diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,102186 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999691643539932, + "eval_steps": 500, + "global_step": 14592, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00020557097337855896, + "grad_norm": 6.1312150955200195, + "learning_rate": 2.0547945205479452e-07, + "loss": 1.5536, + "step": 1 + }, + { + "epoch": 0.0004111419467571179, + "grad_norm": 6.5972065925598145, + "learning_rate": 4.1095890410958903e-07, + "loss": 1.5863, + "step": 2 + }, + { + "epoch": 0.0006167129201356768, + "grad_norm": 5.653270244598389, + "learning_rate": 6.164383561643835e-07, + "loss": 1.491, + "step": 3 + }, + { + "epoch": 0.0008222838935142358, + "grad_norm": 6.296363830566406, + "learning_rate": 8.219178082191781e-07, + "loss": 1.5794, + "step": 4 + }, + { + "epoch": 0.0010278548668927947, + "grad_norm": 1.6855748891830444, + "learning_rate": 1.0273972602739727e-06, + "loss": 0.8016, + "step": 5 + }, + { + "epoch": 0.0012334258402713536, + "grad_norm": 5.977898120880127, + "learning_rate": 1.232876712328767e-06, + "loss": 1.5352, + "step": 6 + }, + { + "epoch": 0.0014389968136499125, + "grad_norm": 1.6171352863311768, + "learning_rate": 1.4383561643835616e-06, + "loss": 0.7893, + "step": 7 + }, + { + "epoch": 0.0016445677870284717, + "grad_norm": 5.830033302307129, + "learning_rate": 1.6438356164383561e-06, + "loss": 1.5201, + "step": 8 + }, + { + "epoch": 0.0018501387604070306, + "grad_norm": 5.7432990074157715, + "learning_rate": 1.8493150684931507e-06, + "loss": 1.5304, + "step": 9 + }, + { + "epoch": 0.0020557097337855893, + "grad_norm": 5.845605373382568, + "learning_rate": 2.0547945205479454e-06, + "loss": 1.5429, + "step": 10 + }, + { + "epoch": 0.0022612807071641485, + "grad_norm": 5.0305399894714355, + "learning_rate": 2.2602739726027396e-06, + "loss": 1.4662, + "step": 11 + }, + { + "epoch": 0.002466851680542707, + "grad_norm": 5.128103256225586, + "learning_rate": 2.465753424657534e-06, + "loss": 1.4793, + "step": 12 + }, + { + "epoch": 0.0026724226539212663, + "grad_norm": 4.975289344787598, + "learning_rate": 2.6712328767123286e-06, + "loss": 1.4448, + "step": 13 + }, + { + "epoch": 0.002877993627299825, + "grad_norm": 4.8694987297058105, + "learning_rate": 2.876712328767123e-06, + "loss": 1.4351, + "step": 14 + }, + { + "epoch": 0.003083564600678384, + "grad_norm": 1.9994945526123047, + "learning_rate": 3.0821917808219177e-06, + "loss": 0.7466, + "step": 15 + }, + { + "epoch": 0.0032891355740569434, + "grad_norm": 5.214486598968506, + "learning_rate": 3.2876712328767123e-06, + "loss": 1.293, + "step": 16 + }, + { + "epoch": 0.003494706547435502, + "grad_norm": 5.771518707275391, + "learning_rate": 3.493150684931507e-06, + "loss": 1.2649, + "step": 17 + }, + { + "epoch": 0.0037002775208140612, + "grad_norm": 5.648902893066406, + "learning_rate": 3.6986301369863014e-06, + "loss": 1.2114, + "step": 18 + }, + { + "epoch": 0.00390584849419262, + "grad_norm": 2.0775961875915527, + "learning_rate": 3.904109589041096e-06, + "loss": 1.1923, + "step": 19 + }, + { + "epoch": 0.004111419467571179, + "grad_norm": 1.4989817142486572, + "learning_rate": 4.109589041095891e-06, + "loss": 1.1838, + "step": 20 + }, + { + "epoch": 0.004316990440949738, + "grad_norm": 1.3304322957992554, + "learning_rate": 4.315068493150685e-06, + "loss": 1.1334, + "step": 21 + }, + { + "epoch": 0.004522561414328297, + "grad_norm": 1.2907218933105469, + "learning_rate": 4.520547945205479e-06, + "loss": 1.126, + "step": 22 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 1.3639134168624878, + "learning_rate": 4.726027397260274e-06, + "loss": 1.1328, + "step": 23 + }, + { + "epoch": 0.004933703361085414, + "grad_norm": 1.2764439582824707, + "learning_rate": 4.931506849315068e-06, + "loss": 1.1774, + "step": 24 + }, + { + "epoch": 0.005139274334463974, + "grad_norm": 1.016863465309143, + "learning_rate": 5.136986301369863e-06, + "loss": 0.7168, + "step": 25 + }, + { + "epoch": 0.005344845307842533, + "grad_norm": 0.9651275277137756, + "learning_rate": 5.342465753424657e-06, + "loss": 0.7096, + "step": 26 + }, + { + "epoch": 0.005550416281221091, + "grad_norm": 0.922505795955658, + "learning_rate": 5.547945205479452e-06, + "loss": 1.0915, + "step": 27 + }, + { + "epoch": 0.00575598725459965, + "grad_norm": 1.0005972385406494, + "learning_rate": 5.753424657534246e-06, + "loss": 1.1167, + "step": 28 + }, + { + "epoch": 0.00596155822797821, + "grad_norm": 1.006510615348816, + "learning_rate": 5.958904109589041e-06, + "loss": 1.1025, + "step": 29 + }, + { + "epoch": 0.006167129201356768, + "grad_norm": 1.069066047668457, + "learning_rate": 6.1643835616438354e-06, + "loss": 1.0833, + "step": 30 + }, + { + "epoch": 0.006372700174735327, + "grad_norm": 1.1197434663772583, + "learning_rate": 6.36986301369863e-06, + "loss": 0.7086, + "step": 31 + }, + { + "epoch": 0.006578271148113887, + "grad_norm": 1.1849225759506226, + "learning_rate": 6.5753424657534245e-06, + "loss": 0.7198, + "step": 32 + }, + { + "epoch": 0.0067838421214924454, + "grad_norm": 1.0908714532852173, + "learning_rate": 6.7808219178082195e-06, + "loss": 1.0882, + "step": 33 + }, + { + "epoch": 0.006989413094871004, + "grad_norm": 1.1033886671066284, + "learning_rate": 6.986301369863014e-06, + "loss": 1.0619, + "step": 34 + }, + { + "epoch": 0.007194984068249563, + "grad_norm": 0.9067010283470154, + "learning_rate": 7.191780821917809e-06, + "loss": 1.0383, + "step": 35 + }, + { + "epoch": 0.0074005550416281225, + "grad_norm": 0.7680827379226685, + "learning_rate": 7.397260273972603e-06, + "loss": 1.0172, + "step": 36 + }, + { + "epoch": 0.007606126015006681, + "grad_norm": 0.6832679510116577, + "learning_rate": 7.602739726027398e-06, + "loss": 1.0656, + "step": 37 + }, + { + "epoch": 0.00781169698838524, + "grad_norm": 0.631285548210144, + "learning_rate": 7.808219178082192e-06, + "loss": 1.0222, + "step": 38 + }, + { + "epoch": 0.008017267961763799, + "grad_norm": 0.6489036083221436, + "learning_rate": 8.013698630136987e-06, + "loss": 1.0526, + "step": 39 + }, + { + "epoch": 0.008222838935142357, + "grad_norm": 0.7755447626113892, + "learning_rate": 8.219178082191782e-06, + "loss": 0.7246, + "step": 40 + }, + { + "epoch": 0.008428409908520916, + "grad_norm": 0.7579307556152344, + "learning_rate": 8.424657534246575e-06, + "loss": 1.0303, + "step": 41 + }, + { + "epoch": 0.008633980881899476, + "grad_norm": 0.746900200843811, + "learning_rate": 8.63013698630137e-06, + "loss": 1.0118, + "step": 42 + }, + { + "epoch": 0.008839551855278035, + "grad_norm": 0.6753754615783691, + "learning_rate": 8.835616438356165e-06, + "loss": 1.0531, + "step": 43 + }, + { + "epoch": 0.009045122828656594, + "grad_norm": 0.6792585253715515, + "learning_rate": 9.041095890410958e-06, + "loss": 1.0317, + "step": 44 + }, + { + "epoch": 0.009250693802035153, + "grad_norm": 0.6036022305488586, + "learning_rate": 9.246575342465753e-06, + "loss": 1.0008, + "step": 45 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 0.5249003767967224, + "learning_rate": 9.452054794520548e-06, + "loss": 1.0103, + "step": 46 + }, + { + "epoch": 0.00966183574879227, + "grad_norm": 0.48237892985343933, + "learning_rate": 9.657534246575343e-06, + "loss": 1.0129, + "step": 47 + }, + { + "epoch": 0.009867406722170829, + "grad_norm": 0.4669821858406067, + "learning_rate": 9.863013698630136e-06, + "loss": 0.6748, + "step": 48 + }, + { + "epoch": 0.01007297769554939, + "grad_norm": 0.7257899045944214, + "learning_rate": 1.0068493150684931e-05, + "loss": 1.0394, + "step": 49 + }, + { + "epoch": 0.010278548668927948, + "grad_norm": 0.5101274847984314, + "learning_rate": 1.0273972602739726e-05, + "loss": 0.9956, + "step": 50 + }, + { + "epoch": 0.010484119642306507, + "grad_norm": 0.4904460906982422, + "learning_rate": 1.0479452054794521e-05, + "loss": 1.0081, + "step": 51 + }, + { + "epoch": 0.010689690615685065, + "grad_norm": 0.49294978380203247, + "learning_rate": 1.0684931506849315e-05, + "loss": 0.9707, + "step": 52 + }, + { + "epoch": 0.010895261589063624, + "grad_norm": 0.5110352039337158, + "learning_rate": 1.089041095890411e-05, + "loss": 0.9684, + "step": 53 + }, + { + "epoch": 0.011100832562442183, + "grad_norm": 0.44021663069725037, + "learning_rate": 1.1095890410958904e-05, + "loss": 0.9872, + "step": 54 + }, + { + "epoch": 0.011306403535820742, + "grad_norm": 0.5229463577270508, + "learning_rate": 1.13013698630137e-05, + "loss": 0.9821, + "step": 55 + }, + { + "epoch": 0.0115119745091993, + "grad_norm": 0.4633481502532959, + "learning_rate": 1.1506849315068493e-05, + "loss": 0.9858, + "step": 56 + }, + { + "epoch": 0.01171754548257786, + "grad_norm": 0.43951645493507385, + "learning_rate": 1.1712328767123288e-05, + "loss": 0.9608, + "step": 57 + }, + { + "epoch": 0.01192311645595642, + "grad_norm": 0.46415814757347107, + "learning_rate": 1.1917808219178083e-05, + "loss": 0.9831, + "step": 58 + }, + { + "epoch": 0.012128687429334978, + "grad_norm": 0.35238775610923767, + "learning_rate": 1.2123287671232878e-05, + "loss": 0.671, + "step": 59 + }, + { + "epoch": 0.012334258402713537, + "grad_norm": 0.4979459047317505, + "learning_rate": 1.2328767123287671e-05, + "loss": 0.9634, + "step": 60 + }, + { + "epoch": 0.012539829376092096, + "grad_norm": 0.40928781032562256, + "learning_rate": 1.2534246575342466e-05, + "loss": 0.9618, + "step": 61 + }, + { + "epoch": 0.012745400349470654, + "grad_norm": 0.35449472069740295, + "learning_rate": 1.273972602739726e-05, + "loss": 0.6745, + "step": 62 + }, + { + "epoch": 0.012950971322849213, + "grad_norm": 0.5600117444992065, + "learning_rate": 1.2945205479452054e-05, + "loss": 0.9651, + "step": 63 + }, + { + "epoch": 0.013156542296227773, + "grad_norm": 0.4429936110973358, + "learning_rate": 1.3150684931506849e-05, + "loss": 0.9478, + "step": 64 + }, + { + "epoch": 0.013362113269606332, + "grad_norm": 0.47870925068855286, + "learning_rate": 1.3356164383561644e-05, + "loss": 0.9631, + "step": 65 + }, + { + "epoch": 0.013567684242984891, + "grad_norm": 0.4984883964061737, + "learning_rate": 1.3561643835616439e-05, + "loss": 0.9612, + "step": 66 + }, + { + "epoch": 0.01377325521636345, + "grad_norm": 0.43905192613601685, + "learning_rate": 1.3767123287671232e-05, + "loss": 0.9495, + "step": 67 + }, + { + "epoch": 0.013978826189742008, + "grad_norm": 0.4528709650039673, + "learning_rate": 1.3972602739726027e-05, + "loss": 0.9597, + "step": 68 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 0.2834670841693878, + "learning_rate": 1.4178082191780822e-05, + "loss": 0.6768, + "step": 69 + }, + { + "epoch": 0.014389968136499126, + "grad_norm": 0.736508846282959, + "learning_rate": 1.4383561643835617e-05, + "loss": 0.9616, + "step": 70 + }, + { + "epoch": 0.014595539109877684, + "grad_norm": 0.2635529935359955, + "learning_rate": 1.458904109589041e-05, + "loss": 0.6671, + "step": 71 + }, + { + "epoch": 0.014801110083256245, + "grad_norm": 0.5397729873657227, + "learning_rate": 1.4794520547945205e-05, + "loss": 0.9488, + "step": 72 + }, + { + "epoch": 0.015006681056634804, + "grad_norm": 0.23914408683776855, + "learning_rate": 1.5e-05, + "loss": 0.6537, + "step": 73 + }, + { + "epoch": 0.015212252030013362, + "grad_norm": 0.6451640129089355, + "learning_rate": 1.5205479452054795e-05, + "loss": 0.954, + "step": 74 + }, + { + "epoch": 0.015417823003391921, + "grad_norm": 0.37705564498901367, + "learning_rate": 1.541095890410959e-05, + "loss": 0.9367, + "step": 75 + }, + { + "epoch": 0.01562339397677048, + "grad_norm": 0.5562038421630859, + "learning_rate": 1.5616438356164384e-05, + "loss": 0.9374, + "step": 76 + }, + { + "epoch": 0.01582896495014904, + "grad_norm": 0.2332352101802826, + "learning_rate": 1.582191780821918e-05, + "loss": 0.6542, + "step": 77 + }, + { + "epoch": 0.016034535923527597, + "grad_norm": 0.5999805331230164, + "learning_rate": 1.6027397260273974e-05, + "loss": 0.9342, + "step": 78 + }, + { + "epoch": 0.016240106896906158, + "grad_norm": 0.3581260144710541, + "learning_rate": 1.623287671232877e-05, + "loss": 0.9542, + "step": 79 + }, + { + "epoch": 0.016445677870284715, + "grad_norm": 0.5643858909606934, + "learning_rate": 1.6438356164383563e-05, + "loss": 0.9312, + "step": 80 + }, + { + "epoch": 0.016651248843663275, + "grad_norm": 0.5196654200553894, + "learning_rate": 1.6643835616438355e-05, + "loss": 0.9256, + "step": 81 + }, + { + "epoch": 0.016856819817041832, + "grad_norm": 0.37860536575317383, + "learning_rate": 1.684931506849315e-05, + "loss": 0.9139, + "step": 82 + }, + { + "epoch": 0.017062390790420393, + "grad_norm": 0.6562532186508179, + "learning_rate": 1.7054794520547945e-05, + "loss": 0.8984, + "step": 83 + }, + { + "epoch": 0.017267961763798953, + "grad_norm": 0.4133750796318054, + "learning_rate": 1.726027397260274e-05, + "loss": 0.905, + "step": 84 + }, + { + "epoch": 0.01747353273717751, + "grad_norm": 0.38232654333114624, + "learning_rate": 1.7465753424657535e-05, + "loss": 0.9202, + "step": 85 + }, + { + "epoch": 0.01767910371055607, + "grad_norm": 0.5049018859863281, + "learning_rate": 1.767123287671233e-05, + "loss": 0.9235, + "step": 86 + }, + { + "epoch": 0.017884674683934627, + "grad_norm": 0.4014778137207031, + "learning_rate": 1.7876712328767125e-05, + "loss": 0.9272, + "step": 87 + }, + { + "epoch": 0.018090245657313188, + "grad_norm": 0.45734459161758423, + "learning_rate": 1.8082191780821916e-05, + "loss": 0.9312, + "step": 88 + }, + { + "epoch": 0.018295816630691745, + "grad_norm": 0.46464303135871887, + "learning_rate": 1.828767123287671e-05, + "loss": 0.9394, + "step": 89 + }, + { + "epoch": 0.018501387604070305, + "grad_norm": 0.39655131101608276, + "learning_rate": 1.8493150684931506e-05, + "loss": 0.9133, + "step": 90 + }, + { + "epoch": 0.018706958577448866, + "grad_norm": 0.36367830634117126, + "learning_rate": 1.86986301369863e-05, + "loss": 0.9085, + "step": 91 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 0.4867264926433563, + "learning_rate": 1.8904109589041096e-05, + "loss": 0.8848, + "step": 92 + }, + { + "epoch": 0.019118100524205983, + "grad_norm": 0.3669883906841278, + "learning_rate": 1.910958904109589e-05, + "loss": 0.8986, + "step": 93 + }, + { + "epoch": 0.01932367149758454, + "grad_norm": 0.4508739411830902, + "learning_rate": 1.9315068493150686e-05, + "loss": 0.9478, + "step": 94 + }, + { + "epoch": 0.0195292424709631, + "grad_norm": 0.4065166711807251, + "learning_rate": 1.952054794520548e-05, + "loss": 0.9318, + "step": 95 + }, + { + "epoch": 0.019734813444341658, + "grad_norm": 0.21278417110443115, + "learning_rate": 1.9726027397260273e-05, + "loss": 0.6272, + "step": 96 + }, + { + "epoch": 0.019940384417720218, + "grad_norm": 0.5677651762962341, + "learning_rate": 1.9931506849315068e-05, + "loss": 0.9209, + "step": 97 + }, + { + "epoch": 0.02014595539109878, + "grad_norm": 0.4079231023788452, + "learning_rate": 2.0136986301369863e-05, + "loss": 0.9189, + "step": 98 + }, + { + "epoch": 0.020351526364477335, + "grad_norm": 0.3942011892795563, + "learning_rate": 2.0342465753424658e-05, + "loss": 0.8827, + "step": 99 + }, + { + "epoch": 0.020557097337855896, + "grad_norm": 0.5771577954292297, + "learning_rate": 2.0547945205479453e-05, + "loss": 0.885, + "step": 100 + }, + { + "epoch": 0.020762668311234453, + "grad_norm": 0.35876256227493286, + "learning_rate": 2.0753424657534248e-05, + "loss": 0.867, + "step": 101 + }, + { + "epoch": 0.020968239284613013, + "grad_norm": 0.47500577569007874, + "learning_rate": 2.0958904109589043e-05, + "loss": 0.8921, + "step": 102 + }, + { + "epoch": 0.02117381025799157, + "grad_norm": 0.4215965270996094, + "learning_rate": 2.1164383561643834e-05, + "loss": 0.883, + "step": 103 + }, + { + "epoch": 0.02137938123137013, + "grad_norm": 0.41377994418144226, + "learning_rate": 2.136986301369863e-05, + "loss": 0.9116, + "step": 104 + }, + { + "epoch": 0.021584952204748688, + "grad_norm": 0.4422590434551239, + "learning_rate": 2.1575342465753424e-05, + "loss": 0.9215, + "step": 105 + }, + { + "epoch": 0.021790523178127248, + "grad_norm": 0.39756667613983154, + "learning_rate": 2.178082191780822e-05, + "loss": 0.8749, + "step": 106 + }, + { + "epoch": 0.02199609415150581, + "grad_norm": 0.3924627900123596, + "learning_rate": 2.1986301369863014e-05, + "loss": 0.9013, + "step": 107 + }, + { + "epoch": 0.022201665124884366, + "grad_norm": 0.4422127306461334, + "learning_rate": 2.219178082191781e-05, + "loss": 0.8741, + "step": 108 + }, + { + "epoch": 0.022407236098262926, + "grad_norm": 0.37621861696243286, + "learning_rate": 2.2397260273972604e-05, + "loss": 0.8726, + "step": 109 + }, + { + "epoch": 0.022612807071641483, + "grad_norm": 0.38060134649276733, + "learning_rate": 2.26027397260274e-05, + "loss": 0.8584, + "step": 110 + }, + { + "epoch": 0.022818378045020044, + "grad_norm": 0.2121458202600479, + "learning_rate": 2.2808219178082194e-05, + "loss": 0.6438, + "step": 111 + }, + { + "epoch": 0.0230239490183986, + "grad_norm": 0.5301511883735657, + "learning_rate": 2.3013698630136985e-05, + "loss": 0.894, + "step": 112 + }, + { + "epoch": 0.02322951999177716, + "grad_norm": 0.3643994629383087, + "learning_rate": 2.3219178082191784e-05, + "loss": 0.8608, + "step": 113 + }, + { + "epoch": 0.02343509096515572, + "grad_norm": 0.4830370843410492, + "learning_rate": 2.3424657534246575e-05, + "loss": 0.9062, + "step": 114 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 0.384884774684906, + "learning_rate": 2.3630136986301374e-05, + "loss": 0.8855, + "step": 115 + }, + { + "epoch": 0.02384623291191284, + "grad_norm": 0.3976382315158844, + "learning_rate": 2.3835616438356165e-05, + "loss": 0.8806, + "step": 116 + }, + { + "epoch": 0.024051803885291396, + "grad_norm": 0.1835232675075531, + "learning_rate": 2.404109589041096e-05, + "loss": 0.611, + "step": 117 + }, + { + "epoch": 0.024257374858669956, + "grad_norm": 0.5072860717773438, + "learning_rate": 2.4246575342465755e-05, + "loss": 0.9086, + "step": 118 + }, + { + "epoch": 0.024462945832048513, + "grad_norm": 0.3984593152999878, + "learning_rate": 2.445205479452055e-05, + "loss": 0.8694, + "step": 119 + }, + { + "epoch": 0.024668516805427074, + "grad_norm": 0.4669335186481476, + "learning_rate": 2.4657534246575342e-05, + "loss": 0.8798, + "step": 120 + }, + { + "epoch": 0.024874087778805634, + "grad_norm": 0.4184141159057617, + "learning_rate": 2.486301369863014e-05, + "loss": 0.8805, + "step": 121 + }, + { + "epoch": 0.02507965875218419, + "grad_norm": 0.4648849070072174, + "learning_rate": 2.5068493150684932e-05, + "loss": 0.8941, + "step": 122 + }, + { + "epoch": 0.02528522972556275, + "grad_norm": 0.503567636013031, + "learning_rate": 2.527397260273973e-05, + "loss": 0.9006, + "step": 123 + }, + { + "epoch": 0.02549080069894131, + "grad_norm": 0.4252830445766449, + "learning_rate": 2.547945205479452e-05, + "loss": 0.8887, + "step": 124 + }, + { + "epoch": 0.02569637167231987, + "grad_norm": 0.4380176067352295, + "learning_rate": 2.5684931506849317e-05, + "loss": 0.8662, + "step": 125 + }, + { + "epoch": 0.025901942645698426, + "grad_norm": 0.3882461488246918, + "learning_rate": 2.5890410958904108e-05, + "loss": 0.8969, + "step": 126 + }, + { + "epoch": 0.026107513619076986, + "grad_norm": 0.43722933530807495, + "learning_rate": 2.6095890410958907e-05, + "loss": 0.8589, + "step": 127 + }, + { + "epoch": 0.026313084592455547, + "grad_norm": 0.46026188135147095, + "learning_rate": 2.6301369863013698e-05, + "loss": 0.8831, + "step": 128 + }, + { + "epoch": 0.026518655565834104, + "grad_norm": 0.36106160283088684, + "learning_rate": 2.6506849315068496e-05, + "loss": 0.8433, + "step": 129 + }, + { + "epoch": 0.026724226539212664, + "grad_norm": 0.19909483194351196, + "learning_rate": 2.6712328767123288e-05, + "loss": 0.6199, + "step": 130 + }, + { + "epoch": 0.02692979751259122, + "grad_norm": 0.5032296180725098, + "learning_rate": 2.6917808219178086e-05, + "loss": 0.9036, + "step": 131 + }, + { + "epoch": 0.027135368485969782, + "grad_norm": 0.40603938698768616, + "learning_rate": 2.7123287671232878e-05, + "loss": 0.8892, + "step": 132 + }, + { + "epoch": 0.02734093945934834, + "grad_norm": 0.43442800641059875, + "learning_rate": 2.7328767123287673e-05, + "loss": 0.8975, + "step": 133 + }, + { + "epoch": 0.0275465104327269, + "grad_norm": 0.442852258682251, + "learning_rate": 2.7534246575342465e-05, + "loss": 0.8509, + "step": 134 + }, + { + "epoch": 0.027752081406105456, + "grad_norm": 0.4811699688434601, + "learning_rate": 2.7739726027397263e-05, + "loss": 0.8496, + "step": 135 + }, + { + "epoch": 0.027957652379484017, + "grad_norm": 0.38817986845970154, + "learning_rate": 2.7945205479452054e-05, + "loss": 0.8383, + "step": 136 + }, + { + "epoch": 0.028163223352862577, + "grad_norm": 0.41808751225471497, + "learning_rate": 2.8150684931506853e-05, + "loss": 0.8662, + "step": 137 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 0.49768969416618347, + "learning_rate": 2.8356164383561644e-05, + "loss": 0.8526, + "step": 138 + }, + { + "epoch": 0.028574365299619695, + "grad_norm": 0.3861895203590393, + "learning_rate": 2.856164383561644e-05, + "loss": 0.8454, + "step": 139 + }, + { + "epoch": 0.02877993627299825, + "grad_norm": 0.4545285999774933, + "learning_rate": 2.8767123287671234e-05, + "loss": 0.8717, + "step": 140 + }, + { + "epoch": 0.028985507246376812, + "grad_norm": 0.20150704681873322, + "learning_rate": 2.897260273972603e-05, + "loss": 0.6377, + "step": 141 + }, + { + "epoch": 0.02919107821975537, + "grad_norm": 0.42400142550468445, + "learning_rate": 2.917808219178082e-05, + "loss": 0.8583, + "step": 142 + }, + { + "epoch": 0.02939664919313393, + "grad_norm": 0.3788576126098633, + "learning_rate": 2.938356164383562e-05, + "loss": 0.8476, + "step": 143 + }, + { + "epoch": 0.02960222016651249, + "grad_norm": 0.17580586671829224, + "learning_rate": 2.958904109589041e-05, + "loss": 0.6334, + "step": 144 + }, + { + "epoch": 0.029807791139891047, + "grad_norm": 0.17598563432693481, + "learning_rate": 2.979452054794521e-05, + "loss": 0.6251, + "step": 145 + }, + { + "epoch": 0.030013362113269607, + "grad_norm": 0.7843010425567627, + "learning_rate": 3e-05, + "loss": 0.8495, + "step": 146 + }, + { + "epoch": 0.030218933086648164, + "grad_norm": 0.478127121925354, + "learning_rate": 3.0205479452054796e-05, + "loss": 0.8733, + "step": 147 + }, + { + "epoch": 0.030424504060026725, + "grad_norm": 0.6210460066795349, + "learning_rate": 3.041095890410959e-05, + "loss": 0.8513, + "step": 148 + }, + { + "epoch": 0.03063007503340528, + "grad_norm": 0.5364311337471008, + "learning_rate": 3.061643835616439e-05, + "loss": 0.8532, + "step": 149 + }, + { + "epoch": 0.030835646006783842, + "grad_norm": 0.5108141899108887, + "learning_rate": 3.082191780821918e-05, + "loss": 0.852, + "step": 150 + }, + { + "epoch": 0.031041216980162403, + "grad_norm": 0.4817136228084564, + "learning_rate": 3.102739726027397e-05, + "loss": 0.8431, + "step": 151 + }, + { + "epoch": 0.03124678795354096, + "grad_norm": 0.5212568044662476, + "learning_rate": 3.123287671232877e-05, + "loss": 0.8591, + "step": 152 + }, + { + "epoch": 0.03145235892691952, + "grad_norm": 0.4288831949234009, + "learning_rate": 3.143835616438356e-05, + "loss": 0.8614, + "step": 153 + }, + { + "epoch": 0.03165792990029808, + "grad_norm": 0.1943136751651764, + "learning_rate": 3.164383561643836e-05, + "loss": 0.6347, + "step": 154 + }, + { + "epoch": 0.031863500873676634, + "grad_norm": 0.7128695249557495, + "learning_rate": 3.184931506849315e-05, + "loss": 0.87, + "step": 155 + }, + { + "epoch": 0.032069071847055194, + "grad_norm": 0.40213656425476074, + "learning_rate": 3.205479452054795e-05, + "loss": 0.8425, + "step": 156 + }, + { + "epoch": 0.032274642820433755, + "grad_norm": 0.4853759706020355, + "learning_rate": 3.226027397260274e-05, + "loss": 0.8643, + "step": 157 + }, + { + "epoch": 0.032480213793812315, + "grad_norm": 0.5050686001777649, + "learning_rate": 3.246575342465754e-05, + "loss": 0.8628, + "step": 158 + }, + { + "epoch": 0.032685784767190876, + "grad_norm": 0.5028424263000488, + "learning_rate": 3.267123287671233e-05, + "loss": 0.8267, + "step": 159 + }, + { + "epoch": 0.03289135574056943, + "grad_norm": 0.4855990707874298, + "learning_rate": 3.287671232876713e-05, + "loss": 0.8549, + "step": 160 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 0.40553873777389526, + "learning_rate": 3.308219178082192e-05, + "loss": 0.8548, + "step": 161 + }, + { + "epoch": 0.03330249768732655, + "grad_norm": 0.22181855142116547, + "learning_rate": 3.328767123287671e-05, + "loss": 0.6371, + "step": 162 + }, + { + "epoch": 0.03350806866070511, + "grad_norm": 0.7873424887657166, + "learning_rate": 3.349315068493151e-05, + "loss": 0.8876, + "step": 163 + }, + { + "epoch": 0.033713639634083664, + "grad_norm": 0.4477074444293976, + "learning_rate": 3.36986301369863e-05, + "loss": 0.8418, + "step": 164 + }, + { + "epoch": 0.033919210607462225, + "grad_norm": 0.6497864127159119, + "learning_rate": 3.39041095890411e-05, + "loss": 0.8605, + "step": 165 + }, + { + "epoch": 0.034124781580840785, + "grad_norm": 0.41493016481399536, + "learning_rate": 3.410958904109589e-05, + "loss": 0.8276, + "step": 166 + }, + { + "epoch": 0.034330352554219346, + "grad_norm": 0.5347689390182495, + "learning_rate": 3.4315068493150685e-05, + "loss": 0.8809, + "step": 167 + }, + { + "epoch": 0.034535923527597906, + "grad_norm": 0.4067676365375519, + "learning_rate": 3.452054794520548e-05, + "loss": 0.8329, + "step": 168 + }, + { + "epoch": 0.03474149450097646, + "grad_norm": 0.4063913822174072, + "learning_rate": 3.4726027397260275e-05, + "loss": 0.8556, + "step": 169 + }, + { + "epoch": 0.03494706547435502, + "grad_norm": 0.4246818721294403, + "learning_rate": 3.493150684931507e-05, + "loss": 0.8664, + "step": 170 + }, + { + "epoch": 0.03515263644773358, + "grad_norm": 0.41586360335350037, + "learning_rate": 3.5136986301369865e-05, + "loss": 0.842, + "step": 171 + }, + { + "epoch": 0.03535820742111214, + "grad_norm": 0.3807069659233093, + "learning_rate": 3.534246575342466e-05, + "loss": 0.824, + "step": 172 + }, + { + "epoch": 0.0355637783944907, + "grad_norm": 0.7290697693824768, + "learning_rate": 3.5547945205479455e-05, + "loss": 0.6189, + "step": 173 + }, + { + "epoch": 0.035769349367869255, + "grad_norm": 0.19204974174499512, + "learning_rate": 3.575342465753425e-05, + "loss": 0.6093, + "step": 174 + }, + { + "epoch": 0.035974920341247815, + "grad_norm": 0.6416502594947815, + "learning_rate": 3.5958904109589045e-05, + "loss": 0.8379, + "step": 175 + }, + { + "epoch": 0.036180491314626376, + "grad_norm": 0.3935816287994385, + "learning_rate": 3.616438356164383e-05, + "loss": 0.8263, + "step": 176 + }, + { + "epoch": 0.036386062288004936, + "grad_norm": 0.47259315848350525, + "learning_rate": 3.6369863013698635e-05, + "loss": 0.8132, + "step": 177 + }, + { + "epoch": 0.03659163326138349, + "grad_norm": 0.47834697365760803, + "learning_rate": 3.657534246575342e-05, + "loss": 0.8393, + "step": 178 + }, + { + "epoch": 0.03679720423476205, + "grad_norm": 0.3470703363418579, + "learning_rate": 3.6780821917808224e-05, + "loss": 0.6182, + "step": 179 + }, + { + "epoch": 0.03700277520814061, + "grad_norm": 0.5120542645454407, + "learning_rate": 3.698630136986301e-05, + "loss": 0.8336, + "step": 180 + }, + { + "epoch": 0.03720834618151917, + "grad_norm": 0.42222753167152405, + "learning_rate": 3.719178082191781e-05, + "loss": 0.837, + "step": 181 + }, + { + "epoch": 0.03741391715489773, + "grad_norm": 0.38363730907440186, + "learning_rate": 3.73972602739726e-05, + "loss": 0.8651, + "step": 182 + }, + { + "epoch": 0.037619488128276285, + "grad_norm": 0.4108883738517761, + "learning_rate": 3.76027397260274e-05, + "loss": 0.8175, + "step": 183 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 0.41021236777305603, + "learning_rate": 3.780821917808219e-05, + "loss": 0.8412, + "step": 184 + }, + { + "epoch": 0.038030630075033406, + "grad_norm": 0.24833433330059052, + "learning_rate": 3.801369863013699e-05, + "loss": 0.6215, + "step": 185 + }, + { + "epoch": 0.038236201048411966, + "grad_norm": 0.465718537569046, + "learning_rate": 3.821917808219178e-05, + "loss": 0.842, + "step": 186 + }, + { + "epoch": 0.03844177202179052, + "grad_norm": 0.41596537828445435, + "learning_rate": 3.842465753424658e-05, + "loss": 0.8296, + "step": 187 + }, + { + "epoch": 0.03864734299516908, + "grad_norm": 0.3815116286277771, + "learning_rate": 3.863013698630137e-05, + "loss": 0.8131, + "step": 188 + }, + { + "epoch": 0.03885291396854764, + "grad_norm": 0.38065505027770996, + "learning_rate": 3.883561643835617e-05, + "loss": 0.8227, + "step": 189 + }, + { + "epoch": 0.0390584849419262, + "grad_norm": 0.40238457918167114, + "learning_rate": 3.904109589041096e-05, + "loss": 0.829, + "step": 190 + }, + { + "epoch": 0.03926405591530476, + "grad_norm": 0.39533552527427673, + "learning_rate": 3.924657534246576e-05, + "loss": 0.8062, + "step": 191 + }, + { + "epoch": 0.039469626888683315, + "grad_norm": 0.2254960983991623, + "learning_rate": 3.9452054794520546e-05, + "loss": 0.6202, + "step": 192 + }, + { + "epoch": 0.039675197862061876, + "grad_norm": 0.5490075945854187, + "learning_rate": 3.965753424657535e-05, + "loss": 0.8587, + "step": 193 + }, + { + "epoch": 0.039880768835440436, + "grad_norm": 0.3820808231830597, + "learning_rate": 3.9863013698630135e-05, + "loss": 0.8461, + "step": 194 + }, + { + "epoch": 0.040086339808818996, + "grad_norm": 0.48500680923461914, + "learning_rate": 4.006849315068494e-05, + "loss": 0.8319, + "step": 195 + }, + { + "epoch": 0.04029191078219756, + "grad_norm": 0.20103423297405243, + "learning_rate": 4.0273972602739725e-05, + "loss": 0.6231, + "step": 196 + }, + { + "epoch": 0.04049748175557611, + "grad_norm": 0.5550208687782288, + "learning_rate": 4.047945205479452e-05, + "loss": 0.8343, + "step": 197 + }, + { + "epoch": 0.04070305272895467, + "grad_norm": 0.37427324056625366, + "learning_rate": 4.0684931506849315e-05, + "loss": 0.8292, + "step": 198 + }, + { + "epoch": 0.04090862370233323, + "grad_norm": 0.2106785923242569, + "learning_rate": 4.089041095890411e-05, + "loss": 0.603, + "step": 199 + }, + { + "epoch": 0.04111419467571179, + "grad_norm": 0.7520186305046082, + "learning_rate": 4.1095890410958905e-05, + "loss": 0.86, + "step": 200 + }, + { + "epoch": 0.041319765649090345, + "grad_norm": 0.38897809386253357, + "learning_rate": 4.13013698630137e-05, + "loss": 0.82, + "step": 201 + }, + { + "epoch": 0.041525336622468906, + "grad_norm": 0.5800373554229736, + "learning_rate": 4.1506849315068495e-05, + "loss": 0.8282, + "step": 202 + }, + { + "epoch": 0.041730907595847466, + "grad_norm": 0.46717479825019836, + "learning_rate": 4.171232876712329e-05, + "loss": 0.8268, + "step": 203 + }, + { + "epoch": 0.04193647856922603, + "grad_norm": 0.45258304476737976, + "learning_rate": 4.1917808219178085e-05, + "loss": 0.8178, + "step": 204 + }, + { + "epoch": 0.04214204954260459, + "grad_norm": 0.44093188643455505, + "learning_rate": 4.212328767123288e-05, + "loss": 0.8507, + "step": 205 + }, + { + "epoch": 0.04234762051598314, + "grad_norm": 0.38282710313796997, + "learning_rate": 4.232876712328767e-05, + "loss": 0.823, + "step": 206 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 0.21601058542728424, + "learning_rate": 4.253424657534247e-05, + "loss": 0.6133, + "step": 207 + }, + { + "epoch": 0.04275876246274026, + "grad_norm": 0.6589162945747375, + "learning_rate": 4.273972602739726e-05, + "loss": 0.8517, + "step": 208 + }, + { + "epoch": 0.04296433343611882, + "grad_norm": 0.39537516236305237, + "learning_rate": 4.294520547945206e-05, + "loss": 0.8297, + "step": 209 + }, + { + "epoch": 0.043169904409497376, + "grad_norm": 0.5449748039245605, + "learning_rate": 4.315068493150685e-05, + "loss": 0.8329, + "step": 210 + }, + { + "epoch": 0.043375475382875936, + "grad_norm": 0.4801601767539978, + "learning_rate": 4.335616438356165e-05, + "loss": 0.8263, + "step": 211 + }, + { + "epoch": 0.043581046356254496, + "grad_norm": 0.3884707987308502, + "learning_rate": 4.356164383561644e-05, + "loss": 0.8392, + "step": 212 + }, + { + "epoch": 0.04378661732963306, + "grad_norm": 0.4665462374687195, + "learning_rate": 4.376712328767123e-05, + "loss": 0.8319, + "step": 213 + }, + { + "epoch": 0.04399218830301162, + "grad_norm": 0.3869108557701111, + "learning_rate": 4.397260273972603e-05, + "loss": 0.8207, + "step": 214 + }, + { + "epoch": 0.04419775927639017, + "grad_norm": 0.38586127758026123, + "learning_rate": 4.417808219178082e-05, + "loss": 0.8035, + "step": 215 + }, + { + "epoch": 0.04440333024976873, + "grad_norm": 0.41265037655830383, + "learning_rate": 4.438356164383562e-05, + "loss": 0.8578, + "step": 216 + }, + { + "epoch": 0.04460890122314729, + "grad_norm": 0.3726780116558075, + "learning_rate": 4.458904109589041e-05, + "loss": 0.8103, + "step": 217 + }, + { + "epoch": 0.04481447219652585, + "grad_norm": 0.21903295814990997, + "learning_rate": 4.479452054794521e-05, + "loss": 0.6149, + "step": 218 + }, + { + "epoch": 0.04502004316990441, + "grad_norm": 0.470803439617157, + "learning_rate": 4.5e-05, + "loss": 0.8187, + "step": 219 + }, + { + "epoch": 0.045225614143282966, + "grad_norm": 0.3907180726528168, + "learning_rate": 4.52054794520548e-05, + "loss": 0.843, + "step": 220 + }, + { + "epoch": 0.04543118511666153, + "grad_norm": 0.3910331726074219, + "learning_rate": 4.54109589041096e-05, + "loss": 0.8228, + "step": 221 + }, + { + "epoch": 0.04563675609004009, + "grad_norm": 0.4238927364349365, + "learning_rate": 4.561643835616439e-05, + "loss": 0.8287, + "step": 222 + }, + { + "epoch": 0.04584232706341865, + "grad_norm": 0.38111889362335205, + "learning_rate": 4.582191780821918e-05, + "loss": 0.8375, + "step": 223 + }, + { + "epoch": 0.0460478980367972, + "grad_norm": 0.17004454135894775, + "learning_rate": 4.602739726027397e-05, + "loss": 0.6103, + "step": 224 + }, + { + "epoch": 0.04625346901017576, + "grad_norm": 0.5066764950752258, + "learning_rate": 4.623287671232877e-05, + "loss": 0.8377, + "step": 225 + }, + { + "epoch": 0.04645903998355432, + "grad_norm": 0.16975145041942596, + "learning_rate": 4.643835616438357e-05, + "loss": 0.6379, + "step": 226 + }, + { + "epoch": 0.04666461095693288, + "grad_norm": 0.17714980244636536, + "learning_rate": 4.6643835616438356e-05, + "loss": 0.6246, + "step": 227 + }, + { + "epoch": 0.04687018193031144, + "grad_norm": 0.44060373306274414, + "learning_rate": 4.684931506849315e-05, + "loss": 0.8455, + "step": 228 + }, + { + "epoch": 0.047075752903689996, + "grad_norm": 0.41871070861816406, + "learning_rate": 4.705479452054795e-05, + "loss": 0.8438, + "step": 229 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 0.20235472917556763, + "learning_rate": 4.726027397260275e-05, + "loss": 0.6155, + "step": 230 + }, + { + "epoch": 0.04748689485044712, + "grad_norm": 0.4988607168197632, + "learning_rate": 4.7465753424657536e-05, + "loss": 0.8098, + "step": 231 + }, + { + "epoch": 0.04769246582382568, + "grad_norm": 0.41510388255119324, + "learning_rate": 4.767123287671233e-05, + "loss": 0.8214, + "step": 232 + }, + { + "epoch": 0.04789803679720424, + "grad_norm": 0.3907022178173065, + "learning_rate": 4.787671232876713e-05, + "loss": 0.8112, + "step": 233 + }, + { + "epoch": 0.04810360777058279, + "grad_norm": 0.40868282318115234, + "learning_rate": 4.808219178082192e-05, + "loss": 0.8161, + "step": 234 + }, + { + "epoch": 0.04830917874396135, + "grad_norm": 0.3888959288597107, + "learning_rate": 4.8287671232876716e-05, + "loss": 0.803, + "step": 235 + }, + { + "epoch": 0.04851474971733991, + "grad_norm": 0.38003799319267273, + "learning_rate": 4.849315068493151e-05, + "loss": 0.8293, + "step": 236 + }, + { + "epoch": 0.04872032069071847, + "grad_norm": 0.2189408391714096, + "learning_rate": 4.869863013698631e-05, + "loss": 0.601, + "step": 237 + }, + { + "epoch": 0.048925891664097027, + "grad_norm": 0.44841453433036804, + "learning_rate": 4.89041095890411e-05, + "loss": 0.8239, + "step": 238 + }, + { + "epoch": 0.04913146263747559, + "grad_norm": 0.41675901412963867, + "learning_rate": 4.9109589041095895e-05, + "loss": 0.8041, + "step": 239 + }, + { + "epoch": 0.04933703361085415, + "grad_norm": 0.3353470265865326, + "learning_rate": 4.9315068493150684e-05, + "loss": 0.8233, + "step": 240 + }, + { + "epoch": 0.04954260458423271, + "grad_norm": 0.38614898920059204, + "learning_rate": 4.9520547945205485e-05, + "loss": 0.8202, + "step": 241 + }, + { + "epoch": 0.04974817555761127, + "grad_norm": 0.3578384220600128, + "learning_rate": 4.972602739726028e-05, + "loss": 0.8155, + "step": 242 + }, + { + "epoch": 0.04995374653098982, + "grad_norm": 0.3806624114513397, + "learning_rate": 4.993150684931507e-05, + "loss": 0.8475, + "step": 243 + }, + { + "epoch": 0.05015931750436838, + "grad_norm": 0.23930180072784424, + "learning_rate": 5.0136986301369863e-05, + "loss": 0.6126, + "step": 244 + }, + { + "epoch": 0.05036488847774694, + "grad_norm": 0.4321422278881073, + "learning_rate": 5.0342465753424665e-05, + "loss": 0.8145, + "step": 245 + }, + { + "epoch": 0.0505704594511255, + "grad_norm": 0.3582285940647125, + "learning_rate": 5.054794520547946e-05, + "loss": 0.8384, + "step": 246 + }, + { + "epoch": 0.05077603042450406, + "grad_norm": 0.3378206491470337, + "learning_rate": 5.075342465753425e-05, + "loss": 0.8189, + "step": 247 + }, + { + "epoch": 0.05098160139788262, + "grad_norm": 0.3585507571697235, + "learning_rate": 5.095890410958904e-05, + "loss": 0.8379, + "step": 248 + }, + { + "epoch": 0.05118717237126118, + "grad_norm": 0.36620137095451355, + "learning_rate": 5.1164383561643845e-05, + "loss": 0.8059, + "step": 249 + }, + { + "epoch": 0.05139274334463974, + "grad_norm": 0.348910391330719, + "learning_rate": 5.136986301369863e-05, + "loss": 0.8231, + "step": 250 + }, + { + "epoch": 0.0515983143180183, + "grad_norm": 0.37466245889663696, + "learning_rate": 5.157534246575343e-05, + "loss": 0.8263, + "step": 251 + }, + { + "epoch": 0.05180388529139685, + "grad_norm": 0.3923078775405884, + "learning_rate": 5.1780821917808216e-05, + "loss": 0.8142, + "step": 252 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 0.3668658435344696, + "learning_rate": 5.1986301369863025e-05, + "loss": 0.815, + "step": 253 + }, + { + "epoch": 0.05221502723815397, + "grad_norm": 0.34352773427963257, + "learning_rate": 5.219178082191781e-05, + "loss": 0.8103, + "step": 254 + }, + { + "epoch": 0.05242059821153253, + "grad_norm": 0.35997268557548523, + "learning_rate": 5.239726027397261e-05, + "loss": 0.8021, + "step": 255 + }, + { + "epoch": 0.052626169184911094, + "grad_norm": 0.4281958043575287, + "learning_rate": 5.2602739726027396e-05, + "loss": 0.613, + "step": 256 + }, + { + "epoch": 0.05283174015828965, + "grad_norm": 0.40191400051116943, + "learning_rate": 5.28082191780822e-05, + "loss": 0.8114, + "step": 257 + }, + { + "epoch": 0.05303731113166821, + "grad_norm": 0.2332005500793457, + "learning_rate": 5.301369863013699e-05, + "loss": 0.6145, + "step": 258 + }, + { + "epoch": 0.05324288210504677, + "grad_norm": 0.3814218044281006, + "learning_rate": 5.321917808219178e-05, + "loss": 0.8322, + "step": 259 + }, + { + "epoch": 0.05344845307842533, + "grad_norm": 0.8000903129577637, + "learning_rate": 5.3424657534246576e-05, + "loss": 0.8061, + "step": 260 + }, + { + "epoch": 0.05365402405180388, + "grad_norm": 0.3613252341747284, + "learning_rate": 5.363013698630138e-05, + "loss": 0.817, + "step": 261 + }, + { + "epoch": 0.05385959502518244, + "grad_norm": 0.3710997998714447, + "learning_rate": 5.383561643835617e-05, + "loss": 0.847, + "step": 262 + }, + { + "epoch": 0.054065165998561, + "grad_norm": 0.36693164706230164, + "learning_rate": 5.404109589041096e-05, + "loss": 0.6174, + "step": 263 + }, + { + "epoch": 0.054270736971939564, + "grad_norm": 0.4523719251155853, + "learning_rate": 5.4246575342465756e-05, + "loss": 0.8234, + "step": 264 + }, + { + "epoch": 0.054476307945318124, + "grad_norm": 0.3696235120296478, + "learning_rate": 5.445205479452056e-05, + "loss": 0.7997, + "step": 265 + }, + { + "epoch": 0.05468187891869668, + "grad_norm": 0.3745763599872589, + "learning_rate": 5.4657534246575346e-05, + "loss": 0.8098, + "step": 266 + }, + { + "epoch": 0.05488744989207524, + "grad_norm": 0.36916518211364746, + "learning_rate": 5.486301369863014e-05, + "loss": 0.788, + "step": 267 + }, + { + "epoch": 0.0550930208654538, + "grad_norm": 0.351854532957077, + "learning_rate": 5.506849315068493e-05, + "loss": 0.8124, + "step": 268 + }, + { + "epoch": 0.05529859183883236, + "grad_norm": 0.3717731535434723, + "learning_rate": 5.527397260273973e-05, + "loss": 0.8166, + "step": 269 + }, + { + "epoch": 0.05550416281221091, + "grad_norm": 0.3277188837528229, + "learning_rate": 5.5479452054794526e-05, + "loss": 0.6006, + "step": 270 + }, + { + "epoch": 0.05570973378558947, + "grad_norm": 0.39217084646224976, + "learning_rate": 5.568493150684932e-05, + "loss": 0.8076, + "step": 271 + }, + { + "epoch": 0.05591530475896803, + "grad_norm": 0.37465596199035645, + "learning_rate": 5.589041095890411e-05, + "loss": 0.8196, + "step": 272 + }, + { + "epoch": 0.056120875732346594, + "grad_norm": 0.37113896012306213, + "learning_rate": 5.609589041095891e-05, + "loss": 0.8206, + "step": 273 + }, + { + "epoch": 0.056326446705725154, + "grad_norm": 0.3641659915447235, + "learning_rate": 5.6301369863013706e-05, + "loss": 0.8372, + "step": 274 + }, + { + "epoch": 0.05653201767910371, + "grad_norm": 0.3738704025745392, + "learning_rate": 5.6506849315068494e-05, + "loss": 0.8201, + "step": 275 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 0.35747018456459045, + "learning_rate": 5.671232876712329e-05, + "loss": 0.8082, + "step": 276 + }, + { + "epoch": 0.05694315962586083, + "grad_norm": 0.29701605439186096, + "learning_rate": 5.691780821917809e-05, + "loss": 0.6105, + "step": 277 + }, + { + "epoch": 0.05714873059923939, + "grad_norm": 0.4180268347263336, + "learning_rate": 5.712328767123288e-05, + "loss": 0.8325, + "step": 278 + }, + { + "epoch": 0.05735430157261795, + "grad_norm": 0.36010023951530457, + "learning_rate": 5.7328767123287674e-05, + "loss": 0.8403, + "step": 279 + }, + { + "epoch": 0.0575598725459965, + "grad_norm": 0.35812970995903015, + "learning_rate": 5.753424657534247e-05, + "loss": 0.8201, + "step": 280 + }, + { + "epoch": 0.05776544351937506, + "grad_norm": 0.35655659437179565, + "learning_rate": 5.773972602739727e-05, + "loss": 0.8104, + "step": 281 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 0.3628866970539093, + "learning_rate": 5.794520547945206e-05, + "loss": 0.8011, + "step": 282 + }, + { + "epoch": 0.058176585466132184, + "grad_norm": 0.33707040548324585, + "learning_rate": 5.8150684931506854e-05, + "loss": 0.7863, + "step": 283 + }, + { + "epoch": 0.05838215643951074, + "grad_norm": 0.25686392188072205, + "learning_rate": 5.835616438356164e-05, + "loss": 0.605, + "step": 284 + }, + { + "epoch": 0.0585877274128893, + "grad_norm": 0.4549000859260559, + "learning_rate": 5.8561643835616444e-05, + "loss": 0.7871, + "step": 285 + }, + { + "epoch": 0.05879329838626786, + "grad_norm": 0.17129164934158325, + "learning_rate": 5.876712328767124e-05, + "loss": 0.6043, + "step": 286 + }, + { + "epoch": 0.05899886935964642, + "grad_norm": 0.4582807719707489, + "learning_rate": 5.8972602739726033e-05, + "loss": 0.7943, + "step": 287 + }, + { + "epoch": 0.05920444033302498, + "grad_norm": 0.3587150573730469, + "learning_rate": 5.917808219178082e-05, + "loss": 0.818, + "step": 288 + }, + { + "epoch": 0.05941001130640353, + "grad_norm": 0.35766854882240295, + "learning_rate": 5.9383561643835623e-05, + "loss": 0.8084, + "step": 289 + }, + { + "epoch": 0.059615582279782094, + "grad_norm": 0.24981027841567993, + "learning_rate": 5.958904109589042e-05, + "loss": 0.6123, + "step": 290 + }, + { + "epoch": 0.059821153253160654, + "grad_norm": 0.4611298143863678, + "learning_rate": 5.9794520547945207e-05, + "loss": 0.7859, + "step": 291 + }, + { + "epoch": 0.060026724226539215, + "grad_norm": 0.1829315423965454, + "learning_rate": 6e-05, + "loss": 0.6047, + "step": 292 + }, + { + "epoch": 0.060232295199917775, + "grad_norm": 0.432064026594162, + "learning_rate": 6.02054794520548e-05, + "loss": 0.8252, + "step": 293 + }, + { + "epoch": 0.06043786617329633, + "grad_norm": 0.3626839518547058, + "learning_rate": 6.041095890410959e-05, + "loss": 0.8004, + "step": 294 + }, + { + "epoch": 0.06064343714667489, + "grad_norm": 0.3860291838645935, + "learning_rate": 6.0616438356164386e-05, + "loss": 0.8287, + "step": 295 + }, + { + "epoch": 0.06084900812005345, + "grad_norm": 0.2607959806919098, + "learning_rate": 6.082191780821918e-05, + "loss": 0.617, + "step": 296 + }, + { + "epoch": 0.06105457909343201, + "grad_norm": 0.494211882352829, + "learning_rate": 6.102739726027398e-05, + "loss": 0.8062, + "step": 297 + }, + { + "epoch": 0.06126015006681056, + "grad_norm": 0.37032371759414673, + "learning_rate": 6.123287671232878e-05, + "loss": 0.7842, + "step": 298 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 0.3706514835357666, + "learning_rate": 6.143835616438357e-05, + "loss": 0.8076, + "step": 299 + }, + { + "epoch": 0.061671292013567684, + "grad_norm": 0.41590166091918945, + "learning_rate": 6.164383561643835e-05, + "loss": 0.8142, + "step": 300 + }, + { + "epoch": 0.061876862986946245, + "grad_norm": 0.4085366129875183, + "learning_rate": 6.184931506849316e-05, + "loss": 0.8583, + "step": 301 + }, + { + "epoch": 0.062082433960324805, + "grad_norm": 0.3671876788139343, + "learning_rate": 6.205479452054794e-05, + "loss": 0.7891, + "step": 302 + }, + { + "epoch": 0.06228800493370336, + "grad_norm": 0.39252158999443054, + "learning_rate": 6.226027397260275e-05, + "loss": 0.8023, + "step": 303 + }, + { + "epoch": 0.06249357590708192, + "grad_norm": 0.35324522852897644, + "learning_rate": 6.246575342465753e-05, + "loss": 0.7921, + "step": 304 + }, + { + "epoch": 0.06269914688046048, + "grad_norm": 0.28854769468307495, + "learning_rate": 6.267123287671234e-05, + "loss": 0.6309, + "step": 305 + }, + { + "epoch": 0.06290471785383904, + "grad_norm": 0.48670095205307007, + "learning_rate": 6.287671232876712e-05, + "loss": 0.7814, + "step": 306 + }, + { + "epoch": 0.0631102888272176, + "grad_norm": 0.3746386170387268, + "learning_rate": 6.308219178082193e-05, + "loss": 0.8142, + "step": 307 + }, + { + "epoch": 0.06331585980059616, + "grad_norm": 0.42179784178733826, + "learning_rate": 6.328767123287671e-05, + "loss": 0.8312, + "step": 308 + }, + { + "epoch": 0.06352143077397472, + "grad_norm": 0.37425556778907776, + "learning_rate": 6.349315068493152e-05, + "loss": 0.8397, + "step": 309 + }, + { + "epoch": 0.06372700174735327, + "grad_norm": 0.42048847675323486, + "learning_rate": 6.36986301369863e-05, + "loss": 0.7864, + "step": 310 + }, + { + "epoch": 0.06393257272073183, + "grad_norm": 0.34095990657806396, + "learning_rate": 6.390410958904109e-05, + "loss": 0.8275, + "step": 311 + }, + { + "epoch": 0.06413814369411039, + "grad_norm": 0.3992113769054413, + "learning_rate": 6.41095890410959e-05, + "loss": 0.8037, + "step": 312 + }, + { + "epoch": 0.06434371466748895, + "grad_norm": 0.3752027451992035, + "learning_rate": 6.43150684931507e-05, + "loss": 0.8096, + "step": 313 + }, + { + "epoch": 0.06454928564086751, + "grad_norm": 0.3788531422615051, + "learning_rate": 6.452054794520548e-05, + "loss": 0.8148, + "step": 314 + }, + { + "epoch": 0.06475485661424607, + "grad_norm": 0.34858015179634094, + "learning_rate": 6.472602739726027e-05, + "loss": 0.7865, + "step": 315 + }, + { + "epoch": 0.06496042758762463, + "grad_norm": 0.3562847375869751, + "learning_rate": 6.493150684931507e-05, + "loss": 0.7953, + "step": 316 + }, + { + "epoch": 0.06516599856100319, + "grad_norm": 0.3146650493144989, + "learning_rate": 6.513698630136988e-05, + "loss": 0.5924, + "step": 317 + }, + { + "epoch": 0.06537156953438175, + "grad_norm": 0.21578195691108704, + "learning_rate": 6.534246575342466e-05, + "loss": 0.6165, + "step": 318 + }, + { + "epoch": 0.0655771405077603, + "grad_norm": 0.19480906426906586, + "learning_rate": 6.554794520547945e-05, + "loss": 0.6254, + "step": 319 + }, + { + "epoch": 0.06578271148113886, + "grad_norm": 0.8668273091316223, + "learning_rate": 6.575342465753425e-05, + "loss": 0.8364, + "step": 320 + }, + { + "epoch": 0.06598828245451742, + "grad_norm": 0.5889570116996765, + "learning_rate": 6.595890410958906e-05, + "loss": 0.8205, + "step": 321 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 0.3477165102958679, + "learning_rate": 6.616438356164384e-05, + "loss": 0.6104, + "step": 322 + }, + { + "epoch": 0.06639942440127454, + "grad_norm": 1.1917229890823364, + "learning_rate": 6.636986301369863e-05, + "loss": 0.8402, + "step": 323 + }, + { + "epoch": 0.0666049953746531, + "grad_norm": 0.5916200876235962, + "learning_rate": 6.657534246575342e-05, + "loss": 0.8265, + "step": 324 + }, + { + "epoch": 0.06681056634803166, + "grad_norm": 0.6326993107795715, + "learning_rate": 6.678082191780822e-05, + "loss": 0.822, + "step": 325 + }, + { + "epoch": 0.06701613732141022, + "grad_norm": 0.545361340045929, + "learning_rate": 6.698630136986302e-05, + "loss": 0.8369, + "step": 326 + }, + { + "epoch": 0.06722170829478878, + "grad_norm": 0.5392776727676392, + "learning_rate": 6.719178082191781e-05, + "loss": 0.8009, + "step": 327 + }, + { + "epoch": 0.06742727926816733, + "grad_norm": 0.2618131637573242, + "learning_rate": 6.73972602739726e-05, + "loss": 0.6182, + "step": 328 + }, + { + "epoch": 0.06763285024154589, + "grad_norm": 0.6088753342628479, + "learning_rate": 6.76027397260274e-05, + "loss": 0.8189, + "step": 329 + }, + { + "epoch": 0.06783842121492445, + "grad_norm": 0.5107940435409546, + "learning_rate": 6.78082191780822e-05, + "loss": 0.8304, + "step": 330 + }, + { + "epoch": 0.06804399218830301, + "grad_norm": 0.38624778389930725, + "learning_rate": 6.801369863013699e-05, + "loss": 0.8361, + "step": 331 + }, + { + "epoch": 0.06824956316168157, + "grad_norm": 0.41758957505226135, + "learning_rate": 6.821917808219178e-05, + "loss": 0.7881, + "step": 332 + }, + { + "epoch": 0.06845513413506013, + "grad_norm": 0.41675320267677307, + "learning_rate": 6.842465753424658e-05, + "loss": 0.8297, + "step": 333 + }, + { + "epoch": 0.06866070510843869, + "grad_norm": 0.3944019079208374, + "learning_rate": 6.863013698630137e-05, + "loss": 0.8154, + "step": 334 + }, + { + "epoch": 0.06886627608181725, + "grad_norm": 0.3403918743133545, + "learning_rate": 6.883561643835617e-05, + "loss": 0.6183, + "step": 335 + }, + { + "epoch": 0.06907184705519581, + "grad_norm": 0.5603693127632141, + "learning_rate": 6.904109589041096e-05, + "loss": 0.8398, + "step": 336 + }, + { + "epoch": 0.06927741802857436, + "grad_norm": 0.3981553912162781, + "learning_rate": 6.924657534246576e-05, + "loss": 0.8122, + "step": 337 + }, + { + "epoch": 0.06948298900195292, + "grad_norm": 0.4603327214717865, + "learning_rate": 6.945205479452055e-05, + "loss": 0.8305, + "step": 338 + }, + { + "epoch": 0.06968855997533148, + "grad_norm": 0.43689751625061035, + "learning_rate": 6.965753424657535e-05, + "loss": 0.828, + "step": 339 + }, + { + "epoch": 0.06989413094871004, + "grad_norm": 0.41511690616607666, + "learning_rate": 6.986301369863014e-05, + "loss": 0.7844, + "step": 340 + }, + { + "epoch": 0.0700997019220886, + "grad_norm": 0.3534780740737915, + "learning_rate": 7.006849315068494e-05, + "loss": 0.7882, + "step": 341 + }, + { + "epoch": 0.07030527289546716, + "grad_norm": 0.33764714002609253, + "learning_rate": 7.027397260273973e-05, + "loss": 0.6009, + "step": 342 + }, + { + "epoch": 0.07051084386884572, + "grad_norm": 0.4741517901420593, + "learning_rate": 7.047945205479452e-05, + "loss": 0.7903, + "step": 343 + }, + { + "epoch": 0.07071641484222428, + "grad_norm": 0.19411741197109222, + "learning_rate": 7.068493150684932e-05, + "loss": 0.6019, + "step": 344 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 0.2023278921842575, + "learning_rate": 7.089041095890412e-05, + "loss": 0.6041, + "step": 345 + }, + { + "epoch": 0.0711275567889814, + "grad_norm": 0.18110667169094086, + "learning_rate": 7.109589041095891e-05, + "loss": 0.6082, + "step": 346 + }, + { + "epoch": 0.07133312776235995, + "grad_norm": 0.6595879197120667, + "learning_rate": 7.13013698630137e-05, + "loss": 0.8487, + "step": 347 + }, + { + "epoch": 0.07153869873573851, + "grad_norm": 0.3792790472507477, + "learning_rate": 7.15068493150685e-05, + "loss": 0.8155, + "step": 348 + }, + { + "epoch": 0.07174426970911707, + "grad_norm": 0.553161084651947, + "learning_rate": 7.17123287671233e-05, + "loss": 0.8172, + "step": 349 + }, + { + "epoch": 0.07194984068249563, + "grad_norm": 0.3672430217266083, + "learning_rate": 7.191780821917809e-05, + "loss": 0.7855, + "step": 350 + }, + { + "epoch": 0.07215541165587419, + "grad_norm": 0.5036430358886719, + "learning_rate": 7.212328767123288e-05, + "loss": 0.8164, + "step": 351 + }, + { + "epoch": 0.07236098262925275, + "grad_norm": 0.3772536814212799, + "learning_rate": 7.232876712328767e-05, + "loss": 0.7894, + "step": 352 + }, + { + "epoch": 0.07256655360263131, + "grad_norm": 0.37201905250549316, + "learning_rate": 7.253424657534247e-05, + "loss": 0.8306, + "step": 353 + }, + { + "epoch": 0.07277212457600987, + "grad_norm": 0.4128398597240448, + "learning_rate": 7.273972602739727e-05, + "loss": 0.8272, + "step": 354 + }, + { + "epoch": 0.07297769554938843, + "grad_norm": 0.3522986173629761, + "learning_rate": 7.294520547945206e-05, + "loss": 0.8075, + "step": 355 + }, + { + "epoch": 0.07318326652276698, + "grad_norm": 0.3743478059768677, + "learning_rate": 7.315068493150685e-05, + "loss": 0.8188, + "step": 356 + }, + { + "epoch": 0.07338883749614554, + "grad_norm": 0.4586912989616394, + "learning_rate": 7.335616438356165e-05, + "loss": 0.6061, + "step": 357 + }, + { + "epoch": 0.0735944084695241, + "grad_norm": 0.21246209740638733, + "learning_rate": 7.356164383561645e-05, + "loss": 0.6243, + "step": 358 + }, + { + "epoch": 0.07379997944290266, + "grad_norm": 0.5889565944671631, + "learning_rate": 7.376712328767124e-05, + "loss": 0.8188, + "step": 359 + }, + { + "epoch": 0.07400555041628122, + "grad_norm": 0.37973251938819885, + "learning_rate": 7.397260273972603e-05, + "loss": 0.8092, + "step": 360 + }, + { + "epoch": 0.07421112138965978, + "grad_norm": 0.45936939120292664, + "learning_rate": 7.417808219178083e-05, + "loss": 0.6085, + "step": 361 + }, + { + "epoch": 0.07441669236303834, + "grad_norm": 0.33185017108917236, + "learning_rate": 7.438356164383562e-05, + "loss": 0.5758, + "step": 362 + }, + { + "epoch": 0.0746222633364169, + "grad_norm": 0.7869192361831665, + "learning_rate": 7.458904109589042e-05, + "loss": 0.8316, + "step": 363 + }, + { + "epoch": 0.07482783430979546, + "grad_norm": 0.5039427876472473, + "learning_rate": 7.47945205479452e-05, + "loss": 0.8197, + "step": 364 + }, + { + "epoch": 0.07503340528317401, + "grad_norm": 0.4809415340423584, + "learning_rate": 7.500000000000001e-05, + "loss": 0.8023, + "step": 365 + }, + { + "epoch": 0.07523897625655257, + "grad_norm": 0.5067195296287537, + "learning_rate": 7.52054794520548e-05, + "loss": 0.8258, + "step": 366 + }, + { + "epoch": 0.07544454722993113, + "grad_norm": 0.44106048345565796, + "learning_rate": 7.54109589041096e-05, + "loss": 0.8063, + "step": 367 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 0.40639805793762207, + "learning_rate": 7.561643835616439e-05, + "loss": 0.8315, + "step": 368 + }, + { + "epoch": 0.07585568917668825, + "grad_norm": 0.44400423765182495, + "learning_rate": 7.582191780821919e-05, + "loss": 0.8053, + "step": 369 + }, + { + "epoch": 0.07606126015006681, + "grad_norm": 0.3997926414012909, + "learning_rate": 7.602739726027398e-05, + "loss": 0.8118, + "step": 370 + }, + { + "epoch": 0.07626683112344537, + "grad_norm": 0.36897820234298706, + "learning_rate": 7.623287671232878e-05, + "loss": 0.8377, + "step": 371 + }, + { + "epoch": 0.07647240209682393, + "grad_norm": 0.40449821949005127, + "learning_rate": 7.643835616438356e-05, + "loss": 0.8115, + "step": 372 + }, + { + "epoch": 0.0766779730702025, + "grad_norm": 0.39014002680778503, + "learning_rate": 7.664383561643837e-05, + "loss": 0.8149, + "step": 373 + }, + { + "epoch": 0.07688354404358104, + "grad_norm": 0.3730955421924591, + "learning_rate": 7.684931506849315e-05, + "loss": 0.8019, + "step": 374 + }, + { + "epoch": 0.0770891150169596, + "grad_norm": 0.36292803287506104, + "learning_rate": 7.705479452054794e-05, + "loss": 0.8305, + "step": 375 + }, + { + "epoch": 0.07729468599033816, + "grad_norm": 0.8635247349739075, + "learning_rate": 7.726027397260274e-05, + "loss": 0.6601, + "step": 376 + }, + { + "epoch": 0.07750025696371672, + "grad_norm": 0.4957028925418854, + "learning_rate": 7.746575342465755e-05, + "loss": 0.8365, + "step": 377 + }, + { + "epoch": 0.07770582793709528, + "grad_norm": 0.400206983089447, + "learning_rate": 7.767123287671233e-05, + "loss": 0.8128, + "step": 378 + }, + { + "epoch": 0.07791139891047384, + "grad_norm": 0.3647255301475525, + "learning_rate": 7.787671232876712e-05, + "loss": 0.7968, + "step": 379 + }, + { + "epoch": 0.0781169698838524, + "grad_norm": 0.39965569972991943, + "learning_rate": 7.808219178082192e-05, + "loss": 0.8015, + "step": 380 + }, + { + "epoch": 0.07832254085723096, + "grad_norm": 0.3467910885810852, + "learning_rate": 7.828767123287673e-05, + "loss": 0.7904, + "step": 381 + }, + { + "epoch": 0.07852811183060952, + "grad_norm": 0.33436062932014465, + "learning_rate": 7.849315068493151e-05, + "loss": 0.7647, + "step": 382 + }, + { + "epoch": 0.07873368280398808, + "grad_norm": 0.3548223376274109, + "learning_rate": 7.86986301369863e-05, + "loss": 0.7939, + "step": 383 + }, + { + "epoch": 0.07893925377736663, + "grad_norm": 0.7502946853637695, + "learning_rate": 7.890410958904109e-05, + "loss": 0.6747, + "step": 384 + }, + { + "epoch": 0.07914482475074519, + "grad_norm": 0.3931428790092468, + "learning_rate": 7.910958904109589e-05, + "loss": 0.8237, + "step": 385 + }, + { + "epoch": 0.07935039572412375, + "grad_norm": 0.30833980441093445, + "learning_rate": 7.93150684931507e-05, + "loss": 0.64, + "step": 386 + }, + { + "epoch": 0.07955596669750231, + "grad_norm": 0.43092408776283264, + "learning_rate": 7.952054794520548e-05, + "loss": 0.8138, + "step": 387 + }, + { + "epoch": 0.07976153767088087, + "grad_norm": 0.26460933685302734, + "learning_rate": 7.972602739726027e-05, + "loss": 0.6153, + "step": 388 + }, + { + "epoch": 0.07996710864425943, + "grad_norm": 0.4149387776851654, + "learning_rate": 7.993150684931507e-05, + "loss": 0.7809, + "step": 389 + }, + { + "epoch": 0.08017267961763799, + "grad_norm": 0.35397103428840637, + "learning_rate": 8.013698630136987e-05, + "loss": 0.8249, + "step": 390 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 0.34258702397346497, + "learning_rate": 8.034246575342466e-05, + "loss": 0.8259, + "step": 391 + }, + { + "epoch": 0.08058382156439511, + "grad_norm": 0.3488398790359497, + "learning_rate": 8.054794520547945e-05, + "loss": 0.7772, + "step": 392 + }, + { + "epoch": 0.08078939253777366, + "grad_norm": 0.3264416456222534, + "learning_rate": 8.075342465753425e-05, + "loss": 0.7751, + "step": 393 + }, + { + "epoch": 0.08099496351115222, + "grad_norm": 0.3270927965641022, + "learning_rate": 8.095890410958904e-05, + "loss": 0.7992, + "step": 394 + }, + { + "epoch": 0.08120053448453078, + "grad_norm": 0.2641488313674927, + "learning_rate": 8.116438356164384e-05, + "loss": 0.6224, + "step": 395 + }, + { + "epoch": 0.08140610545790934, + "grad_norm": 0.3740901052951813, + "learning_rate": 8.136986301369863e-05, + "loss": 0.8118, + "step": 396 + }, + { + "epoch": 0.0816116764312879, + "grad_norm": 0.328571081161499, + "learning_rate": 8.157534246575343e-05, + "loss": 0.7969, + "step": 397 + }, + { + "epoch": 0.08181724740466646, + "grad_norm": 0.2278534322977066, + "learning_rate": 8.178082191780822e-05, + "loss": 0.6215, + "step": 398 + }, + { + "epoch": 0.08202281837804502, + "grad_norm": 0.3593691885471344, + "learning_rate": 8.198630136986302e-05, + "loss": 0.7949, + "step": 399 + }, + { + "epoch": 0.08222838935142358, + "grad_norm": 0.3530971109867096, + "learning_rate": 8.219178082191781e-05, + "loss": 0.8042, + "step": 400 + }, + { + "epoch": 0.08243396032480214, + "grad_norm": 0.17606891691684723, + "learning_rate": 8.239726027397261e-05, + "loss": 0.638, + "step": 401 + }, + { + "epoch": 0.08263953129818069, + "grad_norm": 0.1690833419561386, + "learning_rate": 8.26027397260274e-05, + "loss": 0.5895, + "step": 402 + }, + { + "epoch": 0.08284510227155925, + "grad_norm": 0.17045153677463531, + "learning_rate": 8.280821917808219e-05, + "loss": 0.5924, + "step": 403 + }, + { + "epoch": 0.08305067324493781, + "grad_norm": 0.5894138813018799, + "learning_rate": 8.301369863013699e-05, + "loss": 0.8156, + "step": 404 + }, + { + "epoch": 0.08325624421831637, + "grad_norm": 0.3428020477294922, + "learning_rate": 8.321917808219179e-05, + "loss": 0.8131, + "step": 405 + }, + { + "epoch": 0.08346181519169493, + "grad_norm": 0.4333934783935547, + "learning_rate": 8.342465753424658e-05, + "loss": 0.8106, + "step": 406 + }, + { + "epoch": 0.08366738616507349, + "grad_norm": 0.4093782901763916, + "learning_rate": 8.363013698630137e-05, + "loss": 0.8158, + "step": 407 + }, + { + "epoch": 0.08387295713845205, + "grad_norm": 0.3554767668247223, + "learning_rate": 8.383561643835617e-05, + "loss": 0.805, + "step": 408 + }, + { + "epoch": 0.08407852811183061, + "grad_norm": 0.35396429896354675, + "learning_rate": 8.404109589041097e-05, + "loss": 0.787, + "step": 409 + }, + { + "epoch": 0.08428409908520917, + "grad_norm": 0.36389169096946716, + "learning_rate": 8.424657534246576e-05, + "loss": 0.8378, + "step": 410 + }, + { + "epoch": 0.08448967005858772, + "grad_norm": 0.3563280999660492, + "learning_rate": 8.445205479452055e-05, + "loss": 0.7844, + "step": 411 + }, + { + "epoch": 0.08469524103196628, + "grad_norm": 0.340190589427948, + "learning_rate": 8.465753424657534e-05, + "loss": 0.8288, + "step": 412 + }, + { + "epoch": 0.08490081200534484, + "grad_norm": 0.2419368475675583, + "learning_rate": 8.486301369863015e-05, + "loss": 0.6281, + "step": 413 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.37181293964385986, + "learning_rate": 8.506849315068494e-05, + "loss": 0.8016, + "step": 414 + }, + { + "epoch": 0.08531195395210196, + "grad_norm": 0.34155288338661194, + "learning_rate": 8.527397260273973e-05, + "loss": 0.7963, + "step": 415 + }, + { + "epoch": 0.08551752492548052, + "grad_norm": 0.3259139358997345, + "learning_rate": 8.547945205479452e-05, + "loss": 0.8013, + "step": 416 + }, + { + "epoch": 0.08572309589885908, + "grad_norm": 0.3541535437107086, + "learning_rate": 8.568493150684932e-05, + "loss": 0.7988, + "step": 417 + }, + { + "epoch": 0.08592866687223764, + "grad_norm": 0.20659230649471283, + "learning_rate": 8.589041095890412e-05, + "loss": 0.6026, + "step": 418 + }, + { + "epoch": 0.0861342378456162, + "grad_norm": 0.1695416420698166, + "learning_rate": 8.609589041095891e-05, + "loss": 0.5905, + "step": 419 + }, + { + "epoch": 0.08633980881899475, + "grad_norm": 0.48443859815597534, + "learning_rate": 8.63013698630137e-05, + "loss": 0.8179, + "step": 420 + }, + { + "epoch": 0.08654537979237331, + "grad_norm": 0.33505165576934814, + "learning_rate": 8.65068493150685e-05, + "loss": 0.7979, + "step": 421 + }, + { + "epoch": 0.08675095076575187, + "grad_norm": 0.19388127326965332, + "learning_rate": 8.67123287671233e-05, + "loss": 0.6141, + "step": 422 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.19659045338630676, + "learning_rate": 8.691780821917809e-05, + "loss": 0.5968, + "step": 423 + }, + { + "epoch": 0.08716209271250899, + "grad_norm": 0.5674632787704468, + "learning_rate": 8.712328767123288e-05, + "loss": 0.8258, + "step": 424 + }, + { + "epoch": 0.08736766368588755, + "grad_norm": 0.17561140656471252, + "learning_rate": 8.732876712328768e-05, + "loss": 0.5972, + "step": 425 + }, + { + "epoch": 0.08757323465926611, + "grad_norm": 0.48669886589050293, + "learning_rate": 8.753424657534247e-05, + "loss": 0.7975, + "step": 426 + }, + { + "epoch": 0.08777880563264467, + "grad_norm": 0.3487796187400818, + "learning_rate": 8.773972602739727e-05, + "loss": 0.7713, + "step": 427 + }, + { + "epoch": 0.08798437660602323, + "grad_norm": 0.3712750971317291, + "learning_rate": 8.794520547945206e-05, + "loss": 0.7665, + "step": 428 + }, + { + "epoch": 0.0881899475794018, + "grad_norm": 0.23141850531101227, + "learning_rate": 8.815068493150686e-05, + "loss": 0.6171, + "step": 429 + }, + { + "epoch": 0.08839551855278034, + "grad_norm": 0.43884536623954773, + "learning_rate": 8.835616438356165e-05, + "loss": 0.7922, + "step": 430 + }, + { + "epoch": 0.0886010895261589, + "grad_norm": 0.17824266850948334, + "learning_rate": 8.856164383561645e-05, + "loss": 0.616, + "step": 431 + }, + { + "epoch": 0.08880666049953746, + "grad_norm": 0.4101521670818329, + "learning_rate": 8.876712328767124e-05, + "loss": 0.8083, + "step": 432 + }, + { + "epoch": 0.08901223147291602, + "grad_norm": 0.3446323275566101, + "learning_rate": 8.897260273972604e-05, + "loss": 0.813, + "step": 433 + }, + { + "epoch": 0.08921780244629458, + "grad_norm": 0.17695310711860657, + "learning_rate": 8.917808219178083e-05, + "loss": 0.5855, + "step": 434 + }, + { + "epoch": 0.08942337341967314, + "grad_norm": 0.41505882143974304, + "learning_rate": 8.938356164383561e-05, + "loss": 0.7966, + "step": 435 + }, + { + "epoch": 0.0896289443930517, + "grad_norm": 0.3373473286628723, + "learning_rate": 8.958904109589042e-05, + "loss": 0.7981, + "step": 436 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 0.1881159394979477, + "learning_rate": 8.979452054794522e-05, + "loss": 0.5907, + "step": 437 + }, + { + "epoch": 0.09004008633980883, + "grad_norm": 0.570391058921814, + "learning_rate": 9e-05, + "loss": 0.8141, + "step": 438 + }, + { + "epoch": 0.09024565731318737, + "grad_norm": 0.34099552035331726, + "learning_rate": 8.999999889153016e-05, + "loss": 0.7716, + "step": 439 + }, + { + "epoch": 0.09045122828656593, + "grad_norm": 0.4682377576828003, + "learning_rate": 8.999999556612072e-05, + "loss": 0.8084, + "step": 440 + }, + { + "epoch": 0.09065679925994449, + "grad_norm": 0.36160755157470703, + "learning_rate": 8.999999002377183e-05, + "loss": 0.7883, + "step": 441 + }, + { + "epoch": 0.09086237023332305, + "grad_norm": 0.42005038261413574, + "learning_rate": 8.999998226448373e-05, + "loss": 0.794, + "step": 442 + }, + { + "epoch": 0.09106794120670161, + "grad_norm": 0.32100972533226013, + "learning_rate": 8.999997228825685e-05, + "loss": 0.7767, + "step": 443 + }, + { + "epoch": 0.09127351218008017, + "grad_norm": 0.35609909892082214, + "learning_rate": 8.999996009509166e-05, + "loss": 0.7735, + "step": 444 + }, + { + "epoch": 0.09147908315345873, + "grad_norm": 0.3225650191307068, + "learning_rate": 8.999994568498878e-05, + "loss": 0.7805, + "step": 445 + }, + { + "epoch": 0.0916846541268373, + "grad_norm": 0.5321671962738037, + "learning_rate": 8.999992905794889e-05, + "loss": 0.8085, + "step": 446 + }, + { + "epoch": 0.09189022510021586, + "grad_norm": 0.22884899377822876, + "learning_rate": 8.999991021397283e-05, + "loss": 0.6043, + "step": 447 + }, + { + "epoch": 0.0920957960735944, + "grad_norm": 0.4308418333530426, + "learning_rate": 8.999988915306154e-05, + "loss": 0.7922, + "step": 448 + }, + { + "epoch": 0.09230136704697296, + "grad_norm": 0.33842045068740845, + "learning_rate": 8.999986587521601e-05, + "loss": 0.8081, + "step": 449 + }, + { + "epoch": 0.09250693802035152, + "grad_norm": 0.18722039461135864, + "learning_rate": 8.999984038043744e-05, + "loss": 0.5795, + "step": 450 + }, + { + "epoch": 0.09271250899373008, + "grad_norm": 0.4215300679206848, + "learning_rate": 8.999981266872705e-05, + "loss": 0.7982, + "step": 451 + }, + { + "epoch": 0.09291807996710864, + "grad_norm": 0.16856899857521057, + "learning_rate": 8.999978274008622e-05, + "loss": 0.5915, + "step": 452 + }, + { + "epoch": 0.0931236509404872, + "grad_norm": 0.40007540583610535, + "learning_rate": 8.999975059451644e-05, + "loss": 0.7934, + "step": 453 + }, + { + "epoch": 0.09332922191386576, + "grad_norm": 0.3234069049358368, + "learning_rate": 8.999971623201925e-05, + "loss": 0.7963, + "step": 454 + }, + { + "epoch": 0.09353479288724433, + "grad_norm": 0.33642691373825073, + "learning_rate": 8.999967965259639e-05, + "loss": 0.7909, + "step": 455 + }, + { + "epoch": 0.09374036386062289, + "grad_norm": 0.33508196473121643, + "learning_rate": 8.999964085624962e-05, + "loss": 0.7777, + "step": 456 + }, + { + "epoch": 0.09394593483400143, + "grad_norm": 0.2953488826751709, + "learning_rate": 8.999959984298089e-05, + "loss": 0.7596, + "step": 457 + }, + { + "epoch": 0.09415150580737999, + "grad_norm": 0.32082295417785645, + "learning_rate": 8.99995566127922e-05, + "loss": 0.7774, + "step": 458 + }, + { + "epoch": 0.09435707678075855, + "grad_norm": 0.31374961137771606, + "learning_rate": 8.999951116568568e-05, + "loss": 0.7898, + "step": 459 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 0.29701462388038635, + "learning_rate": 8.999946350166357e-05, + "loss": 0.7725, + "step": 460 + }, + { + "epoch": 0.09476821872751567, + "grad_norm": 0.3302834630012512, + "learning_rate": 8.999941362072822e-05, + "loss": 0.7727, + "step": 461 + }, + { + "epoch": 0.09497378970089423, + "grad_norm": 0.28933510184288025, + "learning_rate": 8.99993615228821e-05, + "loss": 0.8082, + "step": 462 + }, + { + "epoch": 0.0951793606742728, + "grad_norm": 0.28469645977020264, + "learning_rate": 8.999930720812776e-05, + "loss": 0.78, + "step": 463 + }, + { + "epoch": 0.09538493164765136, + "grad_norm": 0.30801114439964294, + "learning_rate": 8.999925067646787e-05, + "loss": 0.8154, + "step": 464 + }, + { + "epoch": 0.09559050262102992, + "grad_norm": 0.28879374265670776, + "learning_rate": 8.999919192790524e-05, + "loss": 0.6174, + "step": 465 + }, + { + "epoch": 0.09579607359440848, + "grad_norm": 0.35134953260421753, + "learning_rate": 8.999913096244273e-05, + "loss": 0.7819, + "step": 466 + }, + { + "epoch": 0.09600164456778702, + "grad_norm": 0.31098031997680664, + "learning_rate": 8.999906778008339e-05, + "loss": 0.7876, + "step": 467 + }, + { + "epoch": 0.09620721554116558, + "grad_norm": 0.31209641695022583, + "learning_rate": 8.999900238083028e-05, + "loss": 0.823, + "step": 468 + }, + { + "epoch": 0.09641278651454414, + "grad_norm": 0.3438270688056946, + "learning_rate": 8.999893476468666e-05, + "loss": 0.7994, + "step": 469 + }, + { + "epoch": 0.0966183574879227, + "grad_norm": 0.303815096616745, + "learning_rate": 8.999886493165584e-05, + "loss": 0.8183, + "step": 470 + }, + { + "epoch": 0.09682392846130126, + "grad_norm": 0.31640782952308655, + "learning_rate": 8.999879288174128e-05, + "loss": 0.7947, + "step": 471 + }, + { + "epoch": 0.09702949943467983, + "grad_norm": 0.31044483184814453, + "learning_rate": 8.999871861494651e-05, + "loss": 0.7867, + "step": 472 + }, + { + "epoch": 0.09723507040805839, + "grad_norm": 0.3066295385360718, + "learning_rate": 8.999864213127521e-05, + "loss": 0.782, + "step": 473 + }, + { + "epoch": 0.09744064138143695, + "grad_norm": 0.32025477290153503, + "learning_rate": 8.999856343073111e-05, + "loss": 0.7756, + "step": 474 + }, + { + "epoch": 0.0976462123548155, + "grad_norm": 0.3043205440044403, + "learning_rate": 8.999848251331813e-05, + "loss": 0.8049, + "step": 475 + }, + { + "epoch": 0.09785178332819405, + "grad_norm": 0.3142707943916321, + "learning_rate": 8.999839937904024e-05, + "loss": 0.7967, + "step": 476 + }, + { + "epoch": 0.09805735430157261, + "grad_norm": 0.2932131886482239, + "learning_rate": 8.999831402790153e-05, + "loss": 0.8031, + "step": 477 + }, + { + "epoch": 0.09826292527495117, + "grad_norm": 0.30467313528060913, + "learning_rate": 8.999822645990621e-05, + "loss": 0.7804, + "step": 478 + }, + { + "epoch": 0.09846849624832973, + "grad_norm": 0.2950557768344879, + "learning_rate": 8.99981366750586e-05, + "loss": 0.8004, + "step": 479 + }, + { + "epoch": 0.0986740672217083, + "grad_norm": 0.2995617091655731, + "learning_rate": 8.99980446733631e-05, + "loss": 0.8044, + "step": 480 + }, + { + "epoch": 0.09887963819508686, + "grad_norm": 0.29080766439437866, + "learning_rate": 8.999795045482429e-05, + "loss": 0.7603, + "step": 481 + }, + { + "epoch": 0.09908520916846542, + "grad_norm": 0.29487237334251404, + "learning_rate": 8.999785401944675e-05, + "loss": 0.8036, + "step": 482 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 0.30198103189468384, + "learning_rate": 8.999775536723527e-05, + "loss": 0.7993, + "step": 483 + }, + { + "epoch": 0.09949635111522254, + "grad_norm": 0.30626240372657776, + "learning_rate": 8.999765449819471e-05, + "loss": 0.7928, + "step": 484 + }, + { + "epoch": 0.09970192208860108, + "grad_norm": 0.3268794119358063, + "learning_rate": 8.999755141233002e-05, + "loss": 0.7797, + "step": 485 + }, + { + "epoch": 0.09990749306197964, + "grad_norm": 0.41261476278305054, + "learning_rate": 8.99974461096463e-05, + "loss": 0.628, + "step": 486 + }, + { + "epoch": 0.1001130640353582, + "grad_norm": 0.2068365067243576, + "learning_rate": 8.999733859014873e-05, + "loss": 0.6014, + "step": 487 + }, + { + "epoch": 0.10031863500873676, + "grad_norm": 0.6694285869598389, + "learning_rate": 8.99972288538426e-05, + "loss": 0.8168, + "step": 488 + }, + { + "epoch": 0.10052420598211532, + "grad_norm": 0.3849710524082184, + "learning_rate": 8.999711690073331e-05, + "loss": 0.7958, + "step": 489 + }, + { + "epoch": 0.10072977695549389, + "grad_norm": 0.4657621383666992, + "learning_rate": 8.99970027308264e-05, + "loss": 0.7877, + "step": 490 + }, + { + "epoch": 0.10093534792887245, + "grad_norm": 0.3709288537502289, + "learning_rate": 8.999688634412747e-05, + "loss": 0.781, + "step": 491 + }, + { + "epoch": 0.101140918902251, + "grad_norm": 0.3850356340408325, + "learning_rate": 8.999676774064228e-05, + "loss": 0.7822, + "step": 492 + }, + { + "epoch": 0.10134648987562957, + "grad_norm": 0.32711490988731384, + "learning_rate": 8.999664692037665e-05, + "loss": 0.7903, + "step": 493 + }, + { + "epoch": 0.10155206084900811, + "grad_norm": 0.35332190990448, + "learning_rate": 8.999652388333654e-05, + "loss": 0.7746, + "step": 494 + }, + { + "epoch": 0.10175763182238667, + "grad_norm": 0.6354550719261169, + "learning_rate": 8.999639862952801e-05, + "loss": 0.6377, + "step": 495 + }, + { + "epoch": 0.10196320279576523, + "grad_norm": 0.4530143737792969, + "learning_rate": 8.999627115895724e-05, + "loss": 0.8012, + "step": 496 + }, + { + "epoch": 0.1021687737691438, + "grad_norm": 0.38917437195777893, + "learning_rate": 8.99961414716305e-05, + "loss": 0.7772, + "step": 497 + }, + { + "epoch": 0.10237434474252236, + "grad_norm": 0.3817954361438751, + "learning_rate": 8.999600956755417e-05, + "loss": 0.769, + "step": 498 + }, + { + "epoch": 0.10257991571590092, + "grad_norm": 0.3404269814491272, + "learning_rate": 8.999587544673475e-05, + "loss": 0.7832, + "step": 499 + }, + { + "epoch": 0.10278548668927948, + "grad_norm": 0.29421180486679077, + "learning_rate": 8.99957391091789e-05, + "loss": 0.6173, + "step": 500 + }, + { + "epoch": 0.10299105766265804, + "grad_norm": 0.4653105139732361, + "learning_rate": 8.999560055489324e-05, + "loss": 0.7835, + "step": 501 + }, + { + "epoch": 0.1031966286360366, + "grad_norm": 0.3839401304721832, + "learning_rate": 8.99954597838847e-05, + "loss": 0.7978, + "step": 502 + }, + { + "epoch": 0.10340219960941516, + "grad_norm": 0.3156857192516327, + "learning_rate": 8.999531679616013e-05, + "loss": 0.7589, + "step": 503 + }, + { + "epoch": 0.1036077705827937, + "grad_norm": 0.3422304391860962, + "learning_rate": 8.999517159172662e-05, + "loss": 0.7809, + "step": 504 + }, + { + "epoch": 0.10381334155617226, + "grad_norm": 0.340270072221756, + "learning_rate": 8.999502417059132e-05, + "loss": 0.7981, + "step": 505 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 0.30371013283729553, + "learning_rate": 8.999487453276148e-05, + "loss": 0.7967, + "step": 506 + }, + { + "epoch": 0.10422448350292939, + "grad_norm": 0.2999022901058197, + "learning_rate": 8.999472267824447e-05, + "loss": 0.7964, + "step": 507 + }, + { + "epoch": 0.10443005447630795, + "grad_norm": 0.3306732475757599, + "learning_rate": 8.999456860704778e-05, + "loss": 0.7903, + "step": 508 + }, + { + "epoch": 0.1046356254496865, + "grad_norm": 0.3183232843875885, + "learning_rate": 8.999441231917901e-05, + "loss": 0.7773, + "step": 509 + }, + { + "epoch": 0.10484119642306507, + "grad_norm": 0.29510068893432617, + "learning_rate": 8.999425381464582e-05, + "loss": 0.7812, + "step": 510 + }, + { + "epoch": 0.10504676739644363, + "grad_norm": 0.30512964725494385, + "learning_rate": 8.999409309345609e-05, + "loss": 0.8054, + "step": 511 + }, + { + "epoch": 0.10525233836982219, + "grad_norm": 0.30337393283843994, + "learning_rate": 8.999393015561767e-05, + "loss": 0.767, + "step": 512 + }, + { + "epoch": 0.10545790934320073, + "grad_norm": 0.32128670811653137, + "learning_rate": 8.999376500113861e-05, + "loss": 0.7576, + "step": 513 + }, + { + "epoch": 0.1056634803165793, + "grad_norm": 0.22419625520706177, + "learning_rate": 8.999359763002704e-05, + "loss": 0.6232, + "step": 514 + }, + { + "epoch": 0.10586905128995786, + "grad_norm": 0.35744601488113403, + "learning_rate": 8.999342804229125e-05, + "loss": 0.7999, + "step": 515 + }, + { + "epoch": 0.10607462226333642, + "grad_norm": 0.31676504015922546, + "learning_rate": 8.999325623793952e-05, + "loss": 0.7892, + "step": 516 + }, + { + "epoch": 0.10628019323671498, + "grad_norm": 0.3098521828651428, + "learning_rate": 8.999308221698038e-05, + "loss": 0.7892, + "step": 517 + }, + { + "epoch": 0.10648576421009354, + "grad_norm": 0.32372260093688965, + "learning_rate": 8.999290597942237e-05, + "loss": 0.7697, + "step": 518 + }, + { + "epoch": 0.1066913351834721, + "grad_norm": 0.3482767343521118, + "learning_rate": 8.999272752527417e-05, + "loss": 0.8299, + "step": 519 + }, + { + "epoch": 0.10689690615685066, + "grad_norm": 0.17404678463935852, + "learning_rate": 8.999254685454459e-05, + "loss": 0.5814, + "step": 520 + }, + { + "epoch": 0.10710247713022922, + "grad_norm": 0.36048364639282227, + "learning_rate": 8.999236396724252e-05, + "loss": 0.7881, + "step": 521 + }, + { + "epoch": 0.10730804810360776, + "grad_norm": 0.30838942527770996, + "learning_rate": 8.999217886337696e-05, + "loss": 0.7818, + "step": 522 + }, + { + "epoch": 0.10751361907698632, + "grad_norm": 0.3079747259616852, + "learning_rate": 8.999199154295705e-05, + "loss": 0.7732, + "step": 523 + }, + { + "epoch": 0.10771919005036489, + "grad_norm": 0.3467218577861786, + "learning_rate": 8.9991802005992e-05, + "loss": 0.7969, + "step": 524 + }, + { + "epoch": 0.10792476102374345, + "grad_norm": 0.29866865277290344, + "learning_rate": 8.999161025249117e-05, + "loss": 0.7996, + "step": 525 + }, + { + "epoch": 0.108130331997122, + "grad_norm": 0.17642079293727875, + "learning_rate": 8.999141628246398e-05, + "loss": 0.5753, + "step": 526 + }, + { + "epoch": 0.10833590297050057, + "grad_norm": 0.3251280188560486, + "learning_rate": 8.999122009592002e-05, + "loss": 0.7962, + "step": 527 + }, + { + "epoch": 0.10854147394387913, + "grad_norm": 0.316807359457016, + "learning_rate": 8.999102169286891e-05, + "loss": 0.7592, + "step": 528 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 0.16698336601257324, + "learning_rate": 8.999082107332046e-05, + "loss": 0.5955, + "step": 529 + }, + { + "epoch": 0.10895261589063625, + "grad_norm": 0.30919867753982544, + "learning_rate": 8.999061823728455e-05, + "loss": 0.7481, + "step": 530 + }, + { + "epoch": 0.1091581868640148, + "grad_norm": 0.2959042489528656, + "learning_rate": 8.999041318477114e-05, + "loss": 0.7795, + "step": 531 + }, + { + "epoch": 0.10936375783739335, + "grad_norm": 0.15893301367759705, + "learning_rate": 8.999020591579038e-05, + "loss": 0.5953, + "step": 532 + }, + { + "epoch": 0.10956932881077192, + "grad_norm": 0.16407330334186554, + "learning_rate": 8.998999643035244e-05, + "loss": 0.5873, + "step": 533 + }, + { + "epoch": 0.10977489978415048, + "grad_norm": 0.3498159348964691, + "learning_rate": 8.998978472846768e-05, + "loss": 0.7825, + "step": 534 + }, + { + "epoch": 0.10998047075752904, + "grad_norm": 0.3068999946117401, + "learning_rate": 8.99895708101465e-05, + "loss": 0.8112, + "step": 535 + }, + { + "epoch": 0.1101860417309076, + "grad_norm": 0.28588443994522095, + "learning_rate": 8.998935467539944e-05, + "loss": 0.7778, + "step": 536 + }, + { + "epoch": 0.11039161270428616, + "grad_norm": 0.31996187567710876, + "learning_rate": 8.998913632423716e-05, + "loss": 0.7736, + "step": 537 + }, + { + "epoch": 0.11059718367766472, + "grad_norm": 0.3105761408805847, + "learning_rate": 8.998891575667041e-05, + "loss": 0.7683, + "step": 538 + }, + { + "epoch": 0.11080275465104328, + "grad_norm": 0.3134320378303528, + "learning_rate": 8.998869297271006e-05, + "loss": 0.7877, + "step": 539 + }, + { + "epoch": 0.11100832562442182, + "grad_norm": 0.2837049067020416, + "learning_rate": 8.998846797236708e-05, + "loss": 0.7664, + "step": 540 + }, + { + "epoch": 0.11121389659780039, + "grad_norm": 0.2891695499420166, + "learning_rate": 8.998824075565258e-05, + "loss": 0.7862, + "step": 541 + }, + { + "epoch": 0.11141946757117895, + "grad_norm": 0.2949972450733185, + "learning_rate": 8.99880113225777e-05, + "loss": 0.7551, + "step": 542 + }, + { + "epoch": 0.1116250385445575, + "grad_norm": 0.2788076400756836, + "learning_rate": 8.99877796731538e-05, + "loss": 0.7657, + "step": 543 + }, + { + "epoch": 0.11183060951793607, + "grad_norm": 0.237320676445961, + "learning_rate": 8.998754580739225e-05, + "loss": 0.6081, + "step": 544 + }, + { + "epoch": 0.11203618049131463, + "grad_norm": 0.3368750810623169, + "learning_rate": 8.99873097253046e-05, + "loss": 0.7962, + "step": 545 + }, + { + "epoch": 0.11224175146469319, + "grad_norm": 0.16897863149642944, + "learning_rate": 8.998707142690247e-05, + "loss": 0.5933, + "step": 546 + }, + { + "epoch": 0.11244732243807175, + "grad_norm": 0.31463444232940674, + "learning_rate": 8.99868309121976e-05, + "loss": 0.778, + "step": 547 + }, + { + "epoch": 0.11265289341145031, + "grad_norm": 0.28116437792778015, + "learning_rate": 8.998658818120184e-05, + "loss": 0.7677, + "step": 548 + }, + { + "epoch": 0.11285846438482887, + "grad_norm": 0.2780570685863495, + "learning_rate": 8.998634323392714e-05, + "loss": 0.7736, + "step": 549 + }, + { + "epoch": 0.11306403535820742, + "grad_norm": 0.18777993321418762, + "learning_rate": 8.998609607038558e-05, + "loss": 0.5928, + "step": 550 + }, + { + "epoch": 0.11326960633158598, + "grad_norm": 0.3512813150882721, + "learning_rate": 8.998584669058933e-05, + "loss": 0.7971, + "step": 551 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 0.1571076214313507, + "learning_rate": 8.998559509455066e-05, + "loss": 0.6026, + "step": 552 + }, + { + "epoch": 0.1136807482783431, + "grad_norm": 0.1699524074792862, + "learning_rate": 8.9985341282282e-05, + "loss": 0.5835, + "step": 553 + }, + { + "epoch": 0.11388631925172166, + "grad_norm": 0.38411441445350647, + "learning_rate": 8.998508525379584e-05, + "loss": 0.7829, + "step": 554 + }, + { + "epoch": 0.11409189022510022, + "grad_norm": 0.2952065169811249, + "learning_rate": 8.998482700910478e-05, + "loss": 0.7878, + "step": 555 + }, + { + "epoch": 0.11429746119847878, + "grad_norm": 0.3076973557472229, + "learning_rate": 8.998456654822156e-05, + "loss": 0.7988, + "step": 556 + }, + { + "epoch": 0.11450303217185734, + "grad_norm": 0.30433389544487, + "learning_rate": 8.9984303871159e-05, + "loss": 0.78, + "step": 557 + }, + { + "epoch": 0.1147086031452359, + "grad_norm": 0.30562445521354675, + "learning_rate": 8.998403897793004e-05, + "loss": 0.7832, + "step": 558 + }, + { + "epoch": 0.11491417411861445, + "grad_norm": 0.3120015561580658, + "learning_rate": 8.998377186854774e-05, + "loss": 0.7989, + "step": 559 + }, + { + "epoch": 0.115119745091993, + "grad_norm": 0.26990431547164917, + "learning_rate": 8.998350254302524e-05, + "loss": 0.7471, + "step": 560 + }, + { + "epoch": 0.11532531606537157, + "grad_norm": 0.2938286364078522, + "learning_rate": 8.998323100137585e-05, + "loss": 0.7667, + "step": 561 + }, + { + "epoch": 0.11553088703875013, + "grad_norm": 0.32502278685569763, + "learning_rate": 8.998295724361289e-05, + "loss": 0.7618, + "step": 562 + }, + { + "epoch": 0.11573645801212869, + "grad_norm": 0.296321839094162, + "learning_rate": 8.998268126974988e-05, + "loss": 0.7828, + "step": 563 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 0.30217137932777405, + "learning_rate": 8.998240307980042e-05, + "loss": 0.765, + "step": 564 + }, + { + "epoch": 0.11614759995888581, + "grad_norm": 0.2876279950141907, + "learning_rate": 8.998212267377822e-05, + "loss": 0.7687, + "step": 565 + }, + { + "epoch": 0.11635317093226437, + "grad_norm": 0.2792581021785736, + "learning_rate": 8.998184005169706e-05, + "loss": 0.785, + "step": 566 + }, + { + "epoch": 0.11655874190564293, + "grad_norm": 0.28941112756729126, + "learning_rate": 8.99815552135709e-05, + "loss": 0.7732, + "step": 567 + }, + { + "epoch": 0.11676431287902148, + "grad_norm": 0.28016045689582825, + "learning_rate": 8.998126815941376e-05, + "loss": 0.8033, + "step": 568 + }, + { + "epoch": 0.11696988385240004, + "grad_norm": 0.27612999081611633, + "learning_rate": 8.998097888923977e-05, + "loss": 0.7811, + "step": 569 + }, + { + "epoch": 0.1171754548257786, + "grad_norm": 0.2725747525691986, + "learning_rate": 8.99806874030632e-05, + "loss": 0.7426, + "step": 570 + }, + { + "epoch": 0.11738102579915716, + "grad_norm": 0.23188281059265137, + "learning_rate": 8.998039370089838e-05, + "loss": 0.6119, + "step": 571 + }, + { + "epoch": 0.11758659677253572, + "grad_norm": 0.329795777797699, + "learning_rate": 8.998009778275982e-05, + "loss": 0.7774, + "step": 572 + }, + { + "epoch": 0.11779216774591428, + "grad_norm": 0.292244017124176, + "learning_rate": 8.997979964866208e-05, + "loss": 0.7684, + "step": 573 + }, + { + "epoch": 0.11799773871929284, + "grad_norm": 0.2874715030193329, + "learning_rate": 8.997949929861984e-05, + "loss": 0.7606, + "step": 574 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 0.3013349175453186, + "learning_rate": 8.99791967326479e-05, + "loss": 0.7686, + "step": 575 + }, + { + "epoch": 0.11840888066604996, + "grad_norm": 0.2986513674259186, + "learning_rate": 8.997889195076117e-05, + "loss": 0.7651, + "step": 576 + }, + { + "epoch": 0.1186144516394285, + "grad_norm": 0.2857048809528351, + "learning_rate": 8.997858495297467e-05, + "loss": 0.7875, + "step": 577 + }, + { + "epoch": 0.11882002261280707, + "grad_norm": 0.27221107482910156, + "learning_rate": 8.997827573930351e-05, + "loss": 0.785, + "step": 578 + }, + { + "epoch": 0.11902559358618563, + "grad_norm": 0.29440751671791077, + "learning_rate": 8.997796430976294e-05, + "loss": 0.7703, + "step": 579 + }, + { + "epoch": 0.11923116455956419, + "grad_norm": 0.28240329027175903, + "learning_rate": 8.99776506643683e-05, + "loss": 0.7901, + "step": 580 + }, + { + "epoch": 0.11943673553294275, + "grad_norm": 0.27463993430137634, + "learning_rate": 8.997733480313503e-05, + "loss": 0.7616, + "step": 581 + }, + { + "epoch": 0.11964230650632131, + "grad_norm": 0.2833562195301056, + "learning_rate": 8.99770167260787e-05, + "loss": 0.7512, + "step": 582 + }, + { + "epoch": 0.11984787747969987, + "grad_norm": 0.22366029024124146, + "learning_rate": 8.997669643321496e-05, + "loss": 0.6235, + "step": 583 + }, + { + "epoch": 0.12005344845307843, + "grad_norm": 0.17241071164608002, + "learning_rate": 8.997637392455963e-05, + "loss": 0.5989, + "step": 584 + }, + { + "epoch": 0.12025901942645699, + "grad_norm": 0.15749235451221466, + "learning_rate": 8.997604920012856e-05, + "loss": 0.5973, + "step": 585 + }, + { + "epoch": 0.12046459039983555, + "grad_norm": 0.42778778076171875, + "learning_rate": 8.997572225993778e-05, + "loss": 0.7722, + "step": 586 + }, + { + "epoch": 0.1206701613732141, + "grad_norm": 0.3165600597858429, + "learning_rate": 8.997539310400337e-05, + "loss": 0.7524, + "step": 587 + }, + { + "epoch": 0.12087573234659266, + "grad_norm": 0.3048163950443268, + "learning_rate": 8.997506173234156e-05, + "loss": 0.7699, + "step": 588 + }, + { + "epoch": 0.12108130331997122, + "grad_norm": 0.3166545331478119, + "learning_rate": 8.997472814496867e-05, + "loss": 0.7819, + "step": 589 + }, + { + "epoch": 0.12128687429334978, + "grad_norm": 0.3150469958782196, + "learning_rate": 8.997439234190113e-05, + "loss": 0.7419, + "step": 590 + }, + { + "epoch": 0.12149244526672834, + "grad_norm": 0.3222194015979767, + "learning_rate": 8.99740543231555e-05, + "loss": 0.7808, + "step": 591 + }, + { + "epoch": 0.1216980162401069, + "grad_norm": 0.3114274740219116, + "learning_rate": 8.99737140887484e-05, + "loss": 0.7859, + "step": 592 + }, + { + "epoch": 0.12190358721348546, + "grad_norm": 0.2929398715496063, + "learning_rate": 8.997337163869665e-05, + "loss": 0.8025, + "step": 593 + }, + { + "epoch": 0.12210915818686402, + "grad_norm": 0.2900030016899109, + "learning_rate": 8.997302697301706e-05, + "loss": 0.7914, + "step": 594 + }, + { + "epoch": 0.12231472916024258, + "grad_norm": 0.2980877459049225, + "learning_rate": 8.997268009172664e-05, + "loss": 0.7548, + "step": 595 + }, + { + "epoch": 0.12252030013362113, + "grad_norm": 0.280519962310791, + "learning_rate": 8.997233099484247e-05, + "loss": 0.7923, + "step": 596 + }, + { + "epoch": 0.12272587110699969, + "grad_norm": 0.27224200963974, + "learning_rate": 8.997197968238175e-05, + "loss": 0.7935, + "step": 597 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 0.2736833691596985, + "learning_rate": 8.99716261543618e-05, + "loss": 0.7409, + "step": 598 + }, + { + "epoch": 0.12313701305375681, + "grad_norm": 0.28164225816726685, + "learning_rate": 8.99712704108e-05, + "loss": 0.7855, + "step": 599 + }, + { + "epoch": 0.12334258402713537, + "grad_norm": 0.27927008271217346, + "learning_rate": 8.997091245171394e-05, + "loss": 0.7768, + "step": 600 + }, + { + "epoch": 0.12354815500051393, + "grad_norm": 0.2606373429298401, + "learning_rate": 8.997055227712119e-05, + "loss": 0.764, + "step": 601 + }, + { + "epoch": 0.12375372597389249, + "grad_norm": 0.32072070240974426, + "learning_rate": 8.997018988703953e-05, + "loss": 0.8124, + "step": 602 + }, + { + "epoch": 0.12395929694727105, + "grad_norm": 0.4943363666534424, + "learning_rate": 8.996982528148682e-05, + "loss": 0.6366, + "step": 603 + }, + { + "epoch": 0.12416486792064961, + "grad_norm": 0.3180435299873352, + "learning_rate": 8.996945846048098e-05, + "loss": 0.7723, + "step": 604 + }, + { + "epoch": 0.12437043889402816, + "grad_norm": 0.29927217960357666, + "learning_rate": 8.996908942404012e-05, + "loss": 0.7608, + "step": 605 + }, + { + "epoch": 0.12457600986740672, + "grad_norm": 0.2776423990726471, + "learning_rate": 8.99687181721824e-05, + "loss": 0.775, + "step": 606 + }, + { + "epoch": 0.12478158084078528, + "grad_norm": 0.3051820397377014, + "learning_rate": 8.996834470492613e-05, + "loss": 0.7923, + "step": 607 + }, + { + "epoch": 0.12498715181416384, + "grad_norm": 0.2759751081466675, + "learning_rate": 8.99679690222897e-05, + "loss": 0.7486, + "step": 608 + }, + { + "epoch": 0.1251927227875424, + "grad_norm": 0.2878243923187256, + "learning_rate": 8.99675911242916e-05, + "loss": 0.7774, + "step": 609 + }, + { + "epoch": 0.12539829376092096, + "grad_norm": 0.2739849090576172, + "learning_rate": 8.996721101095048e-05, + "loss": 0.771, + "step": 610 + }, + { + "epoch": 0.12560386473429952, + "grad_norm": 0.2817218601703644, + "learning_rate": 8.996682868228505e-05, + "loss": 0.761, + "step": 611 + }, + { + "epoch": 0.12580943570767808, + "grad_norm": 0.2750679552555084, + "learning_rate": 8.996644413831412e-05, + "loss": 0.7739, + "step": 612 + }, + { + "epoch": 0.12601500668105664, + "grad_norm": 0.26886436343193054, + "learning_rate": 8.996605737905669e-05, + "loss": 0.7585, + "step": 613 + }, + { + "epoch": 0.1262205776544352, + "grad_norm": 0.2675554156303406, + "learning_rate": 8.996566840453178e-05, + "loss": 0.7639, + "step": 614 + }, + { + "epoch": 0.12642614862781376, + "grad_norm": 0.2672448456287384, + "learning_rate": 8.996527721475855e-05, + "loss": 0.7687, + "step": 615 + }, + { + "epoch": 0.12663171960119232, + "grad_norm": 0.27541592717170715, + "learning_rate": 8.996488380975626e-05, + "loss": 0.7702, + "step": 616 + }, + { + "epoch": 0.12683729057457088, + "grad_norm": 1.3074686527252197, + "learning_rate": 8.996448818954434e-05, + "loss": 0.6375, + "step": 617 + }, + { + "epoch": 0.12704286154794944, + "grad_norm": 0.2855135202407837, + "learning_rate": 8.996409035414224e-05, + "loss": 0.7633, + "step": 618 + }, + { + "epoch": 0.12724843252132798, + "grad_norm": 0.6012619137763977, + "learning_rate": 8.996369030356957e-05, + "loss": 0.6213, + "step": 619 + }, + { + "epoch": 0.12745400349470654, + "grad_norm": 0.30922386050224304, + "learning_rate": 8.996328803784604e-05, + "loss": 0.7827, + "step": 620 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 0.29752808809280396, + "learning_rate": 8.996288355699146e-05, + "loss": 0.773, + "step": 621 + }, + { + "epoch": 0.12786514544146366, + "grad_norm": 0.31884685158729553, + "learning_rate": 8.996247686102577e-05, + "loss": 0.7656, + "step": 622 + }, + { + "epoch": 0.12807071641484222, + "grad_norm": 0.2772408425807953, + "learning_rate": 8.996206794996899e-05, + "loss": 0.7898, + "step": 623 + }, + { + "epoch": 0.12827628738822078, + "grad_norm": 0.2835623323917389, + "learning_rate": 8.996165682384129e-05, + "loss": 0.7631, + "step": 624 + }, + { + "epoch": 0.12848185836159934, + "grad_norm": 0.3379913568496704, + "learning_rate": 8.996124348266291e-05, + "loss": 0.7805, + "step": 625 + }, + { + "epoch": 0.1286874293349779, + "grad_norm": 0.26578038930892944, + "learning_rate": 8.996082792645419e-05, + "loss": 0.608, + "step": 626 + }, + { + "epoch": 0.12889300030835646, + "grad_norm": 0.29912567138671875, + "learning_rate": 8.996041015523563e-05, + "loss": 0.7565, + "step": 627 + }, + { + "epoch": 0.12909857128173502, + "grad_norm": 0.3043285608291626, + "learning_rate": 8.995999016902781e-05, + "loss": 0.7787, + "step": 628 + }, + { + "epoch": 0.12930414225511358, + "grad_norm": 0.1923503428697586, + "learning_rate": 8.995956796785143e-05, + "loss": 0.6051, + "step": 629 + }, + { + "epoch": 0.12950971322849214, + "grad_norm": 0.29241567850112915, + "learning_rate": 8.995914355172726e-05, + "loss": 0.7742, + "step": 630 + }, + { + "epoch": 0.1297152842018707, + "grad_norm": 0.1634470671415329, + "learning_rate": 8.995871692067622e-05, + "loss": 0.6009, + "step": 631 + }, + { + "epoch": 0.12992085517524926, + "grad_norm": 0.1948513388633728, + "learning_rate": 8.995828807471935e-05, + "loss": 0.6038, + "step": 632 + }, + { + "epoch": 0.13012642614862782, + "grad_norm": 0.34593167901039124, + "learning_rate": 8.995785701387774e-05, + "loss": 0.7712, + "step": 633 + }, + { + "epoch": 0.13033199712200638, + "grad_norm": 0.2905696630477905, + "learning_rate": 8.995742373817268e-05, + "loss": 0.7745, + "step": 634 + }, + { + "epoch": 0.13053756809538494, + "grad_norm": 0.28553932905197144, + "learning_rate": 8.995698824762547e-05, + "loss": 0.779, + "step": 635 + }, + { + "epoch": 0.1307431390687635, + "grad_norm": 0.18538178503513336, + "learning_rate": 8.995655054225757e-05, + "loss": 0.623, + "step": 636 + }, + { + "epoch": 0.13094871004214204, + "grad_norm": 0.32950466871261597, + "learning_rate": 8.995611062209054e-05, + "loss": 0.7682, + "step": 637 + }, + { + "epoch": 0.1311542810155206, + "grad_norm": 0.28783705830574036, + "learning_rate": 8.995566848714609e-05, + "loss": 0.7534, + "step": 638 + }, + { + "epoch": 0.13135985198889916, + "grad_norm": 0.2871015667915344, + "learning_rate": 8.995522413744596e-05, + "loss": 0.7315, + "step": 639 + }, + { + "epoch": 0.13156542296227772, + "grad_norm": 0.18547143042087555, + "learning_rate": 8.995477757301207e-05, + "loss": 0.5805, + "step": 640 + }, + { + "epoch": 0.13177099393565628, + "grad_norm": 0.34090474247932434, + "learning_rate": 8.99543287938664e-05, + "loss": 0.7783, + "step": 641 + }, + { + "epoch": 0.13197656490903484, + "grad_norm": 0.2930915355682373, + "learning_rate": 8.995387780003107e-05, + "loss": 0.768, + "step": 642 + }, + { + "epoch": 0.1321821358824134, + "grad_norm": 0.28531643748283386, + "learning_rate": 8.995342459152827e-05, + "loss": 0.7627, + "step": 643 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 0.2844246029853821, + "learning_rate": 8.995296916838038e-05, + "loss": 0.7588, + "step": 644 + }, + { + "epoch": 0.13259327782917052, + "grad_norm": 0.2866900861263275, + "learning_rate": 8.99525115306098e-05, + "loss": 0.7569, + "step": 645 + }, + { + "epoch": 0.13279884880254908, + "grad_norm": 0.2860448360443115, + "learning_rate": 8.995205167823908e-05, + "loss": 0.7614, + "step": 646 + }, + { + "epoch": 0.13300441977592764, + "grad_norm": 0.2673685848712921, + "learning_rate": 8.995158961129088e-05, + "loss": 0.7753, + "step": 647 + }, + { + "epoch": 0.1332099907493062, + "grad_norm": 0.2862294316291809, + "learning_rate": 8.995112532978798e-05, + "loss": 0.7682, + "step": 648 + }, + { + "epoch": 0.13341556172268476, + "grad_norm": 0.27633753418922424, + "learning_rate": 8.995065883375321e-05, + "loss": 0.7726, + "step": 649 + }, + { + "epoch": 0.13362113269606332, + "grad_norm": 0.26780807971954346, + "learning_rate": 8.995019012320959e-05, + "loss": 0.8017, + "step": 650 + }, + { + "epoch": 0.13382670366944188, + "grad_norm": 0.27239716053009033, + "learning_rate": 8.99497191981802e-05, + "loss": 0.7479, + "step": 651 + }, + { + "epoch": 0.13403227464282044, + "grad_norm": 0.2104814648628235, + "learning_rate": 8.994924605868824e-05, + "loss": 0.5866, + "step": 652 + }, + { + "epoch": 0.134237845616199, + "grad_norm": 0.30780890583992004, + "learning_rate": 8.994877070475701e-05, + "loss": 0.7577, + "step": 653 + }, + { + "epoch": 0.13444341658957756, + "grad_norm": 0.2910194993019104, + "learning_rate": 8.994829313640995e-05, + "loss": 0.779, + "step": 654 + }, + { + "epoch": 0.13464898756295612, + "grad_norm": 0.277893602848053, + "learning_rate": 8.994781335367057e-05, + "loss": 0.77, + "step": 655 + }, + { + "epoch": 0.13485455853633466, + "grad_norm": 0.28844013810157776, + "learning_rate": 8.994733135656252e-05, + "loss": 0.7746, + "step": 656 + }, + { + "epoch": 0.13506012950971322, + "grad_norm": 0.28865233063697815, + "learning_rate": 8.994684714510954e-05, + "loss": 0.7825, + "step": 657 + }, + { + "epoch": 0.13526570048309178, + "grad_norm": 0.3075569272041321, + "learning_rate": 8.994636071933546e-05, + "loss": 0.753, + "step": 658 + }, + { + "epoch": 0.13547127145647034, + "grad_norm": 0.2790246903896332, + "learning_rate": 8.994587207926429e-05, + "loss": 0.7341, + "step": 659 + }, + { + "epoch": 0.1356768424298489, + "grad_norm": 0.27742037177085876, + "learning_rate": 8.994538122492006e-05, + "loss": 0.7631, + "step": 660 + }, + { + "epoch": 0.13588241340322746, + "grad_norm": 0.266181617975235, + "learning_rate": 8.994488815632699e-05, + "loss": 0.7381, + "step": 661 + }, + { + "epoch": 0.13608798437660602, + "grad_norm": 0.2639121413230896, + "learning_rate": 8.994439287350932e-05, + "loss": 0.7634, + "step": 662 + }, + { + "epoch": 0.13629355534998458, + "grad_norm": 0.271953821182251, + "learning_rate": 8.994389537649151e-05, + "loss": 0.7902, + "step": 663 + }, + { + "epoch": 0.13649912632336314, + "grad_norm": 0.2754836082458496, + "learning_rate": 8.994339566529804e-05, + "loss": 0.7708, + "step": 664 + }, + { + "epoch": 0.1367046972967417, + "grad_norm": 0.30965548753738403, + "learning_rate": 8.994289373995352e-05, + "loss": 0.7607, + "step": 665 + }, + { + "epoch": 0.13691026827012026, + "grad_norm": 0.28129950165748596, + "learning_rate": 8.99423896004827e-05, + "loss": 0.7701, + "step": 666 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 0.23147864639759064, + "learning_rate": 8.99418832469104e-05, + "loss": 0.6085, + "step": 667 + }, + { + "epoch": 0.13732141021687738, + "grad_norm": 0.3050214648246765, + "learning_rate": 8.994137467926156e-05, + "loss": 0.7704, + "step": 668 + }, + { + "epoch": 0.13752698119025594, + "grad_norm": 0.15223456919193268, + "learning_rate": 8.994086389756126e-05, + "loss": 0.6074, + "step": 669 + }, + { + "epoch": 0.1377325521636345, + "grad_norm": 0.2975500226020813, + "learning_rate": 8.994035090183464e-05, + "loss": 0.7422, + "step": 670 + }, + { + "epoch": 0.13793812313701306, + "grad_norm": 0.28416451811790466, + "learning_rate": 8.993983569210698e-05, + "loss": 0.7575, + "step": 671 + }, + { + "epoch": 0.13814369411039162, + "grad_norm": 0.25423794984817505, + "learning_rate": 8.993931826840368e-05, + "loss": 0.7617, + "step": 672 + }, + { + "epoch": 0.13834926508377018, + "grad_norm": 0.2733759582042694, + "learning_rate": 8.993879863075019e-05, + "loss": 0.7478, + "step": 673 + }, + { + "epoch": 0.13855483605714872, + "grad_norm": 0.2590562105178833, + "learning_rate": 8.993827677917215e-05, + "loss": 0.7578, + "step": 674 + }, + { + "epoch": 0.13876040703052728, + "grad_norm": 0.26819926500320435, + "learning_rate": 8.993775271369525e-05, + "loss": 0.7485, + "step": 675 + }, + { + "epoch": 0.13896597800390584, + "grad_norm": 0.261787474155426, + "learning_rate": 8.993722643434532e-05, + "loss": 0.7623, + "step": 676 + }, + { + "epoch": 0.1391715489772844, + "grad_norm": 0.27696770429611206, + "learning_rate": 8.993669794114828e-05, + "loss": 0.5995, + "step": 677 + }, + { + "epoch": 0.13937711995066296, + "grad_norm": 0.1687610000371933, + "learning_rate": 8.993616723413015e-05, + "loss": 0.5993, + "step": 678 + }, + { + "epoch": 0.13958269092404152, + "grad_norm": 0.34388282895088196, + "learning_rate": 8.993563431331711e-05, + "loss": 0.7844, + "step": 679 + }, + { + "epoch": 0.13978826189742008, + "grad_norm": 0.3012101948261261, + "learning_rate": 8.993509917873539e-05, + "loss": 0.806, + "step": 680 + }, + { + "epoch": 0.13999383287079864, + "grad_norm": 0.27226656675338745, + "learning_rate": 8.993456183041135e-05, + "loss": 0.7302, + "step": 681 + }, + { + "epoch": 0.1401994038441772, + "grad_norm": 0.2889186143875122, + "learning_rate": 8.993402226837148e-05, + "loss": 0.7609, + "step": 682 + }, + { + "epoch": 0.14040497481755576, + "grad_norm": 0.33441823720932007, + "learning_rate": 8.993348049264235e-05, + "loss": 0.6023, + "step": 683 + }, + { + "epoch": 0.14061054579093432, + "grad_norm": 0.21067148447036743, + "learning_rate": 8.993293650325066e-05, + "loss": 0.6154, + "step": 684 + }, + { + "epoch": 0.14081611676431288, + "grad_norm": 0.4340059459209442, + "learning_rate": 8.99323903002232e-05, + "loss": 0.7965, + "step": 685 + }, + { + "epoch": 0.14102168773769144, + "grad_norm": 0.3370809853076935, + "learning_rate": 8.993184188358688e-05, + "loss": 0.7557, + "step": 686 + }, + { + "epoch": 0.14122725871107, + "grad_norm": 0.31289970874786377, + "learning_rate": 8.993129125336873e-05, + "loss": 0.7804, + "step": 687 + }, + { + "epoch": 0.14143282968444856, + "grad_norm": 0.31972143054008484, + "learning_rate": 8.993073840959587e-05, + "loss": 0.7438, + "step": 688 + }, + { + "epoch": 0.14163840065782712, + "grad_norm": 0.31906935572624207, + "learning_rate": 8.993018335229552e-05, + "loss": 0.7564, + "step": 689 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 0.3015035390853882, + "learning_rate": 8.992962608149505e-05, + "loss": 0.7668, + "step": 690 + }, + { + "epoch": 0.14204954260458424, + "grad_norm": 0.3022618591785431, + "learning_rate": 8.99290665972219e-05, + "loss": 0.775, + "step": 691 + }, + { + "epoch": 0.1422551135779628, + "grad_norm": 0.3151668906211853, + "learning_rate": 8.992850489950365e-05, + "loss": 0.7715, + "step": 692 + }, + { + "epoch": 0.14246068455134134, + "grad_norm": 0.29301926493644714, + "learning_rate": 8.992794098836794e-05, + "loss": 0.7472, + "step": 693 + }, + { + "epoch": 0.1426662555247199, + "grad_norm": 0.2793315649032593, + "learning_rate": 8.992737486384257e-05, + "loss": 0.7795, + "step": 694 + }, + { + "epoch": 0.14287182649809846, + "grad_norm": 0.28239625692367554, + "learning_rate": 8.992680652595544e-05, + "loss": 0.7649, + "step": 695 + }, + { + "epoch": 0.14307739747147702, + "grad_norm": 0.2796134352684021, + "learning_rate": 8.992623597473455e-05, + "loss": 0.7207, + "step": 696 + }, + { + "epoch": 0.14328296844485558, + "grad_norm": 0.2902660369873047, + "learning_rate": 8.992566321020799e-05, + "loss": 0.767, + "step": 697 + }, + { + "epoch": 0.14348853941823414, + "grad_norm": 0.28000608086586, + "learning_rate": 8.992508823240397e-05, + "loss": 0.7655, + "step": 698 + }, + { + "epoch": 0.1436941103916127, + "grad_norm": 0.28330516815185547, + "learning_rate": 8.992451104135084e-05, + "loss": 0.787, + "step": 699 + }, + { + "epoch": 0.14389968136499126, + "grad_norm": 0.28026729822158813, + "learning_rate": 8.992393163707704e-05, + "loss": 0.774, + "step": 700 + }, + { + "epoch": 0.14410525233836982, + "grad_norm": 0.5302313566207886, + "learning_rate": 8.99233500196111e-05, + "loss": 0.6421, + "step": 701 + }, + { + "epoch": 0.14431082331174838, + "grad_norm": 0.3879426419734955, + "learning_rate": 8.992276618898167e-05, + "loss": 0.7804, + "step": 702 + }, + { + "epoch": 0.14451639428512694, + "grad_norm": 0.34966281056404114, + "learning_rate": 8.992218014521752e-05, + "loss": 0.7597, + "step": 703 + }, + { + "epoch": 0.1447219652585055, + "grad_norm": 0.31454893946647644, + "learning_rate": 8.99215918883475e-05, + "loss": 0.7709, + "step": 704 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.3002963066101074, + "learning_rate": 8.992100141840064e-05, + "loss": 0.7689, + "step": 705 + }, + { + "epoch": 0.14513310720526262, + "grad_norm": 0.2704041600227356, + "learning_rate": 8.992040873540599e-05, + "loss": 0.5956, + "step": 706 + }, + { + "epoch": 0.14533867817864118, + "grad_norm": 0.37959620356559753, + "learning_rate": 8.991981383939275e-05, + "loss": 0.7709, + "step": 707 + }, + { + "epoch": 0.14554424915201974, + "grad_norm": 0.21092139184474945, + "learning_rate": 8.991921673039024e-05, + "loss": 0.6133, + "step": 708 + }, + { + "epoch": 0.1457498201253983, + "grad_norm": 0.3205825686454773, + "learning_rate": 8.991861740842789e-05, + "loss": 0.7759, + "step": 709 + }, + { + "epoch": 0.14595539109877687, + "grad_norm": 0.3055117428302765, + "learning_rate": 8.99180158735352e-05, + "loss": 0.7601, + "step": 710 + }, + { + "epoch": 0.1461609620721554, + "grad_norm": 0.2790381908416748, + "learning_rate": 8.991741212574182e-05, + "loss": 0.7473, + "step": 711 + }, + { + "epoch": 0.14636653304553396, + "grad_norm": 0.22031188011169434, + "learning_rate": 8.991680616507747e-05, + "loss": 0.6042, + "step": 712 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 0.18893392384052277, + "learning_rate": 8.991619799157203e-05, + "loss": 0.579, + "step": 713 + }, + { + "epoch": 0.14677767499229108, + "grad_norm": 0.409572571516037, + "learning_rate": 8.991558760525546e-05, + "loss": 0.7456, + "step": 714 + }, + { + "epoch": 0.14698324596566964, + "grad_norm": 0.30903562903404236, + "learning_rate": 8.991497500615781e-05, + "loss": 0.7597, + "step": 715 + }, + { + "epoch": 0.1471888169390482, + "grad_norm": 0.3029564917087555, + "learning_rate": 8.991436019430928e-05, + "loss": 0.7574, + "step": 716 + }, + { + "epoch": 0.14739438791242676, + "grad_norm": 0.40293097496032715, + "learning_rate": 8.991374316974016e-05, + "loss": 0.7726, + "step": 717 + }, + { + "epoch": 0.14759995888580532, + "grad_norm": 0.2837783992290497, + "learning_rate": 8.991312393248083e-05, + "loss": 0.7345, + "step": 718 + }, + { + "epoch": 0.14780552985918388, + "grad_norm": 0.31906503438949585, + "learning_rate": 8.991250248256181e-05, + "loss": 0.7493, + "step": 719 + }, + { + "epoch": 0.14801110083256244, + "grad_norm": 0.28739094734191895, + "learning_rate": 8.991187882001371e-05, + "loss": 0.7527, + "step": 720 + }, + { + "epoch": 0.148216671805941, + "grad_norm": 0.28792694211006165, + "learning_rate": 8.991125294486727e-05, + "loss": 0.7758, + "step": 721 + }, + { + "epoch": 0.14842224277931956, + "grad_norm": 0.30004221200942993, + "learning_rate": 8.99106248571533e-05, + "loss": 0.774, + "step": 722 + }, + { + "epoch": 0.14862781375269812, + "grad_norm": 0.2681220471858978, + "learning_rate": 8.990999455690276e-05, + "loss": 0.7636, + "step": 723 + }, + { + "epoch": 0.14883338472607668, + "grad_norm": 0.2687060534954071, + "learning_rate": 8.990936204414669e-05, + "loss": 0.7763, + "step": 724 + }, + { + "epoch": 0.14903895569945524, + "grad_norm": 0.3481808602809906, + "learning_rate": 8.990872731891628e-05, + "loss": 0.6129, + "step": 725 + }, + { + "epoch": 0.1492445266728338, + "grad_norm": 0.31415244936943054, + "learning_rate": 8.990809038124275e-05, + "loss": 0.7789, + "step": 726 + }, + { + "epoch": 0.14945009764621237, + "grad_norm": 0.2992306649684906, + "learning_rate": 8.990745123115752e-05, + "loss": 0.7361, + "step": 727 + }, + { + "epoch": 0.14965566861959093, + "grad_norm": 0.2780331075191498, + "learning_rate": 8.990680986869206e-05, + "loss": 0.7657, + "step": 728 + }, + { + "epoch": 0.1498612395929695, + "grad_norm": 0.20312556624412537, + "learning_rate": 8.990616629387798e-05, + "loss": 0.5755, + "step": 729 + }, + { + "epoch": 0.15006681056634802, + "grad_norm": 0.32418328523635864, + "learning_rate": 8.990552050674697e-05, + "loss": 0.7537, + "step": 730 + }, + { + "epoch": 0.15027238153972658, + "grad_norm": 0.30750200152397156, + "learning_rate": 8.990487250733086e-05, + "loss": 0.7585, + "step": 731 + }, + { + "epoch": 0.15047795251310514, + "grad_norm": 0.2661309242248535, + "learning_rate": 8.990422229566156e-05, + "loss": 0.7454, + "step": 732 + }, + { + "epoch": 0.1506835234864837, + "grad_norm": 0.2825012803077698, + "learning_rate": 8.99035698717711e-05, + "loss": 0.7466, + "step": 733 + }, + { + "epoch": 0.15088909445986226, + "grad_norm": 0.27984434366226196, + "learning_rate": 8.990291523569166e-05, + "loss": 0.7558, + "step": 734 + }, + { + "epoch": 0.15109466543324082, + "grad_norm": 0.20815995335578918, + "learning_rate": 8.990225838745544e-05, + "loss": 0.6112, + "step": 735 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 0.3687712848186493, + "learning_rate": 8.990159932709483e-05, + "loss": 0.7705, + "step": 736 + }, + { + "epoch": 0.15150580737999794, + "grad_norm": 0.28203409910202026, + "learning_rate": 8.990093805464227e-05, + "loss": 0.7658, + "step": 737 + }, + { + "epoch": 0.1517113783533765, + "grad_norm": 0.26725029945373535, + "learning_rate": 8.990027457013039e-05, + "loss": 0.7545, + "step": 738 + }, + { + "epoch": 0.15191694932675506, + "grad_norm": 0.27890896797180176, + "learning_rate": 8.989960887359183e-05, + "loss": 0.7713, + "step": 739 + }, + { + "epoch": 0.15212252030013362, + "grad_norm": 0.2642592191696167, + "learning_rate": 8.98989409650594e-05, + "loss": 0.7418, + "step": 740 + }, + { + "epoch": 0.15232809127351218, + "grad_norm": 0.28167617321014404, + "learning_rate": 8.9898270844566e-05, + "loss": 0.7641, + "step": 741 + }, + { + "epoch": 0.15253366224689074, + "grad_norm": 0.2627207338809967, + "learning_rate": 8.989759851214465e-05, + "loss": 0.7453, + "step": 742 + }, + { + "epoch": 0.1527392332202693, + "grad_norm": 0.28408879041671753, + "learning_rate": 8.98969239678285e-05, + "loss": 0.7596, + "step": 743 + }, + { + "epoch": 0.15294480419364787, + "grad_norm": 0.2735441327095032, + "learning_rate": 8.989624721165072e-05, + "loss": 0.7715, + "step": 744 + }, + { + "epoch": 0.15315037516702643, + "grad_norm": 0.18697437644004822, + "learning_rate": 8.989556824364469e-05, + "loss": 0.5824, + "step": 745 + }, + { + "epoch": 0.153355946140405, + "grad_norm": 0.2745780646800995, + "learning_rate": 8.989488706384386e-05, + "loss": 0.7615, + "step": 746 + }, + { + "epoch": 0.15356151711378355, + "grad_norm": 0.14835397899150848, + "learning_rate": 8.989420367228179e-05, + "loss": 0.5817, + "step": 747 + }, + { + "epoch": 0.15376708808716208, + "grad_norm": 0.272223562002182, + "learning_rate": 8.989351806899213e-05, + "loss": 0.7756, + "step": 748 + }, + { + "epoch": 0.15397265906054064, + "grad_norm": 0.1476040929555893, + "learning_rate": 8.989283025400868e-05, + "loss": 0.5714, + "step": 749 + }, + { + "epoch": 0.1541782300339192, + "grad_norm": 0.29153406620025635, + "learning_rate": 8.98921402273653e-05, + "loss": 0.766, + "step": 750 + }, + { + "epoch": 0.15438380100729776, + "grad_norm": 0.1418268382549286, + "learning_rate": 8.989144798909598e-05, + "loss": 0.6128, + "step": 751 + }, + { + "epoch": 0.15458937198067632, + "grad_norm": 0.2692977786064148, + "learning_rate": 8.989075353923487e-05, + "loss": 0.7622, + "step": 752 + }, + { + "epoch": 0.15479494295405488, + "grad_norm": 0.26004138588905334, + "learning_rate": 8.989005687781615e-05, + "loss": 0.7816, + "step": 753 + }, + { + "epoch": 0.15500051392743344, + "grad_norm": 0.2757778465747833, + "learning_rate": 8.988935800487412e-05, + "loss": 0.7434, + "step": 754 + }, + { + "epoch": 0.155206084900812, + "grad_norm": 0.255287766456604, + "learning_rate": 8.988865692044326e-05, + "loss": 0.7624, + "step": 755 + }, + { + "epoch": 0.15541165587419056, + "grad_norm": 0.25884950160980225, + "learning_rate": 8.988795362455807e-05, + "loss": 0.7563, + "step": 756 + }, + { + "epoch": 0.15561722684756912, + "grad_norm": 0.2563144266605377, + "learning_rate": 8.988724811725321e-05, + "loss": 0.7714, + "step": 757 + }, + { + "epoch": 0.15582279782094768, + "grad_norm": 0.2678104639053345, + "learning_rate": 8.988654039856344e-05, + "loss": 0.7474, + "step": 758 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 0.24936316907405853, + "learning_rate": 8.98858304685236e-05, + "loss": 0.7673, + "step": 759 + }, + { + "epoch": 0.1562339397677048, + "grad_norm": 0.26165440678596497, + "learning_rate": 8.988511832716873e-05, + "loss": 0.7601, + "step": 760 + }, + { + "epoch": 0.15643951074108337, + "grad_norm": 0.26390373706817627, + "learning_rate": 8.988440397453385e-05, + "loss": 0.771, + "step": 761 + }, + { + "epoch": 0.15664508171446193, + "grad_norm": 0.2585375905036926, + "learning_rate": 8.988368741065418e-05, + "loss": 0.7544, + "step": 762 + }, + { + "epoch": 0.15685065268784049, + "grad_norm": 0.2905960977077484, + "learning_rate": 8.9882968635565e-05, + "loss": 0.7778, + "step": 763 + }, + { + "epoch": 0.15705622366121905, + "grad_norm": 0.25519707798957825, + "learning_rate": 8.988224764930176e-05, + "loss": 0.7575, + "step": 764 + }, + { + "epoch": 0.1572617946345976, + "grad_norm": 0.19228395819664001, + "learning_rate": 8.988152445189995e-05, + "loss": 0.5991, + "step": 765 + }, + { + "epoch": 0.15746736560797617, + "grad_norm": 0.3007056713104248, + "learning_rate": 8.988079904339521e-05, + "loss": 0.7521, + "step": 766 + }, + { + "epoch": 0.1576729365813547, + "grad_norm": 0.2646825611591339, + "learning_rate": 8.988007142382328e-05, + "loss": 0.7681, + "step": 767 + }, + { + "epoch": 0.15787850755473326, + "grad_norm": 0.25301775336265564, + "learning_rate": 8.987934159321998e-05, + "loss": 0.7559, + "step": 768 + }, + { + "epoch": 0.15808407852811182, + "grad_norm": 0.2603342533111572, + "learning_rate": 8.987860955162129e-05, + "loss": 0.7328, + "step": 769 + }, + { + "epoch": 0.15828964950149038, + "grad_norm": 0.2716013491153717, + "learning_rate": 8.987787529906327e-05, + "loss": 0.7904, + "step": 770 + }, + { + "epoch": 0.15849522047486894, + "grad_norm": 0.2763035297393799, + "learning_rate": 8.98771388355821e-05, + "loss": 0.7466, + "step": 771 + }, + { + "epoch": 0.1587007914482475, + "grad_norm": 0.20483554899692535, + "learning_rate": 8.987640016121405e-05, + "loss": 0.6064, + "step": 772 + }, + { + "epoch": 0.15890636242162606, + "grad_norm": 0.2952456474304199, + "learning_rate": 8.987565927599552e-05, + "loss": 0.7767, + "step": 773 + }, + { + "epoch": 0.15911193339500462, + "grad_norm": 0.262829452753067, + "learning_rate": 8.9874916179963e-05, + "loss": 0.7453, + "step": 774 + }, + { + "epoch": 0.15931750436838318, + "grad_norm": 0.27599036693573, + "learning_rate": 8.987417087315311e-05, + "loss": 0.7633, + "step": 775 + }, + { + "epoch": 0.15952307534176174, + "grad_norm": 0.2878960371017456, + "learning_rate": 8.987342335560257e-05, + "loss": 0.7264, + "step": 776 + }, + { + "epoch": 0.1597286463151403, + "grad_norm": 0.27682632207870483, + "learning_rate": 8.98726736273482e-05, + "loss": 0.7599, + "step": 777 + }, + { + "epoch": 0.15993421728851887, + "grad_norm": 0.28773486614227295, + "learning_rate": 8.98719216884269e-05, + "loss": 0.749, + "step": 778 + }, + { + "epoch": 0.16013978826189743, + "grad_norm": 0.18678279221057892, + "learning_rate": 8.987116753887578e-05, + "loss": 0.5898, + "step": 779 + }, + { + "epoch": 0.16034535923527599, + "grad_norm": 0.2946769595146179, + "learning_rate": 8.987041117873195e-05, + "loss": 0.7631, + "step": 780 + }, + { + "epoch": 0.16055093020865455, + "grad_norm": 0.2669578492641449, + "learning_rate": 8.98696526080327e-05, + "loss": 0.7401, + "step": 781 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 0.2495296746492386, + "learning_rate": 8.986889182681537e-05, + "loss": 0.7548, + "step": 782 + }, + { + "epoch": 0.16096207215541167, + "grad_norm": 0.1537548452615738, + "learning_rate": 8.986812883511746e-05, + "loss": 0.5952, + "step": 783 + }, + { + "epoch": 0.16116764312879023, + "grad_norm": 0.3242528736591339, + "learning_rate": 8.986736363297657e-05, + "loss": 0.7621, + "step": 784 + }, + { + "epoch": 0.16137321410216876, + "grad_norm": 0.2763916254043579, + "learning_rate": 8.986659622043038e-05, + "loss": 0.7518, + "step": 785 + }, + { + "epoch": 0.16157878507554732, + "grad_norm": 0.27918627858161926, + "learning_rate": 8.986582659751668e-05, + "loss": 0.759, + "step": 786 + }, + { + "epoch": 0.16178435604892588, + "grad_norm": 0.1745089888572693, + "learning_rate": 8.986505476427342e-05, + "loss": 0.6015, + "step": 787 + }, + { + "epoch": 0.16198992702230444, + "grad_norm": 0.2984016239643097, + "learning_rate": 8.986428072073861e-05, + "loss": 0.7422, + "step": 788 + }, + { + "epoch": 0.162195497995683, + "grad_norm": 0.27629682421684265, + "learning_rate": 8.986350446695038e-05, + "loss": 0.7691, + "step": 789 + }, + { + "epoch": 0.16240106896906156, + "grad_norm": 0.15922513604164124, + "learning_rate": 8.986272600294698e-05, + "loss": 0.594, + "step": 790 + }, + { + "epoch": 0.16260663994244012, + "grad_norm": 0.14948177337646484, + "learning_rate": 8.986194532876676e-05, + "loss": 0.5879, + "step": 791 + }, + { + "epoch": 0.16281221091581868, + "grad_norm": 0.33852294087409973, + "learning_rate": 8.986116244444816e-05, + "loss": 0.788, + "step": 792 + }, + { + "epoch": 0.16301778188919724, + "grad_norm": 0.29658934473991394, + "learning_rate": 8.986037735002979e-05, + "loss": 0.7502, + "step": 793 + }, + { + "epoch": 0.1632233528625758, + "grad_norm": 0.27061983942985535, + "learning_rate": 8.98595900455503e-05, + "loss": 0.7444, + "step": 794 + }, + { + "epoch": 0.16342892383595437, + "grad_norm": 0.28159090876579285, + "learning_rate": 8.985880053104848e-05, + "loss": 0.7497, + "step": 795 + }, + { + "epoch": 0.16363449480933293, + "grad_norm": 0.27150630950927734, + "learning_rate": 8.985800880656322e-05, + "loss": 0.7283, + "step": 796 + }, + { + "epoch": 0.16384006578271149, + "grad_norm": 0.26862168312072754, + "learning_rate": 8.985721487213353e-05, + "loss": 0.7492, + "step": 797 + }, + { + "epoch": 0.16404563675609005, + "grad_norm": 0.284452885389328, + "learning_rate": 8.985641872779853e-05, + "loss": 0.7864, + "step": 798 + }, + { + "epoch": 0.1642512077294686, + "grad_norm": 0.19958379864692688, + "learning_rate": 8.985562037359745e-05, + "loss": 0.585, + "step": 799 + }, + { + "epoch": 0.16445677870284717, + "grad_norm": 0.1591620147228241, + "learning_rate": 8.985481980956959e-05, + "loss": 0.5937, + "step": 800 + }, + { + "epoch": 0.16466234967622573, + "grad_norm": 0.15034611523151398, + "learning_rate": 8.985401703575444e-05, + "loss": 0.6034, + "step": 801 + }, + { + "epoch": 0.1648679206496043, + "grad_norm": 0.4189755618572235, + "learning_rate": 8.985321205219149e-05, + "loss": 0.7696, + "step": 802 + }, + { + "epoch": 0.16507349162298285, + "grad_norm": 0.17588938772678375, + "learning_rate": 8.985240485892043e-05, + "loss": 0.5819, + "step": 803 + }, + { + "epoch": 0.16527906259636138, + "grad_norm": 0.17400261759757996, + "learning_rate": 8.985159545598102e-05, + "loss": 0.5878, + "step": 804 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 0.5819520354270935, + "learning_rate": 8.985078384341314e-05, + "loss": 0.7724, + "step": 805 + }, + { + "epoch": 0.1656902045431185, + "grad_norm": 0.3000738322734833, + "learning_rate": 8.984997002125677e-05, + "loss": 0.7544, + "step": 806 + }, + { + "epoch": 0.16589577551649706, + "grad_norm": 0.5194309949874878, + "learning_rate": 8.984915398955201e-05, + "loss": 0.7717, + "step": 807 + }, + { + "epoch": 0.16610134648987562, + "grad_norm": 0.24588865041732788, + "learning_rate": 8.984833574833905e-05, + "loss": 0.5959, + "step": 808 + }, + { + "epoch": 0.16630691746325418, + "grad_norm": 0.3617485761642456, + "learning_rate": 8.984751529765823e-05, + "loss": 0.7641, + "step": 809 + }, + { + "epoch": 0.16651248843663274, + "grad_norm": 0.1757216602563858, + "learning_rate": 8.984669263754993e-05, + "loss": 0.5963, + "step": 810 + }, + { + "epoch": 0.1667180594100113, + "grad_norm": 0.37562620639801025, + "learning_rate": 8.98458677680547e-05, + "loss": 0.7765, + "step": 811 + }, + { + "epoch": 0.16692363038338986, + "grad_norm": 0.19446802139282227, + "learning_rate": 8.984504068921317e-05, + "loss": 0.5991, + "step": 812 + }, + { + "epoch": 0.16712920135676843, + "grad_norm": 0.2953244149684906, + "learning_rate": 8.98442114010661e-05, + "loss": 0.7816, + "step": 813 + }, + { + "epoch": 0.16733477233014699, + "grad_norm": 0.3022470772266388, + "learning_rate": 8.984337990365433e-05, + "loss": 0.7426, + "step": 814 + }, + { + "epoch": 0.16754034330352555, + "grad_norm": 0.268697053194046, + "learning_rate": 8.984254619701882e-05, + "loss": 0.7798, + "step": 815 + }, + { + "epoch": 0.1677459142769041, + "grad_norm": 0.2634507119655609, + "learning_rate": 8.984171028120066e-05, + "loss": 0.7499, + "step": 816 + }, + { + "epoch": 0.16795148525028267, + "grad_norm": 0.2637363374233246, + "learning_rate": 8.984087215624102e-05, + "loss": 0.7244, + "step": 817 + }, + { + "epoch": 0.16815705622366123, + "grad_norm": 0.25045761466026306, + "learning_rate": 8.984003182218121e-05, + "loss": 0.7206, + "step": 818 + }, + { + "epoch": 0.1683626271970398, + "grad_norm": 0.24836835265159607, + "learning_rate": 8.983918927906259e-05, + "loss": 0.7381, + "step": 819 + }, + { + "epoch": 0.16856819817041835, + "grad_norm": 0.26156720519065857, + "learning_rate": 8.983834452692671e-05, + "loss": 0.748, + "step": 820 + }, + { + "epoch": 0.1687737691437969, + "grad_norm": 0.2660123407840729, + "learning_rate": 8.983749756581517e-05, + "loss": 0.7349, + "step": 821 + }, + { + "epoch": 0.16897934011717544, + "grad_norm": 0.20181813836097717, + "learning_rate": 8.983664839576969e-05, + "loss": 0.6089, + "step": 822 + }, + { + "epoch": 0.169184911090554, + "grad_norm": 0.16823935508728027, + "learning_rate": 8.98357970168321e-05, + "loss": 0.6203, + "step": 823 + }, + { + "epoch": 0.16939048206393256, + "grad_norm": 0.36333969235420227, + "learning_rate": 8.983494342904437e-05, + "loss": 0.7704, + "step": 824 + }, + { + "epoch": 0.16959605303731112, + "grad_norm": 0.2901283800601959, + "learning_rate": 8.983408763244853e-05, + "loss": 0.7484, + "step": 825 + }, + { + "epoch": 0.16980162401068968, + "grad_norm": 0.2594255805015564, + "learning_rate": 8.983322962708673e-05, + "loss": 0.7726, + "step": 826 + }, + { + "epoch": 0.17000719498406824, + "grad_norm": 0.2951291799545288, + "learning_rate": 8.983236941300128e-05, + "loss": 0.743, + "step": 827 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.23186159133911133, + "learning_rate": 8.983150699023453e-05, + "loss": 0.6015, + "step": 828 + }, + { + "epoch": 0.17041833693082536, + "grad_norm": 0.2974048852920532, + "learning_rate": 8.983064235882896e-05, + "loss": 0.7689, + "step": 829 + }, + { + "epoch": 0.17062390790420393, + "grad_norm": 0.2741788327693939, + "learning_rate": 8.982977551882719e-05, + "loss": 0.7825, + "step": 830 + }, + { + "epoch": 0.17082947887758249, + "grad_norm": 0.2528201639652252, + "learning_rate": 8.982890647027191e-05, + "loss": 0.7549, + "step": 831 + }, + { + "epoch": 0.17103504985096105, + "grad_norm": 0.27328386902809143, + "learning_rate": 8.982803521320593e-05, + "loss": 0.7433, + "step": 832 + }, + { + "epoch": 0.1712406208243396, + "grad_norm": 0.18332356214523315, + "learning_rate": 8.98271617476722e-05, + "loss": 0.6056, + "step": 833 + }, + { + "epoch": 0.17144619179771817, + "grad_norm": 0.2897491753101349, + "learning_rate": 8.982628607371373e-05, + "loss": 0.7229, + "step": 834 + }, + { + "epoch": 0.17165176277109673, + "grad_norm": 0.27189579606056213, + "learning_rate": 8.982540819137363e-05, + "loss": 0.7409, + "step": 835 + }, + { + "epoch": 0.1718573337444753, + "grad_norm": 0.2686000168323517, + "learning_rate": 8.982452810069521e-05, + "loss": 0.7622, + "step": 836 + }, + { + "epoch": 0.17206290471785385, + "grad_norm": 0.2843405306339264, + "learning_rate": 8.98236458017218e-05, + "loss": 0.7774, + "step": 837 + }, + { + "epoch": 0.1722684756912324, + "grad_norm": 0.249932661652565, + "learning_rate": 8.982276129449687e-05, + "loss": 0.758, + "step": 838 + }, + { + "epoch": 0.17247404666461097, + "grad_norm": 0.1650909036397934, + "learning_rate": 8.982187457906399e-05, + "loss": 0.6026, + "step": 839 + }, + { + "epoch": 0.1726796176379895, + "grad_norm": 0.2688060700893402, + "learning_rate": 8.982098565546684e-05, + "loss": 0.74, + "step": 840 + }, + { + "epoch": 0.17288518861136806, + "grad_norm": 0.2702515423297882, + "learning_rate": 8.982009452374921e-05, + "loss": 0.7454, + "step": 841 + }, + { + "epoch": 0.17309075958474662, + "grad_norm": 0.2621611952781677, + "learning_rate": 8.981920118395502e-05, + "loss": 0.741, + "step": 842 + }, + { + "epoch": 0.17329633055812518, + "grad_norm": 0.26395297050476074, + "learning_rate": 8.981830563612828e-05, + "loss": 0.7634, + "step": 843 + }, + { + "epoch": 0.17350190153150374, + "grad_norm": 0.1796771138906479, + "learning_rate": 8.981740788031309e-05, + "loss": 0.5774, + "step": 844 + }, + { + "epoch": 0.1737074725048823, + "grad_norm": 0.28493568301200867, + "learning_rate": 8.98165079165537e-05, + "loss": 0.744, + "step": 845 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.14998356997966766, + "learning_rate": 8.981560574489442e-05, + "loss": 0.583, + "step": 846 + }, + { + "epoch": 0.17411861445163943, + "grad_norm": 0.28660815954208374, + "learning_rate": 8.981470136537973e-05, + "loss": 0.7648, + "step": 847 + }, + { + "epoch": 0.17432418542501799, + "grad_norm": 0.26909613609313965, + "learning_rate": 8.981379477805416e-05, + "loss": 0.7621, + "step": 848 + }, + { + "epoch": 0.17452975639839655, + "grad_norm": 0.2543969750404358, + "learning_rate": 8.981288598296238e-05, + "loss": 0.7383, + "step": 849 + }, + { + "epoch": 0.1747353273717751, + "grad_norm": 0.27695950865745544, + "learning_rate": 8.981197498014916e-05, + "loss": 0.7567, + "step": 850 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 0.2635768949985504, + "learning_rate": 8.98110617696594e-05, + "loss": 0.7627, + "step": 851 + }, + { + "epoch": 0.17514646931853223, + "grad_norm": 0.31927260756492615, + "learning_rate": 8.981014635153806e-05, + "loss": 0.7376, + "step": 852 + }, + { + "epoch": 0.1753520402919108, + "grad_norm": 0.25446754693984985, + "learning_rate": 8.980922872583025e-05, + "loss": 0.7415, + "step": 853 + }, + { + "epoch": 0.17555761126528935, + "grad_norm": 0.2923116683959961, + "learning_rate": 8.980830889258118e-05, + "loss": 0.7375, + "step": 854 + }, + { + "epoch": 0.1757631822386679, + "grad_norm": 0.17673562467098236, + "learning_rate": 8.980738685183617e-05, + "loss": 0.5944, + "step": 855 + }, + { + "epoch": 0.17596875321204647, + "grad_norm": 0.2569844424724579, + "learning_rate": 8.980646260364063e-05, + "loss": 0.7681, + "step": 856 + }, + { + "epoch": 0.17617432418542503, + "grad_norm": 0.2668174207210541, + "learning_rate": 8.98055361480401e-05, + "loss": 0.753, + "step": 857 + }, + { + "epoch": 0.1763798951588036, + "grad_norm": 0.15782947838306427, + "learning_rate": 8.980460748508023e-05, + "loss": 0.5973, + "step": 858 + }, + { + "epoch": 0.17658546613218212, + "grad_norm": 0.27562811970710754, + "learning_rate": 8.980367661480678e-05, + "loss": 0.7613, + "step": 859 + }, + { + "epoch": 0.17679103710556068, + "grad_norm": 0.2562348544597626, + "learning_rate": 8.980274353726556e-05, + "loss": 0.7451, + "step": 860 + }, + { + "epoch": 0.17699660807893924, + "grad_norm": 0.25293174386024475, + "learning_rate": 8.980180825250261e-05, + "loss": 0.7285, + "step": 861 + }, + { + "epoch": 0.1772021790523178, + "grad_norm": 0.2638672888278961, + "learning_rate": 8.980087076056394e-05, + "loss": 0.7539, + "step": 862 + }, + { + "epoch": 0.17740775002569636, + "grad_norm": 0.1891278624534607, + "learning_rate": 8.979993106149579e-05, + "loss": 0.58, + "step": 863 + }, + { + "epoch": 0.17761332099907493, + "grad_norm": 0.27774450182914734, + "learning_rate": 8.979898915534442e-05, + "loss": 0.7754, + "step": 864 + }, + { + "epoch": 0.17781889197245349, + "grad_norm": 0.26496121287345886, + "learning_rate": 8.979804504215624e-05, + "loss": 0.7595, + "step": 865 + }, + { + "epoch": 0.17802446294583205, + "grad_norm": 0.26245352625846863, + "learning_rate": 8.979709872197778e-05, + "loss": 0.7565, + "step": 866 + }, + { + "epoch": 0.1782300339192106, + "grad_norm": 0.2624642252922058, + "learning_rate": 8.979615019485564e-05, + "loss": 0.7556, + "step": 867 + }, + { + "epoch": 0.17843560489258917, + "grad_norm": 0.16684101521968842, + "learning_rate": 8.979519946083656e-05, + "loss": 0.6104, + "step": 868 + }, + { + "epoch": 0.17864117586596773, + "grad_norm": 0.26087847352027893, + "learning_rate": 8.979424651996738e-05, + "loss": 0.7496, + "step": 869 + }, + { + "epoch": 0.1788467468393463, + "grad_norm": 0.2627946436405182, + "learning_rate": 8.979329137229502e-05, + "loss": 0.7471, + "step": 870 + }, + { + "epoch": 0.17905231781272485, + "grad_norm": 0.2528480887413025, + "learning_rate": 8.979233401786657e-05, + "loss": 0.7645, + "step": 871 + }, + { + "epoch": 0.1792578887861034, + "grad_norm": 0.26880887150764465, + "learning_rate": 8.97913744567292e-05, + "loss": 0.7492, + "step": 872 + }, + { + "epoch": 0.17946345975948197, + "grad_norm": 0.25951650738716125, + "learning_rate": 8.979041268893014e-05, + "loss": 0.7428, + "step": 873 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 0.15437857806682587, + "learning_rate": 8.97894487145168e-05, + "loss": 0.5812, + "step": 874 + }, + { + "epoch": 0.1798746017062391, + "grad_norm": 0.28139808773994446, + "learning_rate": 8.978848253353668e-05, + "loss": 0.7438, + "step": 875 + }, + { + "epoch": 0.18008017267961765, + "grad_norm": 0.14730799198150635, + "learning_rate": 8.978751414603735e-05, + "loss": 0.5816, + "step": 876 + }, + { + "epoch": 0.18028574365299618, + "grad_norm": 0.2632145285606384, + "learning_rate": 8.978654355206654e-05, + "loss": 0.7467, + "step": 877 + }, + { + "epoch": 0.18049131462637474, + "grad_norm": 0.2908996045589447, + "learning_rate": 8.978557075167206e-05, + "loss": 0.74, + "step": 878 + }, + { + "epoch": 0.1806968855997533, + "grad_norm": 0.24691736698150635, + "learning_rate": 8.978459574490184e-05, + "loss": 0.7718, + "step": 879 + }, + { + "epoch": 0.18090245657313186, + "grad_norm": 0.25215819478034973, + "learning_rate": 8.978361853180392e-05, + "loss": 0.7481, + "step": 880 + }, + { + "epoch": 0.18110802754651043, + "grad_norm": 0.2547704577445984, + "learning_rate": 8.978263911242642e-05, + "loss": 0.7508, + "step": 881 + }, + { + "epoch": 0.18131359851988899, + "grad_norm": 0.184767946600914, + "learning_rate": 8.97816574868176e-05, + "loss": 0.5983, + "step": 882 + }, + { + "epoch": 0.18151916949326755, + "grad_norm": 0.1742323487997055, + "learning_rate": 8.978067365502583e-05, + "loss": 0.6079, + "step": 883 + }, + { + "epoch": 0.1817247404666461, + "grad_norm": 0.15977798402309418, + "learning_rate": 8.977968761709958e-05, + "loss": 0.5984, + "step": 884 + }, + { + "epoch": 0.18193031144002467, + "grad_norm": 0.36065980792045593, + "learning_rate": 8.977869937308742e-05, + "loss": 0.7727, + "step": 885 + }, + { + "epoch": 0.18213588241340323, + "grad_norm": 0.28331291675567627, + "learning_rate": 8.977770892303802e-05, + "loss": 0.753, + "step": 886 + }, + { + "epoch": 0.1823414533867818, + "grad_norm": 0.2905336022377014, + "learning_rate": 8.977671626700021e-05, + "loss": 0.7554, + "step": 887 + }, + { + "epoch": 0.18254702436016035, + "grad_norm": 0.2962552309036255, + "learning_rate": 8.977572140502286e-05, + "loss": 0.7432, + "step": 888 + }, + { + "epoch": 0.1827525953335389, + "grad_norm": 0.2991376519203186, + "learning_rate": 8.977472433715502e-05, + "loss": 0.7562, + "step": 889 + }, + { + "epoch": 0.18295816630691747, + "grad_norm": 0.22425773739814758, + "learning_rate": 8.977372506344578e-05, + "loss": 0.5851, + "step": 890 + }, + { + "epoch": 0.18316373728029603, + "grad_norm": 0.32990381121635437, + "learning_rate": 8.977272358394437e-05, + "loss": 0.7482, + "step": 891 + }, + { + "epoch": 0.1833693082536746, + "grad_norm": 0.17806373536586761, + "learning_rate": 8.977171989870013e-05, + "loss": 0.6074, + "step": 892 + }, + { + "epoch": 0.18357487922705315, + "grad_norm": 0.318367063999176, + "learning_rate": 8.977071400776253e-05, + "loss": 0.7526, + "step": 893 + }, + { + "epoch": 0.1837804502004317, + "grad_norm": 0.17434534430503845, + "learning_rate": 8.97697059111811e-05, + "loss": 0.5821, + "step": 894 + }, + { + "epoch": 0.18398602117381027, + "grad_norm": 0.29355406761169434, + "learning_rate": 8.976869560900552e-05, + "loss": 0.7531, + "step": 895 + }, + { + "epoch": 0.1841915921471888, + "grad_norm": 0.2709575593471527, + "learning_rate": 8.976768310128555e-05, + "loss": 0.7768, + "step": 896 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 0.252112478017807, + "learning_rate": 8.976666838807107e-05, + "loss": 0.7173, + "step": 897 + }, + { + "epoch": 0.18460273409394592, + "grad_norm": 0.2750721573829651, + "learning_rate": 8.976565146941209e-05, + "loss": 0.7365, + "step": 898 + }, + { + "epoch": 0.18480830506732449, + "grad_norm": 0.2349645495414734, + "learning_rate": 8.97646323453587e-05, + "loss": 0.5986, + "step": 899 + }, + { + "epoch": 0.18501387604070305, + "grad_norm": 0.268477201461792, + "learning_rate": 8.976361101596108e-05, + "loss": 0.7779, + "step": 900 + }, + { + "epoch": 0.1852194470140816, + "grad_norm": 0.2666422426700592, + "learning_rate": 8.976258748126959e-05, + "loss": 0.7536, + "step": 901 + }, + { + "epoch": 0.18542501798746017, + "grad_norm": 0.2692512571811676, + "learning_rate": 8.976156174133462e-05, + "loss": 0.7737, + "step": 902 + }, + { + "epoch": 0.18563058896083873, + "grad_norm": 0.25315481424331665, + "learning_rate": 8.976053379620673e-05, + "loss": 0.7359, + "step": 903 + }, + { + "epoch": 0.1858361599342173, + "grad_norm": 0.2516801953315735, + "learning_rate": 8.975950364593655e-05, + "loss": 0.7381, + "step": 904 + }, + { + "epoch": 0.18604173090759585, + "grad_norm": 0.2789689600467682, + "learning_rate": 8.975847129057482e-05, + "loss": 0.7466, + "step": 905 + }, + { + "epoch": 0.1862473018809744, + "grad_norm": 0.1855190098285675, + "learning_rate": 8.975743673017243e-05, + "loss": 0.5948, + "step": 906 + }, + { + "epoch": 0.18645287285435297, + "grad_norm": 0.27560868859291077, + "learning_rate": 8.975639996478032e-05, + "loss": 0.737, + "step": 907 + }, + { + "epoch": 0.18665844382773153, + "grad_norm": 0.26743271946907043, + "learning_rate": 8.975536099444957e-05, + "loss": 0.7585, + "step": 908 + }, + { + "epoch": 0.1868640148011101, + "grad_norm": 0.2512650191783905, + "learning_rate": 8.975431981923137e-05, + "loss": 0.7318, + "step": 909 + }, + { + "epoch": 0.18706958577448865, + "grad_norm": 0.2596076726913452, + "learning_rate": 8.9753276439177e-05, + "loss": 0.7641, + "step": 910 + }, + { + "epoch": 0.1872751567478672, + "grad_norm": 0.20333601534366608, + "learning_rate": 8.97522308543379e-05, + "loss": 0.6016, + "step": 911 + }, + { + "epoch": 0.18748072772124577, + "grad_norm": 0.2744527757167816, + "learning_rate": 8.975118306476554e-05, + "loss": 0.7522, + "step": 912 + }, + { + "epoch": 0.18768629869462433, + "grad_norm": 0.2788070738315582, + "learning_rate": 8.975013307051157e-05, + "loss": 0.7487, + "step": 913 + }, + { + "epoch": 0.18789186966800286, + "grad_norm": 0.25242358446121216, + "learning_rate": 8.97490808716277e-05, + "loss": 0.7345, + "step": 914 + }, + { + "epoch": 0.18809744064138142, + "grad_norm": 0.2651404142379761, + "learning_rate": 8.974802646816578e-05, + "loss": 0.7281, + "step": 915 + }, + { + "epoch": 0.18830301161475999, + "grad_norm": 0.2696022689342499, + "learning_rate": 8.974696986017773e-05, + "loss": 0.7516, + "step": 916 + }, + { + "epoch": 0.18850858258813855, + "grad_norm": 0.24874137341976166, + "learning_rate": 8.974591104771564e-05, + "loss": 0.7413, + "step": 917 + }, + { + "epoch": 0.1887141535615171, + "grad_norm": 0.2631874084472656, + "learning_rate": 8.974485003083164e-05, + "loss": 0.7562, + "step": 918 + }, + { + "epoch": 0.18891972453489567, + "grad_norm": 0.26414451003074646, + "learning_rate": 8.974378680957802e-05, + "loss": 0.5997, + "step": 919 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 0.28355100750923157, + "learning_rate": 8.974272138400716e-05, + "loss": 0.756, + "step": 920 + }, + { + "epoch": 0.1893308664816528, + "grad_norm": 0.26617303490638733, + "learning_rate": 8.974165375417155e-05, + "loss": 0.7841, + "step": 921 + }, + { + "epoch": 0.18953643745503135, + "grad_norm": 0.2054712474346161, + "learning_rate": 8.974058392012375e-05, + "loss": 0.575, + "step": 922 + }, + { + "epoch": 0.1897420084284099, + "grad_norm": 0.27742794156074524, + "learning_rate": 8.973951188191652e-05, + "loss": 0.7585, + "step": 923 + }, + { + "epoch": 0.18994757940178847, + "grad_norm": 0.1530211716890335, + "learning_rate": 8.973843763960267e-05, + "loss": 0.5826, + "step": 924 + }, + { + "epoch": 0.19015315037516703, + "grad_norm": 0.2896377444267273, + "learning_rate": 8.973736119323508e-05, + "loss": 0.7741, + "step": 925 + }, + { + "epoch": 0.1903587213485456, + "grad_norm": 0.16760393977165222, + "learning_rate": 8.973628254286681e-05, + "loss": 0.5857, + "step": 926 + }, + { + "epoch": 0.19056429232192415, + "grad_norm": 0.26283350586891174, + "learning_rate": 8.9735201688551e-05, + "loss": 0.7505, + "step": 927 + }, + { + "epoch": 0.1907698632953027, + "grad_norm": 0.24747183918952942, + "learning_rate": 8.97341186303409e-05, + "loss": 0.7227, + "step": 928 + }, + { + "epoch": 0.19097543426868127, + "grad_norm": 0.27605384588241577, + "learning_rate": 8.973303336828985e-05, + "loss": 0.7628, + "step": 929 + }, + { + "epoch": 0.19118100524205983, + "grad_norm": 0.2601989507675171, + "learning_rate": 8.973194590245132e-05, + "loss": 0.7559, + "step": 930 + }, + { + "epoch": 0.1913865762154384, + "grad_norm": 0.18584440648555756, + "learning_rate": 8.973085623287892e-05, + "loss": 0.5884, + "step": 931 + }, + { + "epoch": 0.19159214718881695, + "grad_norm": 0.17022742331027985, + "learning_rate": 8.972976435962629e-05, + "loss": 0.5944, + "step": 932 + }, + { + "epoch": 0.19179771816219549, + "grad_norm": 0.34249716997146606, + "learning_rate": 8.972867028274723e-05, + "loss": 0.767, + "step": 933 + }, + { + "epoch": 0.19200328913557405, + "grad_norm": 0.26959505677223206, + "learning_rate": 8.972757400229565e-05, + "loss": 0.7707, + "step": 934 + }, + { + "epoch": 0.1922088601089526, + "grad_norm": 0.2650569975376129, + "learning_rate": 8.972647551832556e-05, + "loss": 0.7181, + "step": 935 + }, + { + "epoch": 0.19241443108233117, + "grad_norm": 0.20763760805130005, + "learning_rate": 8.972537483089107e-05, + "loss": 0.5857, + "step": 936 + }, + { + "epoch": 0.19262000205570973, + "grad_norm": 0.1736496388912201, + "learning_rate": 8.97242719400464e-05, + "loss": 0.5943, + "step": 937 + }, + { + "epoch": 0.1928255730290883, + "grad_norm": 0.3711773157119751, + "learning_rate": 8.97231668458459e-05, + "loss": 0.7748, + "step": 938 + }, + { + "epoch": 0.19303114400246685, + "grad_norm": 0.2923683226108551, + "learning_rate": 8.9722059548344e-05, + "loss": 0.756, + "step": 939 + }, + { + "epoch": 0.1932367149758454, + "grad_norm": 0.2692539393901825, + "learning_rate": 8.972095004759527e-05, + "loss": 0.7795, + "step": 940 + }, + { + "epoch": 0.19344228594922397, + "grad_norm": 0.2933458387851715, + "learning_rate": 8.971983834365434e-05, + "loss": 0.7411, + "step": 941 + }, + { + "epoch": 0.19364785692260253, + "grad_norm": 0.25706520676612854, + "learning_rate": 8.9718724436576e-05, + "loss": 0.6123, + "step": 942 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 0.2033473253250122, + "learning_rate": 8.971760832641513e-05, + "loss": 0.5855, + "step": 943 + }, + { + "epoch": 0.19405899886935965, + "grad_norm": 0.3263876140117645, + "learning_rate": 8.97164900132267e-05, + "loss": 0.7315, + "step": 944 + }, + { + "epoch": 0.1942645698427382, + "grad_norm": 0.3143511414527893, + "learning_rate": 8.971536949706582e-05, + "loss": 0.761, + "step": 945 + }, + { + "epoch": 0.19447014081611677, + "grad_norm": 0.26773688197135925, + "learning_rate": 8.971424677798768e-05, + "loss": 0.7457, + "step": 946 + }, + { + "epoch": 0.19467571178949533, + "grad_norm": 0.29603666067123413, + "learning_rate": 8.971312185604759e-05, + "loss": 0.7635, + "step": 947 + }, + { + "epoch": 0.1948812827628739, + "grad_norm": 0.27570641040802, + "learning_rate": 8.971199473130097e-05, + "loss": 0.7524, + "step": 948 + }, + { + "epoch": 0.19508685373625245, + "grad_norm": 0.2680298984050751, + "learning_rate": 8.971086540380337e-05, + "loss": 0.723, + "step": 949 + }, + { + "epoch": 0.195292424709631, + "grad_norm": 0.2859373390674591, + "learning_rate": 8.970973387361039e-05, + "loss": 0.7422, + "step": 950 + }, + { + "epoch": 0.19549799568300955, + "grad_norm": 0.28261512517929077, + "learning_rate": 8.97086001407778e-05, + "loss": 0.7666, + "step": 951 + }, + { + "epoch": 0.1957035666563881, + "grad_norm": 0.2570000886917114, + "learning_rate": 8.970746420536146e-05, + "loss": 0.7278, + "step": 952 + }, + { + "epoch": 0.19590913762976667, + "grad_norm": 0.25783413648605347, + "learning_rate": 8.97063260674173e-05, + "loss": 0.7684, + "step": 953 + }, + { + "epoch": 0.19611470860314523, + "grad_norm": 0.36918801069259644, + "learning_rate": 8.970518572700143e-05, + "loss": 0.6265, + "step": 954 + }, + { + "epoch": 0.1963202795765238, + "grad_norm": 0.32823050022125244, + "learning_rate": 8.970404318417e-05, + "loss": 0.7552, + "step": 955 + }, + { + "epoch": 0.19652585054990235, + "grad_norm": 0.31253358721733093, + "learning_rate": 8.970289843897933e-05, + "loss": 0.7588, + "step": 956 + }, + { + "epoch": 0.1967314215232809, + "grad_norm": 0.25706982612609863, + "learning_rate": 8.970175149148577e-05, + "loss": 0.7432, + "step": 957 + }, + { + "epoch": 0.19693699249665947, + "grad_norm": 0.2800324559211731, + "learning_rate": 8.970060234174586e-05, + "loss": 0.7389, + "step": 958 + }, + { + "epoch": 0.19714256347003803, + "grad_norm": 0.29499801993370056, + "learning_rate": 8.969945098981621e-05, + "loss": 0.7663, + "step": 959 + }, + { + "epoch": 0.1973481344434166, + "grad_norm": 0.2643605172634125, + "learning_rate": 8.969829743575351e-05, + "loss": 0.7446, + "step": 960 + }, + { + "epoch": 0.19755370541679515, + "grad_norm": 0.2712821662425995, + "learning_rate": 8.969714167961463e-05, + "loss": 0.7657, + "step": 961 + }, + { + "epoch": 0.1977592763901737, + "grad_norm": 0.31495070457458496, + "learning_rate": 8.96959837214565e-05, + "loss": 0.6014, + "step": 962 + }, + { + "epoch": 0.19796484736355227, + "grad_norm": 0.2913089394569397, + "learning_rate": 8.969482356133615e-05, + "loss": 0.7527, + "step": 963 + }, + { + "epoch": 0.19817041833693083, + "grad_norm": 0.172258198261261, + "learning_rate": 8.969366119931075e-05, + "loss": 0.6048, + "step": 964 + }, + { + "epoch": 0.1983759893103094, + "grad_norm": 0.29237228631973267, + "learning_rate": 8.969249663543756e-05, + "loss": 0.7519, + "step": 965 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 0.27603963017463684, + "learning_rate": 8.969132986977396e-05, + "loss": 0.731, + "step": 966 + }, + { + "epoch": 0.1987871312570665, + "grad_norm": 0.2580612003803253, + "learning_rate": 8.969016090237742e-05, + "loss": 0.723, + "step": 967 + }, + { + "epoch": 0.19899270223044507, + "grad_norm": 0.27025994658470154, + "learning_rate": 8.968898973330552e-05, + "loss": 0.7453, + "step": 968 + }, + { + "epoch": 0.19919827320382363, + "grad_norm": 0.27253222465515137, + "learning_rate": 8.968781636261599e-05, + "loss": 0.7455, + "step": 969 + }, + { + "epoch": 0.19940384417720217, + "grad_norm": 0.25386548042297363, + "learning_rate": 8.96866407903666e-05, + "loss": 0.753, + "step": 970 + }, + { + "epoch": 0.19960941515058073, + "grad_norm": 0.2759700417518616, + "learning_rate": 8.96854630166153e-05, + "loss": 0.5741, + "step": 971 + }, + { + "epoch": 0.1998149861239593, + "grad_norm": 0.28211307525634766, + "learning_rate": 8.96842830414201e-05, + "loss": 0.7339, + "step": 972 + }, + { + "epoch": 0.20002055709733785, + "grad_norm": 0.27216947078704834, + "learning_rate": 8.96831008648391e-05, + "loss": 0.7405, + "step": 973 + }, + { + "epoch": 0.2002261280707164, + "grad_norm": 0.24992568790912628, + "learning_rate": 8.96819164869306e-05, + "loss": 0.7186, + "step": 974 + }, + { + "epoch": 0.20043169904409497, + "grad_norm": 0.181453675031662, + "learning_rate": 8.96807299077529e-05, + "loss": 0.5892, + "step": 975 + }, + { + "epoch": 0.20063727001747353, + "grad_norm": 0.2908715307712555, + "learning_rate": 8.967954112736448e-05, + "loss": 0.7462, + "step": 976 + }, + { + "epoch": 0.2008428409908521, + "grad_norm": 0.2695624828338623, + "learning_rate": 8.96783501458239e-05, + "loss": 0.7669, + "step": 977 + }, + { + "epoch": 0.20104841196423065, + "grad_norm": 0.2560322880744934, + "learning_rate": 8.967715696318983e-05, + "loss": 0.7682, + "step": 978 + }, + { + "epoch": 0.2012539829376092, + "grad_norm": 0.25563281774520874, + "learning_rate": 8.967596157952106e-05, + "loss": 0.7246, + "step": 979 + }, + { + "epoch": 0.20145955391098777, + "grad_norm": 0.24063649773597717, + "learning_rate": 8.967476399487649e-05, + "loss": 0.7328, + "step": 980 + }, + { + "epoch": 0.20166512488436633, + "grad_norm": 0.2495402842760086, + "learning_rate": 8.967356420931509e-05, + "loss": 0.722, + "step": 981 + }, + { + "epoch": 0.2018706958577449, + "grad_norm": 0.25746145844459534, + "learning_rate": 8.9672362222896e-05, + "loss": 0.7357, + "step": 982 + }, + { + "epoch": 0.20207626683112345, + "grad_norm": 0.2592317461967468, + "learning_rate": 8.96711580356784e-05, + "loss": 0.746, + "step": 983 + }, + { + "epoch": 0.202281837804502, + "grad_norm": 0.25513893365859985, + "learning_rate": 8.966995164772166e-05, + "loss": 0.7486, + "step": 984 + }, + { + "epoch": 0.20248740877788057, + "grad_norm": 0.40953561663627625, + "learning_rate": 8.966874305908516e-05, + "loss": 0.5957, + "step": 985 + }, + { + "epoch": 0.20269297975125913, + "grad_norm": 0.255729079246521, + "learning_rate": 8.96675322698285e-05, + "loss": 0.748, + "step": 986 + }, + { + "epoch": 0.2028985507246377, + "grad_norm": 0.26324090361595154, + "learning_rate": 8.966631928001129e-05, + "loss": 0.7387, + "step": 987 + }, + { + "epoch": 0.20310412169801623, + "grad_norm": 0.24772094190120697, + "learning_rate": 8.966510408969329e-05, + "loss": 0.7252, + "step": 988 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 0.27024003863334656, + "learning_rate": 8.96638866989344e-05, + "loss": 0.7716, + "step": 989 + }, + { + "epoch": 0.20351526364477335, + "grad_norm": 0.2622278928756714, + "learning_rate": 8.966266710779454e-05, + "loss": 0.7678, + "step": 990 + }, + { + "epoch": 0.2037208346181519, + "grad_norm": 0.252861350774765, + "learning_rate": 8.966144531633384e-05, + "loss": 0.7769, + "step": 991 + }, + { + "epoch": 0.20392640559153047, + "grad_norm": 0.3397926390171051, + "learning_rate": 8.966022132461248e-05, + "loss": 0.742, + "step": 992 + }, + { + "epoch": 0.20413197656490903, + "grad_norm": 0.2550930380821228, + "learning_rate": 8.965899513269076e-05, + "loss": 0.7205, + "step": 993 + }, + { + "epoch": 0.2043375475382876, + "grad_norm": 0.2502458393573761, + "learning_rate": 8.965776674062906e-05, + "loss": 0.7368, + "step": 994 + }, + { + "epoch": 0.20454311851166615, + "grad_norm": 0.25033867359161377, + "learning_rate": 8.965653614848793e-05, + "loss": 0.758, + "step": 995 + }, + { + "epoch": 0.2047486894850447, + "grad_norm": 0.24429009854793549, + "learning_rate": 8.965530335632801e-05, + "loss": 0.7466, + "step": 996 + }, + { + "epoch": 0.20495426045842327, + "grad_norm": 0.24865779280662537, + "learning_rate": 8.965406836421e-05, + "loss": 0.7741, + "step": 997 + }, + { + "epoch": 0.20515983143180183, + "grad_norm": 0.2573890686035156, + "learning_rate": 8.965283117219475e-05, + "loss": 0.7486, + "step": 998 + }, + { + "epoch": 0.2053654024051804, + "grad_norm": 0.2486078292131424, + "learning_rate": 8.965159178034322e-05, + "loss": 0.7277, + "step": 999 + }, + { + "epoch": 0.20557097337855895, + "grad_norm": 0.2717074155807495, + "learning_rate": 8.965035018871647e-05, + "loss": 0.6086, + "step": 1000 + }, + { + "epoch": 0.2057765443519375, + "grad_norm": 0.2679359018802643, + "learning_rate": 8.964910639737566e-05, + "loss": 0.7664, + "step": 1001 + }, + { + "epoch": 0.20598211532531607, + "grad_norm": 0.16115225851535797, + "learning_rate": 8.964786040638205e-05, + "loss": 0.5977, + "step": 1002 + }, + { + "epoch": 0.20618768629869463, + "grad_norm": 0.2807529866695404, + "learning_rate": 8.964661221579706e-05, + "loss": 0.7348, + "step": 1003 + }, + { + "epoch": 0.2063932572720732, + "grad_norm": 0.25754019618034363, + "learning_rate": 8.964536182568215e-05, + "loss": 0.7283, + "step": 1004 + }, + { + "epoch": 0.20659882824545175, + "grad_norm": 0.2526054382324219, + "learning_rate": 8.964410923609894e-05, + "loss": 0.7144, + "step": 1005 + }, + { + "epoch": 0.20680439921883031, + "grad_norm": 0.2148108184337616, + "learning_rate": 8.964285444710914e-05, + "loss": 0.5871, + "step": 1006 + }, + { + "epoch": 0.20700997019220885, + "grad_norm": 0.18252213299274445, + "learning_rate": 8.964159745877456e-05, + "loss": 0.5956, + "step": 1007 + }, + { + "epoch": 0.2072155411655874, + "grad_norm": 0.3090805113315582, + "learning_rate": 8.964033827115713e-05, + "loss": 0.7496, + "step": 1008 + }, + { + "epoch": 0.20742111213896597, + "grad_norm": 0.2703743278980255, + "learning_rate": 8.963907688431887e-05, + "loss": 0.7492, + "step": 1009 + }, + { + "epoch": 0.20762668311234453, + "grad_norm": 0.26899415254592896, + "learning_rate": 8.963781329832194e-05, + "loss": 0.7468, + "step": 1010 + }, + { + "epoch": 0.2078322540857231, + "grad_norm": 0.2887749969959259, + "learning_rate": 8.963654751322858e-05, + "loss": 0.765, + "step": 1011 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 0.2602989077568054, + "learning_rate": 8.963527952910116e-05, + "loss": 0.7749, + "step": 1012 + }, + { + "epoch": 0.2082433960324802, + "grad_norm": 0.22857093811035156, + "learning_rate": 8.963400934600215e-05, + "loss": 0.5989, + "step": 1013 + }, + { + "epoch": 0.20844896700585877, + "grad_norm": 0.29049423336982727, + "learning_rate": 8.963273696399411e-05, + "loss": 0.7406, + "step": 1014 + }, + { + "epoch": 0.20865453797923733, + "grad_norm": 0.27531930804252625, + "learning_rate": 8.963146238313975e-05, + "loss": 0.7575, + "step": 1015 + }, + { + "epoch": 0.2088601089526159, + "grad_norm": 0.2683233320713043, + "learning_rate": 8.963018560350181e-05, + "loss": 0.7572, + "step": 1016 + }, + { + "epoch": 0.20906567992599445, + "grad_norm": 0.26720771193504333, + "learning_rate": 8.962890662514325e-05, + "loss": 0.7537, + "step": 1017 + }, + { + "epoch": 0.209271250899373, + "grad_norm": 0.26178407669067383, + "learning_rate": 8.962762544812705e-05, + "loss": 0.7226, + "step": 1018 + }, + { + "epoch": 0.20947682187275157, + "grad_norm": 0.25852060317993164, + "learning_rate": 8.962634207251633e-05, + "loss": 0.7401, + "step": 1019 + }, + { + "epoch": 0.20968239284613013, + "grad_norm": 0.25970616936683655, + "learning_rate": 8.962505649837432e-05, + "loss": 0.7277, + "step": 1020 + }, + { + "epoch": 0.2098879638195087, + "grad_norm": 0.2682318687438965, + "learning_rate": 8.962376872576436e-05, + "loss": 0.7638, + "step": 1021 + }, + { + "epoch": 0.21009353479288725, + "grad_norm": 0.24570779502391815, + "learning_rate": 8.962247875474989e-05, + "loss": 0.7256, + "step": 1022 + }, + { + "epoch": 0.21029910576626581, + "grad_norm": 0.2523082196712494, + "learning_rate": 8.962118658539446e-05, + "loss": 0.7288, + "step": 1023 + }, + { + "epoch": 0.21050467673964438, + "grad_norm": 0.24562524259090424, + "learning_rate": 8.96198922177617e-05, + "loss": 0.7292, + "step": 1024 + }, + { + "epoch": 0.2107102477130229, + "grad_norm": 0.23262366652488708, + "learning_rate": 8.961859565191543e-05, + "loss": 0.7401, + "step": 1025 + }, + { + "epoch": 0.21091581868640147, + "grad_norm": 0.21075837314128876, + "learning_rate": 8.961729688791949e-05, + "loss": 0.5854, + "step": 1026 + }, + { + "epoch": 0.21112138965978003, + "grad_norm": 0.2659233808517456, + "learning_rate": 8.961599592583785e-05, + "loss": 0.742, + "step": 1027 + }, + { + "epoch": 0.2113269606331586, + "grad_norm": 0.2612632215023041, + "learning_rate": 8.961469276573466e-05, + "loss": 0.7212, + "step": 1028 + }, + { + "epoch": 0.21153253160653715, + "grad_norm": 0.24459590017795563, + "learning_rate": 8.961338740767407e-05, + "loss": 0.7445, + "step": 1029 + }, + { + "epoch": 0.2117381025799157, + "grad_norm": 0.2455456703901291, + "learning_rate": 8.96120798517204e-05, + "loss": 0.7469, + "step": 1030 + }, + { + "epoch": 0.21194367355329427, + "grad_norm": 0.25947311520576477, + "learning_rate": 8.961077009793809e-05, + "loss": 0.7578, + "step": 1031 + }, + { + "epoch": 0.21214924452667283, + "grad_norm": 0.26415055990219116, + "learning_rate": 8.960945814639162e-05, + "loss": 0.7453, + "step": 1032 + }, + { + "epoch": 0.2123548155000514, + "grad_norm": 0.2478688508272171, + "learning_rate": 8.960814399714568e-05, + "loss": 0.7246, + "step": 1033 + }, + { + "epoch": 0.21256038647342995, + "grad_norm": 0.21988952159881592, + "learning_rate": 8.960682765026497e-05, + "loss": 0.6062, + "step": 1034 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 0.16625165939331055, + "learning_rate": 8.960550910581436e-05, + "loss": 0.5704, + "step": 1035 + }, + { + "epoch": 0.21297152842018707, + "grad_norm": 0.2860580086708069, + "learning_rate": 8.960418836385879e-05, + "loss": 0.747, + "step": 1036 + }, + { + "epoch": 0.21317709939356563, + "grad_norm": 0.2644577920436859, + "learning_rate": 8.960286542446335e-05, + "loss": 0.7268, + "step": 1037 + }, + { + "epoch": 0.2133826703669442, + "grad_norm": 0.2598789930343628, + "learning_rate": 8.960154028769319e-05, + "loss": 0.7645, + "step": 1038 + }, + { + "epoch": 0.21358824134032275, + "grad_norm": 0.2992006540298462, + "learning_rate": 8.960021295361363e-05, + "loss": 0.5999, + "step": 1039 + }, + { + "epoch": 0.21379381231370131, + "grad_norm": 0.27868691086769104, + "learning_rate": 8.959888342229001e-05, + "loss": 0.7472, + "step": 1040 + }, + { + "epoch": 0.21399938328707988, + "grad_norm": 0.2707647979259491, + "learning_rate": 8.959755169378788e-05, + "loss": 0.7158, + "step": 1041 + }, + { + "epoch": 0.21420495426045844, + "grad_norm": 0.2671177089214325, + "learning_rate": 8.959621776817281e-05, + "loss": 0.7573, + "step": 1042 + }, + { + "epoch": 0.214410525233837, + "grad_norm": 0.24762409925460815, + "learning_rate": 8.959488164551055e-05, + "loss": 0.7353, + "step": 1043 + }, + { + "epoch": 0.21461609620721553, + "grad_norm": 0.24137498438358307, + "learning_rate": 8.959354332586689e-05, + "loss": 0.7476, + "step": 1044 + }, + { + "epoch": 0.2148216671805941, + "grad_norm": 0.2598249614238739, + "learning_rate": 8.959220280930779e-05, + "loss": 0.7397, + "step": 1045 + }, + { + "epoch": 0.21502723815397265, + "grad_norm": 0.2500339448451996, + "learning_rate": 8.959086009589929e-05, + "loss": 0.7525, + "step": 1046 + }, + { + "epoch": 0.2152328091273512, + "grad_norm": 0.25262802839279175, + "learning_rate": 8.958951518570753e-05, + "loss": 0.759, + "step": 1047 + }, + { + "epoch": 0.21543838010072977, + "grad_norm": 0.2515556216239929, + "learning_rate": 8.958816807879875e-05, + "loss": 0.7321, + "step": 1048 + }, + { + "epoch": 0.21564395107410833, + "grad_norm": 0.24297581613063812, + "learning_rate": 8.958681877523935e-05, + "loss": 0.7444, + "step": 1049 + }, + { + "epoch": 0.2158495220474869, + "grad_norm": 0.2649231255054474, + "learning_rate": 8.958546727509578e-05, + "loss": 0.7458, + "step": 1050 + }, + { + "epoch": 0.21605509302086545, + "grad_norm": 0.2701459527015686, + "learning_rate": 8.958411357843461e-05, + "loss": 0.595, + "step": 1051 + }, + { + "epoch": 0.216260663994244, + "grad_norm": 0.2653101682662964, + "learning_rate": 8.958275768532258e-05, + "loss": 0.7544, + "step": 1052 + }, + { + "epoch": 0.21646623496762257, + "grad_norm": 0.2633649408817291, + "learning_rate": 8.958139959582645e-05, + "loss": 0.7403, + "step": 1053 + }, + { + "epoch": 0.21667180594100113, + "grad_norm": 0.25117960572242737, + "learning_rate": 8.958003931001312e-05, + "loss": 0.7427, + "step": 1054 + }, + { + "epoch": 0.2168773769143797, + "grad_norm": 0.24553567171096802, + "learning_rate": 8.957867682794963e-05, + "loss": 0.7264, + "step": 1055 + }, + { + "epoch": 0.21708294788775825, + "grad_norm": 0.23510022461414337, + "learning_rate": 8.95773121497031e-05, + "loss": 0.7413, + "step": 1056 + }, + { + "epoch": 0.21728851886113681, + "grad_norm": 0.2532014846801758, + "learning_rate": 8.957594527534075e-05, + "loss": 0.735, + "step": 1057 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 0.25079968571662903, + "learning_rate": 8.957457620492993e-05, + "loss": 0.7478, + "step": 1058 + }, + { + "epoch": 0.21769966080789394, + "grad_norm": 0.23813451826572418, + "learning_rate": 8.957320493853805e-05, + "loss": 0.7238, + "step": 1059 + }, + { + "epoch": 0.2179052317812725, + "grad_norm": 0.24865779280662537, + "learning_rate": 8.957183147623273e-05, + "loss": 0.7369, + "step": 1060 + }, + { + "epoch": 0.21811080275465106, + "grad_norm": 0.24684272706508636, + "learning_rate": 8.957045581808159e-05, + "loss": 0.7008, + "step": 1061 + }, + { + "epoch": 0.2183163737280296, + "grad_norm": 0.24000217020511627, + "learning_rate": 8.956907796415241e-05, + "loss": 0.5949, + "step": 1062 + }, + { + "epoch": 0.21852194470140815, + "grad_norm": 0.266008198261261, + "learning_rate": 8.956769791451309e-05, + "loss": 0.7161, + "step": 1063 + }, + { + "epoch": 0.2187275156747867, + "grad_norm": 0.14858698844909668, + "learning_rate": 8.956631566923159e-05, + "loss": 0.5948, + "step": 1064 + }, + { + "epoch": 0.21893308664816527, + "grad_norm": 0.2638164162635803, + "learning_rate": 8.956493122837601e-05, + "loss": 0.7347, + "step": 1065 + }, + { + "epoch": 0.21913865762154383, + "grad_norm": 0.2497703582048416, + "learning_rate": 8.956354459201459e-05, + "loss": 0.7458, + "step": 1066 + }, + { + "epoch": 0.2193442285949224, + "grad_norm": 0.22499538958072662, + "learning_rate": 8.95621557602156e-05, + "loss": 0.5748, + "step": 1067 + }, + { + "epoch": 0.21954979956830095, + "grad_norm": 0.2625332176685333, + "learning_rate": 8.956076473304748e-05, + "loss": 0.748, + "step": 1068 + }, + { + "epoch": 0.2197553705416795, + "grad_norm": 0.2666896879673004, + "learning_rate": 8.955937151057876e-05, + "loss": 0.7547, + "step": 1069 + }, + { + "epoch": 0.21996094151505807, + "grad_norm": 0.25993168354034424, + "learning_rate": 8.955797609287807e-05, + "loss": 0.7593, + "step": 1070 + }, + { + "epoch": 0.22016651248843663, + "grad_norm": 0.248934805393219, + "learning_rate": 8.955657848001417e-05, + "loss": 0.753, + "step": 1071 + }, + { + "epoch": 0.2203720834618152, + "grad_norm": 0.24592526257038116, + "learning_rate": 8.95551786720559e-05, + "loss": 0.7335, + "step": 1072 + }, + { + "epoch": 0.22057765443519375, + "grad_norm": 0.2522546052932739, + "learning_rate": 8.955377666907224e-05, + "loss": 0.7287, + "step": 1073 + }, + { + "epoch": 0.22078322540857231, + "grad_norm": 0.24097007513046265, + "learning_rate": 8.955237247113222e-05, + "loss": 0.7178, + "step": 1074 + }, + { + "epoch": 0.22098879638195087, + "grad_norm": 0.26036760210990906, + "learning_rate": 8.955096607830506e-05, + "loss": 0.7528, + "step": 1075 + }, + { + "epoch": 0.22119436735532944, + "grad_norm": 0.2414807826280594, + "learning_rate": 8.954955749066005e-05, + "loss": 0.7121, + "step": 1076 + }, + { + "epoch": 0.221399938328708, + "grad_norm": 0.2436942607164383, + "learning_rate": 8.954814670826654e-05, + "loss": 0.744, + "step": 1077 + }, + { + "epoch": 0.22160550930208656, + "grad_norm": 0.2534603774547577, + "learning_rate": 8.954673373119407e-05, + "loss": 0.7627, + "step": 1078 + }, + { + "epoch": 0.22181108027546512, + "grad_norm": 0.21081526577472687, + "learning_rate": 8.954531855951224e-05, + "loss": 0.5921, + "step": 1079 + }, + { + "epoch": 0.22201665124884365, + "grad_norm": 0.26541346311569214, + "learning_rate": 8.954390119329077e-05, + "loss": 0.7452, + "step": 1080 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.24794277548789978, + "learning_rate": 8.954248163259949e-05, + "loss": 0.7196, + "step": 1081 + }, + { + "epoch": 0.22242779319560077, + "grad_norm": 0.25889837741851807, + "learning_rate": 8.954105987750832e-05, + "loss": 0.7674, + "step": 1082 + }, + { + "epoch": 0.22263336416897933, + "grad_norm": 0.24961018562316895, + "learning_rate": 8.953963592808733e-05, + "loss": 0.7232, + "step": 1083 + }, + { + "epoch": 0.2228389351423579, + "grad_norm": 0.2539832293987274, + "learning_rate": 8.953820978440664e-05, + "loss": 0.7559, + "step": 1084 + }, + { + "epoch": 0.22304450611573645, + "grad_norm": 0.23905551433563232, + "learning_rate": 8.953678144653653e-05, + "loss": 0.7211, + "step": 1085 + }, + { + "epoch": 0.223250077089115, + "grad_norm": 0.24047812819480896, + "learning_rate": 8.953535091454735e-05, + "loss": 0.7367, + "step": 1086 + }, + { + "epoch": 0.22345564806249357, + "grad_norm": 0.25583919882774353, + "learning_rate": 8.953391818850961e-05, + "loss": 0.7573, + "step": 1087 + }, + { + "epoch": 0.22366121903587213, + "grad_norm": 0.20065194368362427, + "learning_rate": 8.953248326849386e-05, + "loss": 0.5804, + "step": 1088 + }, + { + "epoch": 0.2238667900092507, + "grad_norm": 0.18610531091690063, + "learning_rate": 8.953104615457081e-05, + "loss": 0.5888, + "step": 1089 + }, + { + "epoch": 0.22407236098262925, + "grad_norm": 0.15629194676876068, + "learning_rate": 8.952960684681125e-05, + "loss": 0.5884, + "step": 1090 + }, + { + "epoch": 0.22427793195600781, + "grad_norm": 0.3306218683719635, + "learning_rate": 8.952816534528609e-05, + "loss": 0.7454, + "step": 1091 + }, + { + "epoch": 0.22448350292938637, + "grad_norm": 0.26848849654197693, + "learning_rate": 8.952672165006635e-05, + "loss": 0.7336, + "step": 1092 + }, + { + "epoch": 0.22468907390276494, + "grad_norm": 0.20548087358474731, + "learning_rate": 8.952527576122315e-05, + "loss": 0.5992, + "step": 1093 + }, + { + "epoch": 0.2248946448761435, + "grad_norm": 0.18607185781002045, + "learning_rate": 8.952382767882773e-05, + "loss": 0.5666, + "step": 1094 + }, + { + "epoch": 0.22510021584952206, + "grad_norm": 0.16436809301376343, + "learning_rate": 8.952237740295141e-05, + "loss": 0.595, + "step": 1095 + }, + { + "epoch": 0.22530578682290062, + "grad_norm": 0.46899160742759705, + "learning_rate": 8.952092493366567e-05, + "loss": 0.7777, + "step": 1096 + }, + { + "epoch": 0.22551135779627918, + "grad_norm": 0.2985895276069641, + "learning_rate": 8.951947027104205e-05, + "loss": 0.7495, + "step": 1097 + }, + { + "epoch": 0.22571692876965774, + "grad_norm": 0.319159597158432, + "learning_rate": 8.95180134151522e-05, + "loss": 0.7469, + "step": 1098 + }, + { + "epoch": 0.22592249974303627, + "grad_norm": 0.324747771024704, + "learning_rate": 8.95165543660679e-05, + "loss": 0.7239, + "step": 1099 + }, + { + "epoch": 0.22612807071641483, + "grad_norm": 0.5259039402008057, + "learning_rate": 8.951509312386105e-05, + "loss": 0.6189, + "step": 1100 + }, + { + "epoch": 0.2263336416897934, + "grad_norm": 0.2236146181821823, + "learning_rate": 8.951362968860361e-05, + "loss": 0.5996, + "step": 1101 + }, + { + "epoch": 0.22653921266317195, + "grad_norm": 0.4835422933101654, + "learning_rate": 8.95121640603677e-05, + "loss": 0.7662, + "step": 1102 + }, + { + "epoch": 0.2267447836365505, + "grad_norm": 0.371629923582077, + "learning_rate": 8.951069623922552e-05, + "loss": 0.7393, + "step": 1103 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 0.2967519164085388, + "learning_rate": 8.950922622524938e-05, + "loss": 0.7547, + "step": 1104 + }, + { + "epoch": 0.22715592558330763, + "grad_norm": 0.3473425507545471, + "learning_rate": 8.950775401851169e-05, + "loss": 0.7603, + "step": 1105 + }, + { + "epoch": 0.2273614965566862, + "grad_norm": 0.3515138030052185, + "learning_rate": 8.950627961908499e-05, + "loss": 0.729, + "step": 1106 + }, + { + "epoch": 0.22756706753006475, + "grad_norm": 0.3210054039955139, + "learning_rate": 8.950480302704193e-05, + "loss": 0.7565, + "step": 1107 + }, + { + "epoch": 0.22777263850344331, + "grad_norm": 0.5195302367210388, + "learning_rate": 8.950332424245522e-05, + "loss": 0.6351, + "step": 1108 + }, + { + "epoch": 0.22797820947682187, + "grad_norm": 0.3467387557029724, + "learning_rate": 8.950184326539775e-05, + "loss": 0.7554, + "step": 1109 + }, + { + "epoch": 0.22818378045020044, + "grad_norm": 0.33716848492622375, + "learning_rate": 8.950036009594245e-05, + "loss": 0.7558, + "step": 1110 + }, + { + "epoch": 0.228389351423579, + "grad_norm": 0.27896901965141296, + "learning_rate": 8.94988747341624e-05, + "loss": 0.7455, + "step": 1111 + }, + { + "epoch": 0.22859492239695756, + "grad_norm": 0.27595579624176025, + "learning_rate": 8.949738718013078e-05, + "loss": 0.7425, + "step": 1112 + }, + { + "epoch": 0.22880049337033612, + "grad_norm": 0.29621824622154236, + "learning_rate": 8.949589743392089e-05, + "loss": 0.7416, + "step": 1113 + }, + { + "epoch": 0.22900606434371468, + "grad_norm": 0.28054726123809814, + "learning_rate": 8.94944054956061e-05, + "loss": 0.7538, + "step": 1114 + }, + { + "epoch": 0.22921163531709324, + "grad_norm": 0.25396206974983215, + "learning_rate": 8.949291136525991e-05, + "loss": 0.7479, + "step": 1115 + }, + { + "epoch": 0.2294172062904718, + "grad_norm": 0.2706109881401062, + "learning_rate": 8.949141504295594e-05, + "loss": 0.7475, + "step": 1116 + }, + { + "epoch": 0.22962277726385033, + "grad_norm": 0.26184260845184326, + "learning_rate": 8.94899165287679e-05, + "loss": 0.7383, + "step": 1117 + }, + { + "epoch": 0.2298283482372289, + "grad_norm": 0.2610413134098053, + "learning_rate": 8.948841582276963e-05, + "loss": 0.7384, + "step": 1118 + }, + { + "epoch": 0.23003391921060745, + "grad_norm": 0.2537980079650879, + "learning_rate": 8.948691292503504e-05, + "loss": 0.7444, + "step": 1119 + }, + { + "epoch": 0.230239490183986, + "grad_norm": 0.2602024972438812, + "learning_rate": 8.948540783563817e-05, + "loss": 0.7306, + "step": 1120 + }, + { + "epoch": 0.23044506115736457, + "grad_norm": 0.3567192256450653, + "learning_rate": 8.94839005546532e-05, + "loss": 0.604, + "step": 1121 + }, + { + "epoch": 0.23065063213074313, + "grad_norm": 0.49138790369033813, + "learning_rate": 8.948239108215437e-05, + "loss": 0.7303, + "step": 1122 + }, + { + "epoch": 0.2308562031041217, + "grad_norm": 0.30943894386291504, + "learning_rate": 8.948087941821603e-05, + "loss": 0.7535, + "step": 1123 + }, + { + "epoch": 0.23106177407750025, + "grad_norm": 0.25115516781806946, + "learning_rate": 8.947936556291267e-05, + "loss": 0.7416, + "step": 1124 + }, + { + "epoch": 0.23126734505087881, + "grad_norm": 0.24797074496746063, + "learning_rate": 8.947784951631886e-05, + "loss": 0.7328, + "step": 1125 + }, + { + "epoch": 0.23147291602425737, + "grad_norm": 0.25195595622062683, + "learning_rate": 8.94763312785093e-05, + "loss": 0.7375, + "step": 1126 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 0.20428021252155304, + "learning_rate": 8.947481084955877e-05, + "loss": 0.61, + "step": 1127 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 0.27424702048301697, + "learning_rate": 8.947328822954218e-05, + "loss": 0.7512, + "step": 1128 + }, + { + "epoch": 0.23208962894439306, + "grad_norm": 0.26351961493492126, + "learning_rate": 8.947176341853455e-05, + "loss": 0.7584, + "step": 1129 + }, + { + "epoch": 0.23229519991777162, + "grad_norm": 0.25228413939476013, + "learning_rate": 8.947023641661101e-05, + "loss": 0.7629, + "step": 1130 + }, + { + "epoch": 0.23250077089115018, + "grad_norm": 0.24488292634487152, + "learning_rate": 8.946870722384676e-05, + "loss": 0.7501, + "step": 1131 + }, + { + "epoch": 0.23270634186452874, + "grad_norm": 0.2597258388996124, + "learning_rate": 8.946717584031716e-05, + "loss": 0.7408, + "step": 1132 + }, + { + "epoch": 0.2329119128379073, + "grad_norm": 0.25343239307403564, + "learning_rate": 8.946564226609764e-05, + "loss": 0.7186, + "step": 1133 + }, + { + "epoch": 0.23311748381128586, + "grad_norm": 0.24788786470890045, + "learning_rate": 8.946410650126376e-05, + "loss": 0.6838, + "step": 1134 + }, + { + "epoch": 0.23332305478466442, + "grad_norm": 0.18649965524673462, + "learning_rate": 8.946256854589118e-05, + "loss": 0.6325, + "step": 1135 + }, + { + "epoch": 0.23352862575804295, + "grad_norm": 0.26197314262390137, + "learning_rate": 8.946102840005568e-05, + "loss": 0.7428, + "step": 1136 + }, + { + "epoch": 0.2337341967314215, + "grad_norm": 0.25486642122268677, + "learning_rate": 8.94594860638331e-05, + "loss": 0.7505, + "step": 1137 + }, + { + "epoch": 0.23393976770480007, + "grad_norm": 0.2388404756784439, + "learning_rate": 8.945794153729945e-05, + "loss": 0.7296, + "step": 1138 + }, + { + "epoch": 0.23414533867817863, + "grad_norm": 0.2506440579891205, + "learning_rate": 8.945639482053081e-05, + "loss": 0.7501, + "step": 1139 + }, + { + "epoch": 0.2343509096515572, + "grad_norm": 0.2521236538887024, + "learning_rate": 8.94548459136034e-05, + "loss": 0.7488, + "step": 1140 + }, + { + "epoch": 0.23455648062493575, + "grad_norm": 0.25158312916755676, + "learning_rate": 8.94532948165935e-05, + "loss": 0.7274, + "step": 1141 + }, + { + "epoch": 0.23476205159831431, + "grad_norm": 0.23634850978851318, + "learning_rate": 8.945174152957755e-05, + "loss": 0.7306, + "step": 1142 + }, + { + "epoch": 0.23496762257169287, + "grad_norm": 0.1795545369386673, + "learning_rate": 8.945018605263205e-05, + "loss": 0.5908, + "step": 1143 + }, + { + "epoch": 0.23517319354507144, + "grad_norm": 0.26744595170021057, + "learning_rate": 8.944862838583364e-05, + "loss": 0.747, + "step": 1144 + }, + { + "epoch": 0.23537876451845, + "grad_norm": 0.23531249165534973, + "learning_rate": 8.944706852925908e-05, + "loss": 0.7097, + "step": 1145 + }, + { + "epoch": 0.23558433549182856, + "grad_norm": 0.2423231452703476, + "learning_rate": 8.944550648298519e-05, + "loss": 0.7536, + "step": 1146 + }, + { + "epoch": 0.23578990646520712, + "grad_norm": 0.24406969547271729, + "learning_rate": 8.944394224708892e-05, + "loss": 0.7459, + "step": 1147 + }, + { + "epoch": 0.23599547743858568, + "grad_norm": 0.2516055405139923, + "learning_rate": 8.944237582164736e-05, + "loss": 0.748, + "step": 1148 + }, + { + "epoch": 0.23620104841196424, + "grad_norm": 0.23662374913692474, + "learning_rate": 8.944080720673766e-05, + "loss": 0.7272, + "step": 1149 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 0.25914058089256287, + "learning_rate": 8.943923640243712e-05, + "loss": 0.7286, + "step": 1150 + }, + { + "epoch": 0.23661219035872136, + "grad_norm": 0.16088080406188965, + "learning_rate": 8.943766340882309e-05, + "loss": 0.5913, + "step": 1151 + }, + { + "epoch": 0.23681776133209992, + "grad_norm": 0.15930064022541046, + "learning_rate": 8.943608822597309e-05, + "loss": 0.5927, + "step": 1152 + }, + { + "epoch": 0.23702333230547848, + "grad_norm": 0.2877768576145172, + "learning_rate": 8.943451085396473e-05, + "loss": 0.7462, + "step": 1153 + }, + { + "epoch": 0.237228903278857, + "grad_norm": 0.2618594169616699, + "learning_rate": 8.94329312928757e-05, + "loss": 0.7506, + "step": 1154 + }, + { + "epoch": 0.23743447425223557, + "grad_norm": 0.24599005281925201, + "learning_rate": 8.943134954278383e-05, + "loss": 0.7052, + "step": 1155 + }, + { + "epoch": 0.23764004522561413, + "grad_norm": 0.2675454318523407, + "learning_rate": 8.942976560376703e-05, + "loss": 0.7396, + "step": 1156 + }, + { + "epoch": 0.2378456161989927, + "grad_norm": 0.2358483374118805, + "learning_rate": 8.942817947590333e-05, + "loss": 0.7131, + "step": 1157 + }, + { + "epoch": 0.23805118717237125, + "grad_norm": 0.24510863423347473, + "learning_rate": 8.94265911592709e-05, + "loss": 0.735, + "step": 1158 + }, + { + "epoch": 0.23825675814574981, + "grad_norm": 0.24396325647830963, + "learning_rate": 8.942500065394798e-05, + "loss": 0.7286, + "step": 1159 + }, + { + "epoch": 0.23846232911912837, + "grad_norm": 0.24989542365074158, + "learning_rate": 8.942340796001291e-05, + "loss": 0.7614, + "step": 1160 + }, + { + "epoch": 0.23866790009250693, + "grad_norm": 0.22477596998214722, + "learning_rate": 8.942181307754416e-05, + "loss": 0.7065, + "step": 1161 + }, + { + "epoch": 0.2388734710658855, + "grad_norm": 0.27181369066238403, + "learning_rate": 8.942021600662033e-05, + "loss": 0.7612, + "step": 1162 + }, + { + "epoch": 0.23907904203926406, + "grad_norm": 0.2516171336174011, + "learning_rate": 8.941861674732005e-05, + "loss": 0.7506, + "step": 1163 + }, + { + "epoch": 0.23928461301264262, + "grad_norm": 0.23005805909633636, + "learning_rate": 8.941701529972216e-05, + "loss": 0.7287, + "step": 1164 + }, + { + "epoch": 0.23949018398602118, + "grad_norm": 0.24049928784370422, + "learning_rate": 8.941541166390549e-05, + "loss": 0.7337, + "step": 1165 + }, + { + "epoch": 0.23969575495939974, + "grad_norm": 0.2356685847043991, + "learning_rate": 8.941380583994912e-05, + "loss": 0.7066, + "step": 1166 + }, + { + "epoch": 0.2399013259327783, + "grad_norm": 0.21500107645988464, + "learning_rate": 8.941219782793211e-05, + "loss": 0.5845, + "step": 1167 + }, + { + "epoch": 0.24010689690615686, + "grad_norm": 0.24245062470436096, + "learning_rate": 8.941058762793371e-05, + "loss": 0.7339, + "step": 1168 + }, + { + "epoch": 0.24031246787953542, + "grad_norm": 0.24114523828029633, + "learning_rate": 8.940897524003322e-05, + "loss": 0.7167, + "step": 1169 + }, + { + "epoch": 0.24051803885291398, + "grad_norm": 0.2341417521238327, + "learning_rate": 8.94073606643101e-05, + "loss": 0.7557, + "step": 1170 + }, + { + "epoch": 0.24072360982629254, + "grad_norm": 0.24253100156784058, + "learning_rate": 8.940574390084385e-05, + "loss": 0.7522, + "step": 1171 + }, + { + "epoch": 0.2409291807996711, + "grad_norm": 0.17679694294929504, + "learning_rate": 8.940412494971418e-05, + "loss": 0.5978, + "step": 1172 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 0.2966403067111969, + "learning_rate": 8.940250381100081e-05, + "loss": 0.7489, + "step": 1173 + }, + { + "epoch": 0.2413403227464282, + "grad_norm": 0.2602713108062744, + "learning_rate": 8.94008804847836e-05, + "loss": 0.737, + "step": 1174 + }, + { + "epoch": 0.24154589371980675, + "grad_norm": 0.24620187282562256, + "learning_rate": 8.939925497114255e-05, + "loss": 0.7612, + "step": 1175 + }, + { + "epoch": 0.24175146469318531, + "grad_norm": 1.3907586336135864, + "learning_rate": 8.939762727015773e-05, + "loss": 0.7424, + "step": 1176 + }, + { + "epoch": 0.24195703566656387, + "grad_norm": 0.25489339232444763, + "learning_rate": 8.939599738190933e-05, + "loss": 0.7292, + "step": 1177 + }, + { + "epoch": 0.24216260663994243, + "grad_norm": 0.24630793929100037, + "learning_rate": 8.939436530647765e-05, + "loss": 0.7201, + "step": 1178 + }, + { + "epoch": 0.242368177613321, + "grad_norm": 0.2420111447572708, + "learning_rate": 8.939273104394307e-05, + "loss": 0.7593, + "step": 1179 + }, + { + "epoch": 0.24257374858669956, + "grad_norm": 0.24446842074394226, + "learning_rate": 8.939109459438614e-05, + "loss": 0.7191, + "step": 1180 + }, + { + "epoch": 0.24277931956007812, + "grad_norm": 0.2652778625488281, + "learning_rate": 8.938945595788746e-05, + "loss": 0.7417, + "step": 1181 + }, + { + "epoch": 0.24298489053345668, + "grad_norm": 0.2472565621137619, + "learning_rate": 8.938781513452775e-05, + "loss": 0.7128, + "step": 1182 + }, + { + "epoch": 0.24319046150683524, + "grad_norm": 0.25744304060935974, + "learning_rate": 8.938617212438786e-05, + "loss": 0.7433, + "step": 1183 + }, + { + "epoch": 0.2433960324802138, + "grad_norm": 0.2481434941291809, + "learning_rate": 8.938452692754874e-05, + "loss": 0.6043, + "step": 1184 + }, + { + "epoch": 0.24360160345359236, + "grad_norm": 0.27799829840660095, + "learning_rate": 8.938287954409143e-05, + "loss": 0.7457, + "step": 1185 + }, + { + "epoch": 0.24380717442697092, + "grad_norm": 0.1753695160150528, + "learning_rate": 8.938122997409709e-05, + "loss": 0.5978, + "step": 1186 + }, + { + "epoch": 0.24401274540034948, + "grad_norm": 0.16633495688438416, + "learning_rate": 8.937957821764698e-05, + "loss": 0.6047, + "step": 1187 + }, + { + "epoch": 0.24421831637372804, + "grad_norm": 0.2707998752593994, + "learning_rate": 8.937792427482249e-05, + "loss": 0.7181, + "step": 1188 + }, + { + "epoch": 0.2444238873471066, + "grad_norm": 0.1617717742919922, + "learning_rate": 8.937626814570507e-05, + "loss": 0.6032, + "step": 1189 + }, + { + "epoch": 0.24462945832048516, + "grad_norm": 0.15513579547405243, + "learning_rate": 8.937460983037636e-05, + "loss": 0.5983, + "step": 1190 + }, + { + "epoch": 0.2448350292938637, + "grad_norm": 0.2588478624820709, + "learning_rate": 8.9372949328918e-05, + "loss": 0.7395, + "step": 1191 + }, + { + "epoch": 0.24504060026724225, + "grad_norm": 0.2583847939968109, + "learning_rate": 8.937128664141184e-05, + "loss": 0.7442, + "step": 1192 + }, + { + "epoch": 0.2452461712406208, + "grad_norm": 0.23951515555381775, + "learning_rate": 8.936962176793979e-05, + "loss": 0.7309, + "step": 1193 + }, + { + "epoch": 0.24545174221399937, + "grad_norm": 0.23284120857715607, + "learning_rate": 8.936795470858385e-05, + "loss": 0.7122, + "step": 1194 + }, + { + "epoch": 0.24565731318737793, + "grad_norm": 0.2364392876625061, + "learning_rate": 8.936628546342617e-05, + "loss": 0.7452, + "step": 1195 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 0.19968503713607788, + "learning_rate": 8.936461403254895e-05, + "loss": 0.6054, + "step": 1196 + }, + { + "epoch": 0.24606845513413506, + "grad_norm": 0.25698399543762207, + "learning_rate": 8.936294041603457e-05, + "loss": 0.7542, + "step": 1197 + }, + { + "epoch": 0.24627402610751362, + "grad_norm": 0.2551160454750061, + "learning_rate": 8.936126461396545e-05, + "loss": 0.729, + "step": 1198 + }, + { + "epoch": 0.24647959708089218, + "grad_norm": 0.2407594472169876, + "learning_rate": 8.935958662642419e-05, + "loss": 0.7331, + "step": 1199 + }, + { + "epoch": 0.24668516805427074, + "grad_norm": 0.19667823612689972, + "learning_rate": 8.935790645349342e-05, + "loss": 0.5818, + "step": 1200 + }, + { + "epoch": 0.2468907390276493, + "grad_norm": 0.25005340576171875, + "learning_rate": 8.935622409525593e-05, + "loss": 0.7355, + "step": 1201 + }, + { + "epoch": 0.24709631000102786, + "grad_norm": 0.15851576626300812, + "learning_rate": 8.93545395517946e-05, + "loss": 0.6147, + "step": 1202 + }, + { + "epoch": 0.24730188097440642, + "grad_norm": 0.2595955431461334, + "learning_rate": 8.935285282319242e-05, + "loss": 0.7344, + "step": 1203 + }, + { + "epoch": 0.24750745194778498, + "grad_norm": 0.2531373202800751, + "learning_rate": 8.935116390953249e-05, + "loss": 0.7206, + "step": 1204 + }, + { + "epoch": 0.24771302292116354, + "grad_norm": 0.2330513596534729, + "learning_rate": 8.9349472810898e-05, + "loss": 0.7487, + "step": 1205 + }, + { + "epoch": 0.2479185938945421, + "grad_norm": 0.23262523114681244, + "learning_rate": 8.934777952737228e-05, + "loss": 0.7268, + "step": 1206 + }, + { + "epoch": 0.24812416486792066, + "grad_norm": 0.2461225688457489, + "learning_rate": 8.934608405903875e-05, + "loss": 0.7272, + "step": 1207 + }, + { + "epoch": 0.24832973584129922, + "grad_norm": 0.23531411588191986, + "learning_rate": 8.934438640598092e-05, + "loss": 0.7249, + "step": 1208 + }, + { + "epoch": 0.24853530681467778, + "grad_norm": 0.19100695848464966, + "learning_rate": 8.934268656828244e-05, + "loss": 0.6049, + "step": 1209 + }, + { + "epoch": 0.2487408777880563, + "grad_norm": 0.25513240694999695, + "learning_rate": 8.934098454602704e-05, + "loss": 0.7281, + "step": 1210 + }, + { + "epoch": 0.24894644876143487, + "grad_norm": 0.24409835040569305, + "learning_rate": 8.93392803392986e-05, + "loss": 0.7533, + "step": 1211 + }, + { + "epoch": 0.24915201973481343, + "grad_norm": 0.24540594220161438, + "learning_rate": 8.933757394818104e-05, + "loss": 0.7218, + "step": 1212 + }, + { + "epoch": 0.249357590708192, + "grad_norm": 0.24975821375846863, + "learning_rate": 8.933586537275846e-05, + "loss": 0.7528, + "step": 1213 + }, + { + "epoch": 0.24956316168157056, + "grad_norm": 0.17961885035037994, + "learning_rate": 8.933415461311502e-05, + "loss": 0.5881, + "step": 1214 + }, + { + "epoch": 0.24976873265494912, + "grad_norm": 0.26504039764404297, + "learning_rate": 8.9332441669335e-05, + "loss": 0.7393, + "step": 1215 + }, + { + "epoch": 0.24997430362832768, + "grad_norm": 0.24959856271743774, + "learning_rate": 8.933072654150277e-05, + "loss": 0.7333, + "step": 1216 + }, + { + "epoch": 0.25017987460170626, + "grad_norm": 0.25788456201553345, + "learning_rate": 8.932900922970287e-05, + "loss": 0.7524, + "step": 1217 + }, + { + "epoch": 0.2503854455750848, + "grad_norm": 0.2299453467130661, + "learning_rate": 8.932728973401986e-05, + "loss": 0.7532, + "step": 1218 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 0.23602120578289032, + "learning_rate": 8.932556805453847e-05, + "loss": 0.7446, + "step": 1219 + }, + { + "epoch": 0.2507965875218419, + "grad_norm": 0.24988947808742523, + "learning_rate": 8.932384419134352e-05, + "loss": 0.7275, + "step": 1220 + }, + { + "epoch": 0.25100215849522045, + "grad_norm": 0.22750410437583923, + "learning_rate": 8.932211814451995e-05, + "loss": 0.7284, + "step": 1221 + }, + { + "epoch": 0.25120772946859904, + "grad_norm": 0.22385790944099426, + "learning_rate": 8.932038991415277e-05, + "loss": 0.753, + "step": 1222 + }, + { + "epoch": 0.25141330044197757, + "grad_norm": 0.22648993134498596, + "learning_rate": 8.931865950032713e-05, + "loss": 0.7171, + "step": 1223 + }, + { + "epoch": 0.25161887141535616, + "grad_norm": 0.22896623611450195, + "learning_rate": 8.931692690312828e-05, + "loss": 0.7164, + "step": 1224 + }, + { + "epoch": 0.2518244423887347, + "grad_norm": 0.2378738969564438, + "learning_rate": 8.931519212264157e-05, + "loss": 0.6969, + "step": 1225 + }, + { + "epoch": 0.2520300133621133, + "grad_norm": 0.23377791047096252, + "learning_rate": 8.931345515895248e-05, + "loss": 0.7102, + "step": 1226 + }, + { + "epoch": 0.2522355843354918, + "grad_norm": 0.23156873881816864, + "learning_rate": 8.93117160121466e-05, + "loss": 0.7426, + "step": 1227 + }, + { + "epoch": 0.2524411553088704, + "grad_norm": 0.2447620928287506, + "learning_rate": 8.930997468230956e-05, + "loss": 0.7254, + "step": 1228 + }, + { + "epoch": 0.25264672628224893, + "grad_norm": 0.24257569015026093, + "learning_rate": 8.930823116952717e-05, + "loss": 0.7551, + "step": 1229 + }, + { + "epoch": 0.2528522972556275, + "grad_norm": 0.23060962557792664, + "learning_rate": 8.930648547388534e-05, + "loss": 0.7411, + "step": 1230 + }, + { + "epoch": 0.25305786822900606, + "grad_norm": 0.23297728598117828, + "learning_rate": 8.930473759547005e-05, + "loss": 0.731, + "step": 1231 + }, + { + "epoch": 0.25326343920238464, + "grad_norm": 0.18401369452476501, + "learning_rate": 8.930298753436741e-05, + "loss": 0.6025, + "step": 1232 + }, + { + "epoch": 0.2534690101757632, + "grad_norm": 0.25541701912879944, + "learning_rate": 8.930123529066365e-05, + "loss": 0.7314, + "step": 1233 + }, + { + "epoch": 0.25367458114914176, + "grad_norm": 0.2430264949798584, + "learning_rate": 8.929948086444512e-05, + "loss": 0.7115, + "step": 1234 + }, + { + "epoch": 0.2538801521225203, + "grad_norm": 0.2397884875535965, + "learning_rate": 8.929772425579818e-05, + "loss": 0.7065, + "step": 1235 + }, + { + "epoch": 0.2540857230958989, + "grad_norm": 0.2442830502986908, + "learning_rate": 8.929596546480944e-05, + "loss": 0.7252, + "step": 1236 + }, + { + "epoch": 0.2542912940692774, + "grad_norm": 0.2494584023952484, + "learning_rate": 8.92942044915655e-05, + "loss": 0.7292, + "step": 1237 + }, + { + "epoch": 0.25449686504265595, + "grad_norm": 0.23975245654582977, + "learning_rate": 8.929244133615314e-05, + "loss": 0.7256, + "step": 1238 + }, + { + "epoch": 0.25470243601603454, + "grad_norm": 0.24557578563690186, + "learning_rate": 8.929067599865924e-05, + "loss": 0.7126, + "step": 1239 + }, + { + "epoch": 0.25490800698941307, + "grad_norm": 0.2466876208782196, + "learning_rate": 8.928890847917073e-05, + "loss": 0.7397, + "step": 1240 + }, + { + "epoch": 0.25511357796279166, + "grad_norm": 0.236251562833786, + "learning_rate": 8.92871387777747e-05, + "loss": 0.7578, + "step": 1241 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.23271340131759644, + "learning_rate": 8.928536689455835e-05, + "loss": 0.7126, + "step": 1242 + }, + { + "epoch": 0.2555247199095488, + "grad_norm": 0.2597436010837555, + "learning_rate": 8.928359282960896e-05, + "loss": 0.7506, + "step": 1243 + }, + { + "epoch": 0.2557302908829273, + "grad_norm": 0.2491491734981537, + "learning_rate": 8.928181658301394e-05, + "loss": 0.7396, + "step": 1244 + }, + { + "epoch": 0.2559358618563059, + "grad_norm": 0.2302912026643753, + "learning_rate": 8.928003815486078e-05, + "loss": 0.7074, + "step": 1245 + }, + { + "epoch": 0.25614143282968443, + "grad_norm": 0.22792287170886993, + "learning_rate": 8.927825754523711e-05, + "loss": 0.705, + "step": 1246 + }, + { + "epoch": 0.256347003803063, + "grad_norm": 0.20026971399784088, + "learning_rate": 8.927647475423064e-05, + "loss": 0.597, + "step": 1247 + }, + { + "epoch": 0.25655257477644156, + "grad_norm": 0.2631547749042511, + "learning_rate": 8.92746897819292e-05, + "loss": 0.7552, + "step": 1248 + }, + { + "epoch": 0.25675814574982014, + "grad_norm": 0.24641458690166473, + "learning_rate": 8.927290262842075e-05, + "loss": 0.7049, + "step": 1249 + }, + { + "epoch": 0.2569637167231987, + "grad_norm": 0.24111877381801605, + "learning_rate": 8.927111329379331e-05, + "loss": 0.7467, + "step": 1250 + }, + { + "epoch": 0.25716928769657726, + "grad_norm": 0.23682504892349243, + "learning_rate": 8.926932177813505e-05, + "loss": 0.7529, + "step": 1251 + }, + { + "epoch": 0.2573748586699558, + "grad_norm": 0.2335578352212906, + "learning_rate": 8.92675280815342e-05, + "loss": 0.7186, + "step": 1252 + }, + { + "epoch": 0.2575804296433344, + "grad_norm": 0.25901028513908386, + "learning_rate": 8.926573220407918e-05, + "loss": 0.7339, + "step": 1253 + }, + { + "epoch": 0.2577860006167129, + "grad_norm": 0.2469077706336975, + "learning_rate": 8.92639341458584e-05, + "loss": 0.744, + "step": 1254 + }, + { + "epoch": 0.2579915715900915, + "grad_norm": 0.17402611672878265, + "learning_rate": 8.926213390696048e-05, + "loss": 0.5948, + "step": 1255 + }, + { + "epoch": 0.25819714256347004, + "grad_norm": 0.2638707160949707, + "learning_rate": 8.926033148747412e-05, + "loss": 0.7456, + "step": 1256 + }, + { + "epoch": 0.25840271353684857, + "grad_norm": 0.15191468596458435, + "learning_rate": 8.925852688748808e-05, + "loss": 0.6055, + "step": 1257 + }, + { + "epoch": 0.25860828451022716, + "grad_norm": 0.25375521183013916, + "learning_rate": 8.92567201070913e-05, + "loss": 0.7441, + "step": 1258 + }, + { + "epoch": 0.2588138554836057, + "grad_norm": 0.24398963153362274, + "learning_rate": 8.925491114637277e-05, + "loss": 0.7551, + "step": 1259 + }, + { + "epoch": 0.2590194264569843, + "grad_norm": 0.15817205607891083, + "learning_rate": 8.925310000542161e-05, + "loss": 0.5987, + "step": 1260 + }, + { + "epoch": 0.2592249974303628, + "grad_norm": 0.15531690418720245, + "learning_rate": 8.925128668432705e-05, + "loss": 0.5948, + "step": 1261 + }, + { + "epoch": 0.2594305684037414, + "grad_norm": 0.25315144658088684, + "learning_rate": 8.924947118317844e-05, + "loss": 0.7374, + "step": 1262 + }, + { + "epoch": 0.25963613937711993, + "grad_norm": 0.24230562150478363, + "learning_rate": 8.924765350206519e-05, + "loss": 0.7363, + "step": 1263 + }, + { + "epoch": 0.2598417103504985, + "grad_norm": 0.22478878498077393, + "learning_rate": 8.924583364107687e-05, + "loss": 0.7269, + "step": 1264 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 0.24388407170772552, + "learning_rate": 8.924401160030313e-05, + "loss": 0.7349, + "step": 1265 + }, + { + "epoch": 0.26025285229725564, + "grad_norm": 0.24955937266349792, + "learning_rate": 8.924218737983373e-05, + "loss": 0.73, + "step": 1266 + }, + { + "epoch": 0.2604584232706342, + "grad_norm": 0.24500887095928192, + "learning_rate": 8.924036097975856e-05, + "loss": 0.7247, + "step": 1267 + }, + { + "epoch": 0.26066399424401276, + "grad_norm": 0.20046253502368927, + "learning_rate": 8.923853240016757e-05, + "loss": 0.5842, + "step": 1268 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.25663238763809204, + "learning_rate": 8.923670164115087e-05, + "loss": 0.7296, + "step": 1269 + }, + { + "epoch": 0.2610751361907699, + "grad_norm": 0.25753530859947205, + "learning_rate": 8.923486870279863e-05, + "loss": 0.7367, + "step": 1270 + }, + { + "epoch": 0.2612807071641484, + "grad_norm": 0.23126912117004395, + "learning_rate": 8.923303358520117e-05, + "loss": 0.7257, + "step": 1271 + }, + { + "epoch": 0.261486278137527, + "grad_norm": 0.24083848297595978, + "learning_rate": 8.923119628844889e-05, + "loss": 0.7335, + "step": 1272 + }, + { + "epoch": 0.26169184911090554, + "grad_norm": 0.17281857132911682, + "learning_rate": 8.92293568126323e-05, + "loss": 0.5799, + "step": 1273 + }, + { + "epoch": 0.26189742008428407, + "grad_norm": 0.1575915813446045, + "learning_rate": 8.922751515784204e-05, + "loss": 0.5796, + "step": 1274 + }, + { + "epoch": 0.26210299105766266, + "grad_norm": 0.31265151500701904, + "learning_rate": 8.922567132416881e-05, + "loss": 0.7426, + "step": 1275 + }, + { + "epoch": 0.2623085620310412, + "grad_norm": 0.257569819688797, + "learning_rate": 8.922382531170347e-05, + "loss": 0.7183, + "step": 1276 + }, + { + "epoch": 0.2625141330044198, + "grad_norm": 0.23766203224658966, + "learning_rate": 8.922197712053697e-05, + "loss": 0.7331, + "step": 1277 + }, + { + "epoch": 0.2627197039777983, + "grad_norm": 0.25914183259010315, + "learning_rate": 8.922012675076034e-05, + "loss": 0.7342, + "step": 1278 + }, + { + "epoch": 0.2629252749511769, + "grad_norm": 0.26477503776550293, + "learning_rate": 8.921827420246473e-05, + "loss": 0.7313, + "step": 1279 + }, + { + "epoch": 0.26313084592455543, + "grad_norm": 0.3233232796192169, + "learning_rate": 8.921641947574145e-05, + "loss": 0.7345, + "step": 1280 + }, + { + "epoch": 0.263336416897934, + "grad_norm": 0.20394398272037506, + "learning_rate": 8.921456257068186e-05, + "loss": 0.5848, + "step": 1281 + }, + { + "epoch": 0.26354198787131256, + "grad_norm": 0.28951147198677063, + "learning_rate": 8.921270348737741e-05, + "loss": 0.7507, + "step": 1282 + }, + { + "epoch": 0.26374755884469114, + "grad_norm": 0.26492390036582947, + "learning_rate": 8.921084222591971e-05, + "loss": 0.7124, + "step": 1283 + }, + { + "epoch": 0.2639531298180697, + "grad_norm": 0.2661970555782318, + "learning_rate": 8.920897878640046e-05, + "loss": 0.7556, + "step": 1284 + }, + { + "epoch": 0.26415870079144826, + "grad_norm": 0.17668524384498596, + "learning_rate": 8.920711316891145e-05, + "loss": 0.5874, + "step": 1285 + }, + { + "epoch": 0.2643642717648268, + "grad_norm": 0.2812560796737671, + "learning_rate": 8.92052453735446e-05, + "loss": 0.744, + "step": 1286 + }, + { + "epoch": 0.2645698427382054, + "grad_norm": 0.25487664341926575, + "learning_rate": 8.920337540039193e-05, + "loss": 0.7414, + "step": 1287 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 0.26109081506729126, + "learning_rate": 8.920150324954557e-05, + "loss": 0.7305, + "step": 1288 + }, + { + "epoch": 0.2649809846849625, + "grad_norm": 0.2654556334018707, + "learning_rate": 8.919962892109772e-05, + "loss": 0.7105, + "step": 1289 + }, + { + "epoch": 0.26518655565834104, + "grad_norm": 0.25440090894699097, + "learning_rate": 8.919775241514075e-05, + "loss": 0.7567, + "step": 1290 + }, + { + "epoch": 0.2653921266317196, + "grad_norm": 0.26158374547958374, + "learning_rate": 8.91958737317671e-05, + "loss": 0.7656, + "step": 1291 + }, + { + "epoch": 0.26559769760509816, + "grad_norm": 0.25178900361061096, + "learning_rate": 8.919399287106933e-05, + "loss": 0.7342, + "step": 1292 + }, + { + "epoch": 0.2658032685784767, + "grad_norm": 0.2315172553062439, + "learning_rate": 8.91921098331401e-05, + "loss": 0.7527, + "step": 1293 + }, + { + "epoch": 0.2660088395518553, + "grad_norm": 0.2387528419494629, + "learning_rate": 8.919022461807215e-05, + "loss": 0.7414, + "step": 1294 + }, + { + "epoch": 0.2662144105252338, + "grad_norm": 0.24964243173599243, + "learning_rate": 8.918833722595838e-05, + "loss": 0.7538, + "step": 1295 + }, + { + "epoch": 0.2664199814986124, + "grad_norm": 0.43933603167533875, + "learning_rate": 8.918644765689179e-05, + "loss": 0.738, + "step": 1296 + }, + { + "epoch": 0.26662555247199093, + "grad_norm": 0.23242905735969543, + "learning_rate": 8.918455591096543e-05, + "loss": 0.7456, + "step": 1297 + }, + { + "epoch": 0.2668311234453695, + "grad_norm": 0.2441163808107376, + "learning_rate": 8.918266198827252e-05, + "loss": 0.7278, + "step": 1298 + }, + { + "epoch": 0.26703669441874806, + "grad_norm": 0.2470923811197281, + "learning_rate": 8.918076588890637e-05, + "loss": 0.7274, + "step": 1299 + }, + { + "epoch": 0.26724226539212664, + "grad_norm": 0.23086468875408173, + "learning_rate": 8.917886761296039e-05, + "loss": 0.7503, + "step": 1300 + }, + { + "epoch": 0.2674478363655052, + "grad_norm": 0.24466407299041748, + "learning_rate": 8.917696716052808e-05, + "loss": 0.6128, + "step": 1301 + }, + { + "epoch": 0.26765340733888376, + "grad_norm": 0.24658440053462982, + "learning_rate": 8.91750645317031e-05, + "loss": 0.7356, + "step": 1302 + }, + { + "epoch": 0.2678589783122623, + "grad_norm": 0.24751920998096466, + "learning_rate": 8.917315972657915e-05, + "loss": 0.7394, + "step": 1303 + }, + { + "epoch": 0.2680645492856409, + "grad_norm": 0.2545618414878845, + "learning_rate": 8.91712527452501e-05, + "loss": 0.7412, + "step": 1304 + }, + { + "epoch": 0.2682701202590194, + "grad_norm": 0.23690831661224365, + "learning_rate": 8.916934358780986e-05, + "loss": 0.7224, + "step": 1305 + }, + { + "epoch": 0.268475691232398, + "grad_norm": 0.24612128734588623, + "learning_rate": 8.916743225435252e-05, + "loss": 0.7441, + "step": 1306 + }, + { + "epoch": 0.26868126220577654, + "grad_norm": 0.24375763535499573, + "learning_rate": 8.916551874497223e-05, + "loss": 0.735, + "step": 1307 + }, + { + "epoch": 0.2688868331791551, + "grad_norm": 0.22968213260173798, + "learning_rate": 8.916360305976326e-05, + "loss": 0.7453, + "step": 1308 + }, + { + "epoch": 0.26909240415253366, + "grad_norm": 0.23660656809806824, + "learning_rate": 8.916168519881999e-05, + "loss": 0.7201, + "step": 1309 + }, + { + "epoch": 0.26929797512591225, + "grad_norm": 0.2977808713912964, + "learning_rate": 8.915976516223691e-05, + "loss": 0.6098, + "step": 1310 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 0.2509056031703949, + "learning_rate": 8.915784295010859e-05, + "loss": 0.7539, + "step": 1311 + }, + { + "epoch": 0.2697091170726693, + "grad_norm": 0.2543947696685791, + "learning_rate": 8.915591856252973e-05, + "loss": 0.7508, + "step": 1312 + }, + { + "epoch": 0.2699146880460479, + "grad_norm": 0.24036121368408203, + "learning_rate": 8.915399199959516e-05, + "loss": 0.7149, + "step": 1313 + }, + { + "epoch": 0.27012025901942643, + "grad_norm": 0.2512202560901642, + "learning_rate": 8.915206326139978e-05, + "loss": 0.6823, + "step": 1314 + }, + { + "epoch": 0.270325829992805, + "grad_norm": 0.24787308275699615, + "learning_rate": 8.915013234803863e-05, + "loss": 0.7399, + "step": 1315 + }, + { + "epoch": 0.27053140096618356, + "grad_norm": 0.24503572285175323, + "learning_rate": 8.914819925960679e-05, + "loss": 0.7347, + "step": 1316 + }, + { + "epoch": 0.27073697193956214, + "grad_norm": 0.23503392934799194, + "learning_rate": 8.914626399619951e-05, + "loss": 0.7262, + "step": 1317 + }, + { + "epoch": 0.2709425429129407, + "grad_norm": 0.23490577936172485, + "learning_rate": 8.914432655791217e-05, + "loss": 0.7333, + "step": 1318 + }, + { + "epoch": 0.27114811388631926, + "grad_norm": 0.2428707480430603, + "learning_rate": 8.914238694484016e-05, + "loss": 0.7087, + "step": 1319 + }, + { + "epoch": 0.2713536848596978, + "grad_norm": 0.24492257833480835, + "learning_rate": 8.91404451570791e-05, + "loss": 0.7164, + "step": 1320 + }, + { + "epoch": 0.2715592558330764, + "grad_norm": 0.2504068911075592, + "learning_rate": 8.913850119472461e-05, + "loss": 0.7406, + "step": 1321 + }, + { + "epoch": 0.2717648268064549, + "grad_norm": 0.24984775483608246, + "learning_rate": 8.913655505787246e-05, + "loss": 0.7324, + "step": 1322 + }, + { + "epoch": 0.2719703977798335, + "grad_norm": 0.23938335478305817, + "learning_rate": 8.913460674661854e-05, + "loss": 0.7147, + "step": 1323 + }, + { + "epoch": 0.27217596875321204, + "grad_norm": 0.24494026601314545, + "learning_rate": 8.913265626105883e-05, + "loss": 0.7476, + "step": 1324 + }, + { + "epoch": 0.2723815397265906, + "grad_norm": 0.23465509712696075, + "learning_rate": 8.913070360128941e-05, + "loss": 0.7203, + "step": 1325 + }, + { + "epoch": 0.27258711069996916, + "grad_norm": 0.2233608067035675, + "learning_rate": 8.912874876740651e-05, + "loss": 0.7189, + "step": 1326 + }, + { + "epoch": 0.27279268167334775, + "grad_norm": 0.23633797466754913, + "learning_rate": 8.912679175950641e-05, + "loss": 0.7257, + "step": 1327 + }, + { + "epoch": 0.2729982526467263, + "grad_norm": 0.22821030020713806, + "learning_rate": 8.912483257768551e-05, + "loss": 0.726, + "step": 1328 + }, + { + "epoch": 0.27320382362010487, + "grad_norm": 0.2244369387626648, + "learning_rate": 8.912287122204038e-05, + "loss": 0.709, + "step": 1329 + }, + { + "epoch": 0.2734093945934834, + "grad_norm": 0.23471800982952118, + "learning_rate": 8.912090769266758e-05, + "loss": 0.7163, + "step": 1330 + }, + { + "epoch": 0.27361496556686193, + "grad_norm": 0.23954612016677856, + "learning_rate": 8.911894198966391e-05, + "loss": 0.7477, + "step": 1331 + }, + { + "epoch": 0.2738205365402405, + "grad_norm": 0.33054718375205994, + "learning_rate": 8.911697411312616e-05, + "loss": 0.616, + "step": 1332 + }, + { + "epoch": 0.27402610751361905, + "grad_norm": 0.26455309987068176, + "learning_rate": 8.91150040631513e-05, + "loss": 0.7477, + "step": 1333 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 0.15511548519134521, + "learning_rate": 8.911303183983639e-05, + "loss": 0.5804, + "step": 1334 + }, + { + "epoch": 0.2744372494603762, + "grad_norm": 0.2723095715045929, + "learning_rate": 8.911105744327858e-05, + "loss": 0.7527, + "step": 1335 + }, + { + "epoch": 0.27464282043375476, + "grad_norm": 0.2615657150745392, + "learning_rate": 8.910908087357515e-05, + "loss": 0.7228, + "step": 1336 + }, + { + "epoch": 0.2748483914071333, + "grad_norm": 0.2343035191297531, + "learning_rate": 8.910710213082346e-05, + "loss": 0.7435, + "step": 1337 + }, + { + "epoch": 0.2750539623805119, + "grad_norm": 0.27343472838401794, + "learning_rate": 8.910512121512101e-05, + "loss": 0.7415, + "step": 1338 + }, + { + "epoch": 0.2752595333538904, + "grad_norm": 0.2690789997577667, + "learning_rate": 8.910313812656539e-05, + "loss": 0.7301, + "step": 1339 + }, + { + "epoch": 0.275465104327269, + "grad_norm": 0.23863738775253296, + "learning_rate": 8.910115286525428e-05, + "loss": 0.7114, + "step": 1340 + }, + { + "epoch": 0.27567067530064754, + "grad_norm": 0.26206308603286743, + "learning_rate": 8.909916543128551e-05, + "loss": 0.5967, + "step": 1341 + }, + { + "epoch": 0.2758762462740261, + "grad_norm": 0.27798014879226685, + "learning_rate": 8.909717582475695e-05, + "loss": 0.7337, + "step": 1342 + }, + { + "epoch": 0.27608181724740466, + "grad_norm": 0.23681025207042694, + "learning_rate": 8.909518404576668e-05, + "loss": 0.7287, + "step": 1343 + }, + { + "epoch": 0.27628738822078325, + "grad_norm": 0.2664317786693573, + "learning_rate": 8.90931900944128e-05, + "loss": 0.7151, + "step": 1344 + }, + { + "epoch": 0.2764929591941618, + "grad_norm": 0.2881788909435272, + "learning_rate": 8.909119397079349e-05, + "loss": 0.7289, + "step": 1345 + }, + { + "epoch": 0.27669853016754037, + "grad_norm": 0.248192697763443, + "learning_rate": 8.908919567500718e-05, + "loss": 0.7233, + "step": 1346 + }, + { + "epoch": 0.2769041011409189, + "grad_norm": 0.2383420318365097, + "learning_rate": 8.908719520715224e-05, + "loss": 0.7178, + "step": 1347 + }, + { + "epoch": 0.27710967211429743, + "grad_norm": 0.23679983615875244, + "learning_rate": 8.908519256732727e-05, + "loss": 0.717, + "step": 1348 + }, + { + "epoch": 0.277315243087676, + "grad_norm": 0.2335837185382843, + "learning_rate": 8.908318775563092e-05, + "loss": 0.7167, + "step": 1349 + }, + { + "epoch": 0.27752081406105455, + "grad_norm": 0.247580885887146, + "learning_rate": 8.908118077216194e-05, + "loss": 0.7467, + "step": 1350 + }, + { + "epoch": 0.27772638503443314, + "grad_norm": 0.24042358994483948, + "learning_rate": 8.907917161701923e-05, + "loss": 0.7615, + "step": 1351 + }, + { + "epoch": 0.2779319560078117, + "grad_norm": 0.24658474326133728, + "learning_rate": 8.907716029030174e-05, + "loss": 0.7096, + "step": 1352 + }, + { + "epoch": 0.27813752698119026, + "grad_norm": 0.24043896794319153, + "learning_rate": 8.90751467921086e-05, + "loss": 0.735, + "step": 1353 + }, + { + "epoch": 0.2783430979545688, + "grad_norm": 0.2515980303287506, + "learning_rate": 8.907313112253898e-05, + "loss": 0.7167, + "step": 1354 + }, + { + "epoch": 0.2785486689279474, + "grad_norm": 0.23116926848888397, + "learning_rate": 8.907111328169219e-05, + "loss": 0.6996, + "step": 1355 + }, + { + "epoch": 0.2787542399013259, + "grad_norm": 0.23852792382240295, + "learning_rate": 8.906909326966762e-05, + "loss": 0.7252, + "step": 1356 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 0.2699477970600128, + "learning_rate": 8.906707108656481e-05, + "loss": 0.5933, + "step": 1357 + }, + { + "epoch": 0.27916538184808304, + "grad_norm": 0.171479269862175, + "learning_rate": 8.906504673248338e-05, + "loss": 0.583, + "step": 1358 + }, + { + "epoch": 0.2793709528214616, + "grad_norm": 0.1635981947183609, + "learning_rate": 8.906302020752306e-05, + "loss": 0.592, + "step": 1359 + }, + { + "epoch": 0.27957652379484016, + "grad_norm": 0.3277224898338318, + "learning_rate": 8.906099151178368e-05, + "loss": 0.7403, + "step": 1360 + }, + { + "epoch": 0.27978209476821875, + "grad_norm": 0.27374133467674255, + "learning_rate": 8.905896064536519e-05, + "loss": 0.7438, + "step": 1361 + }, + { + "epoch": 0.2799876657415973, + "grad_norm": 0.2909560203552246, + "learning_rate": 8.905692760836765e-05, + "loss": 0.5838, + "step": 1362 + }, + { + "epoch": 0.28019323671497587, + "grad_norm": 0.34569621086120605, + "learning_rate": 8.905489240089119e-05, + "loss": 0.7456, + "step": 1363 + }, + { + "epoch": 0.2803988076883544, + "grad_norm": 0.32318931818008423, + "learning_rate": 8.90528550230361e-05, + "loss": 0.7337, + "step": 1364 + }, + { + "epoch": 0.280604378661733, + "grad_norm": 0.24782495200634003, + "learning_rate": 8.905081547490276e-05, + "loss": 0.7135, + "step": 1365 + }, + { + "epoch": 0.2808099496351115, + "grad_norm": 0.25972336530685425, + "learning_rate": 8.904877375659163e-05, + "loss": 0.7076, + "step": 1366 + }, + { + "epoch": 0.28101552060849005, + "grad_norm": 0.28636348247528076, + "learning_rate": 8.904672986820328e-05, + "loss": 0.7406, + "step": 1367 + }, + { + "epoch": 0.28122109158186864, + "grad_norm": 0.21100643277168274, + "learning_rate": 8.904468380983843e-05, + "loss": 0.6081, + "step": 1368 + }, + { + "epoch": 0.2814266625552472, + "grad_norm": 0.2907034456729889, + "learning_rate": 8.904263558159788e-05, + "loss": 0.7046, + "step": 1369 + }, + { + "epoch": 0.28163223352862576, + "grad_norm": 0.2622237205505371, + "learning_rate": 8.904058518358253e-05, + "loss": 0.7578, + "step": 1370 + }, + { + "epoch": 0.2818378045020043, + "grad_norm": 0.2604566812515259, + "learning_rate": 8.903853261589339e-05, + "loss": 0.75, + "step": 1371 + }, + { + "epoch": 0.2820433754753829, + "grad_norm": 0.27299514412879944, + "learning_rate": 8.90364778786316e-05, + "loss": 0.7491, + "step": 1372 + }, + { + "epoch": 0.2822489464487614, + "grad_norm": 0.25931867957115173, + "learning_rate": 8.903442097189835e-05, + "loss": 0.6978, + "step": 1373 + }, + { + "epoch": 0.28245451742214, + "grad_norm": 0.2450464367866516, + "learning_rate": 8.9032361895795e-05, + "loss": 0.7276, + "step": 1374 + }, + { + "epoch": 0.28266008839551854, + "grad_norm": 0.20911885797977448, + "learning_rate": 8.903030065042298e-05, + "loss": 0.5984, + "step": 1375 + }, + { + "epoch": 0.2828656593688971, + "grad_norm": 0.2976955473423004, + "learning_rate": 8.902823723588385e-05, + "loss": 0.7332, + "step": 1376 + }, + { + "epoch": 0.28307123034227566, + "grad_norm": 0.2745811641216278, + "learning_rate": 8.902617165227928e-05, + "loss": 0.7369, + "step": 1377 + }, + { + "epoch": 0.28327680131565425, + "grad_norm": 0.23596425354480743, + "learning_rate": 8.902410389971099e-05, + "loss": 0.7253, + "step": 1378 + }, + { + "epoch": 0.2834823722890328, + "grad_norm": 0.25958871841430664, + "learning_rate": 8.902203397828086e-05, + "loss": 0.7494, + "step": 1379 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 0.2587198317050934, + "learning_rate": 8.901996188809088e-05, + "loss": 0.7001, + "step": 1380 + }, + { + "epoch": 0.2838935142357899, + "grad_norm": 0.2621273696422577, + "learning_rate": 8.901788762924313e-05, + "loss": 0.728, + "step": 1381 + }, + { + "epoch": 0.2840990852091685, + "grad_norm": 0.18734264373779297, + "learning_rate": 8.901581120183979e-05, + "loss": 0.6061, + "step": 1382 + }, + { + "epoch": 0.284304656182547, + "grad_norm": 0.16175542771816254, + "learning_rate": 8.901373260598317e-05, + "loss": 0.6072, + "step": 1383 + }, + { + "epoch": 0.2845102271559256, + "grad_norm": 0.30578863620758057, + "learning_rate": 8.901165184177567e-05, + "loss": 0.7373, + "step": 1384 + }, + { + "epoch": 0.28471579812930414, + "grad_norm": 0.26835259795188904, + "learning_rate": 8.900956890931979e-05, + "loss": 0.7249, + "step": 1385 + }, + { + "epoch": 0.2849213691026827, + "grad_norm": 0.221610888838768, + "learning_rate": 8.900748380871814e-05, + "loss": 0.5865, + "step": 1386 + }, + { + "epoch": 0.28512694007606126, + "grad_norm": 0.27838990092277527, + "learning_rate": 8.900539654007346e-05, + "loss": 0.7224, + "step": 1387 + }, + { + "epoch": 0.2853325110494398, + "grad_norm": 0.24998264014720917, + "learning_rate": 8.900330710348857e-05, + "loss": 0.7112, + "step": 1388 + }, + { + "epoch": 0.2855380820228184, + "grad_norm": 0.2573053240776062, + "learning_rate": 8.900121549906642e-05, + "loss": 0.7395, + "step": 1389 + }, + { + "epoch": 0.2857436529961969, + "grad_norm": 0.24121756851673126, + "learning_rate": 8.899912172691004e-05, + "loss": 0.747, + "step": 1390 + }, + { + "epoch": 0.2859492239695755, + "grad_norm": 0.2541133463382721, + "learning_rate": 8.899702578712256e-05, + "loss": 0.7226, + "step": 1391 + }, + { + "epoch": 0.28615479494295404, + "grad_norm": 0.24340660870075226, + "learning_rate": 8.899492767980729e-05, + "loss": 0.698, + "step": 1392 + }, + { + "epoch": 0.2863603659163326, + "grad_norm": 0.24495667219161987, + "learning_rate": 8.899282740506756e-05, + "loss": 0.7535, + "step": 1393 + }, + { + "epoch": 0.28656593688971116, + "grad_norm": 0.2280047982931137, + "learning_rate": 8.899072496300684e-05, + "loss": 0.7219, + "step": 1394 + }, + { + "epoch": 0.28677150786308975, + "grad_norm": 0.23093637824058533, + "learning_rate": 8.898862035372872e-05, + "loss": 0.7135, + "step": 1395 + }, + { + "epoch": 0.2869770788364683, + "grad_norm": 0.24832944571971893, + "learning_rate": 8.898651357733686e-05, + "loss": 0.7522, + "step": 1396 + }, + { + "epoch": 0.28718264980984687, + "grad_norm": 0.23297333717346191, + "learning_rate": 8.898440463393508e-05, + "loss": 0.7546, + "step": 1397 + }, + { + "epoch": 0.2873882207832254, + "grad_norm": 0.21482457220554352, + "learning_rate": 8.898229352362727e-05, + "loss": 0.5847, + "step": 1398 + }, + { + "epoch": 0.287593791756604, + "grad_norm": 0.16317768394947052, + "learning_rate": 8.898018024651742e-05, + "loss": 0.5954, + "step": 1399 + }, + { + "epoch": 0.2877993627299825, + "grad_norm": 0.3127588629722595, + "learning_rate": 8.897806480270967e-05, + "loss": 0.7413, + "step": 1400 + }, + { + "epoch": 0.2880049337033611, + "grad_norm": 0.2599581182003021, + "learning_rate": 8.897594719230821e-05, + "loss": 0.7315, + "step": 1401 + }, + { + "epoch": 0.28821050467673964, + "grad_norm": 0.23986676335334778, + "learning_rate": 8.897382741541737e-05, + "loss": 0.7528, + "step": 1402 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 0.2908901870250702, + "learning_rate": 8.897170547214159e-05, + "loss": 0.7404, + "step": 1403 + }, + { + "epoch": 0.28862164662349676, + "grad_norm": 0.3151310682296753, + "learning_rate": 8.896958136258541e-05, + "loss": 0.6033, + "step": 1404 + }, + { + "epoch": 0.2888272175968753, + "grad_norm": 0.2576965391635895, + "learning_rate": 8.896745508685346e-05, + "loss": 0.7326, + "step": 1405 + }, + { + "epoch": 0.2890327885702539, + "grad_norm": 0.2626875340938568, + "learning_rate": 8.896532664505051e-05, + "loss": 0.7408, + "step": 1406 + }, + { + "epoch": 0.2892383595436324, + "grad_norm": 0.24406549334526062, + "learning_rate": 8.896319603728141e-05, + "loss": 0.7326, + "step": 1407 + }, + { + "epoch": 0.289443930517011, + "grad_norm": 0.24385593831539154, + "learning_rate": 8.896106326365112e-05, + "loss": 0.7503, + "step": 1408 + }, + { + "epoch": 0.28964950149038954, + "grad_norm": 0.24427802860736847, + "learning_rate": 8.89589283242647e-05, + "loss": 0.7341, + "step": 1409 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.24131245911121368, + "learning_rate": 8.895679121922738e-05, + "loss": 0.7313, + "step": 1410 + }, + { + "epoch": 0.29006064343714666, + "grad_norm": 0.24251912534236908, + "learning_rate": 8.895465194864439e-05, + "loss": 0.7138, + "step": 1411 + }, + { + "epoch": 0.29026621441052525, + "grad_norm": 0.22263044118881226, + "learning_rate": 8.895251051262115e-05, + "loss": 0.6891, + "step": 1412 + }, + { + "epoch": 0.2904717853839038, + "grad_norm": 0.23494918644428253, + "learning_rate": 8.895036691126314e-05, + "loss": 0.732, + "step": 1413 + }, + { + "epoch": 0.29067735635728237, + "grad_norm": 0.22686836123466492, + "learning_rate": 8.894822114467598e-05, + "loss": 0.7274, + "step": 1414 + }, + { + "epoch": 0.2908829273306609, + "grad_norm": 0.24379804730415344, + "learning_rate": 8.894607321296538e-05, + "loss": 0.74, + "step": 1415 + }, + { + "epoch": 0.2910884983040395, + "grad_norm": 0.23114730417728424, + "learning_rate": 8.894392311623714e-05, + "loss": 0.7377, + "step": 1416 + }, + { + "epoch": 0.291294069277418, + "grad_norm": 0.23655329644680023, + "learning_rate": 8.894177085459722e-05, + "loss": 0.7493, + "step": 1417 + }, + { + "epoch": 0.2914996402507966, + "grad_norm": 0.2256159633398056, + "learning_rate": 8.893961642815163e-05, + "loss": 0.6974, + "step": 1418 + }, + { + "epoch": 0.29170521122417514, + "grad_norm": 0.20934060215950012, + "learning_rate": 8.893745983700652e-05, + "loss": 0.5891, + "step": 1419 + }, + { + "epoch": 0.29191078219755373, + "grad_norm": 0.1600976139307022, + "learning_rate": 8.893530108126811e-05, + "loss": 0.6138, + "step": 1420 + }, + { + "epoch": 0.29211635317093226, + "grad_norm": 0.1524209976196289, + "learning_rate": 8.893314016104278e-05, + "loss": 0.5702, + "step": 1421 + }, + { + "epoch": 0.2923219241443108, + "grad_norm": 0.31443774700164795, + "learning_rate": 8.893097707643697e-05, + "loss": 0.6969, + "step": 1422 + }, + { + "epoch": 0.2925274951176894, + "grad_norm": 0.2652696669101715, + "learning_rate": 8.892881182755727e-05, + "loss": 0.7177, + "step": 1423 + }, + { + "epoch": 0.2927330660910679, + "grad_norm": 0.23116344213485718, + "learning_rate": 8.892664441451031e-05, + "loss": 0.6064, + "step": 1424 + }, + { + "epoch": 0.2929386370644465, + "grad_norm": 0.2783909738063812, + "learning_rate": 8.892447483740291e-05, + "loss": 0.7301, + "step": 1425 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 0.2517321705818176, + "learning_rate": 8.892230309634192e-05, + "loss": 0.7447, + "step": 1426 + }, + { + "epoch": 0.2933497790112036, + "grad_norm": 0.2492847889661789, + "learning_rate": 8.892012919143436e-05, + "loss": 0.7529, + "step": 1427 + }, + { + "epoch": 0.29355534998458216, + "grad_norm": 0.23372922837734222, + "learning_rate": 8.891795312278732e-05, + "loss": 0.7302, + "step": 1428 + }, + { + "epoch": 0.29376092095796075, + "grad_norm": 0.260433167219162, + "learning_rate": 8.8915774890508e-05, + "loss": 0.7388, + "step": 1429 + }, + { + "epoch": 0.2939664919313393, + "grad_norm": 0.24735549092292786, + "learning_rate": 8.89135944947037e-05, + "loss": 0.6851, + "step": 1430 + }, + { + "epoch": 0.29417206290471787, + "grad_norm": 0.24530264735221863, + "learning_rate": 8.891141193548188e-05, + "loss": 0.7483, + "step": 1431 + }, + { + "epoch": 0.2943776338780964, + "grad_norm": 0.24232807755470276, + "learning_rate": 8.890922721295e-05, + "loss": 0.7272, + "step": 1432 + }, + { + "epoch": 0.294583204851475, + "grad_norm": 0.23810634016990662, + "learning_rate": 8.890704032721575e-05, + "loss": 0.6853, + "step": 1433 + }, + { + "epoch": 0.2947887758248535, + "grad_norm": 0.23144571483135223, + "learning_rate": 8.890485127838684e-05, + "loss": 0.7317, + "step": 1434 + }, + { + "epoch": 0.2949943467982321, + "grad_norm": 0.23867613077163696, + "learning_rate": 8.890266006657111e-05, + "loss": 0.7378, + "step": 1435 + }, + { + "epoch": 0.29519991777161064, + "grad_norm": 0.2355402261018753, + "learning_rate": 8.890046669187653e-05, + "loss": 0.7183, + "step": 1436 + }, + { + "epoch": 0.29540548874498923, + "grad_norm": 0.2344846874475479, + "learning_rate": 8.889827115441114e-05, + "loss": 0.6113, + "step": 1437 + }, + { + "epoch": 0.29561105971836776, + "grad_norm": 0.25104036927223206, + "learning_rate": 8.88960734542831e-05, + "loss": 0.716, + "step": 1438 + }, + { + "epoch": 0.29581663069174635, + "grad_norm": 0.2465832382440567, + "learning_rate": 8.88938735916007e-05, + "loss": 0.7588, + "step": 1439 + }, + { + "epoch": 0.2960222016651249, + "grad_norm": 0.24674251675605774, + "learning_rate": 8.889167156647231e-05, + "loss": 0.7221, + "step": 1440 + }, + { + "epoch": 0.2962277726385034, + "grad_norm": 0.25955334305763245, + "learning_rate": 8.888946737900642e-05, + "loss": 0.742, + "step": 1441 + }, + { + "epoch": 0.296433343611882, + "grad_norm": 0.2384418547153473, + "learning_rate": 8.888726102931159e-05, + "loss": 0.7298, + "step": 1442 + }, + { + "epoch": 0.29663891458526054, + "grad_norm": 0.2418283224105835, + "learning_rate": 8.888505251749655e-05, + "loss": 0.7149, + "step": 1443 + }, + { + "epoch": 0.2968444855586391, + "grad_norm": 0.2591508626937866, + "learning_rate": 8.88828418436701e-05, + "loss": 0.7281, + "step": 1444 + }, + { + "epoch": 0.29705005653201766, + "grad_norm": 0.2347528338432312, + "learning_rate": 8.888062900794113e-05, + "loss": 0.741, + "step": 1445 + }, + { + "epoch": 0.29725562750539625, + "grad_norm": 0.22745028138160706, + "learning_rate": 8.887841401041865e-05, + "loss": 0.7347, + "step": 1446 + }, + { + "epoch": 0.2974611984787748, + "grad_norm": 0.236216738820076, + "learning_rate": 8.887619685121183e-05, + "loss": 0.7229, + "step": 1447 + }, + { + "epoch": 0.29766676945215337, + "grad_norm": 0.22409434616565704, + "learning_rate": 8.887397753042985e-05, + "loss": 0.5921, + "step": 1448 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 0.24046771228313446, + "learning_rate": 8.887175604818206e-05, + "loss": 0.6934, + "step": 1449 + }, + { + "epoch": 0.2980779113989105, + "grad_norm": 0.25511425733566284, + "learning_rate": 8.886953240457791e-05, + "loss": 0.7177, + "step": 1450 + }, + { + "epoch": 0.298283482372289, + "grad_norm": 0.23517939448356628, + "learning_rate": 8.886730659972696e-05, + "loss": 0.744, + "step": 1451 + }, + { + "epoch": 0.2984890533456676, + "grad_norm": 0.23165474832057953, + "learning_rate": 8.886507863373883e-05, + "loss": 0.72, + "step": 1452 + }, + { + "epoch": 0.29869462431904614, + "grad_norm": 0.22487609088420868, + "learning_rate": 8.88628485067233e-05, + "loss": 0.6993, + "step": 1453 + }, + { + "epoch": 0.29890019529242473, + "grad_norm": 0.2359279990196228, + "learning_rate": 8.886061621879024e-05, + "loss": 0.7148, + "step": 1454 + }, + { + "epoch": 0.29910576626580326, + "grad_norm": 0.23191282153129578, + "learning_rate": 8.885838177004964e-05, + "loss": 0.73, + "step": 1455 + }, + { + "epoch": 0.29931133723918185, + "grad_norm": 0.2255670130252838, + "learning_rate": 8.885614516061156e-05, + "loss": 0.7192, + "step": 1456 + }, + { + "epoch": 0.2995169082125604, + "grad_norm": 0.21794365346431732, + "learning_rate": 8.885390639058617e-05, + "loss": 0.7126, + "step": 1457 + }, + { + "epoch": 0.299722479185939, + "grad_norm": 0.22137753665447235, + "learning_rate": 8.88516654600838e-05, + "loss": 0.6953, + "step": 1458 + }, + { + "epoch": 0.2999280501593175, + "grad_norm": 0.23347578942775726, + "learning_rate": 8.884942236921483e-05, + "loss": 0.7275, + "step": 1459 + }, + { + "epoch": 0.30013362113269604, + "grad_norm": 0.22592391073703766, + "learning_rate": 8.884717711808976e-05, + "loss": 0.7011, + "step": 1460 + }, + { + "epoch": 0.3003391921060746, + "grad_norm": 0.2333751916885376, + "learning_rate": 8.884492970681924e-05, + "loss": 0.5993, + "step": 1461 + }, + { + "epoch": 0.30054476307945316, + "grad_norm": 0.23949290812015533, + "learning_rate": 8.884268013551395e-05, + "loss": 0.7246, + "step": 1462 + }, + { + "epoch": 0.30075033405283175, + "grad_norm": 0.22439618408679962, + "learning_rate": 8.884042840428473e-05, + "loss": 0.7257, + "step": 1463 + }, + { + "epoch": 0.3009559050262103, + "grad_norm": 0.2332451343536377, + "learning_rate": 8.883817451324253e-05, + "loss": 0.7344, + "step": 1464 + }, + { + "epoch": 0.30116147599958887, + "grad_norm": 0.2470991313457489, + "learning_rate": 8.883591846249834e-05, + "loss": 0.7396, + "step": 1465 + }, + { + "epoch": 0.3013670469729674, + "grad_norm": 0.23062336444854736, + "learning_rate": 8.883366025216336e-05, + "loss": 0.715, + "step": 1466 + }, + { + "epoch": 0.301572617946346, + "grad_norm": 0.2705153226852417, + "learning_rate": 8.88313998823488e-05, + "loss": 0.7202, + "step": 1467 + }, + { + "epoch": 0.3017781889197245, + "grad_norm": 0.2432517409324646, + "learning_rate": 8.882913735316604e-05, + "loss": 0.7346, + "step": 1468 + }, + { + "epoch": 0.3019837598931031, + "grad_norm": 0.20731572806835175, + "learning_rate": 8.882687266472655e-05, + "loss": 0.6029, + "step": 1469 + }, + { + "epoch": 0.30218933086648164, + "grad_norm": 0.24890613555908203, + "learning_rate": 8.882460581714188e-05, + "loss": 0.743, + "step": 1470 + }, + { + "epoch": 0.30239490183986023, + "grad_norm": 0.23934966325759888, + "learning_rate": 8.882233681052371e-05, + "loss": 0.7102, + "step": 1471 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 0.2529708743095398, + "learning_rate": 8.882006564498385e-05, + "loss": 0.7366, + "step": 1472 + }, + { + "epoch": 0.30280604378661735, + "grad_norm": 0.22400988638401031, + "learning_rate": 8.881779232063416e-05, + "loss": 0.7295, + "step": 1473 + }, + { + "epoch": 0.3030116147599959, + "grad_norm": 0.23044519126415253, + "learning_rate": 8.881551683758664e-05, + "loss": 0.7332, + "step": 1474 + }, + { + "epoch": 0.3032171857333745, + "grad_norm": 0.2295847088098526, + "learning_rate": 8.881323919595341e-05, + "loss": 0.6939, + "step": 1475 + }, + { + "epoch": 0.303422756706753, + "grad_norm": 0.22964751720428467, + "learning_rate": 8.881095939584667e-05, + "loss": 0.7197, + "step": 1476 + }, + { + "epoch": 0.30362832768013154, + "grad_norm": 0.2278130203485489, + "learning_rate": 8.880867743737873e-05, + "loss": 0.7366, + "step": 1477 + }, + { + "epoch": 0.3038338986535101, + "grad_norm": 0.17138256132602692, + "learning_rate": 8.8806393320662e-05, + "loss": 0.585, + "step": 1478 + }, + { + "epoch": 0.30403946962688866, + "grad_norm": 0.23692992329597473, + "learning_rate": 8.880410704580904e-05, + "loss": 0.7368, + "step": 1479 + }, + { + "epoch": 0.30424504060026725, + "grad_norm": 0.23937001824378967, + "learning_rate": 8.880181861293245e-05, + "loss": 0.7465, + "step": 1480 + }, + { + "epoch": 0.3044506115736458, + "grad_norm": 0.2425798624753952, + "learning_rate": 8.879952802214498e-05, + "loss": 0.7235, + "step": 1481 + }, + { + "epoch": 0.30465618254702437, + "grad_norm": 0.22199256718158722, + "learning_rate": 8.87972352735595e-05, + "loss": 0.7266, + "step": 1482 + }, + { + "epoch": 0.3048617535204029, + "grad_norm": 0.22652393579483032, + "learning_rate": 8.879494036728895e-05, + "loss": 0.7196, + "step": 1483 + }, + { + "epoch": 0.3050673244937815, + "grad_norm": 0.23339220881462097, + "learning_rate": 8.879264330344637e-05, + "loss": 0.6907, + "step": 1484 + }, + { + "epoch": 0.30527289546716, + "grad_norm": 0.17793652415275574, + "learning_rate": 8.879034408214495e-05, + "loss": 0.5843, + "step": 1485 + }, + { + "epoch": 0.3054784664405386, + "grad_norm": 0.14778107404708862, + "learning_rate": 8.878804270349794e-05, + "loss": 0.5915, + "step": 1486 + }, + { + "epoch": 0.30568403741391714, + "grad_norm": 0.25510430335998535, + "learning_rate": 8.878573916761875e-05, + "loss": 0.7359, + "step": 1487 + }, + { + "epoch": 0.30588960838729573, + "grad_norm": 0.245680570602417, + "learning_rate": 8.878343347462083e-05, + "loss": 0.7232, + "step": 1488 + }, + { + "epoch": 0.30609517936067426, + "grad_norm": 0.22665980458259583, + "learning_rate": 8.878112562461781e-05, + "loss": 0.72, + "step": 1489 + }, + { + "epoch": 0.30630075033405285, + "grad_norm": 0.23110273480415344, + "learning_rate": 8.877881561772334e-05, + "loss": 0.7333, + "step": 1490 + }, + { + "epoch": 0.3065063213074314, + "grad_norm": 0.2374107986688614, + "learning_rate": 8.877650345405124e-05, + "loss": 0.7047, + "step": 1491 + }, + { + "epoch": 0.30671189228081, + "grad_norm": 0.23222175240516663, + "learning_rate": 8.877418913371543e-05, + "loss": 0.7247, + "step": 1492 + }, + { + "epoch": 0.3069174632541885, + "grad_norm": 0.2248169332742691, + "learning_rate": 8.877187265682993e-05, + "loss": 0.731, + "step": 1493 + }, + { + "epoch": 0.3071230342275671, + "grad_norm": 0.22877496480941772, + "learning_rate": 8.876955402350885e-05, + "loss": 0.7317, + "step": 1494 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 0.23524411022663116, + "learning_rate": 8.876723323386642e-05, + "loss": 0.7243, + "step": 1495 + }, + { + "epoch": 0.30753417617432416, + "grad_norm": 0.23392078280448914, + "learning_rate": 8.876491028801698e-05, + "loss": 0.7291, + "step": 1496 + }, + { + "epoch": 0.30773974714770275, + "grad_norm": 0.23218654096126556, + "learning_rate": 8.876258518607496e-05, + "loss": 0.7185, + "step": 1497 + }, + { + "epoch": 0.3079453181210813, + "grad_norm": 0.22467701137065887, + "learning_rate": 8.876025792815493e-05, + "loss": 0.6027, + "step": 1498 + }, + { + "epoch": 0.30815088909445987, + "grad_norm": 0.16272898018360138, + "learning_rate": 8.875792851437153e-05, + "loss": 0.5879, + "step": 1499 + }, + { + "epoch": 0.3083564600678384, + "grad_norm": 0.3116845190525055, + "learning_rate": 8.875559694483949e-05, + "loss": 0.7104, + "step": 1500 + }, + { + "epoch": 0.308562031041217, + "grad_norm": 0.27991852164268494, + "learning_rate": 8.875326321967371e-05, + "loss": 0.7103, + "step": 1501 + }, + { + "epoch": 0.3087676020145955, + "grad_norm": 0.2318386435508728, + "learning_rate": 8.875092733898917e-05, + "loss": 0.7377, + "step": 1502 + }, + { + "epoch": 0.3089731729879741, + "grad_norm": 0.2598876655101776, + "learning_rate": 8.874858930290091e-05, + "loss": 0.6015, + "step": 1503 + }, + { + "epoch": 0.30917874396135264, + "grad_norm": 0.29758408665657043, + "learning_rate": 8.874624911152415e-05, + "loss": 0.7181, + "step": 1504 + }, + { + "epoch": 0.30938431493473123, + "grad_norm": 0.27736955881118774, + "learning_rate": 8.874390676497416e-05, + "loss": 0.7206, + "step": 1505 + }, + { + "epoch": 0.30958988590810976, + "grad_norm": 0.2458835244178772, + "learning_rate": 8.874156226336634e-05, + "loss": 0.7499, + "step": 1506 + }, + { + "epoch": 0.30979545688148835, + "grad_norm": 0.22762452065944672, + "learning_rate": 8.873921560681619e-05, + "loss": 0.5821, + "step": 1507 + }, + { + "epoch": 0.3100010278548669, + "grad_norm": 0.27454984188079834, + "learning_rate": 8.873686679543934e-05, + "loss": 0.7146, + "step": 1508 + }, + { + "epoch": 0.3102065988282455, + "grad_norm": 0.26772287487983704, + "learning_rate": 8.873451582935148e-05, + "loss": 0.7536, + "step": 1509 + }, + { + "epoch": 0.310412169801624, + "grad_norm": 0.23362015187740326, + "learning_rate": 8.873216270866843e-05, + "loss": 0.6984, + "step": 1510 + }, + { + "epoch": 0.3106177407750026, + "grad_norm": 0.23610959947109222, + "learning_rate": 8.872980743350613e-05, + "loss": 0.7171, + "step": 1511 + }, + { + "epoch": 0.3108233117483811, + "grad_norm": 0.25894349813461304, + "learning_rate": 8.872745000398062e-05, + "loss": 0.7187, + "step": 1512 + }, + { + "epoch": 0.3110288827217597, + "grad_norm": 0.26054081320762634, + "learning_rate": 8.872509042020803e-05, + "loss": 0.7203, + "step": 1513 + }, + { + "epoch": 0.31123445369513825, + "grad_norm": 0.2335205376148224, + "learning_rate": 8.872272868230461e-05, + "loss": 0.7009, + "step": 1514 + }, + { + "epoch": 0.3114400246685168, + "grad_norm": 0.24587051570415497, + "learning_rate": 8.872036479038669e-05, + "loss": 0.7399, + "step": 1515 + }, + { + "epoch": 0.31164559564189537, + "grad_norm": 0.24924126267433167, + "learning_rate": 8.871799874457075e-05, + "loss": 0.7493, + "step": 1516 + }, + { + "epoch": 0.3118511666152739, + "grad_norm": 0.24950510263442993, + "learning_rate": 8.871563054497335e-05, + "loss": 0.7178, + "step": 1517 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 0.25369346141815186, + "learning_rate": 8.871326019171117e-05, + "loss": 0.6963, + "step": 1518 + }, + { + "epoch": 0.312262308562031, + "grad_norm": 0.2488810122013092, + "learning_rate": 8.871088768490098e-05, + "loss": 0.7619, + "step": 1519 + }, + { + "epoch": 0.3124678795354096, + "grad_norm": 0.24383045732975006, + "learning_rate": 8.870851302465962e-05, + "loss": 0.711, + "step": 1520 + }, + { + "epoch": 0.31267345050878814, + "grad_norm": 0.2425009161233902, + "learning_rate": 8.870613621110415e-05, + "loss": 0.7177, + "step": 1521 + }, + { + "epoch": 0.31287902148216673, + "grad_norm": 0.240753635764122, + "learning_rate": 8.870375724435162e-05, + "loss": 0.7244, + "step": 1522 + }, + { + "epoch": 0.31308459245554526, + "grad_norm": 0.23214225471019745, + "learning_rate": 8.870137612451926e-05, + "loss": 0.7576, + "step": 1523 + }, + { + "epoch": 0.31329016342892385, + "grad_norm": 0.2381378412246704, + "learning_rate": 8.869899285172435e-05, + "loss": 0.7379, + "step": 1524 + }, + { + "epoch": 0.3134957344023024, + "grad_norm": 0.24119152128696442, + "learning_rate": 8.869660742608429e-05, + "loss": 0.5884, + "step": 1525 + }, + { + "epoch": 0.31370130537568097, + "grad_norm": 0.1588635891675949, + "learning_rate": 8.869421984771664e-05, + "loss": 0.5977, + "step": 1526 + }, + { + "epoch": 0.3139068763490595, + "grad_norm": 0.30175936222076416, + "learning_rate": 8.869183011673899e-05, + "loss": 0.7523, + "step": 1527 + }, + { + "epoch": 0.3141124473224381, + "grad_norm": 0.2720763385295868, + "learning_rate": 8.868943823326911e-05, + "loss": 0.7369, + "step": 1528 + }, + { + "epoch": 0.3143180182958166, + "grad_norm": 0.25000452995300293, + "learning_rate": 8.868704419742477e-05, + "loss": 0.7248, + "step": 1529 + }, + { + "epoch": 0.3145235892691952, + "grad_norm": 0.24794606864452362, + "learning_rate": 8.8684648009324e-05, + "loss": 0.716, + "step": 1530 + }, + { + "epoch": 0.31472916024257375, + "grad_norm": 0.2837069630622864, + "learning_rate": 8.868224966908477e-05, + "loss": 0.7167, + "step": 1531 + }, + { + "epoch": 0.31493473121595233, + "grad_norm": 0.2553151845932007, + "learning_rate": 8.867984917682529e-05, + "loss": 0.728, + "step": 1532 + }, + { + "epoch": 0.31514030218933087, + "grad_norm": 0.2584458589553833, + "learning_rate": 8.86774465326638e-05, + "loss": 0.7546, + "step": 1533 + }, + { + "epoch": 0.3153458731627094, + "grad_norm": 0.3400932252407074, + "learning_rate": 8.867504173671866e-05, + "loss": 0.6503, + "step": 1534 + }, + { + "epoch": 0.315551444136088, + "grad_norm": 0.22265098989009857, + "learning_rate": 8.867263478910834e-05, + "loss": 0.6126, + "step": 1535 + }, + { + "epoch": 0.3157570151094665, + "grad_norm": 0.3153107464313507, + "learning_rate": 8.867022568995144e-05, + "loss": 0.7263, + "step": 1536 + }, + { + "epoch": 0.3159625860828451, + "grad_norm": 0.2766020596027374, + "learning_rate": 8.866781443936664e-05, + "loss": 0.7219, + "step": 1537 + }, + { + "epoch": 0.31616815705622364, + "grad_norm": 0.24225422739982605, + "learning_rate": 8.866540103747273e-05, + "loss": 0.7171, + "step": 1538 + }, + { + "epoch": 0.31637372802960223, + "grad_norm": 0.25176170468330383, + "learning_rate": 8.866298548438859e-05, + "loss": 0.7344, + "step": 1539 + }, + { + "epoch": 0.31657929900298076, + "grad_norm": 0.25651928782463074, + "learning_rate": 8.866056778023322e-05, + "loss": 0.7413, + "step": 1540 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 0.2334342896938324, + "learning_rate": 8.865814792512578e-05, + "loss": 0.7253, + "step": 1541 + }, + { + "epoch": 0.3169904409497379, + "grad_norm": 0.2274434119462967, + "learning_rate": 8.865572591918542e-05, + "loss": 0.7159, + "step": 1542 + }, + { + "epoch": 0.31719601192311647, + "grad_norm": 0.2403416633605957, + "learning_rate": 8.86533017625315e-05, + "loss": 0.7181, + "step": 1543 + }, + { + "epoch": 0.317401582896495, + "grad_norm": 0.4360656142234802, + "learning_rate": 8.865087545528343e-05, + "loss": 0.621, + "step": 1544 + }, + { + "epoch": 0.3176071538698736, + "grad_norm": 0.267894983291626, + "learning_rate": 8.864844699756077e-05, + "loss": 0.7211, + "step": 1545 + }, + { + "epoch": 0.3178127248432521, + "grad_norm": 0.28000763058662415, + "learning_rate": 8.864601638948313e-05, + "loss": 0.7417, + "step": 1546 + }, + { + "epoch": 0.3180182958166307, + "grad_norm": 0.25448542833328247, + "learning_rate": 8.864358363117026e-05, + "loss": 0.7456, + "step": 1547 + }, + { + "epoch": 0.31822386679000925, + "grad_norm": 0.22277960181236267, + "learning_rate": 8.864114872274201e-05, + "loss": 0.7509, + "step": 1548 + }, + { + "epoch": 0.31842943776338783, + "grad_norm": 0.25154295563697815, + "learning_rate": 8.863871166431835e-05, + "loss": 0.7561, + "step": 1549 + }, + { + "epoch": 0.31863500873676637, + "grad_norm": 0.24481630325317383, + "learning_rate": 8.863627245601933e-05, + "loss": 0.7205, + "step": 1550 + }, + { + "epoch": 0.3188405797101449, + "grad_norm": 0.2636171877384186, + "learning_rate": 8.863383109796514e-05, + "loss": 0.6225, + "step": 1551 + }, + { + "epoch": 0.3190461506835235, + "grad_norm": 0.24895146489143372, + "learning_rate": 8.863138759027601e-05, + "loss": 0.713, + "step": 1552 + }, + { + "epoch": 0.319251721656902, + "grad_norm": 0.23717238008975983, + "learning_rate": 8.862894193307234e-05, + "loss": 0.7009, + "step": 1553 + }, + { + "epoch": 0.3194572926302806, + "grad_norm": 0.17063067853450775, + "learning_rate": 8.862649412647463e-05, + "loss": 0.609, + "step": 1554 + }, + { + "epoch": 0.31966286360365914, + "grad_norm": 0.24430248141288757, + "learning_rate": 8.862404417060348e-05, + "loss": 0.7329, + "step": 1555 + }, + { + "epoch": 0.31986843457703773, + "grad_norm": 0.22696368396282196, + "learning_rate": 8.862159206557955e-05, + "loss": 0.7189, + "step": 1556 + }, + { + "epoch": 0.32007400555041626, + "grad_norm": 0.23269693553447723, + "learning_rate": 8.861913781152368e-05, + "loss": 0.72, + "step": 1557 + }, + { + "epoch": 0.32027957652379485, + "grad_norm": 0.23606634140014648, + "learning_rate": 8.861668140855677e-05, + "loss": 0.7273, + "step": 1558 + }, + { + "epoch": 0.3204851474971734, + "grad_norm": 0.2232600301504135, + "learning_rate": 8.861422285679982e-05, + "loss": 0.7271, + "step": 1559 + }, + { + "epoch": 0.32069071847055197, + "grad_norm": 0.22926129400730133, + "learning_rate": 8.861176215637396e-05, + "loss": 0.7046, + "step": 1560 + }, + { + "epoch": 0.3208962894439305, + "grad_norm": 0.21815744042396545, + "learning_rate": 8.860929930740043e-05, + "loss": 0.7145, + "step": 1561 + }, + { + "epoch": 0.3211018604173091, + "grad_norm": 0.2220899611711502, + "learning_rate": 8.860683431000055e-05, + "loss": 0.7517, + "step": 1562 + }, + { + "epoch": 0.3213074313906876, + "grad_norm": 0.23148676753044128, + "learning_rate": 8.860436716429576e-05, + "loss": 0.7425, + "step": 1563 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 0.2475571632385254, + "learning_rate": 8.86018978704076e-05, + "loss": 0.7373, + "step": 1564 + }, + { + "epoch": 0.32171857333744475, + "grad_norm": 0.22201502323150635, + "learning_rate": 8.859942642845773e-05, + "loss": 0.739, + "step": 1565 + }, + { + "epoch": 0.32192414431082333, + "grad_norm": 0.23228532075881958, + "learning_rate": 8.859695283856791e-05, + "loss": 0.7181, + "step": 1566 + }, + { + "epoch": 0.32212971528420187, + "grad_norm": 0.22633086144924164, + "learning_rate": 8.859447710085998e-05, + "loss": 0.7264, + "step": 1567 + }, + { + "epoch": 0.32233528625758046, + "grad_norm": 0.2289307564496994, + "learning_rate": 8.859199921545595e-05, + "loss": 0.6861, + "step": 1568 + }, + { + "epoch": 0.322540857230959, + "grad_norm": 0.2249763160943985, + "learning_rate": 8.858951918247784e-05, + "loss": 0.7251, + "step": 1569 + }, + { + "epoch": 0.3227464282043375, + "grad_norm": 0.21789641678333282, + "learning_rate": 8.858703700204787e-05, + "loss": 0.5872, + "step": 1570 + }, + { + "epoch": 0.3229519991777161, + "grad_norm": 0.32843679189682007, + "learning_rate": 8.85845526742883e-05, + "loss": 0.7297, + "step": 1571 + }, + { + "epoch": 0.32315757015109464, + "grad_norm": 0.2552517354488373, + "learning_rate": 8.858206619932154e-05, + "loss": 0.7297, + "step": 1572 + }, + { + "epoch": 0.32336314112447323, + "grad_norm": 0.1595383882522583, + "learning_rate": 8.857957757727008e-05, + "loss": 0.5928, + "step": 1573 + }, + { + "epoch": 0.32356871209785176, + "grad_norm": 0.23427622020244598, + "learning_rate": 8.857708680825654e-05, + "loss": 0.7416, + "step": 1574 + }, + { + "epoch": 0.32377428307123035, + "grad_norm": 0.2303827553987503, + "learning_rate": 8.85745938924036e-05, + "loss": 0.7506, + "step": 1575 + }, + { + "epoch": 0.3239798540446089, + "grad_norm": 0.2222229540348053, + "learning_rate": 8.857209882983408e-05, + "loss": 0.7212, + "step": 1576 + }, + { + "epoch": 0.32418542501798747, + "grad_norm": 0.21901166439056396, + "learning_rate": 8.856960162067091e-05, + "loss": 0.7307, + "step": 1577 + }, + { + "epoch": 0.324390995991366, + "grad_norm": 1.646615743637085, + "learning_rate": 8.85671022650371e-05, + "loss": 0.7284, + "step": 1578 + }, + { + "epoch": 0.3245965669647446, + "grad_norm": 0.22739437222480774, + "learning_rate": 8.856460076305581e-05, + "loss": 0.7468, + "step": 1579 + }, + { + "epoch": 0.3248021379381231, + "grad_norm": 0.22001872956752777, + "learning_rate": 8.856209711485026e-05, + "loss": 0.6801, + "step": 1580 + }, + { + "epoch": 0.3250077089115017, + "grad_norm": 0.2490796595811844, + "learning_rate": 8.855959132054379e-05, + "loss": 0.7225, + "step": 1581 + }, + { + "epoch": 0.32521327988488025, + "grad_norm": 0.23509925603866577, + "learning_rate": 8.855708338025985e-05, + "loss": 0.7126, + "step": 1582 + }, + { + "epoch": 0.32541885085825883, + "grad_norm": 0.26781192421913147, + "learning_rate": 8.8554573294122e-05, + "loss": 0.7345, + "step": 1583 + }, + { + "epoch": 0.32562442183163737, + "grad_norm": 0.23214460909366608, + "learning_rate": 8.85520610622539e-05, + "loss": 0.7287, + "step": 1584 + }, + { + "epoch": 0.32582999280501596, + "grad_norm": 0.24188122153282166, + "learning_rate": 8.854954668477931e-05, + "loss": 0.7169, + "step": 1585 + }, + { + "epoch": 0.3260355637783945, + "grad_norm": 0.22148127853870392, + "learning_rate": 8.85470301618221e-05, + "loss": 0.7128, + "step": 1586 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 0.6666994690895081, + "learning_rate": 8.854451149350625e-05, + "loss": 0.6192, + "step": 1587 + }, + { + "epoch": 0.3264467057251516, + "grad_norm": 0.24034947156906128, + "learning_rate": 8.854199067995585e-05, + "loss": 0.724, + "step": 1588 + }, + { + "epoch": 0.32665227669853014, + "grad_norm": 0.23072193562984467, + "learning_rate": 8.85394677212951e-05, + "loss": 0.727, + "step": 1589 + }, + { + "epoch": 0.32685784767190873, + "grad_norm": 0.23429062962532043, + "learning_rate": 8.853694261764826e-05, + "loss": 0.7165, + "step": 1590 + }, + { + "epoch": 0.32706341864528726, + "grad_norm": 0.23310211300849915, + "learning_rate": 8.853441536913976e-05, + "loss": 0.7284, + "step": 1591 + }, + { + "epoch": 0.32726898961866585, + "grad_norm": 0.2373618483543396, + "learning_rate": 8.853188597589409e-05, + "loss": 0.7347, + "step": 1592 + }, + { + "epoch": 0.3274745605920444, + "grad_norm": 0.22494561970233917, + "learning_rate": 8.852935443803587e-05, + "loss": 0.73, + "step": 1593 + }, + { + "epoch": 0.32768013156542297, + "grad_norm": 0.22148995101451874, + "learning_rate": 8.85268207556898e-05, + "loss": 0.7105, + "step": 1594 + }, + { + "epoch": 0.3278857025388015, + "grad_norm": 0.23605044186115265, + "learning_rate": 8.852428492898071e-05, + "loss": 0.7147, + "step": 1595 + }, + { + "epoch": 0.3280912735121801, + "grad_norm": 0.23167657852172852, + "learning_rate": 8.852174695803355e-05, + "loss": 0.7129, + "step": 1596 + }, + { + "epoch": 0.3282968444855586, + "grad_norm": 0.2309151291847229, + "learning_rate": 8.851920684297333e-05, + "loss": 0.7087, + "step": 1597 + }, + { + "epoch": 0.3285024154589372, + "grad_norm": 0.22455458343029022, + "learning_rate": 8.85166645839252e-05, + "loss": 0.7316, + "step": 1598 + }, + { + "epoch": 0.32870798643231575, + "grad_norm": 0.2276565134525299, + "learning_rate": 8.85141201810144e-05, + "loss": 0.719, + "step": 1599 + }, + { + "epoch": 0.32891355740569433, + "grad_norm": 0.23086774349212646, + "learning_rate": 8.851157363436628e-05, + "loss": 0.7065, + "step": 1600 + }, + { + "epoch": 0.32911912837907287, + "grad_norm": 0.23493504524230957, + "learning_rate": 8.850902494410631e-05, + "loss": 0.7245, + "step": 1601 + }, + { + "epoch": 0.32932469935245146, + "grad_norm": 0.24357451498508453, + "learning_rate": 8.850647411036003e-05, + "loss": 0.7151, + "step": 1602 + }, + { + "epoch": 0.32953027032583, + "grad_norm": 0.24102084338665009, + "learning_rate": 8.850392113325312e-05, + "loss": 0.7389, + "step": 1603 + }, + { + "epoch": 0.3297358412992086, + "grad_norm": 0.2216963768005371, + "learning_rate": 8.850136601291137e-05, + "loss": 0.703, + "step": 1604 + }, + { + "epoch": 0.3299414122725871, + "grad_norm": 0.22978007793426514, + "learning_rate": 8.849880874946062e-05, + "loss": 0.7402, + "step": 1605 + }, + { + "epoch": 0.3301469832459657, + "grad_norm": 0.23540645837783813, + "learning_rate": 8.849624934302689e-05, + "loss": 0.6975, + "step": 1606 + }, + { + "epoch": 0.33035255421934423, + "grad_norm": 1.370906949043274, + "learning_rate": 8.849368779373625e-05, + "loss": 0.8282, + "step": 1607 + }, + { + "epoch": 0.33055812519272276, + "grad_norm": 0.2301483154296875, + "learning_rate": 8.84911241017149e-05, + "loss": 0.7083, + "step": 1608 + }, + { + "epoch": 0.33076369616610135, + "grad_norm": 0.24278217554092407, + "learning_rate": 8.848855826708914e-05, + "loss": 0.724, + "step": 1609 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 0.25511378049850464, + "learning_rate": 8.848599028998538e-05, + "loss": 0.7214, + "step": 1610 + }, + { + "epoch": 0.33117483811285847, + "grad_norm": 0.2384072244167328, + "learning_rate": 8.848342017053015e-05, + "loss": 0.7211, + "step": 1611 + }, + { + "epoch": 0.331380409086237, + "grad_norm": 0.31351780891418457, + "learning_rate": 8.848084790885003e-05, + "loss": 0.6297, + "step": 1612 + }, + { + "epoch": 0.3315859800596156, + "grad_norm": 0.262350469827652, + "learning_rate": 8.847827350507177e-05, + "loss": 0.7176, + "step": 1613 + }, + { + "epoch": 0.3317915510329941, + "grad_norm": 0.2178378701210022, + "learning_rate": 8.847569695932219e-05, + "loss": 0.5897, + "step": 1614 + }, + { + "epoch": 0.3319971220063727, + "grad_norm": 0.2447414994239807, + "learning_rate": 8.847311827172822e-05, + "loss": 0.7119, + "step": 1615 + }, + { + "epoch": 0.33220269297975125, + "grad_norm": 0.23930813372135162, + "learning_rate": 8.84705374424169e-05, + "loss": 0.7297, + "step": 1616 + }, + { + "epoch": 0.33240826395312983, + "grad_norm": 0.18309295177459717, + "learning_rate": 8.846795447151539e-05, + "loss": 0.6059, + "step": 1617 + }, + { + "epoch": 0.33261383492650837, + "grad_norm": 0.23922927677631378, + "learning_rate": 8.846536935915093e-05, + "loss": 0.709, + "step": 1618 + }, + { + "epoch": 0.33281940589988696, + "grad_norm": 0.24151726067066193, + "learning_rate": 8.846278210545089e-05, + "loss": 0.7009, + "step": 1619 + }, + { + "epoch": 0.3330249768732655, + "grad_norm": 0.23320122063159943, + "learning_rate": 8.846019271054272e-05, + "loss": 0.702, + "step": 1620 + }, + { + "epoch": 0.3332305478466441, + "grad_norm": 0.24178290367126465, + "learning_rate": 8.845760117455397e-05, + "loss": 0.7359, + "step": 1621 + }, + { + "epoch": 0.3334361188200226, + "grad_norm": 0.6629179120063782, + "learning_rate": 8.845500749761233e-05, + "loss": 0.7394, + "step": 1622 + }, + { + "epoch": 0.3336416897934012, + "grad_norm": 0.2403455376625061, + "learning_rate": 8.84524116798456e-05, + "loss": 0.7285, + "step": 1623 + }, + { + "epoch": 0.33384726076677973, + "grad_norm": 0.19743573665618896, + "learning_rate": 8.844981372138162e-05, + "loss": 0.6283, + "step": 1624 + }, + { + "epoch": 0.33405283174015826, + "grad_norm": 0.2429579198360443, + "learning_rate": 8.844721362234841e-05, + "loss": 0.7409, + "step": 1625 + }, + { + "epoch": 0.33425840271353685, + "grad_norm": 0.24667932093143463, + "learning_rate": 8.844461138287406e-05, + "loss": 0.7242, + "step": 1626 + }, + { + "epoch": 0.3344639736869154, + "grad_norm": 0.2274756133556366, + "learning_rate": 8.844200700308677e-05, + "loss": 0.7241, + "step": 1627 + }, + { + "epoch": 0.33466954466029397, + "grad_norm": 0.24319452047348022, + "learning_rate": 8.843940048311484e-05, + "loss": 0.7248, + "step": 1628 + }, + { + "epoch": 0.3348751156336725, + "grad_norm": 0.23962891101837158, + "learning_rate": 8.843679182308668e-05, + "loss": 0.7236, + "step": 1629 + }, + { + "epoch": 0.3350806866070511, + "grad_norm": 0.23430408537387848, + "learning_rate": 8.84341810231308e-05, + "loss": 0.7255, + "step": 1630 + }, + { + "epoch": 0.3352862575804296, + "grad_norm": 0.2336353361606598, + "learning_rate": 8.843156808337585e-05, + "loss": 0.7229, + "step": 1631 + }, + { + "epoch": 0.3354918285538082, + "grad_norm": 0.22381432354450226, + "learning_rate": 8.842895300395054e-05, + "loss": 0.7248, + "step": 1632 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 0.2316228300333023, + "learning_rate": 8.842633578498368e-05, + "loss": 0.7343, + "step": 1633 + }, + { + "epoch": 0.33590297050056533, + "grad_norm": 0.22491221129894257, + "learning_rate": 8.842371642660424e-05, + "loss": 0.718, + "step": 1634 + }, + { + "epoch": 0.33610854147394387, + "grad_norm": 0.2314968854188919, + "learning_rate": 8.842109492894127e-05, + "loss": 0.7289, + "step": 1635 + }, + { + "epoch": 0.33631411244732246, + "grad_norm": 0.23885907232761383, + "learning_rate": 8.841847129212389e-05, + "loss": 0.7338, + "step": 1636 + }, + { + "epoch": 0.336519683420701, + "grad_norm": 0.22755815088748932, + "learning_rate": 8.841584551628136e-05, + "loss": 0.7238, + "step": 1637 + }, + { + "epoch": 0.3367252543940796, + "grad_norm": 0.2223365604877472, + "learning_rate": 8.841321760154306e-05, + "loss": 0.729, + "step": 1638 + }, + { + "epoch": 0.3369308253674581, + "grad_norm": 0.23648889362812042, + "learning_rate": 8.841058754803844e-05, + "loss": 0.7479, + "step": 1639 + }, + { + "epoch": 0.3371363963408367, + "grad_norm": 0.22464527189731598, + "learning_rate": 8.840795535589706e-05, + "loss": 0.7364, + "step": 1640 + }, + { + "epoch": 0.33734196731421523, + "grad_norm": 0.22983680665493011, + "learning_rate": 8.840532102524861e-05, + "loss": 0.7288, + "step": 1641 + }, + { + "epoch": 0.3375475382875938, + "grad_norm": 0.22532789409160614, + "learning_rate": 8.840268455622288e-05, + "loss": 0.7626, + "step": 1642 + }, + { + "epoch": 0.33775310926097235, + "grad_norm": 0.22486740350723267, + "learning_rate": 8.840004594894974e-05, + "loss": 0.7198, + "step": 1643 + }, + { + "epoch": 0.3379586802343509, + "grad_norm": 0.220737487077713, + "learning_rate": 8.839740520355918e-05, + "loss": 0.7467, + "step": 1644 + }, + { + "epoch": 0.33816425120772947, + "grad_norm": 0.23781028389930725, + "learning_rate": 8.839476232018131e-05, + "loss": 0.7162, + "step": 1645 + }, + { + "epoch": 0.338369822181108, + "grad_norm": 0.22306212782859802, + "learning_rate": 8.839211729894634e-05, + "loss": 0.7024, + "step": 1646 + }, + { + "epoch": 0.3385753931544866, + "grad_norm": 0.22637905180454254, + "learning_rate": 8.838947013998454e-05, + "loss": 0.7227, + "step": 1647 + }, + { + "epoch": 0.3387809641278651, + "grad_norm": 0.21539071202278137, + "learning_rate": 8.838682084342637e-05, + "loss": 0.715, + "step": 1648 + }, + { + "epoch": 0.3389865351012437, + "grad_norm": 0.21236176788806915, + "learning_rate": 8.838416940940232e-05, + "loss": 0.6935, + "step": 1649 + }, + { + "epoch": 0.33919210607462225, + "grad_norm": 0.21903282403945923, + "learning_rate": 8.838151583804302e-05, + "loss": 0.6875, + "step": 1650 + }, + { + "epoch": 0.33939767704800083, + "grad_norm": 0.22233720123767853, + "learning_rate": 8.83788601294792e-05, + "loss": 0.7196, + "step": 1651 + }, + { + "epoch": 0.33960324802137937, + "grad_norm": 0.21296600997447968, + "learning_rate": 8.837620228384169e-05, + "loss": 0.7383, + "step": 1652 + }, + { + "epoch": 0.33980881899475796, + "grad_norm": 0.21336333453655243, + "learning_rate": 8.837354230126144e-05, + "loss": 0.7222, + "step": 1653 + }, + { + "epoch": 0.3400143899681365, + "grad_norm": 0.22977587580680847, + "learning_rate": 8.837088018186948e-05, + "loss": 0.7053, + "step": 1654 + }, + { + "epoch": 0.3402199609415151, + "grad_norm": 0.22435788810253143, + "learning_rate": 8.836821592579697e-05, + "loss": 0.6154, + "step": 1655 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.23182466626167297, + "learning_rate": 8.836554953317518e-05, + "loss": 0.7294, + "step": 1656 + }, + { + "epoch": 0.3406311028882722, + "grad_norm": 0.2296569049358368, + "learning_rate": 8.836288100413543e-05, + "loss": 0.7147, + "step": 1657 + }, + { + "epoch": 0.34083667386165073, + "grad_norm": 0.22955302894115448, + "learning_rate": 8.836021033880922e-05, + "loss": 0.7228, + "step": 1658 + }, + { + "epoch": 0.3410422448350293, + "grad_norm": 0.28406065702438354, + "learning_rate": 8.83575375373281e-05, + "loss": 0.722, + "step": 1659 + }, + { + "epoch": 0.34124781580840785, + "grad_norm": 0.22933915257453918, + "learning_rate": 8.835486259982378e-05, + "loss": 0.7365, + "step": 1660 + }, + { + "epoch": 0.34145338678178644, + "grad_norm": 0.18561038374900818, + "learning_rate": 8.835218552642801e-05, + "loss": 0.6073, + "step": 1661 + }, + { + "epoch": 0.34165895775516497, + "grad_norm": 0.22962850332260132, + "learning_rate": 8.834950631727269e-05, + "loss": 0.7329, + "step": 1662 + }, + { + "epoch": 0.3418645287285435, + "grad_norm": 0.22192583978176117, + "learning_rate": 8.83468249724898e-05, + "loss": 0.6966, + "step": 1663 + }, + { + "epoch": 0.3420700997019221, + "grad_norm": 0.2303367406129837, + "learning_rate": 8.834414149221145e-05, + "loss": 0.7083, + "step": 1664 + }, + { + "epoch": 0.3422756706753006, + "grad_norm": 0.21235564351081848, + "learning_rate": 8.834145587656984e-05, + "loss": 0.7054, + "step": 1665 + }, + { + "epoch": 0.3424812416486792, + "grad_norm": 0.22414252161979675, + "learning_rate": 8.833876812569728e-05, + "loss": 0.7094, + "step": 1666 + }, + { + "epoch": 0.34268681262205775, + "grad_norm": 0.21854104101657867, + "learning_rate": 8.833607823972617e-05, + "loss": 0.7009, + "step": 1667 + }, + { + "epoch": 0.34289238359543633, + "grad_norm": 0.21945634484291077, + "learning_rate": 8.833338621878904e-05, + "loss": 0.7214, + "step": 1668 + }, + { + "epoch": 0.34309795456881487, + "grad_norm": 0.22008635103702545, + "learning_rate": 8.833069206301852e-05, + "loss": 0.7231, + "step": 1669 + }, + { + "epoch": 0.34330352554219346, + "grad_norm": 0.2222408652305603, + "learning_rate": 8.832799577254734e-05, + "loss": 0.7249, + "step": 1670 + }, + { + "epoch": 0.343509096515572, + "grad_norm": 0.22058893740177155, + "learning_rate": 8.83252973475083e-05, + "loss": 0.7196, + "step": 1671 + }, + { + "epoch": 0.3437146674889506, + "grad_norm": 0.2201676368713379, + "learning_rate": 8.832259678803437e-05, + "loss": 0.7226, + "step": 1672 + }, + { + "epoch": 0.3439202384623291, + "grad_norm": 0.21815598011016846, + "learning_rate": 8.831989409425857e-05, + "loss": 0.6943, + "step": 1673 + }, + { + "epoch": 0.3441258094357077, + "grad_norm": 0.22216841578483582, + "learning_rate": 8.831718926631409e-05, + "loss": 0.7259, + "step": 1674 + }, + { + "epoch": 0.34433138040908623, + "grad_norm": 0.21504633128643036, + "learning_rate": 8.831448230433415e-05, + "loss": 0.7286, + "step": 1675 + }, + { + "epoch": 0.3445369513824648, + "grad_norm": 0.20685335993766785, + "learning_rate": 8.83117732084521e-05, + "loss": 0.6891, + "step": 1676 + }, + { + "epoch": 0.34474252235584335, + "grad_norm": 0.1763618290424347, + "learning_rate": 8.830906197880146e-05, + "loss": 0.6218, + "step": 1677 + }, + { + "epoch": 0.34494809332922194, + "grad_norm": 0.24009843170642853, + "learning_rate": 8.830634861551573e-05, + "loss": 0.7337, + "step": 1678 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 0.21924906969070435, + "learning_rate": 8.830363311872862e-05, + "loss": 0.7194, + "step": 1679 + }, + { + "epoch": 0.345359235275979, + "grad_norm": 0.22524218261241913, + "learning_rate": 8.830091548857392e-05, + "loss": 0.728, + "step": 1680 + }, + { + "epoch": 0.3455648062493576, + "grad_norm": 0.15049724280834198, + "learning_rate": 8.829819572518549e-05, + "loss": 0.5879, + "step": 1681 + }, + { + "epoch": 0.3457703772227361, + "grad_norm": 0.23018436133861542, + "learning_rate": 8.829547382869734e-05, + "loss": 0.7318, + "step": 1682 + }, + { + "epoch": 0.3459759481961147, + "grad_norm": 0.14980974793434143, + "learning_rate": 8.829274979924355e-05, + "loss": 0.6082, + "step": 1683 + }, + { + "epoch": 0.34618151916949325, + "grad_norm": 0.23299898207187653, + "learning_rate": 8.829002363695834e-05, + "loss": 0.6979, + "step": 1684 + }, + { + "epoch": 0.34638709014287183, + "grad_norm": 0.22874654829502106, + "learning_rate": 8.828729534197599e-05, + "loss": 0.7117, + "step": 1685 + }, + { + "epoch": 0.34659266111625037, + "grad_norm": 0.14617690443992615, + "learning_rate": 8.828456491443093e-05, + "loss": 0.5823, + "step": 1686 + }, + { + "epoch": 0.34679823208962895, + "grad_norm": 0.14507731795310974, + "learning_rate": 8.828183235445767e-05, + "loss": 0.6002, + "step": 1687 + }, + { + "epoch": 0.3470038030630075, + "grad_norm": 0.15053583681583405, + "learning_rate": 8.827909766219082e-05, + "loss": 0.6047, + "step": 1688 + }, + { + "epoch": 0.3472093740363861, + "grad_norm": 0.1374531388282776, + "learning_rate": 8.827636083776512e-05, + "loss": 0.6148, + "step": 1689 + }, + { + "epoch": 0.3474149450097646, + "grad_norm": 0.2662424147129059, + "learning_rate": 8.827362188131539e-05, + "loss": 0.7147, + "step": 1690 + }, + { + "epoch": 0.3476205159831432, + "grad_norm": 0.24824592471122742, + "learning_rate": 8.827088079297658e-05, + "loss": 0.749, + "step": 1691 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.17181143164634705, + "learning_rate": 8.826813757288371e-05, + "loss": 0.605, + "step": 1692 + }, + { + "epoch": 0.3480316579299003, + "grad_norm": 0.2484540492296219, + "learning_rate": 8.826539222117195e-05, + "loss": 0.7012, + "step": 1693 + }, + { + "epoch": 0.34823722890327885, + "grad_norm": 0.17473895847797394, + "learning_rate": 8.826264473797651e-05, + "loss": 0.5969, + "step": 1694 + }, + { + "epoch": 0.34844279987665744, + "grad_norm": 0.14865082502365112, + "learning_rate": 8.825989512343281e-05, + "loss": 0.6109, + "step": 1695 + }, + { + "epoch": 0.34864837085003597, + "grad_norm": 0.26978155970573425, + "learning_rate": 8.825714337767625e-05, + "loss": 0.7122, + "step": 1696 + }, + { + "epoch": 0.34885394182341456, + "grad_norm": 0.15846404433250427, + "learning_rate": 8.825438950084241e-05, + "loss": 0.5924, + "step": 1697 + }, + { + "epoch": 0.3490595127967931, + "grad_norm": 0.23453454673290253, + "learning_rate": 8.8251633493067e-05, + "loss": 0.7328, + "step": 1698 + }, + { + "epoch": 0.3492650837701716, + "grad_norm": 0.22266656160354614, + "learning_rate": 8.824887535448574e-05, + "loss": 0.7041, + "step": 1699 + }, + { + "epoch": 0.3494706547435502, + "grad_norm": 0.2392280548810959, + "learning_rate": 8.824611508523455e-05, + "loss": 0.7133, + "step": 1700 + }, + { + "epoch": 0.34967622571692875, + "grad_norm": 0.22809362411499023, + "learning_rate": 8.82433526854494e-05, + "loss": 0.7258, + "step": 1701 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 0.2222517728805542, + "learning_rate": 8.824058815526637e-05, + "loss": 0.7114, + "step": 1702 + }, + { + "epoch": 0.35008736766368587, + "grad_norm": 0.23900644481182098, + "learning_rate": 8.823782149482169e-05, + "loss": 0.7146, + "step": 1703 + }, + { + "epoch": 0.35029293863706445, + "grad_norm": 0.2216804325580597, + "learning_rate": 8.823505270425162e-05, + "loss": 0.712, + "step": 1704 + }, + { + "epoch": 0.350498509610443, + "grad_norm": 0.22626622021198273, + "learning_rate": 8.823228178369259e-05, + "loss": 0.7145, + "step": 1705 + }, + { + "epoch": 0.3507040805838216, + "grad_norm": 0.23051661252975464, + "learning_rate": 8.82295087332811e-05, + "loss": 0.7246, + "step": 1706 + }, + { + "epoch": 0.3509096515572001, + "grad_norm": 0.19165797531604767, + "learning_rate": 8.822673355315376e-05, + "loss": 0.6022, + "step": 1707 + }, + { + "epoch": 0.3511152225305787, + "grad_norm": 0.15455321967601776, + "learning_rate": 8.822395624344733e-05, + "loss": 0.5952, + "step": 1708 + }, + { + "epoch": 0.35132079350395723, + "grad_norm": 0.25851893424987793, + "learning_rate": 8.822117680429856e-05, + "loss": 0.7155, + "step": 1709 + }, + { + "epoch": 0.3515263644773358, + "grad_norm": 0.14911410212516785, + "learning_rate": 8.821839523584446e-05, + "loss": 0.6002, + "step": 1710 + }, + { + "epoch": 0.35173193545071435, + "grad_norm": 0.2250581830739975, + "learning_rate": 8.821561153822202e-05, + "loss": 0.694, + "step": 1711 + }, + { + "epoch": 0.35193750642409294, + "grad_norm": 0.17733228206634521, + "learning_rate": 8.821282571156838e-05, + "loss": 0.5743, + "step": 1712 + }, + { + "epoch": 0.35214307739747147, + "grad_norm": 0.23851247131824493, + "learning_rate": 8.82100377560208e-05, + "loss": 0.7278, + "step": 1713 + }, + { + "epoch": 0.35234864837085006, + "grad_norm": 0.23099485039710999, + "learning_rate": 8.820724767171662e-05, + "loss": 0.7387, + "step": 1714 + }, + { + "epoch": 0.3525542193442286, + "grad_norm": 0.22473661601543427, + "learning_rate": 8.82044554587933e-05, + "loss": 0.7185, + "step": 1715 + }, + { + "epoch": 0.3527597903176072, + "grad_norm": 0.22726485133171082, + "learning_rate": 8.820166111738839e-05, + "loss": 0.7141, + "step": 1716 + }, + { + "epoch": 0.3529653612909857, + "grad_norm": 0.2528528869152069, + "learning_rate": 8.819886464763958e-05, + "loss": 0.725, + "step": 1717 + }, + { + "epoch": 0.35317093226436425, + "grad_norm": 0.1892632395029068, + "learning_rate": 8.81960660496846e-05, + "loss": 0.5938, + "step": 1718 + }, + { + "epoch": 0.35337650323774283, + "grad_norm": 0.22239932417869568, + "learning_rate": 8.819326532366134e-05, + "loss": 0.7044, + "step": 1719 + }, + { + "epoch": 0.35358207421112137, + "grad_norm": 0.22476689517498016, + "learning_rate": 8.81904624697078e-05, + "loss": 0.7243, + "step": 1720 + }, + { + "epoch": 0.35378764518449995, + "grad_norm": 0.2231576144695282, + "learning_rate": 8.818765748796204e-05, + "loss": 0.7159, + "step": 1721 + }, + { + "epoch": 0.3539932161578785, + "grad_norm": 0.21081259846687317, + "learning_rate": 8.818485037856224e-05, + "loss": 0.7144, + "step": 1722 + }, + { + "epoch": 0.3541987871312571, + "grad_norm": 0.22331789135932922, + "learning_rate": 8.818204114164673e-05, + "loss": 0.7398, + "step": 1723 + }, + { + "epoch": 0.3544043581046356, + "grad_norm": 0.1838466078042984, + "learning_rate": 8.817922977735387e-05, + "loss": 0.6238, + "step": 1724 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 0.2340015321969986, + "learning_rate": 8.81764162858222e-05, + "loss": 0.7226, + "step": 1725 + }, + { + "epoch": 0.35481550005139273, + "grad_norm": 0.14466704428195953, + "learning_rate": 8.817360066719027e-05, + "loss": 0.5699, + "step": 1726 + }, + { + "epoch": 0.3550210710247713, + "grad_norm": 0.23499037325382233, + "learning_rate": 8.817078292159686e-05, + "loss": 0.71, + "step": 1727 + }, + { + "epoch": 0.35522664199814985, + "grad_norm": 0.24169334769248962, + "learning_rate": 8.816796304918072e-05, + "loss": 0.7195, + "step": 1728 + }, + { + "epoch": 0.35543221297152844, + "grad_norm": 0.16424809396266937, + "learning_rate": 8.816514105008086e-05, + "loss": 0.5792, + "step": 1729 + }, + { + "epoch": 0.35563778394490697, + "grad_norm": 0.2632940113544464, + "learning_rate": 8.816231692443621e-05, + "loss": 0.7313, + "step": 1730 + }, + { + "epoch": 0.35584335491828556, + "grad_norm": 0.23430821299552917, + "learning_rate": 8.815949067238596e-05, + "loss": 0.7073, + "step": 1731 + }, + { + "epoch": 0.3560489258916641, + "grad_norm": 0.22487561404705048, + "learning_rate": 8.815666229406932e-05, + "loss": 0.7182, + "step": 1732 + }, + { + "epoch": 0.3562544968650427, + "grad_norm": 0.24197392165660858, + "learning_rate": 8.815383178962566e-05, + "loss": 0.7196, + "step": 1733 + }, + { + "epoch": 0.3564600678384212, + "grad_norm": 0.22599098086357117, + "learning_rate": 8.81509991591944e-05, + "loss": 0.7165, + "step": 1734 + }, + { + "epoch": 0.3566656388117998, + "grad_norm": 0.22369571030139923, + "learning_rate": 8.814816440291509e-05, + "loss": 0.7385, + "step": 1735 + }, + { + "epoch": 0.35687120978517833, + "grad_norm": 0.23025518655776978, + "learning_rate": 8.81453275209274e-05, + "loss": 0.7184, + "step": 1736 + }, + { + "epoch": 0.35707678075855687, + "grad_norm": 0.22964996099472046, + "learning_rate": 8.81424885133711e-05, + "loss": 0.7192, + "step": 1737 + }, + { + "epoch": 0.35728235173193545, + "grad_norm": 0.19159770011901855, + "learning_rate": 8.813964738038602e-05, + "loss": 0.6025, + "step": 1738 + }, + { + "epoch": 0.357487922705314, + "grad_norm": 0.2504747211933136, + "learning_rate": 8.813680412211216e-05, + "loss": 0.6964, + "step": 1739 + }, + { + "epoch": 0.3576934936786926, + "grad_norm": 0.23766383528709412, + "learning_rate": 8.813395873868956e-05, + "loss": 0.7021, + "step": 1740 + }, + { + "epoch": 0.3578990646520711, + "grad_norm": 0.2447771579027176, + "learning_rate": 8.813111123025844e-05, + "loss": 0.7185, + "step": 1741 + }, + { + "epoch": 0.3581046356254497, + "grad_norm": 0.23200775682926178, + "learning_rate": 8.812826159695907e-05, + "loss": 0.7188, + "step": 1742 + }, + { + "epoch": 0.35831020659882823, + "grad_norm": 0.22907336056232452, + "learning_rate": 8.812540983893181e-05, + "loss": 0.6909, + "step": 1743 + }, + { + "epoch": 0.3585157775722068, + "grad_norm": 0.22600993514060974, + "learning_rate": 8.812255595631719e-05, + "loss": 0.7074, + "step": 1744 + }, + { + "epoch": 0.35872134854558535, + "grad_norm": 0.2269076704978943, + "learning_rate": 8.811969994925578e-05, + "loss": 0.6814, + "step": 1745 + }, + { + "epoch": 0.35892691951896394, + "grad_norm": 0.21256834268569946, + "learning_rate": 8.811684181788831e-05, + "loss": 0.7353, + "step": 1746 + }, + { + "epoch": 0.35913249049234247, + "grad_norm": 0.22337260842323303, + "learning_rate": 8.811398156235557e-05, + "loss": 0.7398, + "step": 1747 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 0.2335451990365982, + "learning_rate": 8.811111918279847e-05, + "loss": 0.7205, + "step": 1748 + }, + { + "epoch": 0.3595436324390996, + "grad_norm": 0.21998728811740875, + "learning_rate": 8.810825467935802e-05, + "loss": 0.6947, + "step": 1749 + }, + { + "epoch": 0.3597492034124782, + "grad_norm": 0.272847443819046, + "learning_rate": 8.810538805217535e-05, + "loss": 0.6987, + "step": 1750 + }, + { + "epoch": 0.3599547743858567, + "grad_norm": 0.22549496591091156, + "learning_rate": 8.810251930139169e-05, + "loss": 0.7159, + "step": 1751 + }, + { + "epoch": 0.3601603453592353, + "grad_norm": 0.21950645744800568, + "learning_rate": 8.809964842714837e-05, + "loss": 0.7493, + "step": 1752 + }, + { + "epoch": 0.36036591633261383, + "grad_norm": 0.21935752034187317, + "learning_rate": 8.809677542958681e-05, + "loss": 0.6923, + "step": 1753 + }, + { + "epoch": 0.36057148730599237, + "grad_norm": 0.2425873726606369, + "learning_rate": 8.809390030884856e-05, + "loss": 0.7055, + "step": 1754 + }, + { + "epoch": 0.36077705827937095, + "grad_norm": 0.21217839419841766, + "learning_rate": 8.809102306507527e-05, + "loss": 0.7261, + "step": 1755 + }, + { + "epoch": 0.3609826292527495, + "grad_norm": 0.22305883467197418, + "learning_rate": 8.808814369840867e-05, + "loss": 0.6804, + "step": 1756 + }, + { + "epoch": 0.3611882002261281, + "grad_norm": 0.23050794005393982, + "learning_rate": 8.808526220899063e-05, + "loss": 0.7209, + "step": 1757 + }, + { + "epoch": 0.3613937711995066, + "grad_norm": 0.21624812483787537, + "learning_rate": 8.80823785969631e-05, + "loss": 0.733, + "step": 1758 + }, + { + "epoch": 0.3615993421728852, + "grad_norm": 0.2256494164466858, + "learning_rate": 8.807949286246814e-05, + "loss": 0.7133, + "step": 1759 + }, + { + "epoch": 0.36180491314626373, + "grad_norm": 0.2232973873615265, + "learning_rate": 8.807660500564793e-05, + "loss": 0.7099, + "step": 1760 + }, + { + "epoch": 0.3620104841196423, + "grad_norm": 0.21484389901161194, + "learning_rate": 8.807371502664473e-05, + "loss": 0.7089, + "step": 1761 + }, + { + "epoch": 0.36221605509302085, + "grad_norm": 0.22121310234069824, + "learning_rate": 8.807082292560089e-05, + "loss": 0.7098, + "step": 1762 + }, + { + "epoch": 0.36242162606639944, + "grad_norm": 0.22262440621852875, + "learning_rate": 8.806792870265895e-05, + "loss": 0.7494, + "step": 1763 + }, + { + "epoch": 0.36262719703977797, + "grad_norm": 0.22367548942565918, + "learning_rate": 8.806503235796145e-05, + "loss": 0.7334, + "step": 1764 + }, + { + "epoch": 0.36283276801315656, + "grad_norm": 0.22336241602897644, + "learning_rate": 8.806213389165109e-05, + "loss": 0.7028, + "step": 1765 + }, + { + "epoch": 0.3630383389865351, + "grad_norm": 0.21695300936698914, + "learning_rate": 8.805923330387067e-05, + "loss": 0.7131, + "step": 1766 + }, + { + "epoch": 0.3632439099599137, + "grad_norm": 0.2211865484714508, + "learning_rate": 8.805633059476307e-05, + "loss": 0.7493, + "step": 1767 + }, + { + "epoch": 0.3634494809332922, + "grad_norm": 0.2145841121673584, + "learning_rate": 8.80534257644713e-05, + "loss": 0.5885, + "step": 1768 + }, + { + "epoch": 0.3636550519066708, + "grad_norm": 0.23112855851650238, + "learning_rate": 8.805051881313849e-05, + "loss": 0.6836, + "step": 1769 + }, + { + "epoch": 0.36386062288004933, + "grad_norm": 0.226564422249794, + "learning_rate": 8.804760974090785e-05, + "loss": 0.7297, + "step": 1770 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 0.15169551968574524, + "learning_rate": 8.804469854792266e-05, + "loss": 0.6113, + "step": 1771 + }, + { + "epoch": 0.36427176482680645, + "grad_norm": 0.23821888864040375, + "learning_rate": 8.804178523432637e-05, + "loss": 0.7175, + "step": 1772 + }, + { + "epoch": 0.364477335800185, + "grad_norm": 0.23416121304035187, + "learning_rate": 8.80388698002625e-05, + "loss": 0.7214, + "step": 1773 + }, + { + "epoch": 0.3646829067735636, + "grad_norm": 0.1642165631055832, + "learning_rate": 8.803595224587467e-05, + "loss": 0.5792, + "step": 1774 + }, + { + "epoch": 0.3648884777469421, + "grad_norm": 0.2228156477212906, + "learning_rate": 8.803303257130662e-05, + "loss": 0.7051, + "step": 1775 + }, + { + "epoch": 0.3650940487203207, + "grad_norm": 0.2340465635061264, + "learning_rate": 8.80301107767022e-05, + "loss": 0.7373, + "step": 1776 + }, + { + "epoch": 0.36529961969369923, + "grad_norm": 0.2198680192232132, + "learning_rate": 8.802718686220535e-05, + "loss": 0.71, + "step": 1777 + }, + { + "epoch": 0.3655051906670778, + "grad_norm": 0.2116042524576187, + "learning_rate": 8.80242608279601e-05, + "loss": 0.7465, + "step": 1778 + }, + { + "epoch": 0.36571076164045635, + "grad_norm": 0.22121259570121765, + "learning_rate": 8.802133267411062e-05, + "loss": 0.7352, + "step": 1779 + }, + { + "epoch": 0.36591633261383494, + "grad_norm": 0.23157210648059845, + "learning_rate": 8.801840240080117e-05, + "loss": 0.6896, + "step": 1780 + }, + { + "epoch": 0.36612190358721347, + "grad_norm": 0.22456520795822144, + "learning_rate": 8.801547000817609e-05, + "loss": 0.7449, + "step": 1781 + }, + { + "epoch": 0.36632747456059206, + "grad_norm": 0.15871234238147736, + "learning_rate": 8.801253549637985e-05, + "loss": 0.5766, + "step": 1782 + }, + { + "epoch": 0.3665330455339706, + "grad_norm": 0.23135414719581604, + "learning_rate": 8.800959886555704e-05, + "loss": 0.7021, + "step": 1783 + }, + { + "epoch": 0.3667386165073492, + "grad_norm": 0.1448424756526947, + "learning_rate": 8.80066601158523e-05, + "loss": 0.6072, + "step": 1784 + }, + { + "epoch": 0.3669441874807277, + "grad_norm": 0.14697474241256714, + "learning_rate": 8.800371924741044e-05, + "loss": 0.6064, + "step": 1785 + }, + { + "epoch": 0.3671497584541063, + "grad_norm": 0.22950981557369232, + "learning_rate": 8.800077626037634e-05, + "loss": 0.7119, + "step": 1786 + }, + { + "epoch": 0.36735532942748483, + "grad_norm": 0.21077360212802887, + "learning_rate": 8.799783115489497e-05, + "loss": 0.7119, + "step": 1787 + }, + { + "epoch": 0.3675609004008634, + "grad_norm": 0.21831658482551575, + "learning_rate": 8.799488393111144e-05, + "loss": 0.6915, + "step": 1788 + }, + { + "epoch": 0.36776647137424195, + "grad_norm": 0.2097778469324112, + "learning_rate": 8.799193458917092e-05, + "loss": 0.7103, + "step": 1789 + }, + { + "epoch": 0.36797204234762054, + "grad_norm": 0.21712899208068848, + "learning_rate": 8.798898312921874e-05, + "loss": 0.7155, + "step": 1790 + }, + { + "epoch": 0.3681776133209991, + "grad_norm": 0.21277742087841034, + "learning_rate": 8.798602955140029e-05, + "loss": 0.7349, + "step": 1791 + }, + { + "epoch": 0.3683831842943776, + "grad_norm": 0.2360071986913681, + "learning_rate": 8.798307385586107e-05, + "loss": 0.7345, + "step": 1792 + }, + { + "epoch": 0.3685887552677562, + "grad_norm": 0.200873002409935, + "learning_rate": 8.798011604274671e-05, + "loss": 0.5943, + "step": 1793 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 0.23026502132415771, + "learning_rate": 8.797715611220293e-05, + "loss": 0.7188, + "step": 1794 + }, + { + "epoch": 0.3689998972145133, + "grad_norm": 0.22256635129451752, + "learning_rate": 8.797419406437553e-05, + "loss": 0.7152, + "step": 1795 + }, + { + "epoch": 0.36920546818789185, + "grad_norm": 0.21542035043239594, + "learning_rate": 8.797122989941045e-05, + "loss": 0.7055, + "step": 1796 + }, + { + "epoch": 0.36941103916127044, + "grad_norm": 0.22514380514621735, + "learning_rate": 8.796826361745374e-05, + "loss": 0.7028, + "step": 1797 + }, + { + "epoch": 0.36961661013464897, + "grad_norm": 0.2098117172718048, + "learning_rate": 8.796529521865149e-05, + "loss": 0.7223, + "step": 1798 + }, + { + "epoch": 0.36982218110802756, + "grad_norm": 0.2132442593574524, + "learning_rate": 8.796232470314997e-05, + "loss": 0.6883, + "step": 1799 + }, + { + "epoch": 0.3700277520814061, + "grad_norm": 0.17681948840618134, + "learning_rate": 8.795935207109552e-05, + "loss": 0.5999, + "step": 1800 + }, + { + "epoch": 0.3702333230547847, + "grad_norm": 0.23800528049468994, + "learning_rate": 8.795637732263459e-05, + "loss": 0.7058, + "step": 1801 + }, + { + "epoch": 0.3704388940281632, + "grad_norm": 0.2394934594631195, + "learning_rate": 8.795340045791371e-05, + "loss": 0.7371, + "step": 1802 + }, + { + "epoch": 0.3706444650015418, + "grad_norm": 0.21029235422611237, + "learning_rate": 8.795042147707957e-05, + "loss": 0.6879, + "step": 1803 + }, + { + "epoch": 0.37085003597492033, + "grad_norm": 0.2209658920764923, + "learning_rate": 8.79474403802789e-05, + "loss": 0.7145, + "step": 1804 + }, + { + "epoch": 0.3710556069482989, + "grad_norm": 0.2265157699584961, + "learning_rate": 8.79444571676586e-05, + "loss": 0.7329, + "step": 1805 + }, + { + "epoch": 0.37126117792167745, + "grad_norm": 0.17411258816719055, + "learning_rate": 8.79414718393656e-05, + "loss": 0.5861, + "step": 1806 + }, + { + "epoch": 0.37146674889505604, + "grad_norm": 0.235770583152771, + "learning_rate": 8.793848439554699e-05, + "loss": 0.7168, + "step": 1807 + }, + { + "epoch": 0.3716723198684346, + "grad_norm": 0.24390238523483276, + "learning_rate": 8.793549483634995e-05, + "loss": 0.7242, + "step": 1808 + }, + { + "epoch": 0.37187789084181316, + "grad_norm": 0.22740136086940765, + "learning_rate": 8.793250316192175e-05, + "loss": 0.7064, + "step": 1809 + }, + { + "epoch": 0.3720834618151917, + "grad_norm": 0.1567818820476532, + "learning_rate": 8.79295093724098e-05, + "loss": 0.6035, + "step": 1810 + }, + { + "epoch": 0.37228903278857023, + "grad_norm": 0.23284457623958588, + "learning_rate": 8.792651346796157e-05, + "loss": 0.7145, + "step": 1811 + }, + { + "epoch": 0.3724946037619488, + "grad_norm": 0.21928593516349792, + "learning_rate": 8.792351544872467e-05, + "loss": 0.7015, + "step": 1812 + }, + { + "epoch": 0.37270017473532735, + "grad_norm": 0.2226940542459488, + "learning_rate": 8.792051531484678e-05, + "loss": 0.7032, + "step": 1813 + }, + { + "epoch": 0.37290574570870594, + "grad_norm": 0.1569989025592804, + "learning_rate": 8.791751306647572e-05, + "loss": 0.6043, + "step": 1814 + }, + { + "epoch": 0.37311131668208447, + "grad_norm": 0.231995090842247, + "learning_rate": 8.791450870375936e-05, + "loss": 0.7066, + "step": 1815 + }, + { + "epoch": 0.37331688765546306, + "grad_norm": 0.2193315476179123, + "learning_rate": 8.791150222684576e-05, + "loss": 0.7099, + "step": 1816 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 0.2191406637430191, + "learning_rate": 8.790849363588301e-05, + "loss": 0.7082, + "step": 1817 + }, + { + "epoch": 0.3737280296022202, + "grad_norm": 0.24114836752414703, + "learning_rate": 8.790548293101932e-05, + "loss": 0.6951, + "step": 1818 + }, + { + "epoch": 0.3739336005755987, + "grad_norm": 0.21961726248264313, + "learning_rate": 8.790247011240304e-05, + "loss": 0.7007, + "step": 1819 + }, + { + "epoch": 0.3741391715489773, + "grad_norm": 0.22864870727062225, + "learning_rate": 8.789945518018259e-05, + "loss": 0.7172, + "step": 1820 + }, + { + "epoch": 0.37434474252235583, + "grad_norm": 0.2318045198917389, + "learning_rate": 8.789643813450647e-05, + "loss": 0.7168, + "step": 1821 + }, + { + "epoch": 0.3745503134957344, + "grad_norm": 0.21737788617610931, + "learning_rate": 8.789341897552336e-05, + "loss": 0.7251, + "step": 1822 + }, + { + "epoch": 0.37475588446911295, + "grad_norm": 0.21853739023208618, + "learning_rate": 8.789039770338197e-05, + "loss": 0.7059, + "step": 1823 + }, + { + "epoch": 0.37496145544249154, + "grad_norm": 0.21663320064544678, + "learning_rate": 8.788737431823116e-05, + "loss": 0.6916, + "step": 1824 + }, + { + "epoch": 0.3751670264158701, + "grad_norm": 0.17142772674560547, + "learning_rate": 8.788434882021987e-05, + "loss": 0.594, + "step": 1825 + }, + { + "epoch": 0.37537259738924866, + "grad_norm": 0.2234950065612793, + "learning_rate": 8.788132120949716e-05, + "loss": 0.7175, + "step": 1826 + }, + { + "epoch": 0.3755781683626272, + "grad_norm": 0.21172864735126495, + "learning_rate": 8.787829148621218e-05, + "loss": 0.6872, + "step": 1827 + }, + { + "epoch": 0.37578373933600573, + "grad_norm": 0.22408267855644226, + "learning_rate": 8.787525965051418e-05, + "loss": 0.7375, + "step": 1828 + }, + { + "epoch": 0.3759893103093843, + "grad_norm": 0.21300190687179565, + "learning_rate": 8.787222570255256e-05, + "loss": 0.7224, + "step": 1829 + }, + { + "epoch": 0.37619488128276285, + "grad_norm": 0.22645267844200134, + "learning_rate": 8.786918964247674e-05, + "loss": 0.6957, + "step": 1830 + }, + { + "epoch": 0.37640045225614144, + "grad_norm": 0.1754547655582428, + "learning_rate": 8.786615147043633e-05, + "loss": 0.5798, + "step": 1831 + }, + { + "epoch": 0.37660602322951997, + "grad_norm": 0.2161412239074707, + "learning_rate": 8.786311118658097e-05, + "loss": 0.7041, + "step": 1832 + }, + { + "epoch": 0.37681159420289856, + "grad_norm": 0.14319129288196564, + "learning_rate": 8.78600687910605e-05, + "loss": 0.5844, + "step": 1833 + }, + { + "epoch": 0.3770171651762771, + "grad_norm": 0.22690434753894806, + "learning_rate": 8.785702428402475e-05, + "loss": 0.7024, + "step": 1834 + }, + { + "epoch": 0.3772227361496557, + "grad_norm": 0.2214747965335846, + "learning_rate": 8.785397766562371e-05, + "loss": 0.7269, + "step": 1835 + }, + { + "epoch": 0.3774283071230342, + "grad_norm": 0.2654751241207123, + "learning_rate": 8.785092893600751e-05, + "loss": 0.7037, + "step": 1836 + }, + { + "epoch": 0.3776338780964128, + "grad_norm": 0.21953707933425903, + "learning_rate": 8.784787809532632e-05, + "loss": 0.7217, + "step": 1837 + }, + { + "epoch": 0.37783944906979133, + "grad_norm": 0.22590485215187073, + "learning_rate": 8.784482514373045e-05, + "loss": 0.7056, + "step": 1838 + }, + { + "epoch": 0.3780450200431699, + "grad_norm": 0.22106105089187622, + "learning_rate": 8.78417700813703e-05, + "loss": 0.7058, + "step": 1839 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 0.1933329850435257, + "learning_rate": 8.783871290839637e-05, + "loss": 0.5885, + "step": 1840 + }, + { + "epoch": 0.37845616198992704, + "grad_norm": 0.18652944266796112, + "learning_rate": 8.78356536249593e-05, + "loss": 0.5857, + "step": 1841 + }, + { + "epoch": 0.3786617329633056, + "grad_norm": 0.2601449191570282, + "learning_rate": 8.783259223120979e-05, + "loss": 0.7123, + "step": 1842 + }, + { + "epoch": 0.37886730393668416, + "grad_norm": 0.246074840426445, + "learning_rate": 8.782952872729864e-05, + "loss": 0.7277, + "step": 1843 + }, + { + "epoch": 0.3790728749100627, + "grad_norm": 0.2558608949184418, + "learning_rate": 8.78264631133768e-05, + "loss": 0.7006, + "step": 1844 + }, + { + "epoch": 0.3792784458834413, + "grad_norm": 0.21807844936847687, + "learning_rate": 8.78233953895953e-05, + "loss": 0.5869, + "step": 1845 + }, + { + "epoch": 0.3794840168568198, + "grad_norm": 0.25354549288749695, + "learning_rate": 8.782032555610526e-05, + "loss": 0.7129, + "step": 1846 + }, + { + "epoch": 0.37968958783019835, + "grad_norm": 0.243685781955719, + "learning_rate": 8.781725361305793e-05, + "loss": 0.7217, + "step": 1847 + }, + { + "epoch": 0.37989515880357694, + "grad_norm": 0.16930992901325226, + "learning_rate": 8.781417956060464e-05, + "loss": 0.6007, + "step": 1848 + }, + { + "epoch": 0.38010072977695547, + "grad_norm": 0.24475498497486115, + "learning_rate": 8.781110339889682e-05, + "loss": 0.7114, + "step": 1849 + }, + { + "epoch": 0.38030630075033406, + "grad_norm": 0.24792300164699554, + "learning_rate": 8.780802512808605e-05, + "loss": 0.7409, + "step": 1850 + }, + { + "epoch": 0.3805118717237126, + "grad_norm": 0.2320515662431717, + "learning_rate": 8.780494474832395e-05, + "loss": 0.7163, + "step": 1851 + }, + { + "epoch": 0.3807174426970912, + "grad_norm": 0.24166975915431976, + "learning_rate": 8.780186225976232e-05, + "loss": 0.7304, + "step": 1852 + }, + { + "epoch": 0.3809230136704697, + "grad_norm": 0.23629960417747498, + "learning_rate": 8.779877766255297e-05, + "loss": 0.7155, + "step": 1853 + }, + { + "epoch": 0.3811285846438483, + "grad_norm": 0.22916334867477417, + "learning_rate": 8.77956909568479e-05, + "loss": 0.7263, + "step": 1854 + }, + { + "epoch": 0.38133415561722683, + "grad_norm": 0.24127478897571564, + "learning_rate": 8.779260214279915e-05, + "loss": 0.6936, + "step": 1855 + }, + { + "epoch": 0.3815397265906054, + "grad_norm": 0.22905930876731873, + "learning_rate": 8.778951122055891e-05, + "loss": 0.718, + "step": 1856 + }, + { + "epoch": 0.38174529756398395, + "grad_norm": 0.21907439827919006, + "learning_rate": 8.778641819027946e-05, + "loss": 0.7082, + "step": 1857 + }, + { + "epoch": 0.38195086853736254, + "grad_norm": 0.2231978327035904, + "learning_rate": 8.778332305211315e-05, + "loss": 0.5978, + "step": 1858 + }, + { + "epoch": 0.3821564395107411, + "grad_norm": 0.2434241622686386, + "learning_rate": 8.778022580621249e-05, + "loss": 0.7043, + "step": 1859 + }, + { + "epoch": 0.38236201048411966, + "grad_norm": 0.22279253602027893, + "learning_rate": 8.777712645273005e-05, + "loss": 0.728, + "step": 1860 + }, + { + "epoch": 0.3825675814574982, + "grad_norm": 0.22146545350551605, + "learning_rate": 8.777402499181854e-05, + "loss": 0.7035, + "step": 1861 + }, + { + "epoch": 0.3827731524308768, + "grad_norm": 0.1629379838705063, + "learning_rate": 8.777092142363074e-05, + "loss": 0.5911, + "step": 1862 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 0.24716326594352722, + "learning_rate": 8.776781574831956e-05, + "loss": 0.7466, + "step": 1863 + }, + { + "epoch": 0.3831842943776339, + "grad_norm": 0.21958030760288239, + "learning_rate": 8.776470796603799e-05, + "loss": 0.7112, + "step": 1864 + }, + { + "epoch": 0.38338986535101244, + "grad_norm": 0.22167621552944183, + "learning_rate": 8.776159807693914e-05, + "loss": 0.7076, + "step": 1865 + }, + { + "epoch": 0.38359543632439097, + "grad_norm": 0.22505022585391998, + "learning_rate": 8.775848608117621e-05, + "loss": 0.7383, + "step": 1866 + }, + { + "epoch": 0.38380100729776956, + "grad_norm": 0.2208850234746933, + "learning_rate": 8.775537197890254e-05, + "loss": 0.7371, + "step": 1867 + }, + { + "epoch": 0.3840065782711481, + "grad_norm": 0.21698389947414398, + "learning_rate": 8.775225577027154e-05, + "loss": 0.7226, + "step": 1868 + }, + { + "epoch": 0.3842121492445267, + "grad_norm": 0.22070789337158203, + "learning_rate": 8.774913745543668e-05, + "loss": 0.712, + "step": 1869 + }, + { + "epoch": 0.3844177202179052, + "grad_norm": 0.22153621912002563, + "learning_rate": 8.774601703455166e-05, + "loss": 0.7102, + "step": 1870 + }, + { + "epoch": 0.3846232911912838, + "grad_norm": 0.21667127311229706, + "learning_rate": 8.774289450777017e-05, + "loss": 0.705, + "step": 1871 + }, + { + "epoch": 0.38482886216466233, + "grad_norm": 0.22485022246837616, + "learning_rate": 8.773976987524604e-05, + "loss": 0.7232, + "step": 1872 + }, + { + "epoch": 0.3850344331380409, + "grad_norm": 0.19532062113285065, + "learning_rate": 8.77366431371332e-05, + "loss": 0.61, + "step": 1873 + }, + { + "epoch": 0.38524000411141945, + "grad_norm": 0.2282322347164154, + "learning_rate": 8.773351429358574e-05, + "loss": 0.721, + "step": 1874 + }, + { + "epoch": 0.38544557508479804, + "grad_norm": 0.141531839966774, + "learning_rate": 8.773038334475774e-05, + "loss": 0.5959, + "step": 1875 + }, + { + "epoch": 0.3856511460581766, + "grad_norm": 0.22724571824073792, + "learning_rate": 8.772725029080349e-05, + "loss": 0.7027, + "step": 1876 + }, + { + "epoch": 0.38585671703155516, + "grad_norm": 0.22629983723163605, + "learning_rate": 8.772411513187731e-05, + "loss": 0.7021, + "step": 1877 + }, + { + "epoch": 0.3860622880049337, + "grad_norm": 0.18585380911827087, + "learning_rate": 8.772097786813368e-05, + "loss": 0.5524, + "step": 1878 + }, + { + "epoch": 0.3862678589783123, + "grad_norm": 0.25130245089530945, + "learning_rate": 8.771783849972714e-05, + "loss": 0.7274, + "step": 1879 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 0.22500745952129364, + "learning_rate": 8.771469702681236e-05, + "loss": 0.725, + "step": 1880 + }, + { + "epoch": 0.3866790009250694, + "grad_norm": 0.20772625505924225, + "learning_rate": 8.771155344954412e-05, + "loss": 0.7155, + "step": 1881 + }, + { + "epoch": 0.38688457189844794, + "grad_norm": 0.2251315712928772, + "learning_rate": 8.770840776807726e-05, + "loss": 0.6973, + "step": 1882 + }, + { + "epoch": 0.3870901428718265, + "grad_norm": 0.2260076254606247, + "learning_rate": 8.770525998256677e-05, + "loss": 0.7128, + "step": 1883 + }, + { + "epoch": 0.38729571384520506, + "grad_norm": 0.16973739862442017, + "learning_rate": 8.770211009316772e-05, + "loss": 0.5794, + "step": 1884 + }, + { + "epoch": 0.3875012848185836, + "grad_norm": 0.2505844831466675, + "learning_rate": 8.76989581000353e-05, + "loss": 0.7359, + "step": 1885 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 0.28069007396698, + "learning_rate": 8.769580400332479e-05, + "loss": 0.7233, + "step": 1886 + }, + { + "epoch": 0.3879124267653407, + "grad_norm": 0.13608971238136292, + "learning_rate": 8.769264780319158e-05, + "loss": 0.5905, + "step": 1887 + }, + { + "epoch": 0.3881179977387193, + "grad_norm": 0.25234588980674744, + "learning_rate": 8.768948949979116e-05, + "loss": 0.7122, + "step": 1888 + }, + { + "epoch": 0.38832356871209783, + "grad_norm": 0.234871044754982, + "learning_rate": 8.768632909327912e-05, + "loss": 0.7299, + "step": 1889 + }, + { + "epoch": 0.3885291396854764, + "grad_norm": 0.2207827866077423, + "learning_rate": 8.768316658381114e-05, + "loss": 0.7086, + "step": 1890 + }, + { + "epoch": 0.38873471065885495, + "grad_norm": 0.25734445452690125, + "learning_rate": 8.768000197154306e-05, + "loss": 0.7071, + "step": 1891 + }, + { + "epoch": 0.38894028163223354, + "grad_norm": 0.2389577329158783, + "learning_rate": 8.767683525663077e-05, + "loss": 0.733, + "step": 1892 + }, + { + "epoch": 0.3891458526056121, + "grad_norm": 0.17553114891052246, + "learning_rate": 8.767366643923028e-05, + "loss": 0.5974, + "step": 1893 + }, + { + "epoch": 0.38935142357899066, + "grad_norm": 0.23687125742435455, + "learning_rate": 8.76704955194977e-05, + "loss": 0.7148, + "step": 1894 + }, + { + "epoch": 0.3895569945523692, + "grad_norm": 0.25215673446655273, + "learning_rate": 8.766732249758925e-05, + "loss": 0.7338, + "step": 1895 + }, + { + "epoch": 0.3897625655257478, + "grad_norm": 0.16502924263477325, + "learning_rate": 8.766414737366124e-05, + "loss": 0.584, + "step": 1896 + }, + { + "epoch": 0.3899681364991263, + "grad_norm": 0.1485537588596344, + "learning_rate": 8.76609701478701e-05, + "loss": 0.6049, + "step": 1897 + }, + { + "epoch": 0.3901737074725049, + "grad_norm": 0.3515810966491699, + "learning_rate": 8.765779082037235e-05, + "loss": 0.7529, + "step": 1898 + }, + { + "epoch": 0.39037927844588344, + "grad_norm": 0.23719021677970886, + "learning_rate": 8.765460939132464e-05, + "loss": 0.728, + "step": 1899 + }, + { + "epoch": 0.390584849419262, + "grad_norm": 0.17814306914806366, + "learning_rate": 8.76514258608837e-05, + "loss": 0.601, + "step": 1900 + }, + { + "epoch": 0.39079042039264056, + "grad_norm": 0.4228149652481079, + "learning_rate": 8.764824022920636e-05, + "loss": 0.7195, + "step": 1901 + }, + { + "epoch": 0.3909959913660191, + "grad_norm": 0.16185280680656433, + "learning_rate": 8.764505249644953e-05, + "loss": 0.5728, + "step": 1902 + }, + { + "epoch": 0.3912015623393977, + "grad_norm": 0.23503097891807556, + "learning_rate": 8.764186266277032e-05, + "loss": 0.71, + "step": 1903 + }, + { + "epoch": 0.3914071333127762, + "grad_norm": 0.23683130741119385, + "learning_rate": 8.763867072832583e-05, + "loss": 0.7351, + "step": 1904 + }, + { + "epoch": 0.3916127042861548, + "grad_norm": 0.2431173473596573, + "learning_rate": 8.763547669327334e-05, + "loss": 0.72, + "step": 1905 + }, + { + "epoch": 0.39181827525953333, + "grad_norm": 0.2246868759393692, + "learning_rate": 8.763228055777016e-05, + "loss": 0.7136, + "step": 1906 + }, + { + "epoch": 0.3920238462329119, + "grad_norm": 0.17881381511688232, + "learning_rate": 8.762908232197379e-05, + "loss": 0.6021, + "step": 1907 + }, + { + "epoch": 0.39222941720629045, + "grad_norm": 0.25456559658050537, + "learning_rate": 8.76258819860418e-05, + "loss": 0.7192, + "step": 1908 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 0.2538883686065674, + "learning_rate": 8.762267955013185e-05, + "loss": 0.6971, + "step": 1909 + }, + { + "epoch": 0.3926405591530476, + "grad_norm": 0.21888628602027893, + "learning_rate": 8.761947501440166e-05, + "loss": 0.7097, + "step": 1910 + }, + { + "epoch": 0.39284613012642616, + "grad_norm": 0.2221071869134903, + "learning_rate": 8.761626837900916e-05, + "loss": 0.7004, + "step": 1911 + }, + { + "epoch": 0.3930517010998047, + "grad_norm": 0.23489388823509216, + "learning_rate": 8.761305964411228e-05, + "loss": 0.6935, + "step": 1912 + }, + { + "epoch": 0.3932572720731833, + "grad_norm": 0.23386436700820923, + "learning_rate": 8.760984880986915e-05, + "loss": 0.695, + "step": 1913 + }, + { + "epoch": 0.3934628430465618, + "grad_norm": 0.22081080079078674, + "learning_rate": 8.760663587643792e-05, + "loss": 0.6939, + "step": 1914 + }, + { + "epoch": 0.3936684140199404, + "grad_norm": 0.2191271334886551, + "learning_rate": 8.760342084397688e-05, + "loss": 0.7055, + "step": 1915 + }, + { + "epoch": 0.39387398499331894, + "grad_norm": 0.16592054069042206, + "learning_rate": 8.760020371264442e-05, + "loss": 0.5968, + "step": 1916 + }, + { + "epoch": 0.3940795559666975, + "grad_norm": 0.2341727763414383, + "learning_rate": 8.759698448259905e-05, + "loss": 0.7216, + "step": 1917 + }, + { + "epoch": 0.39428512694007606, + "grad_norm": 0.2350844144821167, + "learning_rate": 8.759376315399935e-05, + "loss": 0.7036, + "step": 1918 + }, + { + "epoch": 0.39449069791345465, + "grad_norm": 0.1551404446363449, + "learning_rate": 8.759053972700401e-05, + "loss": 0.6018, + "step": 1919 + }, + { + "epoch": 0.3946962688868332, + "grad_norm": 0.2272733896970749, + "learning_rate": 8.758731420177186e-05, + "loss": 0.7132, + "step": 1920 + }, + { + "epoch": 0.3949018398602117, + "grad_norm": 0.22375091910362244, + "learning_rate": 8.758408657846177e-05, + "loss": 0.6917, + "step": 1921 + }, + { + "epoch": 0.3951074108335903, + "grad_norm": 0.14521102607250214, + "learning_rate": 8.758085685723279e-05, + "loss": 0.5774, + "step": 1922 + }, + { + "epoch": 0.39531298180696883, + "grad_norm": 0.2234261929988861, + "learning_rate": 8.757762503824401e-05, + "loss": 0.7322, + "step": 1923 + }, + { + "epoch": 0.3955185527803474, + "grad_norm": 0.2137596607208252, + "learning_rate": 8.757439112165465e-05, + "loss": 0.7094, + "step": 1924 + }, + { + "epoch": 0.39572412375372595, + "grad_norm": 0.15637266635894775, + "learning_rate": 8.757115510762404e-05, + "loss": 0.599, + "step": 1925 + }, + { + "epoch": 0.39592969472710454, + "grad_norm": 0.21594415605068207, + "learning_rate": 8.756791699631159e-05, + "loss": 0.7096, + "step": 1926 + }, + { + "epoch": 0.3961352657004831, + "grad_norm": 0.21532535552978516, + "learning_rate": 8.756467678787683e-05, + "loss": 0.7331, + "step": 1927 + }, + { + "epoch": 0.39634083667386166, + "grad_norm": 0.14360411465168, + "learning_rate": 8.756143448247938e-05, + "loss": 0.5832, + "step": 1928 + }, + { + "epoch": 0.3965464076472402, + "grad_norm": 0.2573210597038269, + "learning_rate": 8.7558190080279e-05, + "loss": 0.7116, + "step": 1929 + }, + { + "epoch": 0.3967519786206188, + "grad_norm": 0.22037194669246674, + "learning_rate": 8.755494358143552e-05, + "loss": 0.6988, + "step": 1930 + }, + { + "epoch": 0.3969575495939973, + "grad_norm": 0.1471826732158661, + "learning_rate": 8.755169498610885e-05, + "loss": 0.6081, + "step": 1931 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 0.24475279450416565, + "learning_rate": 8.754844429445906e-05, + "loss": 0.7527, + "step": 1932 + }, + { + "epoch": 0.39736869154075444, + "grad_norm": 0.21802780032157898, + "learning_rate": 8.754519150664629e-05, + "loss": 0.6628, + "step": 1933 + }, + { + "epoch": 0.397574262514133, + "grad_norm": 0.14480328559875488, + "learning_rate": 8.75419366228308e-05, + "loss": 0.597, + "step": 1934 + }, + { + "epoch": 0.39777983348751156, + "grad_norm": 0.21927644312381744, + "learning_rate": 8.753867964317292e-05, + "loss": 0.7108, + "step": 1935 + }, + { + "epoch": 0.39798540446089015, + "grad_norm": 0.20976369082927704, + "learning_rate": 8.753542056783312e-05, + "loss": 0.7283, + "step": 1936 + }, + { + "epoch": 0.3981909754342687, + "grad_norm": 0.22052782773971558, + "learning_rate": 8.753215939697198e-05, + "loss": 0.7261, + "step": 1937 + }, + { + "epoch": 0.39839654640764727, + "grad_norm": 0.21982043981552124, + "learning_rate": 8.752889613075012e-05, + "loss": 0.6902, + "step": 1938 + }, + { + "epoch": 0.3986021173810258, + "grad_norm": 0.24831879138946533, + "learning_rate": 8.752563076932833e-05, + "loss": 0.7175, + "step": 1939 + }, + { + "epoch": 0.39880768835440433, + "grad_norm": 0.22775912284851074, + "learning_rate": 8.75223633128675e-05, + "loss": 0.7126, + "step": 1940 + }, + { + "epoch": 0.3990132593277829, + "grad_norm": 0.21900929510593414, + "learning_rate": 8.751909376152854e-05, + "loss": 0.6947, + "step": 1941 + }, + { + "epoch": 0.39921883030116145, + "grad_norm": 0.22170402109622955, + "learning_rate": 8.751582211547259e-05, + "loss": 0.7201, + "step": 1942 + }, + { + "epoch": 0.39942440127454004, + "grad_norm": 0.22413894534111023, + "learning_rate": 8.751254837486079e-05, + "loss": 0.7205, + "step": 1943 + }, + { + "epoch": 0.3996299722479186, + "grad_norm": 0.22276797890663147, + "learning_rate": 8.750927253985443e-05, + "loss": 0.714, + "step": 1944 + }, + { + "epoch": 0.39983554322129716, + "grad_norm": 0.21520061790943146, + "learning_rate": 8.750599461061492e-05, + "loss": 0.7147, + "step": 1945 + }, + { + "epoch": 0.4000411141946757, + "grad_norm": 0.16708485782146454, + "learning_rate": 8.750271458730372e-05, + "loss": 0.5976, + "step": 1946 + }, + { + "epoch": 0.4002466851680543, + "grad_norm": 0.24202388525009155, + "learning_rate": 8.74994324700824e-05, + "loss": 0.7329, + "step": 1947 + }, + { + "epoch": 0.4004522561414328, + "grad_norm": 0.13979558646678925, + "learning_rate": 8.749614825911274e-05, + "loss": 0.5932, + "step": 1948 + }, + { + "epoch": 0.4006578271148114, + "grad_norm": 0.13720543682575226, + "learning_rate": 8.749286195455645e-05, + "loss": 0.564, + "step": 1949 + }, + { + "epoch": 0.40086339808818994, + "grad_norm": 0.22568507492542267, + "learning_rate": 8.748957355657546e-05, + "loss": 0.7259, + "step": 1950 + }, + { + "epoch": 0.4010689690615685, + "grad_norm": 0.2142673283815384, + "learning_rate": 8.748628306533178e-05, + "loss": 0.7024, + "step": 1951 + }, + { + "epoch": 0.40127454003494706, + "grad_norm": 0.2180175483226776, + "learning_rate": 8.748299048098751e-05, + "loss": 0.7488, + "step": 1952 + }, + { + "epoch": 0.40148011100832565, + "grad_norm": 0.21027667820453644, + "learning_rate": 8.747969580370488e-05, + "loss": 0.708, + "step": 1953 + }, + { + "epoch": 0.4016856819817042, + "grad_norm": 0.21340122818946838, + "learning_rate": 8.747639903364617e-05, + "loss": 0.7076, + "step": 1954 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 0.22183535993099213, + "learning_rate": 8.747310017097382e-05, + "loss": 0.6994, + "step": 1955 + }, + { + "epoch": 0.4020968239284613, + "grad_norm": 0.21292465925216675, + "learning_rate": 8.746979921585035e-05, + "loss": 0.675, + "step": 1956 + }, + { + "epoch": 0.40230239490183983, + "grad_norm": 0.2158004343509674, + "learning_rate": 8.746649616843837e-05, + "loss": 0.727, + "step": 1957 + }, + { + "epoch": 0.4025079658752184, + "grad_norm": 0.20767906308174133, + "learning_rate": 8.746319102890061e-05, + "loss": 0.7034, + "step": 1958 + }, + { + "epoch": 0.40271353684859695, + "grad_norm": 0.21342967450618744, + "learning_rate": 8.74598837973999e-05, + "loss": 0.7249, + "step": 1959 + }, + { + "epoch": 0.40291910782197554, + "grad_norm": 0.22150453925132751, + "learning_rate": 8.745657447409917e-05, + "loss": 0.7209, + "step": 1960 + }, + { + "epoch": 0.4031246787953541, + "grad_norm": 0.20457392930984497, + "learning_rate": 8.745326305916145e-05, + "loss": 0.6967, + "step": 1961 + }, + { + "epoch": 0.40333024976873266, + "grad_norm": 0.2096332609653473, + "learning_rate": 8.744994955274992e-05, + "loss": 0.7295, + "step": 1962 + }, + { + "epoch": 0.4035358207421112, + "grad_norm": 0.20849314332008362, + "learning_rate": 8.744663395502776e-05, + "loss": 0.6962, + "step": 1963 + }, + { + "epoch": 0.4037413917154898, + "grad_norm": 0.21918678283691406, + "learning_rate": 8.744331626615835e-05, + "loss": 0.6026, + "step": 1964 + }, + { + "epoch": 0.4039469626888683, + "grad_norm": 0.21508684754371643, + "learning_rate": 8.743999648630511e-05, + "loss": 0.7116, + "step": 1965 + }, + { + "epoch": 0.4041525336622469, + "grad_norm": 0.23266804218292236, + "learning_rate": 8.743667461563161e-05, + "loss": 0.7314, + "step": 1966 + }, + { + "epoch": 0.40435810463562544, + "grad_norm": 0.21796725690364838, + "learning_rate": 8.743335065430151e-05, + "loss": 0.7151, + "step": 1967 + }, + { + "epoch": 0.404563675609004, + "grad_norm": 0.21634382009506226, + "learning_rate": 8.743002460247855e-05, + "loss": 0.7272, + "step": 1968 + }, + { + "epoch": 0.40476924658238256, + "grad_norm": 0.21737129986286163, + "learning_rate": 8.74266964603266e-05, + "loss": 0.748, + "step": 1969 + }, + { + "epoch": 0.40497481755576115, + "grad_norm": 0.20188266038894653, + "learning_rate": 8.742336622800962e-05, + "loss": 0.6833, + "step": 1970 + }, + { + "epoch": 0.4051803885291397, + "grad_norm": 0.21718573570251465, + "learning_rate": 8.742003390569166e-05, + "loss": 0.7016, + "step": 1971 + }, + { + "epoch": 0.40538595950251827, + "grad_norm": 0.2084118276834488, + "learning_rate": 8.741669949353692e-05, + "loss": 0.6989, + "step": 1972 + }, + { + "epoch": 0.4055915304758968, + "grad_norm": 0.21882924437522888, + "learning_rate": 8.741336299170963e-05, + "loss": 0.6893, + "step": 1973 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 0.2056969553232193, + "learning_rate": 8.741002440037421e-05, + "loss": 0.7163, + "step": 1974 + }, + { + "epoch": 0.4060026724226539, + "grad_norm": 0.22237667441368103, + "learning_rate": 8.740668371969509e-05, + "loss": 0.7379, + "step": 1975 + }, + { + "epoch": 0.40620824339603245, + "grad_norm": 0.2131538689136505, + "learning_rate": 8.740334094983688e-05, + "loss": 0.7185, + "step": 1976 + }, + { + "epoch": 0.40641381436941104, + "grad_norm": 0.20948132872581482, + "learning_rate": 8.739999609096425e-05, + "loss": 0.5797, + "step": 1977 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 0.1722819209098816, + "learning_rate": 8.7396649143242e-05, + "loss": 0.5985, + "step": 1978 + }, + { + "epoch": 0.40682495631616816, + "grad_norm": 0.15967948734760284, + "learning_rate": 8.739330010683498e-05, + "loss": 0.5984, + "step": 1979 + }, + { + "epoch": 0.4070305272895467, + "grad_norm": 0.29981619119644165, + "learning_rate": 8.738994898190825e-05, + "loss": 0.6891, + "step": 1980 + }, + { + "epoch": 0.4072360982629253, + "grad_norm": 0.17661848664283752, + "learning_rate": 8.738659576862684e-05, + "loss": 0.5816, + "step": 1981 + }, + { + "epoch": 0.4074416692363038, + "grad_norm": 0.23567262291908264, + "learning_rate": 8.738324046715597e-05, + "loss": 0.6944, + "step": 1982 + }, + { + "epoch": 0.4076472402096824, + "grad_norm": 0.23192854225635529, + "learning_rate": 8.737988307766094e-05, + "loss": 0.7268, + "step": 1983 + }, + { + "epoch": 0.40785281118306094, + "grad_norm": 0.2210889458656311, + "learning_rate": 8.737652360030715e-05, + "loss": 0.711, + "step": 1984 + }, + { + "epoch": 0.4080583821564395, + "grad_norm": 0.22944270074367523, + "learning_rate": 8.737316203526013e-05, + "loss": 0.7187, + "step": 1985 + }, + { + "epoch": 0.40826395312981806, + "grad_norm": 0.2202499508857727, + "learning_rate": 8.736979838268545e-05, + "loss": 0.6949, + "step": 1986 + }, + { + "epoch": 0.40846952410319665, + "grad_norm": 0.22138486802577972, + "learning_rate": 8.736643264274885e-05, + "loss": 0.7328, + "step": 1987 + }, + { + "epoch": 0.4086750950765752, + "grad_norm": 0.22516939043998718, + "learning_rate": 8.736306481561613e-05, + "loss": 0.7106, + "step": 1988 + }, + { + "epoch": 0.40888066604995377, + "grad_norm": 0.22086863219738007, + "learning_rate": 8.735969490145321e-05, + "loss": 0.6854, + "step": 1989 + }, + { + "epoch": 0.4090862370233323, + "grad_norm": 0.2156277447938919, + "learning_rate": 8.73563229004261e-05, + "loss": 0.7179, + "step": 1990 + }, + { + "epoch": 0.4092918079967109, + "grad_norm": 0.26995977759361267, + "learning_rate": 8.735294881270095e-05, + "loss": 0.59, + "step": 1991 + }, + { + "epoch": 0.4094973789700894, + "grad_norm": 0.2523725926876068, + "learning_rate": 8.734957263844397e-05, + "loss": 0.7057, + "step": 1992 + }, + { + "epoch": 0.409702949943468, + "grad_norm": 0.2281750589609146, + "learning_rate": 8.734619437782148e-05, + "loss": 0.7269, + "step": 1993 + }, + { + "epoch": 0.40990852091684654, + "grad_norm": 0.23070600628852844, + "learning_rate": 8.734281403099992e-05, + "loss": 0.724, + "step": 1994 + }, + { + "epoch": 0.4101140918902251, + "grad_norm": 0.22441944479942322, + "learning_rate": 8.733943159814583e-05, + "loss": 0.7058, + "step": 1995 + }, + { + "epoch": 0.41031966286360366, + "grad_norm": 0.1988096684217453, + "learning_rate": 8.733604707942584e-05, + "loss": 0.5961, + "step": 1996 + }, + { + "epoch": 0.4105252338369822, + "grad_norm": 0.16709105670452118, + "learning_rate": 8.733266047500667e-05, + "loss": 0.5956, + "step": 1997 + }, + { + "epoch": 0.4107308048103608, + "grad_norm": 0.258070170879364, + "learning_rate": 8.73292717850552e-05, + "loss": 0.6959, + "step": 1998 + }, + { + "epoch": 0.4109363757837393, + "grad_norm": 0.24676097929477692, + "learning_rate": 8.732588100973834e-05, + "loss": 0.7152, + "step": 1999 + }, + { + "epoch": 0.4111419467571179, + "grad_norm": 0.2049533575773239, + "learning_rate": 8.732248814922317e-05, + "loss": 0.603, + "step": 2000 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 0.24677561223506927, + "learning_rate": 8.73190932036768e-05, + "loss": 0.7021, + "step": 2001 + }, + { + "epoch": 0.411553088703875, + "grad_norm": 0.24673065543174744, + "learning_rate": 8.731569617326652e-05, + "loss": 0.7424, + "step": 2002 + }, + { + "epoch": 0.41175865967725356, + "grad_norm": 0.23665191233158112, + "learning_rate": 8.731229705815968e-05, + "loss": 0.7199, + "step": 2003 + }, + { + "epoch": 0.41196423065063215, + "grad_norm": 0.21852630376815796, + "learning_rate": 8.730889585852371e-05, + "loss": 0.7065, + "step": 2004 + }, + { + "epoch": 0.4121698016240107, + "grad_norm": 0.22494211792945862, + "learning_rate": 8.730549257452622e-05, + "loss": 0.7032, + "step": 2005 + }, + { + "epoch": 0.41237537259738927, + "grad_norm": 0.21385926008224487, + "learning_rate": 8.730208720633483e-05, + "loss": 0.6929, + "step": 2006 + }, + { + "epoch": 0.4125809435707678, + "grad_norm": 0.19130924344062805, + "learning_rate": 8.729867975411734e-05, + "loss": 0.5725, + "step": 2007 + }, + { + "epoch": 0.4127865145441464, + "grad_norm": 0.226227268576622, + "learning_rate": 8.729527021804158e-05, + "loss": 0.6859, + "step": 2008 + }, + { + "epoch": 0.4129920855175249, + "grad_norm": 0.22433815896511078, + "learning_rate": 8.729185859827555e-05, + "loss": 0.7239, + "step": 2009 + }, + { + "epoch": 0.4131976564909035, + "grad_norm": 0.2165122628211975, + "learning_rate": 8.728844489498733e-05, + "loss": 0.7045, + "step": 2010 + }, + { + "epoch": 0.41340322746428204, + "grad_norm": 0.21789471805095673, + "learning_rate": 8.728502910834506e-05, + "loss": 0.7185, + "step": 2011 + }, + { + "epoch": 0.41360879843766063, + "grad_norm": 0.2177097499370575, + "learning_rate": 8.728161123851708e-05, + "loss": 0.7074, + "step": 2012 + }, + { + "epoch": 0.41381436941103916, + "grad_norm": 0.22537820041179657, + "learning_rate": 8.727819128567171e-05, + "loss": 0.706, + "step": 2013 + }, + { + "epoch": 0.4140199403844177, + "grad_norm": 0.21425795555114746, + "learning_rate": 8.727476924997747e-05, + "loss": 0.6974, + "step": 2014 + }, + { + "epoch": 0.4142255113577963, + "grad_norm": 0.23247577250003815, + "learning_rate": 8.727134513160296e-05, + "loss": 0.7111, + "step": 2015 + }, + { + "epoch": 0.4144310823311748, + "grad_norm": 0.21180875599384308, + "learning_rate": 8.726791893071683e-05, + "loss": 0.6801, + "step": 2016 + }, + { + "epoch": 0.4146366533045534, + "grad_norm": 0.21250028908252716, + "learning_rate": 8.72644906474879e-05, + "loss": 0.7447, + "step": 2017 + }, + { + "epoch": 0.41484222427793194, + "grad_norm": 0.21931192278862, + "learning_rate": 8.726106028208505e-05, + "loss": 0.7272, + "step": 2018 + }, + { + "epoch": 0.4150477952513105, + "grad_norm": 0.21856500208377838, + "learning_rate": 8.72576278346773e-05, + "loss": 0.7224, + "step": 2019 + }, + { + "epoch": 0.41525336622468906, + "grad_norm": 0.21037447452545166, + "learning_rate": 8.725419330543373e-05, + "loss": 0.727, + "step": 2020 + }, + { + "epoch": 0.41545893719806765, + "grad_norm": 0.21209198236465454, + "learning_rate": 8.725075669452356e-05, + "loss": 0.7019, + "step": 2021 + }, + { + "epoch": 0.4156645081714462, + "grad_norm": 0.20165219902992249, + "learning_rate": 8.724731800211608e-05, + "loss": 0.6149, + "step": 2022 + }, + { + "epoch": 0.41587007914482477, + "grad_norm": 0.22927507758140564, + "learning_rate": 8.72438772283807e-05, + "loss": 0.7089, + "step": 2023 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 0.2256333827972412, + "learning_rate": 8.724043437348695e-05, + "loss": 0.7093, + "step": 2024 + }, + { + "epoch": 0.4162812210915819, + "grad_norm": 0.21047276258468628, + "learning_rate": 8.723698943760443e-05, + "loss": 0.7246, + "step": 2025 + }, + { + "epoch": 0.4164867920649604, + "grad_norm": 0.21218207478523254, + "learning_rate": 8.723354242090285e-05, + "loss": 0.6883, + "step": 2026 + }, + { + "epoch": 0.416692363038339, + "grad_norm": 0.21619375050067902, + "learning_rate": 8.723009332355203e-05, + "loss": 0.7068, + "step": 2027 + }, + { + "epoch": 0.41689793401171754, + "grad_norm": 0.215839222073555, + "learning_rate": 8.72266421457219e-05, + "loss": 0.6964, + "step": 2028 + }, + { + "epoch": 0.41710350498509613, + "grad_norm": 0.22797274589538574, + "learning_rate": 8.722318888758248e-05, + "loss": 0.6966, + "step": 2029 + }, + { + "epoch": 0.41730907595847466, + "grad_norm": 0.2232465296983719, + "learning_rate": 8.72197335493039e-05, + "loss": 0.611, + "step": 2030 + }, + { + "epoch": 0.4175146469318532, + "grad_norm": 0.2285899519920349, + "learning_rate": 8.721627613105637e-05, + "loss": 0.7202, + "step": 2031 + }, + { + "epoch": 0.4177202179052318, + "grad_norm": 0.23706313967704773, + "learning_rate": 8.721281663301024e-05, + "loss": 0.7267, + "step": 2032 + }, + { + "epoch": 0.4179257888786103, + "grad_norm": 0.21476082503795624, + "learning_rate": 8.720935505533593e-05, + "loss": 0.7026, + "step": 2033 + }, + { + "epoch": 0.4181313598519889, + "grad_norm": 0.20751173794269562, + "learning_rate": 8.720589139820399e-05, + "loss": 0.726, + "step": 2034 + }, + { + "epoch": 0.41833693082536744, + "grad_norm": 0.19482995569705963, + "learning_rate": 8.720242566178504e-05, + "loss": 0.5893, + "step": 2035 + }, + { + "epoch": 0.418542501798746, + "grad_norm": 0.2433481514453888, + "learning_rate": 8.719895784624985e-05, + "loss": 0.6991, + "step": 2036 + }, + { + "epoch": 0.41874807277212456, + "grad_norm": 0.22105759382247925, + "learning_rate": 8.719548795176922e-05, + "loss": 0.7016, + "step": 2037 + }, + { + "epoch": 0.41895364374550315, + "grad_norm": 0.14366379380226135, + "learning_rate": 8.719201597851414e-05, + "loss": 0.5847, + "step": 2038 + }, + { + "epoch": 0.4191592147188817, + "grad_norm": 0.15119072794914246, + "learning_rate": 8.718854192665563e-05, + "loss": 0.599, + "step": 2039 + }, + { + "epoch": 0.41936478569226027, + "grad_norm": 0.2527151107788086, + "learning_rate": 8.718506579636484e-05, + "loss": 0.6794, + "step": 2040 + }, + { + "epoch": 0.4195703566656388, + "grad_norm": 0.1412784457206726, + "learning_rate": 8.718158758781305e-05, + "loss": 0.5728, + "step": 2041 + }, + { + "epoch": 0.4197759276390174, + "grad_norm": 0.2282373160123825, + "learning_rate": 8.717810730117158e-05, + "loss": 0.7497, + "step": 2042 + }, + { + "epoch": 0.4199814986123959, + "grad_norm": 0.2128640115261078, + "learning_rate": 8.717462493661192e-05, + "loss": 0.7085, + "step": 2043 + }, + { + "epoch": 0.4201870695857745, + "grad_norm": 0.22235573828220367, + "learning_rate": 8.717114049430558e-05, + "loss": 0.7508, + "step": 2044 + }, + { + "epoch": 0.42039264055915304, + "grad_norm": 0.21980416774749756, + "learning_rate": 8.716765397442428e-05, + "loss": 0.7091, + "step": 2045 + }, + { + "epoch": 0.42059821153253163, + "grad_norm": 0.20546141266822815, + "learning_rate": 8.716416537713978e-05, + "loss": 0.7008, + "step": 2046 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 0.2216566503047943, + "learning_rate": 8.71606747026239e-05, + "loss": 0.6921, + "step": 2047 + }, + { + "epoch": 0.42100935347928875, + "grad_norm": 0.2280108779668808, + "learning_rate": 8.715718195104863e-05, + "loss": 0.7094, + "step": 2048 + }, + { + "epoch": 0.4212149244526673, + "grad_norm": 0.18423175811767578, + "learning_rate": 8.715368712258605e-05, + "loss": 0.6069, + "step": 2049 + }, + { + "epoch": 0.4214204954260458, + "grad_norm": 0.22304539382457733, + "learning_rate": 8.715019021740834e-05, + "loss": 0.7094, + "step": 2050 + }, + { + "epoch": 0.4216260663994244, + "grad_norm": 0.2160019874572754, + "learning_rate": 8.714669123568776e-05, + "loss": 0.7204, + "step": 2051 + }, + { + "epoch": 0.42183163737280294, + "grad_norm": 0.21349206566810608, + "learning_rate": 8.714319017759671e-05, + "loss": 0.7041, + "step": 2052 + }, + { + "epoch": 0.4220372083461815, + "grad_norm": 0.2105959951877594, + "learning_rate": 8.713968704330766e-05, + "loss": 0.7152, + "step": 2053 + }, + { + "epoch": 0.42224277931956006, + "grad_norm": 0.21072207391262054, + "learning_rate": 8.713618183299318e-05, + "loss": 0.7148, + "step": 2054 + }, + { + "epoch": 0.42244835029293865, + "grad_norm": 0.2207954227924347, + "learning_rate": 8.713267454682595e-05, + "loss": 0.7272, + "step": 2055 + }, + { + "epoch": 0.4226539212663172, + "grad_norm": 0.21951311826705933, + "learning_rate": 8.712916518497877e-05, + "loss": 0.7121, + "step": 2056 + }, + { + "epoch": 0.42285949223969577, + "grad_norm": 0.21501171588897705, + "learning_rate": 8.712565374762456e-05, + "loss": 0.7086, + "step": 2057 + }, + { + "epoch": 0.4230650632130743, + "grad_norm": 0.21046118438243866, + "learning_rate": 8.712214023493628e-05, + "loss": 0.6967, + "step": 2058 + }, + { + "epoch": 0.4232706341864529, + "grad_norm": 0.1807229071855545, + "learning_rate": 8.711862464708701e-05, + "loss": 0.5913, + "step": 2059 + }, + { + "epoch": 0.4234762051598314, + "grad_norm": 0.22645685076713562, + "learning_rate": 8.711510698424999e-05, + "loss": 0.7036, + "step": 2060 + }, + { + "epoch": 0.42368177613321, + "grad_norm": 0.22503720223903656, + "learning_rate": 8.711158724659848e-05, + "loss": 0.7092, + "step": 2061 + }, + { + "epoch": 0.42388734710658854, + "grad_norm": 0.21952955424785614, + "learning_rate": 8.71080654343059e-05, + "loss": 0.7028, + "step": 2062 + }, + { + "epoch": 0.42409291807996713, + "grad_norm": 0.21978265047073364, + "learning_rate": 8.710454154754574e-05, + "loss": 0.6954, + "step": 2063 + }, + { + "epoch": 0.42429848905334566, + "grad_norm": 0.21806906163692474, + "learning_rate": 8.710101558649162e-05, + "loss": 0.6992, + "step": 2064 + }, + { + "epoch": 0.42450406002672425, + "grad_norm": 0.14885424077510834, + "learning_rate": 8.709748755131724e-05, + "loss": 0.5892, + "step": 2065 + }, + { + "epoch": 0.4247096310001028, + "grad_norm": 0.230007603764534, + "learning_rate": 8.709395744219641e-05, + "loss": 0.7061, + "step": 2066 + }, + { + "epoch": 0.42491520197348137, + "grad_norm": 0.21456275880336761, + "learning_rate": 8.709042525930305e-05, + "loss": 0.699, + "step": 2067 + }, + { + "epoch": 0.4251207729468599, + "grad_norm": 0.21649466454982758, + "learning_rate": 8.708689100281116e-05, + "loss": 0.6888, + "step": 2068 + }, + { + "epoch": 0.42532634392023844, + "grad_norm": 0.2111383080482483, + "learning_rate": 8.708335467289487e-05, + "loss": 0.7007, + "step": 2069 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.2149335891008377, + "learning_rate": 8.707981626972839e-05, + "loss": 0.6819, + "step": 2070 + }, + { + "epoch": 0.42573748586699556, + "grad_norm": 0.14442218840122223, + "learning_rate": 8.707627579348605e-05, + "loss": 0.5817, + "step": 2071 + }, + { + "epoch": 0.42594305684037415, + "grad_norm": 0.21797578036785126, + "learning_rate": 8.707273324434225e-05, + "loss": 0.693, + "step": 2072 + }, + { + "epoch": 0.4261486278137527, + "grad_norm": 0.2137763351202011, + "learning_rate": 8.706918862247155e-05, + "loss": 0.7087, + "step": 2073 + }, + { + "epoch": 0.42635419878713127, + "grad_norm": 0.21722511947155, + "learning_rate": 8.706564192804854e-05, + "loss": 0.7327, + "step": 2074 + }, + { + "epoch": 0.4265597697605098, + "grad_norm": 0.21744219958782196, + "learning_rate": 8.706209316124798e-05, + "loss": 0.7024, + "step": 2075 + }, + { + "epoch": 0.4267653407338884, + "grad_norm": 0.21922947466373444, + "learning_rate": 8.705854232224467e-05, + "loss": 0.7089, + "step": 2076 + }, + { + "epoch": 0.4269709117072669, + "grad_norm": 0.20731019973754883, + "learning_rate": 8.705498941121357e-05, + "loss": 0.7112, + "step": 2077 + }, + { + "epoch": 0.4271764826806455, + "grad_norm": 0.15655431151390076, + "learning_rate": 8.705143442832973e-05, + "loss": 0.5976, + "step": 2078 + }, + { + "epoch": 0.42738205365402404, + "grad_norm": 0.22649213671684265, + "learning_rate": 8.704787737376822e-05, + "loss": 0.7271, + "step": 2079 + }, + { + "epoch": 0.42758762462740263, + "grad_norm": 0.2306176871061325, + "learning_rate": 8.704431824770436e-05, + "loss": 0.7294, + "step": 2080 + }, + { + "epoch": 0.42779319560078116, + "grad_norm": 0.21303272247314453, + "learning_rate": 8.704075705031344e-05, + "loss": 0.703, + "step": 2081 + }, + { + "epoch": 0.42799876657415975, + "grad_norm": 0.2082429826259613, + "learning_rate": 8.70371937817709e-05, + "loss": 0.7122, + "step": 2082 + }, + { + "epoch": 0.4282043375475383, + "grad_norm": 0.21812103688716888, + "learning_rate": 8.703362844225233e-05, + "loss": 0.6854, + "step": 2083 + }, + { + "epoch": 0.42840990852091687, + "grad_norm": 0.22010985016822815, + "learning_rate": 8.703006103193334e-05, + "loss": 0.7085, + "step": 2084 + }, + { + "epoch": 0.4286154794942954, + "grad_norm": 0.21230296790599823, + "learning_rate": 8.70264915509897e-05, + "loss": 0.6915, + "step": 2085 + }, + { + "epoch": 0.428821050467674, + "grad_norm": 0.22726766765117645, + "learning_rate": 8.702291999959725e-05, + "loss": 0.7325, + "step": 2086 + }, + { + "epoch": 0.4290266214410525, + "grad_norm": 0.22241102159023285, + "learning_rate": 8.701934637793194e-05, + "loss": 0.7029, + "step": 2087 + }, + { + "epoch": 0.42923219241443106, + "grad_norm": 0.1587475687265396, + "learning_rate": 8.701577068616984e-05, + "loss": 0.5836, + "step": 2088 + }, + { + "epoch": 0.42943776338780965, + "grad_norm": 0.2406635880470276, + "learning_rate": 8.701219292448708e-05, + "loss": 0.6863, + "step": 2089 + }, + { + "epoch": 0.4296433343611882, + "grad_norm": 0.21944580972194672, + "learning_rate": 8.700861309305995e-05, + "loss": 0.6938, + "step": 2090 + }, + { + "epoch": 0.42984890533456677, + "grad_norm": 0.21135850250720978, + "learning_rate": 8.700503119206481e-05, + "loss": 0.685, + "step": 2091 + }, + { + "epoch": 0.4300544763079453, + "grad_norm": 0.20949722826480865, + "learning_rate": 8.700144722167811e-05, + "loss": 0.6967, + "step": 2092 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 0.21594803035259247, + "learning_rate": 8.699786118207642e-05, + "loss": 0.7037, + "step": 2093 + }, + { + "epoch": 0.4304656182547024, + "grad_norm": 0.16418609023094177, + "learning_rate": 8.69942730734364e-05, + "loss": 0.5692, + "step": 2094 + }, + { + "epoch": 0.430671189228081, + "grad_norm": 0.23615112900733948, + "learning_rate": 8.699068289593483e-05, + "loss": 0.7278, + "step": 2095 + }, + { + "epoch": 0.43087676020145954, + "grad_norm": 0.22218084335327148, + "learning_rate": 8.698709064974858e-05, + "loss": 0.677, + "step": 2096 + }, + { + "epoch": 0.43108233117483813, + "grad_norm": 0.21628277003765106, + "learning_rate": 8.698349633505462e-05, + "loss": 0.6902, + "step": 2097 + }, + { + "epoch": 0.43128790214821666, + "grad_norm": 0.21895258128643036, + "learning_rate": 8.697989995203002e-05, + "loss": 0.6952, + "step": 2098 + }, + { + "epoch": 0.43149347312159525, + "grad_norm": 0.21633300185203552, + "learning_rate": 8.697630150085197e-05, + "loss": 0.7332, + "step": 2099 + }, + { + "epoch": 0.4316990440949738, + "grad_norm": 0.2174568474292755, + "learning_rate": 8.697270098169774e-05, + "loss": 0.6904, + "step": 2100 + }, + { + "epoch": 0.43190461506835237, + "grad_norm": 0.22629016637802124, + "learning_rate": 8.696909839474473e-05, + "loss": 0.7198, + "step": 2101 + }, + { + "epoch": 0.4321101860417309, + "grad_norm": 0.20996680855751038, + "learning_rate": 8.696549374017038e-05, + "loss": 0.6932, + "step": 2102 + }, + { + "epoch": 0.4323157570151095, + "grad_norm": 0.20978742837905884, + "learning_rate": 8.696188701815231e-05, + "loss": 0.684, + "step": 2103 + }, + { + "epoch": 0.432521327988488, + "grad_norm": 0.21533238887786865, + "learning_rate": 8.695827822886818e-05, + "loss": 0.7218, + "step": 2104 + }, + { + "epoch": 0.43272689896186656, + "grad_norm": 0.20759303867816925, + "learning_rate": 8.695466737249582e-05, + "loss": 0.6742, + "step": 2105 + }, + { + "epoch": 0.43293246993524515, + "grad_norm": 0.17055755853652954, + "learning_rate": 8.695105444921307e-05, + "loss": 0.5937, + "step": 2106 + }, + { + "epoch": 0.4331380409086237, + "grad_norm": 0.1438744068145752, + "learning_rate": 8.694743945919796e-05, + "loss": 0.5962, + "step": 2107 + }, + { + "epoch": 0.43334361188200227, + "grad_norm": 0.23514226078987122, + "learning_rate": 8.694382240262857e-05, + "loss": 0.7071, + "step": 2108 + }, + { + "epoch": 0.4335491828553808, + "grad_norm": 0.16390731930732727, + "learning_rate": 8.694020327968309e-05, + "loss": 0.597, + "step": 2109 + }, + { + "epoch": 0.4337547538287594, + "grad_norm": 0.21311801671981812, + "learning_rate": 8.693658209053983e-05, + "loss": 0.7061, + "step": 2110 + }, + { + "epoch": 0.4339603248021379, + "grad_norm": 0.21026752889156342, + "learning_rate": 8.693295883537717e-05, + "loss": 0.7125, + "step": 2111 + }, + { + "epoch": 0.4341658957755165, + "grad_norm": 0.21940794587135315, + "learning_rate": 8.692933351437362e-05, + "loss": 0.7429, + "step": 2112 + }, + { + "epoch": 0.43437146674889504, + "grad_norm": 0.22087624669075012, + "learning_rate": 8.69257061277078e-05, + "loss": 0.7089, + "step": 2113 + }, + { + "epoch": 0.43457703772227363, + "grad_norm": 0.21447579562664032, + "learning_rate": 8.69220766755584e-05, + "loss": 0.7126, + "step": 2114 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.18616484105587006, + "learning_rate": 8.691844515810422e-05, + "loss": 0.5893, + "step": 2115 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 0.2412138730287552, + "learning_rate": 8.691481157552418e-05, + "loss": 0.6838, + "step": 2116 + }, + { + "epoch": 0.4351937506424093, + "grad_norm": 0.2211569845676422, + "learning_rate": 8.691117592799726e-05, + "loss": 0.7146, + "step": 2117 + }, + { + "epoch": 0.43539932161578787, + "grad_norm": 0.22833772003650665, + "learning_rate": 8.690753821570261e-05, + "loss": 0.6909, + "step": 2118 + }, + { + "epoch": 0.4356048925891664, + "grad_norm": 0.22425860166549683, + "learning_rate": 8.690389843881944e-05, + "loss": 0.7387, + "step": 2119 + }, + { + "epoch": 0.435810463562545, + "grad_norm": 0.20990809798240662, + "learning_rate": 8.690025659752702e-05, + "loss": 0.7058, + "step": 2120 + }, + { + "epoch": 0.4360160345359235, + "grad_norm": 0.21391835808753967, + "learning_rate": 8.689661269200483e-05, + "loss": 0.706, + "step": 2121 + }, + { + "epoch": 0.4362216055093021, + "grad_norm": 0.21198540925979614, + "learning_rate": 8.689296672243234e-05, + "loss": 0.6776, + "step": 2122 + }, + { + "epoch": 0.43642717648268065, + "grad_norm": 0.22344285249710083, + "learning_rate": 8.68893186889892e-05, + "loss": 0.6062, + "step": 2123 + }, + { + "epoch": 0.4366327474560592, + "grad_norm": 0.23118963837623596, + "learning_rate": 8.68856685918551e-05, + "loss": 0.7088, + "step": 2124 + }, + { + "epoch": 0.43683831842943777, + "grad_norm": 0.14518238604068756, + "learning_rate": 8.68820164312099e-05, + "loss": 0.5962, + "step": 2125 + }, + { + "epoch": 0.4370438894028163, + "grad_norm": 0.22062361240386963, + "learning_rate": 8.68783622072335e-05, + "loss": 0.7169, + "step": 2126 + }, + { + "epoch": 0.4372494603761949, + "grad_norm": 0.21670423448085785, + "learning_rate": 8.687470592010593e-05, + "loss": 0.6916, + "step": 2127 + }, + { + "epoch": 0.4374550313495734, + "grad_norm": 0.21488401293754578, + "learning_rate": 8.687104757000733e-05, + "loss": 0.7139, + "step": 2128 + }, + { + "epoch": 0.437660602322952, + "grad_norm": 0.22047607600688934, + "learning_rate": 8.686738715711791e-05, + "loss": 0.6969, + "step": 2129 + }, + { + "epoch": 0.43786617329633054, + "grad_norm": 0.21157632768154144, + "learning_rate": 8.686372468161802e-05, + "loss": 0.7293, + "step": 2130 + }, + { + "epoch": 0.43807174426970913, + "grad_norm": 0.2109154462814331, + "learning_rate": 8.686006014368806e-05, + "loss": 0.7178, + "step": 2131 + }, + { + "epoch": 0.43827731524308766, + "grad_norm": 0.2221369594335556, + "learning_rate": 8.685639354350862e-05, + "loss": 0.7315, + "step": 2132 + }, + { + "epoch": 0.43848288621646625, + "grad_norm": 0.2168595790863037, + "learning_rate": 8.68527248812603e-05, + "loss": 0.7079, + "step": 2133 + }, + { + "epoch": 0.4386884571898448, + "grad_norm": 0.2099953144788742, + "learning_rate": 8.684905415712383e-05, + "loss": 0.7007, + "step": 2134 + }, + { + "epoch": 0.43889402816322337, + "grad_norm": 0.21563635766506195, + "learning_rate": 8.684538137128008e-05, + "loss": 0.716, + "step": 2135 + }, + { + "epoch": 0.4390995991366019, + "grad_norm": 0.2030235230922699, + "learning_rate": 8.684170652390996e-05, + "loss": 0.7029, + "step": 2136 + }, + { + "epoch": 0.4393051701099805, + "grad_norm": 0.21220625936985016, + "learning_rate": 8.683802961519454e-05, + "loss": 0.7057, + "step": 2137 + }, + { + "epoch": 0.439510741083359, + "grad_norm": 0.2082281857728958, + "learning_rate": 8.683435064531496e-05, + "loss": 0.6924, + "step": 2138 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 0.2149658501148224, + "learning_rate": 8.683066961445245e-05, + "loss": 0.7082, + "step": 2139 + }, + { + "epoch": 0.43992188303011615, + "grad_norm": 0.21991075575351715, + "learning_rate": 8.682698652278836e-05, + "loss": 0.7101, + "step": 2140 + }, + { + "epoch": 0.44012745400349473, + "grad_norm": 0.21779777109622955, + "learning_rate": 8.682330137050415e-05, + "loss": 0.6922, + "step": 2141 + }, + { + "epoch": 0.44033302497687327, + "grad_norm": 0.21721771359443665, + "learning_rate": 8.681961415778134e-05, + "loss": 0.7198, + "step": 2142 + }, + { + "epoch": 0.4405385959502518, + "grad_norm": 0.21693062782287598, + "learning_rate": 8.681592488480163e-05, + "loss": 0.74, + "step": 2143 + }, + { + "epoch": 0.4407441669236304, + "grad_norm": 0.21777969598770142, + "learning_rate": 8.681223355174673e-05, + "loss": 0.6871, + "step": 2144 + }, + { + "epoch": 0.4409497378970089, + "grad_norm": 0.2129591703414917, + "learning_rate": 8.680854015879852e-05, + "loss": 0.6949, + "step": 2145 + }, + { + "epoch": 0.4411553088703875, + "grad_norm": 0.20881325006484985, + "learning_rate": 8.680484470613896e-05, + "loss": 0.6919, + "step": 2146 + }, + { + "epoch": 0.44136087984376604, + "grad_norm": 0.21094316244125366, + "learning_rate": 8.680114719395007e-05, + "loss": 0.7102, + "step": 2147 + }, + { + "epoch": 0.44156645081714463, + "grad_norm": 0.2205977588891983, + "learning_rate": 8.679744762241407e-05, + "loss": 0.6933, + "step": 2148 + }, + { + "epoch": 0.44177202179052316, + "grad_norm": 0.2161235362291336, + "learning_rate": 8.679374599171317e-05, + "loss": 0.7472, + "step": 2149 + }, + { + "epoch": 0.44197759276390175, + "grad_norm": 0.2870723009109497, + "learning_rate": 8.679004230202973e-05, + "loss": 0.5985, + "step": 2150 + }, + { + "epoch": 0.4421831637372803, + "grad_norm": 0.22053900361061096, + "learning_rate": 8.678633655354627e-05, + "loss": 0.7013, + "step": 2151 + }, + { + "epoch": 0.44238873471065887, + "grad_norm": 0.22010482847690582, + "learning_rate": 8.67826287464453e-05, + "loss": 0.7361, + "step": 2152 + }, + { + "epoch": 0.4425943056840374, + "grad_norm": 0.2220645248889923, + "learning_rate": 8.677891888090949e-05, + "loss": 0.7354, + "step": 2153 + }, + { + "epoch": 0.442799876657416, + "grad_norm": 0.22568100690841675, + "learning_rate": 8.677520695712164e-05, + "loss": 0.6069, + "step": 2154 + }, + { + "epoch": 0.4430054476307945, + "grad_norm": 0.21187719702720642, + "learning_rate": 8.677149297526459e-05, + "loss": 0.6829, + "step": 2155 + }, + { + "epoch": 0.4432110186041731, + "grad_norm": 0.22478394210338593, + "learning_rate": 8.676777693552132e-05, + "loss": 0.6992, + "step": 2156 + }, + { + "epoch": 0.44341658957755165, + "grad_norm": 0.2064889669418335, + "learning_rate": 8.67640588380749e-05, + "loss": 0.6845, + "step": 2157 + }, + { + "epoch": 0.44362216055093023, + "grad_norm": 0.21473796665668488, + "learning_rate": 8.67603386831085e-05, + "loss": 0.706, + "step": 2158 + }, + { + "epoch": 0.44382773152430877, + "grad_norm": 0.22386027872562408, + "learning_rate": 8.675661647080541e-05, + "loss": 0.7064, + "step": 2159 + }, + { + "epoch": 0.4440333024976873, + "grad_norm": 0.21549421548843384, + "learning_rate": 8.675289220134901e-05, + "loss": 0.6826, + "step": 2160 + }, + { + "epoch": 0.4442388734710659, + "grad_norm": 0.1654203236103058, + "learning_rate": 8.674916587492274e-05, + "loss": 0.5987, + "step": 2161 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.23500193655490875, + "learning_rate": 8.674543749171023e-05, + "loss": 0.7202, + "step": 2162 + }, + { + "epoch": 0.444650015417823, + "grad_norm": 0.22905461490154266, + "learning_rate": 8.67417070518951e-05, + "loss": 0.7066, + "step": 2163 + }, + { + "epoch": 0.44485558639120154, + "grad_norm": 0.1377820372581482, + "learning_rate": 8.673797455566118e-05, + "loss": 0.5963, + "step": 2164 + }, + { + "epoch": 0.44506115736458013, + "grad_norm": 0.21596823632717133, + "learning_rate": 8.673424000319233e-05, + "loss": 0.6887, + "step": 2165 + }, + { + "epoch": 0.44526672833795866, + "grad_norm": 0.13856928050518036, + "learning_rate": 8.673050339467255e-05, + "loss": 0.5903, + "step": 2166 + }, + { + "epoch": 0.44547229931133725, + "grad_norm": 0.22425222396850586, + "learning_rate": 8.672676473028591e-05, + "loss": 0.696, + "step": 2167 + }, + { + "epoch": 0.4456778702847158, + "grad_norm": 0.20974132418632507, + "learning_rate": 8.672302401021662e-05, + "loss": 0.6882, + "step": 2168 + }, + { + "epoch": 0.44588344125809437, + "grad_norm": 0.20939786732196808, + "learning_rate": 8.671928123464893e-05, + "loss": 0.6787, + "step": 2169 + }, + { + "epoch": 0.4460890122314729, + "grad_norm": 0.21304769814014435, + "learning_rate": 8.671553640376724e-05, + "loss": 0.6775, + "step": 2170 + }, + { + "epoch": 0.4462945832048515, + "grad_norm": 0.21474890410900116, + "learning_rate": 8.671178951775607e-05, + "loss": 0.6984, + "step": 2171 + }, + { + "epoch": 0.44650015417823, + "grad_norm": 0.2142523229122162, + "learning_rate": 8.670804057679999e-05, + "loss": 0.6975, + "step": 2172 + }, + { + "epoch": 0.4467057251516086, + "grad_norm": 0.21635667979717255, + "learning_rate": 8.670428958108367e-05, + "loss": 0.6998, + "step": 2173 + }, + { + "epoch": 0.44691129612498715, + "grad_norm": 0.18972234427928925, + "learning_rate": 8.670053653079194e-05, + "loss": 0.5905, + "step": 2174 + }, + { + "epoch": 0.44711686709836573, + "grad_norm": 0.22437618672847748, + "learning_rate": 8.669678142610969e-05, + "loss": 0.7078, + "step": 2175 + }, + { + "epoch": 0.44732243807174427, + "grad_norm": 0.22813966870307922, + "learning_rate": 8.669302426722192e-05, + "loss": 0.6999, + "step": 2176 + }, + { + "epoch": 0.44752800904512285, + "grad_norm": 0.14738696813583374, + "learning_rate": 8.66892650543137e-05, + "loss": 0.5654, + "step": 2177 + }, + { + "epoch": 0.4477335800185014, + "grad_norm": 0.2084706872701645, + "learning_rate": 8.668550378757024e-05, + "loss": 0.7261, + "step": 2178 + }, + { + "epoch": 0.4479391509918799, + "grad_norm": 0.22098992764949799, + "learning_rate": 8.668174046717686e-05, + "loss": 0.7273, + "step": 2179 + }, + { + "epoch": 0.4481447219652585, + "grad_norm": 0.20854520797729492, + "learning_rate": 8.667797509331895e-05, + "loss": 0.7197, + "step": 2180 + }, + { + "epoch": 0.44835029293863704, + "grad_norm": 0.2072971910238266, + "learning_rate": 8.667420766618198e-05, + "loss": 0.6683, + "step": 2181 + }, + { + "epoch": 0.44855586391201563, + "grad_norm": 0.20528066158294678, + "learning_rate": 8.667043818595162e-05, + "loss": 0.7181, + "step": 2182 + }, + { + "epoch": 0.44876143488539416, + "grad_norm": 0.21476523578166962, + "learning_rate": 8.666666665281352e-05, + "loss": 0.72, + "step": 2183 + }, + { + "epoch": 0.44896700585877275, + "grad_norm": 0.20512348413467407, + "learning_rate": 8.666289306695351e-05, + "loss": 0.6984, + "step": 2184 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 0.21752099692821503, + "learning_rate": 8.665911742855748e-05, + "loss": 0.6836, + "step": 2185 + }, + { + "epoch": 0.44937814780552987, + "grad_norm": 0.21713502705097198, + "learning_rate": 8.665533973781145e-05, + "loss": 0.6965, + "step": 2186 + }, + { + "epoch": 0.4495837187789084, + "grad_norm": 0.22159411013126373, + "learning_rate": 8.665155999490153e-05, + "loss": 0.7348, + "step": 2187 + }, + { + "epoch": 0.449789289752287, + "grad_norm": 0.20660369098186493, + "learning_rate": 8.664777820001394e-05, + "loss": 0.6958, + "step": 2188 + }, + { + "epoch": 0.4499948607256655, + "grad_norm": 0.1848221719264984, + "learning_rate": 8.664399435333497e-05, + "loss": 0.5917, + "step": 2189 + }, + { + "epoch": 0.4502004316990441, + "grad_norm": 0.15177948772907257, + "learning_rate": 8.664020845505104e-05, + "loss": 0.5976, + "step": 2190 + }, + { + "epoch": 0.45040600267242265, + "grad_norm": 0.23266561329364777, + "learning_rate": 8.663642050534867e-05, + "loss": 0.7185, + "step": 2191 + }, + { + "epoch": 0.45061157364580123, + "grad_norm": 0.2253771871328354, + "learning_rate": 8.663263050441446e-05, + "loss": 0.6928, + "step": 2192 + }, + { + "epoch": 0.45081714461917977, + "grad_norm": 0.20975717902183533, + "learning_rate": 8.662883845243515e-05, + "loss": 0.7157, + "step": 2193 + }, + { + "epoch": 0.45102271559255835, + "grad_norm": 0.23472397029399872, + "learning_rate": 8.662504434959753e-05, + "loss": 0.7103, + "step": 2194 + }, + { + "epoch": 0.4512282865659369, + "grad_norm": 0.22584107518196106, + "learning_rate": 8.662124819608853e-05, + "loss": 0.7278, + "step": 2195 + }, + { + "epoch": 0.4514338575393155, + "grad_norm": 0.22365206480026245, + "learning_rate": 8.661744999209518e-05, + "loss": 0.599, + "step": 2196 + }, + { + "epoch": 0.451639428512694, + "grad_norm": 0.24951714277267456, + "learning_rate": 8.661364973780458e-05, + "loss": 0.7315, + "step": 2197 + }, + { + "epoch": 0.45184499948607254, + "grad_norm": 0.22680872678756714, + "learning_rate": 8.660984743340396e-05, + "loss": 0.7005, + "step": 2198 + }, + { + "epoch": 0.45205057045945113, + "grad_norm": 0.22146962583065033, + "learning_rate": 8.660604307908063e-05, + "loss": 0.6956, + "step": 2199 + }, + { + "epoch": 0.45225614143282966, + "grad_norm": 0.16175302863121033, + "learning_rate": 8.660223667502205e-05, + "loss": 0.5844, + "step": 2200 + }, + { + "epoch": 0.45246171240620825, + "grad_norm": 0.24984121322631836, + "learning_rate": 8.65984282214157e-05, + "loss": 0.7104, + "step": 2201 + }, + { + "epoch": 0.4526672833795868, + "grad_norm": 0.23822738230228424, + "learning_rate": 8.659461771844923e-05, + "loss": 0.7287, + "step": 2202 + }, + { + "epoch": 0.45287285435296537, + "grad_norm": 0.21192102134227753, + "learning_rate": 8.659080516631036e-05, + "loss": 0.714, + "step": 2203 + }, + { + "epoch": 0.4530784253263439, + "grad_norm": 0.23573461174964905, + "learning_rate": 8.65869905651869e-05, + "loss": 0.7125, + "step": 2204 + }, + { + "epoch": 0.4532839962997225, + "grad_norm": 0.22849269211292267, + "learning_rate": 8.658317391526678e-05, + "loss": 0.7213, + "step": 2205 + }, + { + "epoch": 0.453489567273101, + "grad_norm": 0.2162596434354782, + "learning_rate": 8.657935521673808e-05, + "loss": 0.7036, + "step": 2206 + }, + { + "epoch": 0.4536951382464796, + "grad_norm": 0.22291293740272522, + "learning_rate": 8.657553446978885e-05, + "loss": 0.7055, + "step": 2207 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 0.23885302245616913, + "learning_rate": 8.657171167460738e-05, + "loss": 0.7177, + "step": 2208 + }, + { + "epoch": 0.45410628019323673, + "grad_norm": 0.1670546680688858, + "learning_rate": 8.656788683138198e-05, + "loss": 0.5963, + "step": 2209 + }, + { + "epoch": 0.45431185116661527, + "grad_norm": 0.26193171739578247, + "learning_rate": 8.656405994030109e-05, + "loss": 0.6881, + "step": 2210 + }, + { + "epoch": 0.45451742213999385, + "grad_norm": 0.2238868772983551, + "learning_rate": 8.656023100155324e-05, + "loss": 0.6955, + "step": 2211 + }, + { + "epoch": 0.4547229931133724, + "grad_norm": 0.22464968264102936, + "learning_rate": 8.655640001532704e-05, + "loss": 0.6937, + "step": 2212 + }, + { + "epoch": 0.454928564086751, + "grad_norm": 0.2210894376039505, + "learning_rate": 8.655256698181125e-05, + "loss": 0.7033, + "step": 2213 + }, + { + "epoch": 0.4551341350601295, + "grad_norm": 0.2309311479330063, + "learning_rate": 8.654873190119472e-05, + "loss": 0.6877, + "step": 2214 + }, + { + "epoch": 0.4553397060335081, + "grad_norm": 0.15510539710521698, + "learning_rate": 8.654489477366635e-05, + "loss": 0.6074, + "step": 2215 + }, + { + "epoch": 0.45554527700688663, + "grad_norm": 0.1340515911579132, + "learning_rate": 8.654105559941519e-05, + "loss": 0.5916, + "step": 2216 + }, + { + "epoch": 0.45575084798026516, + "grad_norm": 0.3258119225502014, + "learning_rate": 8.653721437863041e-05, + "loss": 0.6729, + "step": 2217 + }, + { + "epoch": 0.45595641895364375, + "grad_norm": 0.24723531305789948, + "learning_rate": 8.653337111150121e-05, + "loss": 0.6963, + "step": 2218 + }, + { + "epoch": 0.4561619899270223, + "grad_norm": 0.16881807148456573, + "learning_rate": 8.652952579821693e-05, + "loss": 0.5994, + "step": 2219 + }, + { + "epoch": 0.45636756090040087, + "grad_norm": 0.16700582206249237, + "learning_rate": 8.652567843896702e-05, + "loss": 0.5822, + "step": 2220 + }, + { + "epoch": 0.4565731318737794, + "grad_norm": 0.1435755044221878, + "learning_rate": 8.652182903394105e-05, + "loss": 0.5809, + "step": 2221 + }, + { + "epoch": 0.456778702847158, + "grad_norm": 0.14672505855560303, + "learning_rate": 8.651797758332862e-05, + "loss": 0.5943, + "step": 2222 + }, + { + "epoch": 0.4569842738205365, + "grad_norm": 0.3784264922142029, + "learning_rate": 8.651412408731949e-05, + "loss": 0.7184, + "step": 2223 + }, + { + "epoch": 0.4571898447939151, + "grad_norm": 0.24264433979988098, + "learning_rate": 8.651026854610348e-05, + "loss": 0.6976, + "step": 2224 + }, + { + "epoch": 0.45739541576729364, + "grad_norm": 0.26151180267333984, + "learning_rate": 8.650641095987059e-05, + "loss": 0.6998, + "step": 2225 + }, + { + "epoch": 0.45760098674067223, + "grad_norm": 0.33650773763656616, + "learning_rate": 8.650255132881082e-05, + "loss": 0.7366, + "step": 2226 + }, + { + "epoch": 0.45780655771405077, + "grad_norm": 0.27262553572654724, + "learning_rate": 8.649868965311432e-05, + "loss": 0.7319, + "step": 2227 + }, + { + "epoch": 0.45801212868742935, + "grad_norm": 0.2205299288034439, + "learning_rate": 8.649482593297135e-05, + "loss": 0.6905, + "step": 2228 + }, + { + "epoch": 0.4582176996608079, + "grad_norm": 0.2557431757450104, + "learning_rate": 8.649096016857226e-05, + "loss": 0.6974, + "step": 2229 + }, + { + "epoch": 0.4584232706341865, + "grad_norm": 0.27587607502937317, + "learning_rate": 8.648709236010749e-05, + "loss": 0.7024, + "step": 2230 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 0.32615306973457336, + "learning_rate": 8.64832225077676e-05, + "loss": 0.6211, + "step": 2231 + }, + { + "epoch": 0.4588344125809436, + "grad_norm": 0.24620257318019867, + "learning_rate": 8.647935061174321e-05, + "loss": 0.7277, + "step": 2232 + }, + { + "epoch": 0.45903998355432213, + "grad_norm": 0.2339821755886078, + "learning_rate": 8.647547667222509e-05, + "loss": 0.7122, + "step": 2233 + }, + { + "epoch": 0.45924555452770066, + "grad_norm": 0.21899057924747467, + "learning_rate": 8.647160068940411e-05, + "loss": 0.7294, + "step": 2234 + }, + { + "epoch": 0.45945112550107925, + "grad_norm": 0.21356239914894104, + "learning_rate": 8.646772266347119e-05, + "loss": 0.7077, + "step": 2235 + }, + { + "epoch": 0.4596566964744578, + "grad_norm": 0.21990163624286652, + "learning_rate": 8.646384259461737e-05, + "loss": 0.6991, + "step": 2236 + }, + { + "epoch": 0.45986226744783637, + "grad_norm": 0.2190622240304947, + "learning_rate": 8.645996048303385e-05, + "loss": 0.7178, + "step": 2237 + }, + { + "epoch": 0.4600678384212149, + "grad_norm": 0.20803511142730713, + "learning_rate": 8.645607632891187e-05, + "loss": 0.6785, + "step": 2238 + }, + { + "epoch": 0.4602734093945935, + "grad_norm": 0.20758850872516632, + "learning_rate": 8.645219013244277e-05, + "loss": 0.6661, + "step": 2239 + }, + { + "epoch": 0.460478980367972, + "grad_norm": 0.21537218987941742, + "learning_rate": 8.6448301893818e-05, + "loss": 0.7075, + "step": 2240 + }, + { + "epoch": 0.4606845513413506, + "grad_norm": 0.2241329848766327, + "learning_rate": 8.644441161322912e-05, + "loss": 0.7014, + "step": 2241 + }, + { + "epoch": 0.46089012231472914, + "grad_norm": 0.20497076213359833, + "learning_rate": 8.64405192908678e-05, + "loss": 0.6964, + "step": 2242 + }, + { + "epoch": 0.46109569328810773, + "grad_norm": 0.20961910486221313, + "learning_rate": 8.643662492692578e-05, + "loss": 0.6976, + "step": 2243 + }, + { + "epoch": 0.46130126426148627, + "grad_norm": 0.2163321077823639, + "learning_rate": 8.643272852159493e-05, + "loss": 0.7253, + "step": 2244 + }, + { + "epoch": 0.46150683523486485, + "grad_norm": 0.21539649367332458, + "learning_rate": 8.642883007506721e-05, + "loss": 0.6848, + "step": 2245 + }, + { + "epoch": 0.4617124062082434, + "grad_norm": 0.2067098766565323, + "learning_rate": 8.642492958753465e-05, + "loss": 0.7156, + "step": 2246 + }, + { + "epoch": 0.461917977181622, + "grad_norm": 0.21964769065380096, + "learning_rate": 8.642102705918945e-05, + "loss": 0.6989, + "step": 2247 + }, + { + "epoch": 0.4621235481550005, + "grad_norm": 0.2275928258895874, + "learning_rate": 8.641712249022384e-05, + "loss": 0.6847, + "step": 2248 + }, + { + "epoch": 0.4623291191283791, + "grad_norm": 0.2040269672870636, + "learning_rate": 8.641321588083018e-05, + "loss": 0.6973, + "step": 2249 + }, + { + "epoch": 0.46253469010175763, + "grad_norm": 0.23092588782310486, + "learning_rate": 8.640930723120093e-05, + "loss": 0.7266, + "step": 2250 + }, + { + "epoch": 0.4627402610751362, + "grad_norm": 0.2156527191400528, + "learning_rate": 8.640539654152868e-05, + "loss": 0.7062, + "step": 2251 + }, + { + "epoch": 0.46294583204851475, + "grad_norm": 0.2142401933670044, + "learning_rate": 8.640148381200607e-05, + "loss": 0.7047, + "step": 2252 + }, + { + "epoch": 0.4631514030218933, + "grad_norm": 0.31457456946372986, + "learning_rate": 8.639756904282586e-05, + "loss": 0.6032, + "step": 2253 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 0.23436057567596436, + "learning_rate": 8.639365223418091e-05, + "loss": 0.7436, + "step": 2254 + }, + { + "epoch": 0.4635625449686504, + "grad_norm": 0.14833630621433258, + "learning_rate": 8.638973338626418e-05, + "loss": 0.588, + "step": 2255 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 0.24190352857112885, + "learning_rate": 8.638581249926876e-05, + "loss": 0.7079, + "step": 2256 + }, + { + "epoch": 0.4639736869154075, + "grad_norm": 0.2287464588880539, + "learning_rate": 8.638188957338778e-05, + "loss": 0.6983, + "step": 2257 + }, + { + "epoch": 0.4641792578887861, + "grad_norm": 0.24814251065254211, + "learning_rate": 8.637796460881454e-05, + "loss": 0.707, + "step": 2258 + }, + { + "epoch": 0.46438482886216464, + "grad_norm": 0.22504420578479767, + "learning_rate": 8.637403760574236e-05, + "loss": 0.7045, + "step": 2259 + }, + { + "epoch": 0.46459039983554323, + "grad_norm": 0.21358801424503326, + "learning_rate": 8.637010856436475e-05, + "loss": 0.7027, + "step": 2260 + }, + { + "epoch": 0.46479597080892177, + "grad_norm": 0.21219758689403534, + "learning_rate": 8.636617748487523e-05, + "loss": 0.689, + "step": 2261 + }, + { + "epoch": 0.46500154178230035, + "grad_norm": 0.21138092875480652, + "learning_rate": 8.63622443674675e-05, + "loss": 0.7208, + "step": 2262 + }, + { + "epoch": 0.4652071127556789, + "grad_norm": 0.27241116762161255, + "learning_rate": 8.635830921233532e-05, + "loss": 0.5964, + "step": 2263 + }, + { + "epoch": 0.4654126837290575, + "grad_norm": 0.2141522914171219, + "learning_rate": 8.635437201967255e-05, + "loss": 0.7362, + "step": 2264 + }, + { + "epoch": 0.465618254702436, + "grad_norm": 0.2085803896188736, + "learning_rate": 8.635043278967317e-05, + "loss": 0.6859, + "step": 2265 + }, + { + "epoch": 0.4658238256758146, + "grad_norm": 0.21698498725891113, + "learning_rate": 8.634649152253123e-05, + "loss": 0.7078, + "step": 2266 + }, + { + "epoch": 0.46602939664919313, + "grad_norm": 0.19954286515712738, + "learning_rate": 8.63425482184409e-05, + "loss": 0.6877, + "step": 2267 + }, + { + "epoch": 0.4662349676225717, + "grad_norm": 0.18924130499362946, + "learning_rate": 8.633860287759646e-05, + "loss": 0.6001, + "step": 2268 + }, + { + "epoch": 0.46644053859595025, + "grad_norm": 0.15498289465904236, + "learning_rate": 8.633465550019227e-05, + "loss": 0.5894, + "step": 2269 + }, + { + "epoch": 0.46664610956932884, + "grad_norm": 0.2448817938566208, + "learning_rate": 8.633070608642282e-05, + "loss": 0.6883, + "step": 2270 + }, + { + "epoch": 0.46685168054270737, + "grad_norm": 0.24218863248825073, + "learning_rate": 8.632675463648264e-05, + "loss": 0.7305, + "step": 2271 + }, + { + "epoch": 0.4670572515160859, + "grad_norm": 0.21386098861694336, + "learning_rate": 8.632280115056642e-05, + "loss": 0.703, + "step": 2272 + }, + { + "epoch": 0.4672628224894645, + "grad_norm": 0.20794478058815002, + "learning_rate": 8.631884562886894e-05, + "loss": 0.7054, + "step": 2273 + }, + { + "epoch": 0.467468393462843, + "grad_norm": 0.22331750392913818, + "learning_rate": 8.631488807158505e-05, + "loss": 0.7116, + "step": 2274 + }, + { + "epoch": 0.4676739644362216, + "grad_norm": 0.22476287186145782, + "learning_rate": 8.631092847890973e-05, + "loss": 0.7001, + "step": 2275 + }, + { + "epoch": 0.46787953540960014, + "grad_norm": 0.23165211081504822, + "learning_rate": 8.630696685103806e-05, + "loss": 0.5924, + "step": 2276 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 0.17003892362117767, + "learning_rate": 8.63030031881652e-05, + "loss": 0.5951, + "step": 2277 + }, + { + "epoch": 0.46829067735635727, + "grad_norm": 0.14959658682346344, + "learning_rate": 8.629903749048642e-05, + "loss": 0.5875, + "step": 2278 + }, + { + "epoch": 0.46849624832973585, + "grad_norm": 0.28558462858200073, + "learning_rate": 8.629506975819709e-05, + "loss": 0.7339, + "step": 2279 + }, + { + "epoch": 0.4687018193031144, + "grad_norm": 0.2474449872970581, + "learning_rate": 8.629109999149268e-05, + "loss": 0.7125, + "step": 2280 + }, + { + "epoch": 0.468907390276493, + "grad_norm": 0.22551508247852325, + "learning_rate": 8.628712819056878e-05, + "loss": 0.7266, + "step": 2281 + }, + { + "epoch": 0.4691129612498715, + "grad_norm": 0.23484089970588684, + "learning_rate": 8.628315435562105e-05, + "loss": 0.686, + "step": 2282 + }, + { + "epoch": 0.4693185322232501, + "grad_norm": 0.2324771285057068, + "learning_rate": 8.627917848684525e-05, + "loss": 0.7387, + "step": 2283 + }, + { + "epoch": 0.46952410319662863, + "grad_norm": 0.28548941016197205, + "learning_rate": 8.627520058443727e-05, + "loss": 0.6007, + "step": 2284 + }, + { + "epoch": 0.4697296741700072, + "grad_norm": 0.1830257922410965, + "learning_rate": 8.627122064859307e-05, + "loss": 0.5817, + "step": 2285 + }, + { + "epoch": 0.46993524514338575, + "grad_norm": 0.2828942835330963, + "learning_rate": 8.626723867950875e-05, + "loss": 0.6864, + "step": 2286 + }, + { + "epoch": 0.47014081611676434, + "grad_norm": 0.20021386444568634, + "learning_rate": 8.626325467738045e-05, + "loss": 0.5965, + "step": 2287 + }, + { + "epoch": 0.47034638709014287, + "grad_norm": 0.2412208914756775, + "learning_rate": 8.625926864240445e-05, + "loss": 0.7398, + "step": 2288 + }, + { + "epoch": 0.47055195806352146, + "grad_norm": 0.2284758985042572, + "learning_rate": 8.625528057477714e-05, + "loss": 0.7037, + "step": 2289 + }, + { + "epoch": 0.4707575290369, + "grad_norm": 0.22256653010845184, + "learning_rate": 8.625129047469498e-05, + "loss": 0.6852, + "step": 2290 + }, + { + "epoch": 0.4709631000102785, + "grad_norm": 0.21506358683109283, + "learning_rate": 8.624729834235455e-05, + "loss": 0.6848, + "step": 2291 + }, + { + "epoch": 0.4711686709836571, + "grad_norm": 0.2219688594341278, + "learning_rate": 8.624330417795251e-05, + "loss": 0.7025, + "step": 2292 + }, + { + "epoch": 0.47137424195703564, + "grad_norm": 0.22017613053321838, + "learning_rate": 8.623930798168564e-05, + "loss": 0.6911, + "step": 2293 + }, + { + "epoch": 0.47157981293041423, + "grad_norm": 0.2322702705860138, + "learning_rate": 8.623530975375084e-05, + "loss": 0.6266, + "step": 2294 + }, + { + "epoch": 0.47178538390379277, + "grad_norm": 0.25697195529937744, + "learning_rate": 8.623130949434505e-05, + "loss": 0.7211, + "step": 2295 + }, + { + "epoch": 0.47199095487717135, + "grad_norm": 0.16440944373607635, + "learning_rate": 8.622730720366535e-05, + "loss": 0.6019, + "step": 2296 + }, + { + "epoch": 0.4721965258505499, + "grad_norm": 0.2459285408258438, + "learning_rate": 8.622330288190893e-05, + "loss": 0.6854, + "step": 2297 + }, + { + "epoch": 0.4724020968239285, + "grad_norm": 0.25851428508758545, + "learning_rate": 8.621929652927306e-05, + "loss": 0.6919, + "step": 2298 + }, + { + "epoch": 0.472607667797307, + "grad_norm": 0.17177143692970276, + "learning_rate": 8.621528814595508e-05, + "loss": 0.5922, + "step": 2299 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 0.22151097655296326, + "learning_rate": 8.621127773215252e-05, + "loss": 0.6958, + "step": 2300 + }, + { + "epoch": 0.47301880974406413, + "grad_norm": 0.2228916585445404, + "learning_rate": 8.620726528806292e-05, + "loss": 0.7062, + "step": 2301 + }, + { + "epoch": 0.4732243807174427, + "grad_norm": 0.17388984560966492, + "learning_rate": 8.620325081388396e-05, + "loss": 0.5868, + "step": 2302 + }, + { + "epoch": 0.47342995169082125, + "grad_norm": 0.22164839506149292, + "learning_rate": 8.61992343098134e-05, + "loss": 0.6753, + "step": 2303 + }, + { + "epoch": 0.47363552266419984, + "grad_norm": 0.2175762802362442, + "learning_rate": 8.619521577604915e-05, + "loss": 0.7057, + "step": 2304 + }, + { + "epoch": 0.47384109363757837, + "grad_norm": 0.21533454954624176, + "learning_rate": 8.619119521278916e-05, + "loss": 0.6798, + "step": 2305 + }, + { + "epoch": 0.47404666461095696, + "grad_norm": 0.23147819936275482, + "learning_rate": 8.618717262023151e-05, + "loss": 0.7162, + "step": 2306 + }, + { + "epoch": 0.4742522355843355, + "grad_norm": 0.21729323267936707, + "learning_rate": 8.618314799857437e-05, + "loss": 0.7169, + "step": 2307 + }, + { + "epoch": 0.474457806557714, + "grad_norm": 0.19784866273403168, + "learning_rate": 8.617912134801603e-05, + "loss": 0.6863, + "step": 2308 + }, + { + "epoch": 0.4746633775310926, + "grad_norm": 0.20950141549110413, + "learning_rate": 8.617509266875484e-05, + "loss": 0.6784, + "step": 2309 + }, + { + "epoch": 0.47486894850447114, + "grad_norm": 0.2207701951265335, + "learning_rate": 8.617106196098928e-05, + "loss": 0.7182, + "step": 2310 + }, + { + "epoch": 0.47507451947784973, + "grad_norm": 0.21060660481452942, + "learning_rate": 8.616702922491794e-05, + "loss": 0.7051, + "step": 2311 + }, + { + "epoch": 0.47528009045122827, + "grad_norm": 0.21560098230838776, + "learning_rate": 8.616299446073948e-05, + "loss": 0.7186, + "step": 2312 + }, + { + "epoch": 0.47548566142460685, + "grad_norm": 0.20710930228233337, + "learning_rate": 8.615895766865268e-05, + "loss": 0.6939, + "step": 2313 + }, + { + "epoch": 0.4756912323979854, + "grad_norm": 0.20942838490009308, + "learning_rate": 8.615491884885642e-05, + "loss": 0.6854, + "step": 2314 + }, + { + "epoch": 0.475896803371364, + "grad_norm": 0.21396920084953308, + "learning_rate": 8.615087800154966e-05, + "loss": 0.6919, + "step": 2315 + }, + { + "epoch": 0.4761023743447425, + "grad_norm": 0.20860084891319275, + "learning_rate": 8.614683512693147e-05, + "loss": 0.715, + "step": 2316 + }, + { + "epoch": 0.4763079453181211, + "grad_norm": 0.19696597754955292, + "learning_rate": 8.614279022520105e-05, + "loss": 0.7004, + "step": 2317 + }, + { + "epoch": 0.47651351629149963, + "grad_norm": 0.214441180229187, + "learning_rate": 8.613874329655765e-05, + "loss": 0.695, + "step": 2318 + }, + { + "epoch": 0.4767190872648782, + "grad_norm": 0.20082063972949982, + "learning_rate": 8.613469434120065e-05, + "loss": 0.69, + "step": 2319 + }, + { + "epoch": 0.47692465823825675, + "grad_norm": 0.20159681141376495, + "learning_rate": 8.613064335932952e-05, + "loss": 0.6772, + "step": 2320 + }, + { + "epoch": 0.47713022921163534, + "grad_norm": 0.20627199113368988, + "learning_rate": 8.612659035114383e-05, + "loss": 0.6884, + "step": 2321 + }, + { + "epoch": 0.47733580018501387, + "grad_norm": 0.19715279340744019, + "learning_rate": 8.612253531684328e-05, + "loss": 0.5856, + "step": 2322 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 0.21673934161663055, + "learning_rate": 8.61184782566276e-05, + "loss": 0.7141, + "step": 2323 + }, + { + "epoch": 0.477746942131771, + "grad_norm": 0.21236567199230194, + "learning_rate": 8.611441917069668e-05, + "loss": 0.7081, + "step": 2324 + }, + { + "epoch": 0.4779525131051496, + "grad_norm": 0.22194881737232208, + "learning_rate": 8.61103580592505e-05, + "loss": 0.725, + "step": 2325 + }, + { + "epoch": 0.4781580840785281, + "grad_norm": 0.20836539566516876, + "learning_rate": 8.610629492248915e-05, + "loss": 0.6872, + "step": 2326 + }, + { + "epoch": 0.47836365505190664, + "grad_norm": 0.20728257298469543, + "learning_rate": 8.610222976061275e-05, + "loss": 0.6898, + "step": 2327 + }, + { + "epoch": 0.47856922602528523, + "grad_norm": 0.2103557288646698, + "learning_rate": 8.609816257382162e-05, + "loss": 0.6939, + "step": 2328 + }, + { + "epoch": 0.47877479699866377, + "grad_norm": 0.18069760501384735, + "learning_rate": 8.609409336231611e-05, + "loss": 0.5892, + "step": 2329 + }, + { + "epoch": 0.47898036797204235, + "grad_norm": 0.21599088609218597, + "learning_rate": 8.609002212629668e-05, + "loss": 0.7186, + "step": 2330 + }, + { + "epoch": 0.4791859389454209, + "grad_norm": 0.22007983922958374, + "learning_rate": 8.608594886596392e-05, + "loss": 0.6984, + "step": 2331 + }, + { + "epoch": 0.4793915099187995, + "grad_norm": 0.13403122127056122, + "learning_rate": 8.608187358151852e-05, + "loss": 0.5937, + "step": 2332 + }, + { + "epoch": 0.479597080892178, + "grad_norm": 0.21932478249073029, + "learning_rate": 8.607779627316119e-05, + "loss": 0.6969, + "step": 2333 + }, + { + "epoch": 0.4798026518655566, + "grad_norm": 0.22216017544269562, + "learning_rate": 8.607371694109285e-05, + "loss": 0.7011, + "step": 2334 + }, + { + "epoch": 0.48000822283893513, + "grad_norm": 0.20484335720539093, + "learning_rate": 8.606963558551445e-05, + "loss": 0.6637, + "step": 2335 + }, + { + "epoch": 0.4802137938123137, + "grad_norm": 0.22132568061351776, + "learning_rate": 8.606555220662707e-05, + "loss": 0.7098, + "step": 2336 + }, + { + "epoch": 0.48041936478569225, + "grad_norm": 0.15403473377227783, + "learning_rate": 8.606146680463187e-05, + "loss": 0.5913, + "step": 2337 + }, + { + "epoch": 0.48062493575907084, + "grad_norm": 0.21559444069862366, + "learning_rate": 8.605737937973011e-05, + "loss": 0.7038, + "step": 2338 + }, + { + "epoch": 0.48083050673244937, + "grad_norm": 0.13026109337806702, + "learning_rate": 8.605328993212317e-05, + "loss": 0.5778, + "step": 2339 + }, + { + "epoch": 0.48103607770582796, + "grad_norm": 0.2200099676847458, + "learning_rate": 8.604919846201255e-05, + "loss": 0.7091, + "step": 2340 + }, + { + "epoch": 0.4812416486792065, + "grad_norm": 0.21221928298473358, + "learning_rate": 8.604510496959975e-05, + "loss": 0.7062, + "step": 2341 + }, + { + "epoch": 0.4814472196525851, + "grad_norm": 0.20801213383674622, + "learning_rate": 8.604100945508648e-05, + "loss": 0.6884, + "step": 2342 + }, + { + "epoch": 0.4816527906259636, + "grad_norm": 0.23321124911308289, + "learning_rate": 8.603691191867451e-05, + "loss": 0.6849, + "step": 2343 + }, + { + "epoch": 0.4818583615993422, + "grad_norm": 0.1625455915927887, + "learning_rate": 8.603281236056569e-05, + "loss": 0.5854, + "step": 2344 + }, + { + "epoch": 0.48206393257272073, + "grad_norm": 0.14913566410541534, + "learning_rate": 8.602871078096198e-05, + "loss": 0.5857, + "step": 2345 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 0.23094283044338226, + "learning_rate": 8.602460718006548e-05, + "loss": 0.6814, + "step": 2346 + }, + { + "epoch": 0.48247507451947785, + "grad_norm": 0.21578393876552582, + "learning_rate": 8.602050155807832e-05, + "loss": 0.6983, + "step": 2347 + }, + { + "epoch": 0.4826806454928564, + "grad_norm": 0.21311207115650177, + "learning_rate": 8.601639391520278e-05, + "loss": 0.714, + "step": 2348 + }, + { + "epoch": 0.482886216466235, + "grad_norm": 0.20807845890522003, + "learning_rate": 8.601228425164123e-05, + "loss": 0.6955, + "step": 2349 + }, + { + "epoch": 0.4830917874396135, + "grad_norm": 0.2071390300989151, + "learning_rate": 8.600817256759611e-05, + "loss": 0.6911, + "step": 2350 + }, + { + "epoch": 0.4832973584129921, + "grad_norm": 0.20365330576896667, + "learning_rate": 8.600405886327001e-05, + "loss": 0.5981, + "step": 2351 + }, + { + "epoch": 0.48350292938637063, + "grad_norm": 0.21439498662948608, + "learning_rate": 8.599994313886558e-05, + "loss": 0.7061, + "step": 2352 + }, + { + "epoch": 0.4837085003597492, + "grad_norm": 0.22116196155548096, + "learning_rate": 8.599582539458558e-05, + "loss": 0.719, + "step": 2353 + }, + { + "epoch": 0.48391407133312775, + "grad_norm": 0.14612843096256256, + "learning_rate": 8.599170563063289e-05, + "loss": 0.5788, + "step": 2354 + }, + { + "epoch": 0.48411964230650634, + "grad_norm": 0.20347650349140167, + "learning_rate": 8.598758384721045e-05, + "loss": 0.6891, + "step": 2355 + }, + { + "epoch": 0.48432521327988487, + "grad_norm": 0.13734294474124908, + "learning_rate": 8.598346004452132e-05, + "loss": 0.5705, + "step": 2356 + }, + { + "epoch": 0.48453078425326346, + "grad_norm": 0.21844719350337982, + "learning_rate": 8.597933422276868e-05, + "loss": 0.7261, + "step": 2357 + }, + { + "epoch": 0.484736355226642, + "grad_norm": 0.20626910030841827, + "learning_rate": 8.597520638215578e-05, + "loss": 0.6712, + "step": 2358 + }, + { + "epoch": 0.4849419262000206, + "grad_norm": 0.2096855491399765, + "learning_rate": 8.597107652288598e-05, + "loss": 0.6777, + "step": 2359 + }, + { + "epoch": 0.4851474971733991, + "grad_norm": 0.20726048946380615, + "learning_rate": 8.596694464516273e-05, + "loss": 0.7194, + "step": 2360 + }, + { + "epoch": 0.4853530681467777, + "grad_norm": 0.2092740535736084, + "learning_rate": 8.59628107491896e-05, + "loss": 0.6859, + "step": 2361 + }, + { + "epoch": 0.48555863912015623, + "grad_norm": 0.20741955935955048, + "learning_rate": 8.595867483517025e-05, + "loss": 0.7095, + "step": 2362 + }, + { + "epoch": 0.4857642100935348, + "grad_norm": 0.1959150731563568, + "learning_rate": 8.595453690330843e-05, + "loss": 0.7032, + "step": 2363 + }, + { + "epoch": 0.48596978106691335, + "grad_norm": 0.20496924221515656, + "learning_rate": 8.5950396953808e-05, + "loss": 0.714, + "step": 2364 + }, + { + "epoch": 0.4861753520402919, + "grad_norm": 0.1742028295993805, + "learning_rate": 8.59462549868729e-05, + "loss": 0.5882, + "step": 2365 + }, + { + "epoch": 0.4863809230136705, + "grad_norm": 0.14946137368679047, + "learning_rate": 8.59421110027072e-05, + "loss": 0.5834, + "step": 2366 + }, + { + "epoch": 0.486586493987049, + "grad_norm": 0.22946619987487793, + "learning_rate": 8.593796500151507e-05, + "loss": 0.6916, + "step": 2367 + }, + { + "epoch": 0.4867920649604276, + "grad_norm": 0.2186809778213501, + "learning_rate": 8.593381698350074e-05, + "loss": 0.695, + "step": 2368 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 0.21201607584953308, + "learning_rate": 8.592966694886857e-05, + "loss": 0.6895, + "step": 2369 + }, + { + "epoch": 0.4872032069071847, + "grad_norm": 0.20772308111190796, + "learning_rate": 8.592551489782302e-05, + "loss": 0.6752, + "step": 2370 + }, + { + "epoch": 0.48740877788056325, + "grad_norm": 0.2207845002412796, + "learning_rate": 8.592136083056862e-05, + "loss": 0.7037, + "step": 2371 + }, + { + "epoch": 0.48761434885394184, + "grad_norm": 0.20530985295772552, + "learning_rate": 8.591720474731006e-05, + "loss": 0.6922, + "step": 2372 + }, + { + "epoch": 0.48781991982732037, + "grad_norm": 0.2157611846923828, + "learning_rate": 8.591304664825205e-05, + "loss": 0.7053, + "step": 2373 + }, + { + "epoch": 0.48802549080069896, + "grad_norm": 0.2080930769443512, + "learning_rate": 8.590888653359947e-05, + "loss": 0.6036, + "step": 2374 + }, + { + "epoch": 0.4882310617740775, + "grad_norm": 0.22034066915512085, + "learning_rate": 8.590472440355725e-05, + "loss": 0.6732, + "step": 2375 + }, + { + "epoch": 0.4884366327474561, + "grad_norm": 0.21666774153709412, + "learning_rate": 8.590056025833045e-05, + "loss": 0.6879, + "step": 2376 + }, + { + "epoch": 0.4886422037208346, + "grad_norm": 0.21656173467636108, + "learning_rate": 8.589639409812422e-05, + "loss": 0.7001, + "step": 2377 + }, + { + "epoch": 0.4888477746942132, + "grad_norm": 0.2207968384027481, + "learning_rate": 8.589222592314381e-05, + "loss": 0.6988, + "step": 2378 + }, + { + "epoch": 0.48905334566759173, + "grad_norm": 0.21282252669334412, + "learning_rate": 8.588805573359454e-05, + "loss": 0.6686, + "step": 2379 + }, + { + "epoch": 0.4892589166409703, + "grad_norm": 0.21024645864963531, + "learning_rate": 8.588388352968188e-05, + "loss": 0.6777, + "step": 2380 + }, + { + "epoch": 0.48946448761434885, + "grad_norm": 0.21151992678642273, + "learning_rate": 8.587970931161137e-05, + "loss": 0.6922, + "step": 2381 + }, + { + "epoch": 0.4896700585877274, + "grad_norm": 0.2125832885503769, + "learning_rate": 8.587553307958865e-05, + "loss": 0.6968, + "step": 2382 + }, + { + "epoch": 0.489875629561106, + "grad_norm": 0.22030989825725555, + "learning_rate": 8.587135483381948e-05, + "loss": 0.6913, + "step": 2383 + }, + { + "epoch": 0.4900812005344845, + "grad_norm": 0.2217807024717331, + "learning_rate": 8.586717457450967e-05, + "loss": 0.7198, + "step": 2384 + }, + { + "epoch": 0.4902867715078631, + "grad_norm": 0.20852632820606232, + "learning_rate": 8.586299230186519e-05, + "loss": 0.6752, + "step": 2385 + }, + { + "epoch": 0.4904923424812416, + "grad_norm": 0.20621474087238312, + "learning_rate": 8.585880801609208e-05, + "loss": 0.6783, + "step": 2386 + }, + { + "epoch": 0.4906979134546202, + "grad_norm": 0.21134278178215027, + "learning_rate": 8.585462171739647e-05, + "loss": 0.5887, + "step": 2387 + }, + { + "epoch": 0.49090348442799875, + "grad_norm": 0.2228272408246994, + "learning_rate": 8.58504334059846e-05, + "loss": 0.6875, + "step": 2388 + }, + { + "epoch": 0.49110905540137734, + "grad_norm": 0.2240232229232788, + "learning_rate": 8.584624308206281e-05, + "loss": 0.6768, + "step": 2389 + }, + { + "epoch": 0.49131462637475587, + "grad_norm": 0.21626600623130798, + "learning_rate": 8.584205074583754e-05, + "loss": 0.7107, + "step": 2390 + }, + { + "epoch": 0.49152019734813446, + "grad_norm": 0.21161963045597076, + "learning_rate": 8.583785639751532e-05, + "loss": 0.6794, + "step": 2391 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 0.21978048980236053, + "learning_rate": 8.583366003730278e-05, + "loss": 0.6772, + "step": 2392 + }, + { + "epoch": 0.4919313392948916, + "grad_norm": 0.20937666296958923, + "learning_rate": 8.582946166540668e-05, + "loss": 0.6825, + "step": 2393 + }, + { + "epoch": 0.4921369102682701, + "grad_norm": 0.21978282928466797, + "learning_rate": 8.582526128203385e-05, + "loss": 0.7231, + "step": 2394 + }, + { + "epoch": 0.4923424812416487, + "grad_norm": 0.21103829145431519, + "learning_rate": 8.582105888739121e-05, + "loss": 0.6941, + "step": 2395 + }, + { + "epoch": 0.49254805221502723, + "grad_norm": 0.20812061429023743, + "learning_rate": 8.581685448168579e-05, + "loss": 0.6734, + "step": 2396 + }, + { + "epoch": 0.4927536231884058, + "grad_norm": 0.2180771827697754, + "learning_rate": 8.581264806512471e-05, + "loss": 0.6817, + "step": 2397 + }, + { + "epoch": 0.49295919416178435, + "grad_norm": 0.20335964858531952, + "learning_rate": 8.580843963791524e-05, + "loss": 0.7109, + "step": 2398 + }, + { + "epoch": 0.49316476513516294, + "grad_norm": 0.22317105531692505, + "learning_rate": 8.580422920026468e-05, + "loss": 0.6899, + "step": 2399 + }, + { + "epoch": 0.4933703361085415, + "grad_norm": 0.2043156623840332, + "learning_rate": 8.580001675238047e-05, + "loss": 0.7072, + "step": 2400 + }, + { + "epoch": 0.49357590708192, + "grad_norm": 0.22758691012859344, + "learning_rate": 8.579580229447013e-05, + "loss": 0.5851, + "step": 2401 + }, + { + "epoch": 0.4937814780552986, + "grad_norm": 0.21011817455291748, + "learning_rate": 8.579158582674129e-05, + "loss": 0.6755, + "step": 2402 + }, + { + "epoch": 0.4939870490286771, + "grad_norm": 0.14406029880046844, + "learning_rate": 8.578736734940168e-05, + "loss": 0.5801, + "step": 2403 + }, + { + "epoch": 0.4941926200020557, + "grad_norm": 0.21777774393558502, + "learning_rate": 8.578314686265911e-05, + "loss": 0.6707, + "step": 2404 + }, + { + "epoch": 0.49439819097543425, + "grad_norm": 0.21820279955863953, + "learning_rate": 8.577892436672152e-05, + "loss": 0.6942, + "step": 2405 + }, + { + "epoch": 0.49460376194881284, + "grad_norm": 0.2069522887468338, + "learning_rate": 8.577469986179693e-05, + "loss": 0.6923, + "step": 2406 + }, + { + "epoch": 0.49480933292219137, + "grad_norm": 0.202153280377388, + "learning_rate": 8.577047334809346e-05, + "loss": 0.7045, + "step": 2407 + }, + { + "epoch": 0.49501490389556996, + "grad_norm": 0.22939299046993256, + "learning_rate": 8.576624482581932e-05, + "loss": 0.6958, + "step": 2408 + }, + { + "epoch": 0.4952204748689485, + "grad_norm": 0.19599145650863647, + "learning_rate": 8.576201429518283e-05, + "loss": 0.6101, + "step": 2409 + }, + { + "epoch": 0.4954260458423271, + "grad_norm": 0.2155923992395401, + "learning_rate": 8.575778175639245e-05, + "loss": 0.7045, + "step": 2410 + }, + { + "epoch": 0.4956316168157056, + "grad_norm": 0.13790921866893768, + "learning_rate": 8.575354720965663e-05, + "loss": 0.5729, + "step": 2411 + }, + { + "epoch": 0.4958371877890842, + "grad_norm": 0.23278020322322845, + "learning_rate": 8.574931065518403e-05, + "loss": 0.7441, + "step": 2412 + }, + { + "epoch": 0.49604275876246273, + "grad_norm": 0.15767961740493774, + "learning_rate": 8.574507209318337e-05, + "loss": 0.617, + "step": 2413 + }, + { + "epoch": 0.4962483297358413, + "grad_norm": 0.21228386461734772, + "learning_rate": 8.574083152386344e-05, + "loss": 0.6849, + "step": 2414 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 0.20901069045066833, + "learning_rate": 8.573658894743316e-05, + "loss": 0.6881, + "step": 2415 + }, + { + "epoch": 0.49665947168259844, + "grad_norm": 0.20342102646827698, + "learning_rate": 8.573234436410155e-05, + "loss": 0.7173, + "step": 2416 + }, + { + "epoch": 0.496865042655977, + "grad_norm": 0.22326229512691498, + "learning_rate": 8.572809777407771e-05, + "loss": 0.7265, + "step": 2417 + }, + { + "epoch": 0.49707061362935556, + "grad_norm": 0.2064063847064972, + "learning_rate": 8.572384917757086e-05, + "loss": 0.6939, + "step": 2418 + }, + { + "epoch": 0.4972761846027341, + "grad_norm": 0.2083250731229782, + "learning_rate": 8.57195985747903e-05, + "loss": 0.7009, + "step": 2419 + }, + { + "epoch": 0.4974817555761126, + "grad_norm": 0.20397667586803436, + "learning_rate": 8.571534596594544e-05, + "loss": 0.6835, + "step": 2420 + }, + { + "epoch": 0.4976873265494912, + "grad_norm": 0.2096882462501526, + "learning_rate": 8.571109135124579e-05, + "loss": 0.714, + "step": 2421 + }, + { + "epoch": 0.49789289752286975, + "grad_norm": 0.2030659317970276, + "learning_rate": 8.570683473090095e-05, + "loss": 0.6971, + "step": 2422 + }, + { + "epoch": 0.49809846849624834, + "grad_norm": 0.202758327126503, + "learning_rate": 8.570257610512064e-05, + "loss": 0.6856, + "step": 2423 + }, + { + "epoch": 0.49830403946962687, + "grad_norm": 0.20229479670524597, + "learning_rate": 8.569831547411464e-05, + "loss": 0.7063, + "step": 2424 + }, + { + "epoch": 0.49850961044300546, + "grad_norm": 0.2144801914691925, + "learning_rate": 8.569405283809285e-05, + "loss": 0.7056, + "step": 2425 + }, + { + "epoch": 0.498715181416384, + "grad_norm": 0.19797521829605103, + "learning_rate": 8.56897881972653e-05, + "loss": 0.6035, + "step": 2426 + }, + { + "epoch": 0.4989207523897626, + "grad_norm": 0.21914798021316528, + "learning_rate": 8.568552155184204e-05, + "loss": 0.6789, + "step": 2427 + }, + { + "epoch": 0.4991263233631411, + "grad_norm": 0.2153196483850479, + "learning_rate": 8.568125290203332e-05, + "loss": 0.7026, + "step": 2428 + }, + { + "epoch": 0.4993318943365197, + "grad_norm": 0.1549125760793686, + "learning_rate": 8.567698224804941e-05, + "loss": 0.5727, + "step": 2429 + }, + { + "epoch": 0.49953746530989823, + "grad_norm": 0.2103041261434555, + "learning_rate": 8.567270959010071e-05, + "loss": 0.7001, + "step": 2430 + }, + { + "epoch": 0.4997430362832768, + "grad_norm": 0.20346547663211823, + "learning_rate": 8.566843492839769e-05, + "loss": 0.6998, + "step": 2431 + }, + { + "epoch": 0.49994860725665535, + "grad_norm": 0.16657423973083496, + "learning_rate": 8.5664158263151e-05, + "loss": 0.5893, + "step": 2432 + }, + { + "epoch": 0.5001541782300339, + "grad_norm": 0.2198108732700348, + "learning_rate": 8.565987959457128e-05, + "loss": 0.692, + "step": 2433 + }, + { + "epoch": 0.5003597492034125, + "grad_norm": 0.21006634831428528, + "learning_rate": 8.565559892286934e-05, + "loss": 0.7012, + "step": 2434 + }, + { + "epoch": 0.500565320176791, + "grad_norm": 0.20093873143196106, + "learning_rate": 8.565131624825605e-05, + "loss": 0.6853, + "step": 2435 + }, + { + "epoch": 0.5007708911501696, + "grad_norm": 0.21130932867527008, + "learning_rate": 8.564703157094242e-05, + "loss": 0.7092, + "step": 2436 + }, + { + "epoch": 0.5009764621235482, + "grad_norm": 0.21420711278915405, + "learning_rate": 8.564274489113954e-05, + "loss": 0.7132, + "step": 2437 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 0.2129506766796112, + "learning_rate": 8.563845620905856e-05, + "loss": 0.6958, + "step": 2438 + }, + { + "epoch": 0.5013876040703052, + "grad_norm": 0.20229041576385498, + "learning_rate": 8.563416552491081e-05, + "loss": 0.6567, + "step": 2439 + }, + { + "epoch": 0.5015931750436838, + "grad_norm": 0.21202024817466736, + "learning_rate": 8.562987283890764e-05, + "loss": 0.7095, + "step": 2440 + }, + { + "epoch": 0.5017987460170624, + "grad_norm": 0.20876267552375793, + "learning_rate": 8.562557815126053e-05, + "loss": 0.6786, + "step": 2441 + }, + { + "epoch": 0.5020043169904409, + "grad_norm": 0.20050349831581116, + "learning_rate": 8.562128146218108e-05, + "loss": 0.6929, + "step": 2442 + }, + { + "epoch": 0.5022098879638195, + "grad_norm": 0.2047853022813797, + "learning_rate": 8.561698277188095e-05, + "loss": 0.6934, + "step": 2443 + }, + { + "epoch": 0.5024154589371981, + "grad_norm": 0.18259146809577942, + "learning_rate": 8.561268208057192e-05, + "loss": 0.6199, + "step": 2444 + }, + { + "epoch": 0.5026210299105767, + "grad_norm": 0.1506025195121765, + "learning_rate": 8.560837938846587e-05, + "loss": 0.6148, + "step": 2445 + }, + { + "epoch": 0.5028266008839551, + "grad_norm": 0.22317710518836975, + "learning_rate": 8.560407469577477e-05, + "loss": 0.7029, + "step": 2446 + }, + { + "epoch": 0.5030321718573337, + "grad_norm": 0.21875528991222382, + "learning_rate": 8.55997680027107e-05, + "loss": 0.7086, + "step": 2447 + }, + { + "epoch": 0.5032377428307123, + "grad_norm": 0.2068042755126953, + "learning_rate": 8.559545930948581e-05, + "loss": 0.6979, + "step": 2448 + }, + { + "epoch": 0.5034433138040909, + "grad_norm": 0.20604568719863892, + "learning_rate": 8.559114861631239e-05, + "loss": 0.6828, + "step": 2449 + }, + { + "epoch": 0.5036488847774694, + "grad_norm": 0.20887784659862518, + "learning_rate": 8.55868359234028e-05, + "loss": 0.7186, + "step": 2450 + }, + { + "epoch": 0.503854455750848, + "grad_norm": 0.23300114274024963, + "learning_rate": 8.55825212309695e-05, + "loss": 0.6772, + "step": 2451 + }, + { + "epoch": 0.5040600267242266, + "grad_norm": 0.2133777141571045, + "learning_rate": 8.557820453922507e-05, + "loss": 0.5952, + "step": 2452 + }, + { + "epoch": 0.5042655976976052, + "grad_norm": 0.23336206376552582, + "learning_rate": 8.557388584838216e-05, + "loss": 0.6794, + "step": 2453 + }, + { + "epoch": 0.5044711686709836, + "grad_norm": 0.22460931539535522, + "learning_rate": 8.556956515865353e-05, + "loss": 0.6914, + "step": 2454 + }, + { + "epoch": 0.5046767396443622, + "grad_norm": 0.21478697657585144, + "learning_rate": 8.556524247025206e-05, + "loss": 0.7215, + "step": 2455 + }, + { + "epoch": 0.5048823106177408, + "grad_norm": 0.22004112601280212, + "learning_rate": 8.556091778339068e-05, + "loss": 0.6831, + "step": 2456 + }, + { + "epoch": 0.5050878815911193, + "grad_norm": 0.21334481239318848, + "learning_rate": 8.555659109828247e-05, + "loss": 0.6868, + "step": 2457 + }, + { + "epoch": 0.5052934525644979, + "grad_norm": 0.20527870953083038, + "learning_rate": 8.555226241514059e-05, + "loss": 0.7008, + "step": 2458 + }, + { + "epoch": 0.5054990235378765, + "grad_norm": 0.2052440643310547, + "learning_rate": 8.554793173417825e-05, + "loss": 0.6851, + "step": 2459 + }, + { + "epoch": 0.505704594511255, + "grad_norm": 0.20601294934749603, + "learning_rate": 8.554359905560886e-05, + "loss": 0.7074, + "step": 2460 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 0.20732106268405914, + "learning_rate": 8.553926437964584e-05, + "loss": 0.7022, + "step": 2461 + }, + { + "epoch": 0.5061157364580121, + "grad_norm": 0.20242151618003845, + "learning_rate": 8.553492770650275e-05, + "loss": 0.7151, + "step": 2462 + }, + { + "epoch": 0.5063213074313907, + "grad_norm": 0.2136530876159668, + "learning_rate": 8.553058903639322e-05, + "loss": 0.6944, + "step": 2463 + }, + { + "epoch": 0.5065268784047693, + "grad_norm": 0.20471519231796265, + "learning_rate": 8.552624836953102e-05, + "loss": 0.7044, + "step": 2464 + }, + { + "epoch": 0.5067324493781478, + "grad_norm": 0.2073119431734085, + "learning_rate": 8.552190570612998e-05, + "loss": 0.7084, + "step": 2465 + }, + { + "epoch": 0.5069380203515264, + "grad_norm": 0.20517416298389435, + "learning_rate": 8.551756104640403e-05, + "loss": 0.7044, + "step": 2466 + }, + { + "epoch": 0.5071435913249049, + "grad_norm": 0.20278342068195343, + "learning_rate": 8.551321439056722e-05, + "loss": 0.724, + "step": 2467 + }, + { + "epoch": 0.5073491622982835, + "grad_norm": 0.20847640931606293, + "learning_rate": 8.550886573883371e-05, + "loss": 0.6805, + "step": 2468 + }, + { + "epoch": 0.507554733271662, + "grad_norm": 0.21068242192268372, + "learning_rate": 8.550451509141772e-05, + "loss": 0.6878, + "step": 2469 + }, + { + "epoch": 0.5077603042450406, + "grad_norm": 0.19965562224388123, + "learning_rate": 8.55001624485336e-05, + "loss": 0.6728, + "step": 2470 + }, + { + "epoch": 0.5079658752184192, + "grad_norm": 0.28934335708618164, + "learning_rate": 8.549580781039576e-05, + "loss": 0.6096, + "step": 2471 + }, + { + "epoch": 0.5081714461917978, + "grad_norm": 0.21150463819503784, + "learning_rate": 8.549145117721875e-05, + "loss": 0.7202, + "step": 2472 + }, + { + "epoch": 0.5083770171651762, + "grad_norm": 0.17131322622299194, + "learning_rate": 8.548709254921721e-05, + "loss": 0.5992, + "step": 2473 + }, + { + "epoch": 0.5085825881385548, + "grad_norm": 0.1621021330356598, + "learning_rate": 8.548273192660585e-05, + "loss": 0.5971, + "step": 2474 + }, + { + "epoch": 0.5087881591119334, + "grad_norm": 0.22314049303531647, + "learning_rate": 8.547836930959949e-05, + "loss": 0.7129, + "step": 2475 + }, + { + "epoch": 0.5089937300853119, + "grad_norm": 0.21151074767112732, + "learning_rate": 8.547400469841307e-05, + "loss": 0.6885, + "step": 2476 + }, + { + "epoch": 0.5091993010586905, + "grad_norm": 0.20470760762691498, + "learning_rate": 8.546963809326162e-05, + "loss": 0.7107, + "step": 2477 + }, + { + "epoch": 0.5094048720320691, + "grad_norm": 0.20865213871002197, + "learning_rate": 8.546526949436025e-05, + "loss": 0.7328, + "step": 2478 + }, + { + "epoch": 0.5096104430054477, + "grad_norm": 0.24143381416797638, + "learning_rate": 8.546089890192422e-05, + "loss": 0.5784, + "step": 2479 + }, + { + "epoch": 0.5098160139788261, + "grad_norm": 0.21726645529270172, + "learning_rate": 8.545652631616878e-05, + "loss": 0.7009, + "step": 2480 + }, + { + "epoch": 0.5100215849522047, + "grad_norm": 0.24358177185058594, + "learning_rate": 8.545215173730938e-05, + "loss": 0.7017, + "step": 2481 + }, + { + "epoch": 0.5102271559255833, + "grad_norm": 0.21474173665046692, + "learning_rate": 8.544777516556155e-05, + "loss": 0.6889, + "step": 2482 + }, + { + "epoch": 0.5104327268989619, + "grad_norm": 0.2038557231426239, + "learning_rate": 8.54433966011409e-05, + "loss": 0.7172, + "step": 2483 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.22823157906532288, + "learning_rate": 8.54390160442631e-05, + "loss": 0.7173, + "step": 2484 + }, + { + "epoch": 0.510843868845719, + "grad_norm": 0.20391489565372467, + "learning_rate": 8.5434633495144e-05, + "loss": 0.7198, + "step": 2485 + }, + { + "epoch": 0.5110494398190976, + "grad_norm": 0.1981978565454483, + "learning_rate": 8.543024895399953e-05, + "loss": 0.6856, + "step": 2486 + }, + { + "epoch": 0.5112550107924761, + "grad_norm": 0.2035714089870453, + "learning_rate": 8.542586242104563e-05, + "loss": 0.6885, + "step": 2487 + }, + { + "epoch": 0.5114605817658546, + "grad_norm": 0.20313310623168945, + "learning_rate": 8.542147389649847e-05, + "loss": 0.7015, + "step": 2488 + }, + { + "epoch": 0.5116661527392332, + "grad_norm": 0.20469297468662262, + "learning_rate": 8.541708338057419e-05, + "loss": 0.7098, + "step": 2489 + }, + { + "epoch": 0.5118717237126118, + "grad_norm": 0.2113511860370636, + "learning_rate": 8.541269087348913e-05, + "loss": 0.7239, + "step": 2490 + }, + { + "epoch": 0.5120772946859904, + "grad_norm": 0.20842553675174713, + "learning_rate": 8.540829637545969e-05, + "loss": 0.7047, + "step": 2491 + }, + { + "epoch": 0.5122828656593689, + "grad_norm": 0.2060026377439499, + "learning_rate": 8.540389988670234e-05, + "loss": 0.6655, + "step": 2492 + }, + { + "epoch": 0.5124884366327475, + "grad_norm": 0.21950404345989227, + "learning_rate": 8.53995014074337e-05, + "loss": 0.6143, + "step": 2493 + }, + { + "epoch": 0.512694007606126, + "grad_norm": 0.21250604093074799, + "learning_rate": 8.539510093787044e-05, + "loss": 0.6995, + "step": 2494 + }, + { + "epoch": 0.5128995785795045, + "grad_norm": 0.21519462764263153, + "learning_rate": 8.539069847822938e-05, + "loss": 0.6877, + "step": 2495 + }, + { + "epoch": 0.5131051495528831, + "grad_norm": 0.21637707948684692, + "learning_rate": 8.538629402872738e-05, + "loss": 0.7088, + "step": 2496 + }, + { + "epoch": 0.5133107205262617, + "grad_norm": 0.2197788506746292, + "learning_rate": 8.538188758958144e-05, + "loss": 0.6753, + "step": 2497 + }, + { + "epoch": 0.5135162914996403, + "grad_norm": 0.22371014952659607, + "learning_rate": 8.537747916100865e-05, + "loss": 0.7074, + "step": 2498 + }, + { + "epoch": 0.5137218624730188, + "grad_norm": 0.16387100517749786, + "learning_rate": 8.537306874322618e-05, + "loss": 0.5846, + "step": 2499 + }, + { + "epoch": 0.5139274334463974, + "grad_norm": 0.24268200993537903, + "learning_rate": 8.536865633645132e-05, + "loss": 0.6932, + "step": 2500 + }, + { + "epoch": 0.5141330044197759, + "grad_norm": 0.23605839908123016, + "learning_rate": 8.536424194090144e-05, + "loss": 0.6874, + "step": 2501 + }, + { + "epoch": 0.5143385753931545, + "grad_norm": 0.20614401996135712, + "learning_rate": 8.535982555679402e-05, + "loss": 0.6704, + "step": 2502 + }, + { + "epoch": 0.514544146366533, + "grad_norm": 0.20825539529323578, + "learning_rate": 8.535540718434665e-05, + "loss": 0.7012, + "step": 2503 + }, + { + "epoch": 0.5147497173399116, + "grad_norm": 0.2111969292163849, + "learning_rate": 8.535098682377698e-05, + "loss": 0.6834, + "step": 2504 + }, + { + "epoch": 0.5149552883132902, + "grad_norm": 0.21059072017669678, + "learning_rate": 8.534656447530278e-05, + "loss": 0.7163, + "step": 2505 + }, + { + "epoch": 0.5151608592866688, + "grad_norm": 0.20956206321716309, + "learning_rate": 8.534214013914193e-05, + "loss": 0.6897, + "step": 2506 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 0.16276654601097107, + "learning_rate": 8.53377138155124e-05, + "loss": 0.5806, + "step": 2507 + }, + { + "epoch": 0.5155720012334258, + "grad_norm": 0.14373748004436493, + "learning_rate": 8.533328550463226e-05, + "loss": 0.5802, + "step": 2508 + }, + { + "epoch": 0.5157775722068044, + "grad_norm": 0.14410528540611267, + "learning_rate": 8.532885520671963e-05, + "loss": 0.5905, + "step": 2509 + }, + { + "epoch": 0.515983143180183, + "grad_norm": 0.25100046396255493, + "learning_rate": 8.532442292199283e-05, + "loss": 0.7222, + "step": 2510 + }, + { + "epoch": 0.5161887141535615, + "grad_norm": 0.1554838865995407, + "learning_rate": 8.531998865067017e-05, + "loss": 0.5799, + "step": 2511 + }, + { + "epoch": 0.5163942851269401, + "grad_norm": 0.21566714346408844, + "learning_rate": 8.531555239297013e-05, + "loss": 0.7103, + "step": 2512 + }, + { + "epoch": 0.5165998561003187, + "grad_norm": 0.1622397005558014, + "learning_rate": 8.531111414911126e-05, + "loss": 0.5907, + "step": 2513 + }, + { + "epoch": 0.5168054270736971, + "grad_norm": 0.2527947723865509, + "learning_rate": 8.530667391931221e-05, + "loss": 0.6972, + "step": 2514 + }, + { + "epoch": 0.5170109980470757, + "grad_norm": 0.14436852931976318, + "learning_rate": 8.530223170379174e-05, + "loss": 0.5834, + "step": 2515 + }, + { + "epoch": 0.5172165690204543, + "grad_norm": 0.22850194573402405, + "learning_rate": 8.529778750276866e-05, + "loss": 0.7095, + "step": 2516 + }, + { + "epoch": 0.5174221399938329, + "grad_norm": 0.21069450676441193, + "learning_rate": 8.529334131646196e-05, + "loss": 0.6754, + "step": 2517 + }, + { + "epoch": 0.5176277109672114, + "grad_norm": 0.16173620522022247, + "learning_rate": 8.528889314509066e-05, + "loss": 0.6033, + "step": 2518 + }, + { + "epoch": 0.51783328194059, + "grad_norm": 0.23078560829162598, + "learning_rate": 8.528444298887391e-05, + "loss": 0.6971, + "step": 2519 + }, + { + "epoch": 0.5180388529139686, + "grad_norm": 0.21634352207183838, + "learning_rate": 8.527999084803092e-05, + "loss": 0.6821, + "step": 2520 + }, + { + "epoch": 0.5182444238873471, + "grad_norm": 0.20838621258735657, + "learning_rate": 8.527553672278107e-05, + "loss": 0.7123, + "step": 2521 + }, + { + "epoch": 0.5184499948607256, + "grad_norm": 0.20532085001468658, + "learning_rate": 8.527108061334378e-05, + "loss": 0.7199, + "step": 2522 + }, + { + "epoch": 0.5186555658341042, + "grad_norm": 0.20181244611740112, + "learning_rate": 8.526662251993856e-05, + "loss": 0.6995, + "step": 2523 + }, + { + "epoch": 0.5188611368074828, + "grad_norm": 0.1562027484178543, + "learning_rate": 8.526216244278505e-05, + "loss": 0.5845, + "step": 2524 + }, + { + "epoch": 0.5190667077808614, + "grad_norm": 0.22398139536380768, + "learning_rate": 8.5257700382103e-05, + "loss": 0.7083, + "step": 2525 + }, + { + "epoch": 0.5192722787542399, + "grad_norm": 0.206566721200943, + "learning_rate": 8.52532363381122e-05, + "loss": 0.7012, + "step": 2526 + }, + { + "epoch": 0.5194778497276185, + "grad_norm": 0.20333848893642426, + "learning_rate": 8.524877031103259e-05, + "loss": 0.7052, + "step": 2527 + }, + { + "epoch": 0.519683420700997, + "grad_norm": 0.1408892273902893, + "learning_rate": 8.524430230108419e-05, + "loss": 0.5717, + "step": 2528 + }, + { + "epoch": 0.5198889916743756, + "grad_norm": 0.21199721097946167, + "learning_rate": 8.523983230848712e-05, + "loss": 0.6796, + "step": 2529 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 0.21294069290161133, + "learning_rate": 8.523536033346159e-05, + "loss": 0.6961, + "step": 2530 + }, + { + "epoch": 0.5203001336211327, + "grad_norm": 0.2040695995092392, + "learning_rate": 8.523088637622793e-05, + "loss": 0.7192, + "step": 2531 + }, + { + "epoch": 0.5205057045945113, + "grad_norm": 0.13950461149215698, + "learning_rate": 8.522641043700653e-05, + "loss": 0.5966, + "step": 2532 + }, + { + "epoch": 0.5207112755678898, + "grad_norm": 0.22141605615615845, + "learning_rate": 8.52219325160179e-05, + "loss": 0.7104, + "step": 2533 + }, + { + "epoch": 0.5209168465412684, + "grad_norm": 0.13655850291252136, + "learning_rate": 8.521745261348264e-05, + "loss": 0.5766, + "step": 2534 + }, + { + "epoch": 0.5211224175146469, + "grad_norm": 0.21564966440200806, + "learning_rate": 8.521297072962148e-05, + "loss": 0.7378, + "step": 2535 + }, + { + "epoch": 0.5213279884880255, + "grad_norm": 0.13964693248271942, + "learning_rate": 8.520848686465521e-05, + "loss": 0.5763, + "step": 2536 + }, + { + "epoch": 0.521533559461404, + "grad_norm": 0.20813791453838348, + "learning_rate": 8.520400101880472e-05, + "loss": 0.6768, + "step": 2537 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.20774829387664795, + "learning_rate": 8.519951319229101e-05, + "loss": 0.7078, + "step": 2538 + }, + { + "epoch": 0.5219447014081612, + "grad_norm": 0.14507782459259033, + "learning_rate": 8.519502338533519e-05, + "loss": 0.6009, + "step": 2539 + }, + { + "epoch": 0.5221502723815398, + "grad_norm": 0.21281610429286957, + "learning_rate": 8.519053159815843e-05, + "loss": 0.6951, + "step": 2540 + }, + { + "epoch": 0.5223558433549182, + "grad_norm": 0.21360744535923004, + "learning_rate": 8.518603783098203e-05, + "loss": 0.7098, + "step": 2541 + }, + { + "epoch": 0.5225614143282968, + "grad_norm": 0.20327754318714142, + "learning_rate": 8.518154208402736e-05, + "loss": 0.7009, + "step": 2542 + }, + { + "epoch": 0.5227669853016754, + "grad_norm": 0.200285404920578, + "learning_rate": 8.517704435751594e-05, + "loss": 0.6858, + "step": 2543 + }, + { + "epoch": 0.522972556275054, + "grad_norm": 0.13732387125492096, + "learning_rate": 8.517254465166932e-05, + "loss": 0.5735, + "step": 2544 + }, + { + "epoch": 0.5231781272484325, + "grad_norm": 0.21144580841064453, + "learning_rate": 8.516804296670919e-05, + "loss": 0.7217, + "step": 2545 + }, + { + "epoch": 0.5233836982218111, + "grad_norm": 0.20281550288200378, + "learning_rate": 8.516353930285735e-05, + "loss": 0.7018, + "step": 2546 + }, + { + "epoch": 0.5235892691951897, + "grad_norm": 0.1997842639684677, + "learning_rate": 8.515903366033563e-05, + "loss": 0.6991, + "step": 2547 + }, + { + "epoch": 0.5237948401685681, + "grad_norm": 0.13998793065547943, + "learning_rate": 8.515452603936603e-05, + "loss": 0.5788, + "step": 2548 + }, + { + "epoch": 0.5240004111419467, + "grad_norm": 0.2052655965089798, + "learning_rate": 8.51500164401706e-05, + "loss": 0.7221, + "step": 2549 + }, + { + "epoch": 0.5242059821153253, + "grad_norm": 0.21158649027347565, + "learning_rate": 8.514550486297155e-05, + "loss": 0.7077, + "step": 2550 + }, + { + "epoch": 0.5244115530887039, + "grad_norm": 0.2046501189470291, + "learning_rate": 8.51409913079911e-05, + "loss": 0.6898, + "step": 2551 + }, + { + "epoch": 0.5246171240620824, + "grad_norm": 0.13471710681915283, + "learning_rate": 8.513647577545163e-05, + "loss": 0.5809, + "step": 2552 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 0.21416522562503815, + "learning_rate": 8.51319582655756e-05, + "loss": 0.6954, + "step": 2553 + }, + { + "epoch": 0.5250282660088396, + "grad_norm": 0.21434451639652252, + "learning_rate": 8.512743877858554e-05, + "loss": 0.6864, + "step": 2554 + }, + { + "epoch": 0.5252338369822181, + "grad_norm": 0.2164076715707779, + "learning_rate": 8.512291731470415e-05, + "loss": 0.7236, + "step": 2555 + }, + { + "epoch": 0.5254394079555966, + "grad_norm": 0.2215905487537384, + "learning_rate": 8.511839387415415e-05, + "loss": 0.6808, + "step": 2556 + }, + { + "epoch": 0.5256449789289752, + "grad_norm": 0.212999165058136, + "learning_rate": 8.51138684571584e-05, + "loss": 0.6986, + "step": 2557 + }, + { + "epoch": 0.5258505499023538, + "grad_norm": 0.20863129198551178, + "learning_rate": 8.510934106393983e-05, + "loss": 0.708, + "step": 2558 + }, + { + "epoch": 0.5260561208757324, + "grad_norm": 0.14516817033290863, + "learning_rate": 8.51048116947215e-05, + "loss": 0.574, + "step": 2559 + }, + { + "epoch": 0.5262616918491109, + "grad_norm": 0.2149210274219513, + "learning_rate": 8.510028034972656e-05, + "loss": 0.6872, + "step": 2560 + }, + { + "epoch": 0.5264672628224895, + "grad_norm": 0.21908272802829742, + "learning_rate": 8.509574702917823e-05, + "loss": 0.6847, + "step": 2561 + }, + { + "epoch": 0.526672833795868, + "grad_norm": 0.1989137828350067, + "learning_rate": 8.509121173329985e-05, + "loss": 0.6807, + "step": 2562 + }, + { + "epoch": 0.5268784047692466, + "grad_norm": 0.14854271709918976, + "learning_rate": 8.508667446231486e-05, + "loss": 0.5931, + "step": 2563 + }, + { + "epoch": 0.5270839757426251, + "grad_norm": 0.21540796756744385, + "learning_rate": 8.508213521644677e-05, + "loss": 0.6948, + "step": 2564 + }, + { + "epoch": 0.5272895467160037, + "grad_norm": 0.21465127170085907, + "learning_rate": 8.507759399591922e-05, + "loss": 0.7256, + "step": 2565 + }, + { + "epoch": 0.5274951176893823, + "grad_norm": 0.2020212709903717, + "learning_rate": 8.507305080095595e-05, + "loss": 0.6946, + "step": 2566 + }, + { + "epoch": 0.5277006886627608, + "grad_norm": 0.21125240623950958, + "learning_rate": 8.506850563178077e-05, + "loss": 0.6756, + "step": 2567 + }, + { + "epoch": 0.5279062596361394, + "grad_norm": 0.17571476101875305, + "learning_rate": 8.506395848861759e-05, + "loss": 0.5914, + "step": 2568 + }, + { + "epoch": 0.5281118306095179, + "grad_norm": 0.22128242254257202, + "learning_rate": 8.505940937169044e-05, + "loss": 0.6772, + "step": 2569 + }, + { + "epoch": 0.5283174015828965, + "grad_norm": 0.13210316002368927, + "learning_rate": 8.505485828122341e-05, + "loss": 0.5798, + "step": 2570 + }, + { + "epoch": 0.528522972556275, + "grad_norm": 0.22432683408260345, + "learning_rate": 8.505030521744074e-05, + "loss": 0.693, + "step": 2571 + }, + { + "epoch": 0.5287285435296536, + "grad_norm": 0.15919888019561768, + "learning_rate": 8.504575018056672e-05, + "loss": 0.5888, + "step": 2572 + }, + { + "epoch": 0.5289341145030322, + "grad_norm": 0.21992851793766022, + "learning_rate": 8.504119317082577e-05, + "loss": 0.6978, + "step": 2573 + }, + { + "epoch": 0.5291396854764108, + "grad_norm": 0.2072344422340393, + "learning_rate": 8.503663418844238e-05, + "loss": 0.7253, + "step": 2574 + }, + { + "epoch": 0.5293452564497892, + "grad_norm": 0.14406660199165344, + "learning_rate": 8.503207323364117e-05, + "loss": 0.5729, + "step": 2575 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 0.21171186864376068, + "learning_rate": 8.50275103066468e-05, + "loss": 0.7078, + "step": 2576 + }, + { + "epoch": 0.5297563983965464, + "grad_norm": 0.22379416227340698, + "learning_rate": 8.502294540768409e-05, + "loss": 0.6871, + "step": 2577 + }, + { + "epoch": 0.529961969369925, + "grad_norm": 0.2064572423696518, + "learning_rate": 8.501837853697792e-05, + "loss": 0.7041, + "step": 2578 + }, + { + "epoch": 0.5301675403433035, + "grad_norm": 0.20695674419403076, + "learning_rate": 8.501380969475331e-05, + "loss": 0.7138, + "step": 2579 + }, + { + "epoch": 0.5303731113166821, + "grad_norm": 0.21721471846103668, + "learning_rate": 8.50092388812353e-05, + "loss": 0.7119, + "step": 2580 + }, + { + "epoch": 0.5305786822900607, + "grad_norm": 0.20023848116397858, + "learning_rate": 8.50046660966491e-05, + "loss": 0.6828, + "step": 2581 + }, + { + "epoch": 0.5307842532634393, + "grad_norm": 0.22572509944438934, + "learning_rate": 8.500009134121998e-05, + "loss": 0.7025, + "step": 2582 + }, + { + "epoch": 0.5309898242368177, + "grad_norm": 0.20377467572689056, + "learning_rate": 8.499551461517332e-05, + "loss": 0.6907, + "step": 2583 + }, + { + "epoch": 0.5311953952101963, + "grad_norm": 0.2061266154050827, + "learning_rate": 8.499093591873459e-05, + "loss": 0.7025, + "step": 2584 + }, + { + "epoch": 0.5314009661835749, + "grad_norm": 0.20886844396591187, + "learning_rate": 8.498635525212937e-05, + "loss": 0.689, + "step": 2585 + }, + { + "epoch": 0.5316065371569534, + "grad_norm": 0.21331052482128143, + "learning_rate": 8.498177261558332e-05, + "loss": 0.7088, + "step": 2586 + }, + { + "epoch": 0.531812108130332, + "grad_norm": 0.2123933583498001, + "learning_rate": 8.49771880093222e-05, + "loss": 0.6907, + "step": 2587 + }, + { + "epoch": 0.5320176791037106, + "grad_norm": 0.20878660678863525, + "learning_rate": 8.49726014335719e-05, + "loss": 0.724, + "step": 2588 + }, + { + "epoch": 0.5322232500770891, + "grad_norm": 0.1978175789117813, + "learning_rate": 8.496801288855835e-05, + "loss": 0.6824, + "step": 2589 + }, + { + "epoch": 0.5324288210504676, + "grad_norm": 0.21396887302398682, + "learning_rate": 8.496342237450761e-05, + "loss": 0.712, + "step": 2590 + }, + { + "epoch": 0.5326343920238462, + "grad_norm": 0.21784614026546478, + "learning_rate": 8.495882989164584e-05, + "loss": 0.6793, + "step": 2591 + }, + { + "epoch": 0.5328399629972248, + "grad_norm": 0.20604658126831055, + "learning_rate": 8.495423544019928e-05, + "loss": 0.7158, + "step": 2592 + }, + { + "epoch": 0.5330455339706034, + "grad_norm": 0.21813294291496277, + "learning_rate": 8.49496390203943e-05, + "loss": 0.6887, + "step": 2593 + }, + { + "epoch": 0.5332511049439819, + "grad_norm": 0.1722048819065094, + "learning_rate": 8.494504063245733e-05, + "loss": 0.6013, + "step": 2594 + }, + { + "epoch": 0.5334566759173605, + "grad_norm": 0.2043728232383728, + "learning_rate": 8.49404402766149e-05, + "loss": 0.684, + "step": 2595 + }, + { + "epoch": 0.533662246890739, + "grad_norm": 0.20982548594474792, + "learning_rate": 8.493583795309364e-05, + "loss": 0.6776, + "step": 2596 + }, + { + "epoch": 0.5338678178641176, + "grad_norm": 0.20805718004703522, + "learning_rate": 8.493123366212034e-05, + "loss": 0.7061, + "step": 2597 + }, + { + "epoch": 0.5340733888374961, + "grad_norm": 0.1766945868730545, + "learning_rate": 8.492662740392178e-05, + "loss": 0.595, + "step": 2598 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 0.22322477400302887, + "learning_rate": 8.49220191787249e-05, + "loss": 0.665, + "step": 2599 + }, + { + "epoch": 0.5344845307842533, + "grad_norm": 0.22785376012325287, + "learning_rate": 8.491740898675675e-05, + "loss": 0.7141, + "step": 2600 + }, + { + "epoch": 0.5346901017576319, + "grad_norm": 0.2232331484556198, + "learning_rate": 8.491279682824441e-05, + "loss": 0.7175, + "step": 2601 + }, + { + "epoch": 0.5348956727310104, + "grad_norm": 0.2167566865682602, + "learning_rate": 8.490818270341514e-05, + "loss": 0.6922, + "step": 2602 + }, + { + "epoch": 0.5351012437043889, + "grad_norm": 0.20170411467552185, + "learning_rate": 8.490356661249623e-05, + "loss": 0.6809, + "step": 2603 + }, + { + "epoch": 0.5353068146777675, + "grad_norm": 0.21896955370903015, + "learning_rate": 8.48989485557151e-05, + "loss": 0.6952, + "step": 2604 + }, + { + "epoch": 0.535512385651146, + "grad_norm": 0.17013712227344513, + "learning_rate": 8.489432853329927e-05, + "loss": 0.5891, + "step": 2605 + }, + { + "epoch": 0.5357179566245246, + "grad_norm": 0.23494184017181396, + "learning_rate": 8.488970654547632e-05, + "loss": 0.6739, + "step": 2606 + }, + { + "epoch": 0.5359235275979032, + "grad_norm": 0.21912021934986115, + "learning_rate": 8.4885082592474e-05, + "loss": 0.7035, + "step": 2607 + }, + { + "epoch": 0.5361290985712818, + "grad_norm": 0.14512377977371216, + "learning_rate": 8.488045667452006e-05, + "loss": 0.569, + "step": 2608 + }, + { + "epoch": 0.5363346695446602, + "grad_norm": 0.14050711691379547, + "learning_rate": 8.487582879184242e-05, + "loss": 0.5772, + "step": 2609 + }, + { + "epoch": 0.5365402405180388, + "grad_norm": 0.25031203031539917, + "learning_rate": 8.48711989446691e-05, + "loss": 0.6868, + "step": 2610 + }, + { + "epoch": 0.5367458114914174, + "grad_norm": 0.2108568251132965, + "learning_rate": 8.486656713322814e-05, + "loss": 0.6894, + "step": 2611 + }, + { + "epoch": 0.536951382464796, + "grad_norm": 0.22467973828315735, + "learning_rate": 8.486193335774777e-05, + "loss": 0.692, + "step": 2612 + }, + { + "epoch": 0.5371569534381745, + "grad_norm": 0.2571062743663788, + "learning_rate": 8.485729761845625e-05, + "loss": 0.705, + "step": 2613 + }, + { + "epoch": 0.5373625244115531, + "grad_norm": 0.21951597929000854, + "learning_rate": 8.485265991558196e-05, + "loss": 0.6824, + "step": 2614 + }, + { + "epoch": 0.5375680953849317, + "grad_norm": 0.22675755620002747, + "learning_rate": 8.48480202493534e-05, + "loss": 0.7114, + "step": 2615 + }, + { + "epoch": 0.5377736663583103, + "grad_norm": 0.2269049733877182, + "learning_rate": 8.484337861999912e-05, + "loss": 0.6641, + "step": 2616 + }, + { + "epoch": 0.5379792373316887, + "grad_norm": 0.21990883350372314, + "learning_rate": 8.48387350277478e-05, + "loss": 0.7275, + "step": 2617 + }, + { + "epoch": 0.5381848083050673, + "grad_norm": 0.21468190848827362, + "learning_rate": 8.483408947282823e-05, + "loss": 0.7202, + "step": 2618 + }, + { + "epoch": 0.5383903792784459, + "grad_norm": 0.21018457412719727, + "learning_rate": 8.482944195546925e-05, + "loss": 0.6831, + "step": 2619 + }, + { + "epoch": 0.5385959502518245, + "grad_norm": 0.2128850817680359, + "learning_rate": 8.482479247589982e-05, + "loss": 0.6809, + "step": 2620 + }, + { + "epoch": 0.538801521225203, + "grad_norm": 0.23084747791290283, + "learning_rate": 8.4820141034349e-05, + "loss": 0.6099, + "step": 2621 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 0.22527490556240082, + "learning_rate": 8.481548763104597e-05, + "loss": 0.7123, + "step": 2622 + }, + { + "epoch": 0.5392126631719601, + "grad_norm": 0.22562628984451294, + "learning_rate": 8.481083226621994e-05, + "loss": 0.707, + "step": 2623 + }, + { + "epoch": 0.5394182341453386, + "grad_norm": 0.21400360763072968, + "learning_rate": 8.48061749401003e-05, + "loss": 0.7019, + "step": 2624 + }, + { + "epoch": 0.5396238051187172, + "grad_norm": 0.20809048414230347, + "learning_rate": 8.480151565291646e-05, + "loss": 0.7188, + "step": 2625 + }, + { + "epoch": 0.5398293760920958, + "grad_norm": 0.21414582431316376, + "learning_rate": 8.479685440489798e-05, + "loss": 0.6698, + "step": 2626 + }, + { + "epoch": 0.5400349470654744, + "grad_norm": 0.19604355096817017, + "learning_rate": 8.47921911962745e-05, + "loss": 0.6728, + "step": 2627 + }, + { + "epoch": 0.5402405180388529, + "grad_norm": 0.2081209272146225, + "learning_rate": 8.478752602727573e-05, + "loss": 0.6839, + "step": 2628 + }, + { + "epoch": 0.5404460890122315, + "grad_norm": 0.21594710648059845, + "learning_rate": 8.478285889813153e-05, + "loss": 0.6845, + "step": 2629 + }, + { + "epoch": 0.54065165998561, + "grad_norm": 0.21320217847824097, + "learning_rate": 8.477818980907183e-05, + "loss": 0.7046, + "step": 2630 + }, + { + "epoch": 0.5408572309589886, + "grad_norm": 0.20672303438186646, + "learning_rate": 8.477351876032662e-05, + "loss": 0.7343, + "step": 2631 + }, + { + "epoch": 0.5410628019323671, + "grad_norm": 0.1888507753610611, + "learning_rate": 8.476884575212606e-05, + "loss": 0.6666, + "step": 2632 + }, + { + "epoch": 0.5412683729057457, + "grad_norm": 0.19607265293598175, + "learning_rate": 8.476417078470032e-05, + "loss": 0.6881, + "step": 2633 + }, + { + "epoch": 0.5414739438791243, + "grad_norm": 0.20374587178230286, + "learning_rate": 8.475949385827977e-05, + "loss": 0.6748, + "step": 2634 + }, + { + "epoch": 0.5416795148525029, + "grad_norm": 0.2075163573026657, + "learning_rate": 8.475481497309478e-05, + "loss": 0.7178, + "step": 2635 + }, + { + "epoch": 0.5418850858258814, + "grad_norm": 0.20457369089126587, + "learning_rate": 8.475013412937587e-05, + "loss": 0.6713, + "step": 2636 + }, + { + "epoch": 0.5420906567992599, + "grad_norm": 0.22288042306900024, + "learning_rate": 8.474545132735365e-05, + "loss": 0.593, + "step": 2637 + }, + { + "epoch": 0.5422962277726385, + "grad_norm": 0.2154739946126938, + "learning_rate": 8.474076656725881e-05, + "loss": 0.6944, + "step": 2638 + }, + { + "epoch": 0.5425017987460171, + "grad_norm": 0.21423187851905823, + "learning_rate": 8.473607984932215e-05, + "loss": 0.6635, + "step": 2639 + }, + { + "epoch": 0.5427073697193956, + "grad_norm": 0.24016740918159485, + "learning_rate": 8.473139117377456e-05, + "loss": 0.7088, + "step": 2640 + }, + { + "epoch": 0.5429129406927742, + "grad_norm": 0.2100851833820343, + "learning_rate": 8.472670054084704e-05, + "loss": 0.6737, + "step": 2641 + }, + { + "epoch": 0.5431185116661528, + "grad_norm": 0.20590589940547943, + "learning_rate": 8.472200795077065e-05, + "loss": 0.7015, + "step": 2642 + }, + { + "epoch": 0.5433240826395312, + "grad_norm": 0.20215122401714325, + "learning_rate": 8.47173134037766e-05, + "loss": 0.6834, + "step": 2643 + }, + { + "epoch": 0.5435296536129098, + "grad_norm": 0.17897242307662964, + "learning_rate": 8.471261690009615e-05, + "loss": 0.5736, + "step": 2644 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 0.1412929892539978, + "learning_rate": 8.470791843996068e-05, + "loss": 0.5684, + "step": 2645 + }, + { + "epoch": 0.543940795559667, + "grad_norm": 0.23520296812057495, + "learning_rate": 8.470321802360167e-05, + "loss": 0.6979, + "step": 2646 + }, + { + "epoch": 0.5441463665330455, + "grad_norm": 0.22806185483932495, + "learning_rate": 8.469851565125068e-05, + "loss": 0.6768, + "step": 2647 + }, + { + "epoch": 0.5443519375064241, + "grad_norm": 0.20918670296669006, + "learning_rate": 8.469381132313938e-05, + "loss": 0.669, + "step": 2648 + }, + { + "epoch": 0.5445575084798027, + "grad_norm": 0.21143250167369843, + "learning_rate": 8.468910503949951e-05, + "loss": 0.7044, + "step": 2649 + }, + { + "epoch": 0.5447630794531813, + "grad_norm": 0.21474787592887878, + "learning_rate": 8.468439680056295e-05, + "loss": 0.7171, + "step": 2650 + }, + { + "epoch": 0.5449686504265597, + "grad_norm": 0.20778292417526245, + "learning_rate": 8.467968660656164e-05, + "loss": 0.6719, + "step": 2651 + }, + { + "epoch": 0.5451742213999383, + "grad_norm": 0.20223721861839294, + "learning_rate": 8.467497445772764e-05, + "loss": 0.5761, + "step": 2652 + }, + { + "epoch": 0.5453797923733169, + "grad_norm": 0.16389262676239014, + "learning_rate": 8.467026035429308e-05, + "loss": 0.6203, + "step": 2653 + }, + { + "epoch": 0.5455853633466955, + "grad_norm": 0.23996488749980927, + "learning_rate": 8.466554429649022e-05, + "loss": 0.7091, + "step": 2654 + }, + { + "epoch": 0.545790934320074, + "grad_norm": 0.22990204393863678, + "learning_rate": 8.466082628455138e-05, + "loss": 0.6889, + "step": 2655 + }, + { + "epoch": 0.5459965052934526, + "grad_norm": 0.20042270421981812, + "learning_rate": 8.4656106318709e-05, + "loss": 0.6864, + "step": 2656 + }, + { + "epoch": 0.5462020762668311, + "grad_norm": 0.2556054890155792, + "learning_rate": 8.465138439919563e-05, + "loss": 0.6858, + "step": 2657 + }, + { + "epoch": 0.5464076472402097, + "grad_norm": 0.20988969504833221, + "learning_rate": 8.464666052624386e-05, + "loss": 0.6907, + "step": 2658 + }, + { + "epoch": 0.5466132182135882, + "grad_norm": 0.21028688549995422, + "learning_rate": 8.464193470008646e-05, + "loss": 0.7199, + "step": 2659 + }, + { + "epoch": 0.5468187891869668, + "grad_norm": 0.20908872783184052, + "learning_rate": 8.463720692095621e-05, + "loss": 0.6965, + "step": 2660 + }, + { + "epoch": 0.5470243601603454, + "grad_norm": 0.20974692702293396, + "learning_rate": 8.463247718908604e-05, + "loss": 0.6913, + "step": 2661 + }, + { + "epoch": 0.5472299311337239, + "grad_norm": 0.3178030550479889, + "learning_rate": 8.462774550470894e-05, + "loss": 0.5966, + "step": 2662 + }, + { + "epoch": 0.5474355021071025, + "grad_norm": 0.23371629416942596, + "learning_rate": 8.462301186805807e-05, + "loss": 0.6999, + "step": 2663 + }, + { + "epoch": 0.547641073080481, + "grad_norm": 0.2393561601638794, + "learning_rate": 8.461827627936658e-05, + "loss": 0.6981, + "step": 2664 + }, + { + "epoch": 0.5478466440538596, + "grad_norm": 0.21029163897037506, + "learning_rate": 8.46135387388678e-05, + "loss": 0.6925, + "step": 2665 + }, + { + "epoch": 0.5480522150272381, + "grad_norm": 0.20427922904491425, + "learning_rate": 8.460879924679513e-05, + "loss": 0.648, + "step": 2666 + }, + { + "epoch": 0.5482577860006167, + "grad_norm": 0.20650714635849, + "learning_rate": 8.460405780338205e-05, + "loss": 0.5918, + "step": 2667 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 0.24088306725025177, + "learning_rate": 8.459931440886214e-05, + "loss": 0.7039, + "step": 2668 + }, + { + "epoch": 0.5486689279473739, + "grad_norm": 0.22175416350364685, + "learning_rate": 8.45945690634691e-05, + "loss": 0.7038, + "step": 2669 + }, + { + "epoch": 0.5488744989207524, + "grad_norm": 0.21606440842151642, + "learning_rate": 8.45898217674367e-05, + "loss": 0.6745, + "step": 2670 + }, + { + "epoch": 0.5490800698941309, + "grad_norm": 0.22006148099899292, + "learning_rate": 8.458507252099884e-05, + "loss": 0.7169, + "step": 2671 + }, + { + "epoch": 0.5492856408675095, + "grad_norm": 0.2132798433303833, + "learning_rate": 8.458032132438947e-05, + "loss": 0.6769, + "step": 2672 + }, + { + "epoch": 0.5494912118408881, + "grad_norm": 0.2083420604467392, + "learning_rate": 8.457556817784266e-05, + "loss": 0.6845, + "step": 2673 + }, + { + "epoch": 0.5496967828142666, + "grad_norm": 0.16094450652599335, + "learning_rate": 8.457081308159259e-05, + "loss": 0.573, + "step": 2674 + }, + { + "epoch": 0.5499023537876452, + "grad_norm": 0.23418548703193665, + "learning_rate": 8.456605603587351e-05, + "loss": 0.6743, + "step": 2675 + }, + { + "epoch": 0.5501079247610238, + "grad_norm": 0.2129811942577362, + "learning_rate": 8.456129704091978e-05, + "loss": 0.6956, + "step": 2676 + }, + { + "epoch": 0.5503134957344022, + "grad_norm": 0.14898192882537842, + "learning_rate": 8.455653609696585e-05, + "loss": 0.5923, + "step": 2677 + }, + { + "epoch": 0.5505190667077808, + "grad_norm": 0.22483858466148376, + "learning_rate": 8.455177320424627e-05, + "loss": 0.6918, + "step": 2678 + }, + { + "epoch": 0.5507246376811594, + "grad_norm": 0.22401611506938934, + "learning_rate": 8.454700836299571e-05, + "loss": 0.6985, + "step": 2679 + }, + { + "epoch": 0.550930208654538, + "grad_norm": 0.19923460483551025, + "learning_rate": 8.454224157344887e-05, + "loss": 0.729, + "step": 2680 + }, + { + "epoch": 0.5511357796279165, + "grad_norm": 0.21183621883392334, + "learning_rate": 8.453747283584061e-05, + "loss": 0.677, + "step": 2681 + }, + { + "epoch": 0.5513413506012951, + "grad_norm": 0.16109618544578552, + "learning_rate": 8.453270215040588e-05, + "loss": 0.5949, + "step": 2682 + }, + { + "epoch": 0.5515469215746737, + "grad_norm": 0.21456550061702728, + "learning_rate": 8.452792951737966e-05, + "loss": 0.7069, + "step": 2683 + }, + { + "epoch": 0.5517524925480523, + "grad_norm": 0.19927652180194855, + "learning_rate": 8.452315493699713e-05, + "loss": 0.6762, + "step": 2684 + }, + { + "epoch": 0.5519580635214307, + "grad_norm": 0.19462721049785614, + "learning_rate": 8.451837840949347e-05, + "loss": 0.701, + "step": 2685 + }, + { + "epoch": 0.5521636344948093, + "grad_norm": 0.22193773090839386, + "learning_rate": 8.451359993510403e-05, + "loss": 0.6949, + "step": 2686 + }, + { + "epoch": 0.5523692054681879, + "grad_norm": 0.22146186232566833, + "learning_rate": 8.450881951406419e-05, + "loss": 0.7208, + "step": 2687 + }, + { + "epoch": 0.5525747764415665, + "grad_norm": 0.19484825432300568, + "learning_rate": 8.45040371466095e-05, + "loss": 0.6823, + "step": 2688 + }, + { + "epoch": 0.552780347414945, + "grad_norm": 0.20109498500823975, + "learning_rate": 8.449925283297551e-05, + "loss": 0.7008, + "step": 2689 + }, + { + "epoch": 0.5529859183883236, + "grad_norm": 0.1965745985507965, + "learning_rate": 8.449446657339798e-05, + "loss": 0.7047, + "step": 2690 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 0.19609376788139343, + "learning_rate": 8.448967836811266e-05, + "loss": 0.6856, + "step": 2691 + }, + { + "epoch": 0.5533970603350807, + "grad_norm": 0.19566380977630615, + "learning_rate": 8.448488821735546e-05, + "loss": 0.6883, + "step": 2692 + }, + { + "epoch": 0.5536026313084592, + "grad_norm": 0.18993543088436127, + "learning_rate": 8.448009612136238e-05, + "loss": 0.5882, + "step": 2693 + }, + { + "epoch": 0.5538082022818378, + "grad_norm": 0.22677689790725708, + "learning_rate": 8.44753020803695e-05, + "loss": 0.695, + "step": 2694 + }, + { + "epoch": 0.5540137732552164, + "grad_norm": 0.21654780209064484, + "learning_rate": 8.447050609461299e-05, + "loss": 0.7006, + "step": 2695 + }, + { + "epoch": 0.5542193442285949, + "grad_norm": 0.1987585723400116, + "learning_rate": 8.446570816432911e-05, + "loss": 0.6786, + "step": 2696 + }, + { + "epoch": 0.5544249152019735, + "grad_norm": 0.21320489048957825, + "learning_rate": 8.446090828975427e-05, + "loss": 0.7029, + "step": 2697 + }, + { + "epoch": 0.554630486175352, + "grad_norm": 0.16352033615112305, + "learning_rate": 8.445610647112492e-05, + "loss": 0.5938, + "step": 2698 + }, + { + "epoch": 0.5548360571487306, + "grad_norm": 0.21454685926437378, + "learning_rate": 8.44513027086776e-05, + "loss": 0.6759, + "step": 2699 + }, + { + "epoch": 0.5550416281221091, + "grad_norm": 0.20842206478118896, + "learning_rate": 8.444649700264902e-05, + "loss": 0.6922, + "step": 2700 + }, + { + "epoch": 0.5552471990954877, + "grad_norm": 0.1389513611793518, + "learning_rate": 8.444168935327589e-05, + "loss": 0.5826, + "step": 2701 + }, + { + "epoch": 0.5554527700688663, + "grad_norm": 0.20907482504844666, + "learning_rate": 8.443687976079507e-05, + "loss": 0.6838, + "step": 2702 + }, + { + "epoch": 0.5556583410422449, + "grad_norm": 0.21713374555110931, + "learning_rate": 8.443206822544352e-05, + "loss": 0.7058, + "step": 2703 + }, + { + "epoch": 0.5558639120156234, + "grad_norm": 0.1558568924665451, + "learning_rate": 8.442725474745827e-05, + "loss": 0.5847, + "step": 2704 + }, + { + "epoch": 0.5560694829890019, + "grad_norm": 0.20640867948532104, + "learning_rate": 8.442243932707647e-05, + "loss": 0.7049, + "step": 2705 + }, + { + "epoch": 0.5562750539623805, + "grad_norm": 0.12573988735675812, + "learning_rate": 8.441762196453534e-05, + "loss": 0.5863, + "step": 2706 + }, + { + "epoch": 0.5564806249357591, + "grad_norm": 0.21294710040092468, + "learning_rate": 8.441280266007221e-05, + "loss": 0.6913, + "step": 2707 + }, + { + "epoch": 0.5566861959091376, + "grad_norm": 0.2014019787311554, + "learning_rate": 8.44079814139245e-05, + "loss": 0.6954, + "step": 2708 + }, + { + "epoch": 0.5568917668825162, + "grad_norm": 0.2047373652458191, + "learning_rate": 8.440315822632974e-05, + "loss": 0.6976, + "step": 2709 + }, + { + "epoch": 0.5570973378558948, + "grad_norm": 0.21064162254333496, + "learning_rate": 8.439833309752556e-05, + "loss": 0.6994, + "step": 2710 + }, + { + "epoch": 0.5573029088292734, + "grad_norm": 0.21300119161605835, + "learning_rate": 8.439350602774964e-05, + "loss": 0.6748, + "step": 2711 + }, + { + "epoch": 0.5575084798026518, + "grad_norm": 0.17572659254074097, + "learning_rate": 8.438867701723982e-05, + "loss": 0.5906, + "step": 2712 + }, + { + "epoch": 0.5577140507760304, + "grad_norm": 0.13898785412311554, + "learning_rate": 8.438384606623397e-05, + "loss": 0.5679, + "step": 2713 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 0.24983015656471252, + "learning_rate": 8.437901317497011e-05, + "loss": 0.6696, + "step": 2714 + }, + { + "epoch": 0.5581251927227875, + "grad_norm": 0.21426579356193542, + "learning_rate": 8.437417834368632e-05, + "loss": 0.6824, + "step": 2715 + }, + { + "epoch": 0.5583307636961661, + "grad_norm": 0.20514623820781708, + "learning_rate": 8.436934157262082e-05, + "loss": 0.708, + "step": 2716 + }, + { + "epoch": 0.5585363346695447, + "grad_norm": 0.21398288011550903, + "learning_rate": 8.436450286201184e-05, + "loss": 0.7051, + "step": 2717 + }, + { + "epoch": 0.5587419056429233, + "grad_norm": 0.2091488540172577, + "learning_rate": 8.435966221209782e-05, + "loss": 0.6671, + "step": 2718 + }, + { + "epoch": 0.5589474766163017, + "grad_norm": 0.21767988801002502, + "learning_rate": 8.43548196231172e-05, + "loss": 0.724, + "step": 2719 + }, + { + "epoch": 0.5591530475896803, + "grad_norm": 0.2218277007341385, + "learning_rate": 8.434997509530855e-05, + "loss": 0.6924, + "step": 2720 + }, + { + "epoch": 0.5593586185630589, + "grad_norm": 0.2099279761314392, + "learning_rate": 8.434512862891058e-05, + "loss": 0.6847, + "step": 2721 + }, + { + "epoch": 0.5595641895364375, + "grad_norm": 0.2063916176557541, + "learning_rate": 8.434028022416199e-05, + "loss": 0.669, + "step": 2722 + }, + { + "epoch": 0.559769760509816, + "grad_norm": 0.2331087738275528, + "learning_rate": 8.433542988130168e-05, + "loss": 0.6039, + "step": 2723 + }, + { + "epoch": 0.5599753314831946, + "grad_norm": 0.22927048802375793, + "learning_rate": 8.433057760056858e-05, + "loss": 0.6982, + "step": 2724 + }, + { + "epoch": 0.5601809024565731, + "grad_norm": 0.22356915473937988, + "learning_rate": 8.432572338220177e-05, + "loss": 0.6676, + "step": 2725 + }, + { + "epoch": 0.5603864734299517, + "grad_norm": 0.21038733422756195, + "learning_rate": 8.432086722644038e-05, + "loss": 0.6922, + "step": 2726 + }, + { + "epoch": 0.5605920444033302, + "grad_norm": 0.21845050156116486, + "learning_rate": 8.431600913352363e-05, + "loss": 0.6809, + "step": 2727 + }, + { + "epoch": 0.5607976153767088, + "grad_norm": 0.20335665345191956, + "learning_rate": 8.431114910369087e-05, + "loss": 0.6561, + "step": 2728 + }, + { + "epoch": 0.5610031863500874, + "grad_norm": 0.20789889991283417, + "learning_rate": 8.430628713718156e-05, + "loss": 0.7282, + "step": 2729 + }, + { + "epoch": 0.561208757323466, + "grad_norm": 0.21542754769325256, + "learning_rate": 8.430142323423518e-05, + "loss": 0.6794, + "step": 2730 + }, + { + "epoch": 0.5614143282968445, + "grad_norm": 0.19883479177951813, + "learning_rate": 8.429655739509137e-05, + "loss": 0.7022, + "step": 2731 + }, + { + "epoch": 0.561619899270223, + "grad_norm": 0.2027217149734497, + "learning_rate": 8.429168961998987e-05, + "loss": 0.7122, + "step": 2732 + }, + { + "epoch": 0.5618254702436016, + "grad_norm": 0.20962925255298615, + "learning_rate": 8.428681990917045e-05, + "loss": 0.702, + "step": 2733 + }, + { + "epoch": 0.5620310412169801, + "grad_norm": 0.2032438963651657, + "learning_rate": 8.428194826287304e-05, + "loss": 0.6828, + "step": 2734 + }, + { + "epoch": 0.5622366121903587, + "grad_norm": 0.19384074211120605, + "learning_rate": 8.427707468133766e-05, + "loss": 0.6693, + "step": 2735 + }, + { + "epoch": 0.5624421831637373, + "grad_norm": 0.20118926465511322, + "learning_rate": 8.427219916480437e-05, + "loss": 0.7003, + "step": 2736 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 0.21019205451011658, + "learning_rate": 8.426732171351338e-05, + "loss": 0.7088, + "step": 2737 + }, + { + "epoch": 0.5628533251104944, + "grad_norm": 0.19624383747577667, + "learning_rate": 8.426244232770501e-05, + "loss": 0.6929, + "step": 2738 + }, + { + "epoch": 0.5630588960838729, + "grad_norm": 0.20001311600208282, + "learning_rate": 8.425756100761961e-05, + "loss": 0.6641, + "step": 2739 + }, + { + "epoch": 0.5632644670572515, + "grad_norm": 0.20031724870204926, + "learning_rate": 8.425267775349766e-05, + "loss": 0.7202, + "step": 2740 + }, + { + "epoch": 0.5634700380306301, + "grad_norm": 0.20123572647571564, + "learning_rate": 8.424779256557976e-05, + "loss": 0.6924, + "step": 2741 + }, + { + "epoch": 0.5636756090040086, + "grad_norm": 0.20444491505622864, + "learning_rate": 8.424290544410654e-05, + "loss": 0.6893, + "step": 2742 + }, + { + "epoch": 0.5638811799773872, + "grad_norm": 0.1976771205663681, + "learning_rate": 8.42380163893188e-05, + "loss": 0.6709, + "step": 2743 + }, + { + "epoch": 0.5640867509507658, + "grad_norm": 0.222488135099411, + "learning_rate": 8.42331254014574e-05, + "loss": 0.5918, + "step": 2744 + }, + { + "epoch": 0.5642923219241444, + "grad_norm": 0.21417805552482605, + "learning_rate": 8.422823248076329e-05, + "loss": 0.6833, + "step": 2745 + }, + { + "epoch": 0.5644978928975228, + "grad_norm": 0.21681103110313416, + "learning_rate": 8.42233376274775e-05, + "loss": 0.7288, + "step": 2746 + }, + { + "epoch": 0.5647034638709014, + "grad_norm": 0.20778658986091614, + "learning_rate": 8.42184408418412e-05, + "loss": 0.6749, + "step": 2747 + }, + { + "epoch": 0.56490903484428, + "grad_norm": 0.20677468180656433, + "learning_rate": 8.421354212409563e-05, + "loss": 0.7008, + "step": 2748 + }, + { + "epoch": 0.5651146058176586, + "grad_norm": 0.15667958557605743, + "learning_rate": 8.420864147448213e-05, + "loss": 0.5793, + "step": 2749 + }, + { + "epoch": 0.5653201767910371, + "grad_norm": 0.22153092920780182, + "learning_rate": 8.42037388932421e-05, + "loss": 0.6865, + "step": 2750 + }, + { + "epoch": 0.5655257477644157, + "grad_norm": 0.22236353158950806, + "learning_rate": 8.419883438061711e-05, + "loss": 0.6672, + "step": 2751 + }, + { + "epoch": 0.5657313187377943, + "grad_norm": 0.2081800103187561, + "learning_rate": 8.419392793684878e-05, + "loss": 0.7169, + "step": 2752 + }, + { + "epoch": 0.5659368897111727, + "grad_norm": 0.16220282018184662, + "learning_rate": 8.418901956217878e-05, + "loss": 0.5878, + "step": 2753 + }, + { + "epoch": 0.5661424606845513, + "grad_norm": 0.21759817004203796, + "learning_rate": 8.418410925684898e-05, + "loss": 0.7273, + "step": 2754 + }, + { + "epoch": 0.5663480316579299, + "grad_norm": 0.22539561986923218, + "learning_rate": 8.417919702110125e-05, + "loss": 0.7, + "step": 2755 + }, + { + "epoch": 0.5665536026313085, + "grad_norm": 0.196711003780365, + "learning_rate": 8.41742828551776e-05, + "loss": 0.7179, + "step": 2756 + }, + { + "epoch": 0.566759173604687, + "grad_norm": 0.210893914103508, + "learning_rate": 8.416936675932015e-05, + "loss": 0.708, + "step": 2757 + }, + { + "epoch": 0.5669647445780656, + "grad_norm": 0.19233620166778564, + "learning_rate": 8.416444873377108e-05, + "loss": 0.5911, + "step": 2758 + }, + { + "epoch": 0.5671703155514441, + "grad_norm": 0.21840979158878326, + "learning_rate": 8.415952877877266e-05, + "loss": 0.6871, + "step": 2759 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 0.216123566031456, + "learning_rate": 8.41546068945673e-05, + "loss": 0.7381, + "step": 2760 + }, + { + "epoch": 0.5675814574982012, + "grad_norm": 0.14728981256484985, + "learning_rate": 8.414968308139747e-05, + "loss": 0.5818, + "step": 2761 + }, + { + "epoch": 0.5677870284715798, + "grad_norm": 0.16224178671836853, + "learning_rate": 8.414475733950572e-05, + "loss": 0.5819, + "step": 2762 + }, + { + "epoch": 0.5679925994449584, + "grad_norm": 0.23816072940826416, + "learning_rate": 8.413982966913475e-05, + "loss": 0.7021, + "step": 2763 + }, + { + "epoch": 0.568198170418337, + "grad_norm": 0.2145988643169403, + "learning_rate": 8.413490007052731e-05, + "loss": 0.712, + "step": 2764 + }, + { + "epoch": 0.5684037413917155, + "grad_norm": 0.1928829550743103, + "learning_rate": 8.412996854392625e-05, + "loss": 0.6792, + "step": 2765 + }, + { + "epoch": 0.568609312365094, + "grad_norm": 0.22511503100395203, + "learning_rate": 8.412503508957455e-05, + "loss": 0.6914, + "step": 2766 + }, + { + "epoch": 0.5688148833384726, + "grad_norm": 0.23448607325553894, + "learning_rate": 8.412009970771524e-05, + "loss": 0.7113, + "step": 2767 + }, + { + "epoch": 0.5690204543118512, + "grad_norm": 0.21442458033561707, + "learning_rate": 8.411516239859146e-05, + "loss": 0.7, + "step": 2768 + }, + { + "epoch": 0.5692260252852297, + "grad_norm": 0.18232490122318268, + "learning_rate": 8.411022316244645e-05, + "loss": 0.5882, + "step": 2769 + }, + { + "epoch": 0.5694315962586083, + "grad_norm": 0.1396799087524414, + "learning_rate": 8.410528199952354e-05, + "loss": 0.5754, + "step": 2770 + }, + { + "epoch": 0.5696371672319869, + "grad_norm": 0.2816780209541321, + "learning_rate": 8.410033891006617e-05, + "loss": 0.6885, + "step": 2771 + }, + { + "epoch": 0.5698427382053654, + "grad_norm": 0.26476380228996277, + "learning_rate": 8.409539389431785e-05, + "loss": 0.6791, + "step": 2772 + }, + { + "epoch": 0.5700483091787439, + "grad_norm": 0.2113625705242157, + "learning_rate": 8.409044695252221e-05, + "loss": 0.7115, + "step": 2773 + }, + { + "epoch": 0.5702538801521225, + "grad_norm": 0.21605044603347778, + "learning_rate": 8.408549808492296e-05, + "loss": 0.7098, + "step": 2774 + }, + { + "epoch": 0.5704594511255011, + "grad_norm": 0.23488545417785645, + "learning_rate": 8.40805472917639e-05, + "loss": 0.6791, + "step": 2775 + }, + { + "epoch": 0.5706650220988796, + "grad_norm": 0.23377586901187897, + "learning_rate": 8.407559457328894e-05, + "loss": 0.7159, + "step": 2776 + }, + { + "epoch": 0.5708705930722582, + "grad_norm": 0.2001940906047821, + "learning_rate": 8.407063992974208e-05, + "loss": 0.6831, + "step": 2777 + }, + { + "epoch": 0.5710761640456368, + "grad_norm": 0.20575560629367828, + "learning_rate": 8.40656833613674e-05, + "loss": 0.6893, + "step": 2778 + }, + { + "epoch": 0.5712817350190154, + "grad_norm": 0.21755361557006836, + "learning_rate": 8.406072486840909e-05, + "loss": 0.6912, + "step": 2779 + }, + { + "epoch": 0.5714873059923938, + "grad_norm": 0.21302054822444916, + "learning_rate": 8.405576445111144e-05, + "loss": 0.5823, + "step": 2780 + }, + { + "epoch": 0.5716928769657724, + "grad_norm": 0.2074202299118042, + "learning_rate": 8.405080210971882e-05, + "loss": 0.6948, + "step": 2781 + }, + { + "epoch": 0.571898447939151, + "grad_norm": 0.2045622020959854, + "learning_rate": 8.40458378444757e-05, + "loss": 0.6982, + "step": 2782 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 0.20877763628959656, + "learning_rate": 8.404087165562664e-05, + "loss": 0.696, + "step": 2783 + }, + { + "epoch": 0.5723095898859081, + "grad_norm": 0.21138116717338562, + "learning_rate": 8.403590354341632e-05, + "loss": 0.6767, + "step": 2784 + }, + { + "epoch": 0.5725151608592867, + "grad_norm": 0.20857292413711548, + "learning_rate": 8.40309335080895e-05, + "loss": 0.6847, + "step": 2785 + }, + { + "epoch": 0.5727207318326653, + "grad_norm": 0.20251955091953278, + "learning_rate": 8.4025961549891e-05, + "loss": 0.7044, + "step": 2786 + }, + { + "epoch": 0.5729263028060438, + "grad_norm": 0.23925918340682983, + "learning_rate": 8.40209876690658e-05, + "loss": 0.6971, + "step": 2787 + }, + { + "epoch": 0.5731318737794223, + "grad_norm": 0.19959931075572968, + "learning_rate": 8.401601186585888e-05, + "loss": 0.5827, + "step": 2788 + }, + { + "epoch": 0.5733374447528009, + "grad_norm": 0.22731555998325348, + "learning_rate": 8.401103414051545e-05, + "loss": 0.6834, + "step": 2789 + }, + { + "epoch": 0.5735430157261795, + "grad_norm": 0.13042806088924408, + "learning_rate": 8.400605449328069e-05, + "loss": 0.584, + "step": 2790 + }, + { + "epoch": 0.573748586699558, + "grad_norm": 0.22589558362960815, + "learning_rate": 8.400107292439996e-05, + "loss": 0.6953, + "step": 2791 + }, + { + "epoch": 0.5739541576729366, + "grad_norm": 0.2052125185728073, + "learning_rate": 8.399608943411864e-05, + "loss": 0.6918, + "step": 2792 + }, + { + "epoch": 0.5741597286463151, + "grad_norm": 0.2042934000492096, + "learning_rate": 8.399110402268226e-05, + "loss": 0.7068, + "step": 2793 + }, + { + "epoch": 0.5743652996196937, + "grad_norm": 0.20587709546089172, + "learning_rate": 8.398611669033642e-05, + "loss": 0.6933, + "step": 2794 + }, + { + "epoch": 0.5745708705930722, + "grad_norm": 0.1982177048921585, + "learning_rate": 8.398112743732685e-05, + "loss": 0.6884, + "step": 2795 + }, + { + "epoch": 0.5747764415664508, + "grad_norm": 0.19220708310604095, + "learning_rate": 8.397613626389933e-05, + "loss": 0.5803, + "step": 2796 + }, + { + "epoch": 0.5749820125398294, + "grad_norm": 0.20522017776966095, + "learning_rate": 8.397114317029975e-05, + "loss": 0.6739, + "step": 2797 + }, + { + "epoch": 0.575187583513208, + "grad_norm": 0.20296591520309448, + "learning_rate": 8.396614815677408e-05, + "loss": 0.6968, + "step": 2798 + }, + { + "epoch": 0.5753931544865865, + "grad_norm": 0.21436072885990143, + "learning_rate": 8.396115122356844e-05, + "loss": 0.7124, + "step": 2799 + }, + { + "epoch": 0.575598725459965, + "grad_norm": 0.1649683117866516, + "learning_rate": 8.395615237092896e-05, + "loss": 0.5981, + "step": 2800 + }, + { + "epoch": 0.5758042964333436, + "grad_norm": 0.20267538726329803, + "learning_rate": 8.395115159910193e-05, + "loss": 0.6791, + "step": 2801 + }, + { + "epoch": 0.5760098674067222, + "grad_norm": 0.2140885293483734, + "learning_rate": 8.394614890833374e-05, + "loss": 0.7054, + "step": 2802 + }, + { + "epoch": 0.5762154383801007, + "grad_norm": 0.20777259767055511, + "learning_rate": 8.394114429887083e-05, + "loss": 0.68, + "step": 2803 + }, + { + "epoch": 0.5764210093534793, + "grad_norm": 0.2137485295534134, + "learning_rate": 8.393613777095974e-05, + "loss": 0.7086, + "step": 2804 + }, + { + "epoch": 0.5766265803268579, + "grad_norm": 0.20304176211357117, + "learning_rate": 8.393112932484713e-05, + "loss": 0.6617, + "step": 2805 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 0.21544432640075684, + "learning_rate": 8.392611896077973e-05, + "loss": 0.7053, + "step": 2806 + }, + { + "epoch": 0.5770377222736149, + "grad_norm": 0.21482408046722412, + "learning_rate": 8.39211066790044e-05, + "loss": 0.6994, + "step": 2807 + }, + { + "epoch": 0.5772432932469935, + "grad_norm": 0.15521647036075592, + "learning_rate": 8.391609247976805e-05, + "loss": 0.5946, + "step": 2808 + }, + { + "epoch": 0.5774488642203721, + "grad_norm": 0.19584627449512482, + "learning_rate": 8.391107636331775e-05, + "loss": 0.6638, + "step": 2809 + }, + { + "epoch": 0.5776544351937506, + "grad_norm": 0.2126510590314865, + "learning_rate": 8.390605832990055e-05, + "loss": 0.7362, + "step": 2810 + }, + { + "epoch": 0.5778600061671292, + "grad_norm": 0.1384701430797577, + "learning_rate": 8.390103837976373e-05, + "loss": 0.5919, + "step": 2811 + }, + { + "epoch": 0.5780655771405078, + "grad_norm": 0.20149080455303192, + "learning_rate": 8.389601651315454e-05, + "loss": 0.6609, + "step": 2812 + }, + { + "epoch": 0.5782711481138864, + "grad_norm": 0.13342009484767914, + "learning_rate": 8.389099273032045e-05, + "loss": 0.5691, + "step": 2813 + }, + { + "epoch": 0.5784767190872648, + "grad_norm": 0.20240166783332825, + "learning_rate": 8.38859670315089e-05, + "loss": 0.6667, + "step": 2814 + }, + { + "epoch": 0.5786822900606434, + "grad_norm": 0.14066733419895172, + "learning_rate": 8.388093941696752e-05, + "loss": 0.5841, + "step": 2815 + }, + { + "epoch": 0.578887861034022, + "grad_norm": 0.20561981201171875, + "learning_rate": 8.387590988694398e-05, + "loss": 0.6808, + "step": 2816 + }, + { + "epoch": 0.5790934320074006, + "grad_norm": 0.19909094274044037, + "learning_rate": 8.387087844168607e-05, + "loss": 0.6827, + "step": 2817 + }, + { + "epoch": 0.5792990029807791, + "grad_norm": 0.19748428463935852, + "learning_rate": 8.386584508144166e-05, + "loss": 0.6952, + "step": 2818 + }, + { + "epoch": 0.5795045739541577, + "grad_norm": 0.203225240111351, + "learning_rate": 8.386080980645872e-05, + "loss": 0.711, + "step": 2819 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.20350880920886993, + "learning_rate": 8.385577261698531e-05, + "loss": 0.6672, + "step": 2820 + }, + { + "epoch": 0.5799157159009148, + "grad_norm": 0.19929729402065277, + "learning_rate": 8.385073351326959e-05, + "loss": 0.6749, + "step": 2821 + }, + { + "epoch": 0.5801212868742933, + "grad_norm": 0.20175184309482574, + "learning_rate": 8.384569249555983e-05, + "loss": 0.6931, + "step": 2822 + }, + { + "epoch": 0.5803268578476719, + "grad_norm": 0.18173432350158691, + "learning_rate": 8.384064956410437e-05, + "loss": 0.5901, + "step": 2823 + }, + { + "epoch": 0.5805324288210505, + "grad_norm": 0.21010646224021912, + "learning_rate": 8.383560471915162e-05, + "loss": 0.6967, + "step": 2824 + }, + { + "epoch": 0.580737999794429, + "grad_norm": 0.2225627601146698, + "learning_rate": 8.383055796095018e-05, + "loss": 0.7137, + "step": 2825 + }, + { + "epoch": 0.5809435707678076, + "grad_norm": 0.19758129119873047, + "learning_rate": 8.382550928974862e-05, + "loss": 0.6991, + "step": 2826 + }, + { + "epoch": 0.5811491417411861, + "grad_norm": 0.19794224202632904, + "learning_rate": 8.382045870579569e-05, + "loss": 0.6759, + "step": 2827 + }, + { + "epoch": 0.5813547127145647, + "grad_norm": 0.20339448750019073, + "learning_rate": 8.38154062093402e-05, + "loss": 0.6621, + "step": 2828 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 0.19173693656921387, + "learning_rate": 8.381035180063107e-05, + "loss": 0.6821, + "step": 2829 + }, + { + "epoch": 0.5817658546613218, + "grad_norm": 0.1988253891468048, + "learning_rate": 8.380529547991732e-05, + "loss": 0.6803, + "step": 2830 + }, + { + "epoch": 0.5819714256347004, + "grad_norm": 0.2126402109861374, + "learning_rate": 8.380023724744802e-05, + "loss": 0.6765, + "step": 2831 + }, + { + "epoch": 0.582176996608079, + "grad_norm": 0.20873717963695526, + "learning_rate": 8.379517710347238e-05, + "loss": 0.6801, + "step": 2832 + }, + { + "epoch": 0.5823825675814575, + "grad_norm": 0.1995771825313568, + "learning_rate": 8.379011504823973e-05, + "loss": 0.6837, + "step": 2833 + }, + { + "epoch": 0.582588138554836, + "grad_norm": 0.9173756241798401, + "learning_rate": 8.378505108199937e-05, + "loss": 0.7294, + "step": 2834 + }, + { + "epoch": 0.5827937095282146, + "grad_norm": 0.20103541016578674, + "learning_rate": 8.377998520500086e-05, + "loss": 0.6703, + "step": 2835 + }, + { + "epoch": 0.5829992805015932, + "grad_norm": 0.20115043222904205, + "learning_rate": 8.377491741749371e-05, + "loss": 0.6794, + "step": 2836 + }, + { + "epoch": 0.5832048514749717, + "grad_norm": 0.2085791677236557, + "learning_rate": 8.376984771972763e-05, + "loss": 0.6799, + "step": 2837 + }, + { + "epoch": 0.5834104224483503, + "grad_norm": 0.2213800698518753, + "learning_rate": 8.376477611195234e-05, + "loss": 0.7313, + "step": 2838 + }, + { + "epoch": 0.5836159934217289, + "grad_norm": 0.2140512466430664, + "learning_rate": 8.375970259441773e-05, + "loss": 0.693, + "step": 2839 + }, + { + "epoch": 0.5838215643951075, + "grad_norm": 0.20790469646453857, + "learning_rate": 8.375462716737375e-05, + "loss": 0.6993, + "step": 2840 + }, + { + "epoch": 0.5840271353684859, + "grad_norm": 0.2115468680858612, + "learning_rate": 8.374954983107042e-05, + "loss": 0.687, + "step": 2841 + }, + { + "epoch": 0.5842327063418645, + "grad_norm": 0.21003267168998718, + "learning_rate": 8.374447058575786e-05, + "loss": 0.7148, + "step": 2842 + }, + { + "epoch": 0.5844382773152431, + "grad_norm": 0.21963387727737427, + "learning_rate": 8.373938943168635e-05, + "loss": 0.6821, + "step": 2843 + }, + { + "epoch": 0.5846438482886216, + "grad_norm": 0.20493534207344055, + "learning_rate": 8.373430636910619e-05, + "loss": 0.6842, + "step": 2844 + }, + { + "epoch": 0.5848494192620002, + "grad_norm": 0.20353847742080688, + "learning_rate": 8.37292213982678e-05, + "loss": 0.6853, + "step": 2845 + }, + { + "epoch": 0.5850549902353788, + "grad_norm": 0.17759917676448822, + "learning_rate": 8.372413451942168e-05, + "loss": 0.581, + "step": 2846 + }, + { + "epoch": 0.5852605612087574, + "grad_norm": 0.14481404423713684, + "learning_rate": 8.371904573281845e-05, + "loss": 0.5929, + "step": 2847 + }, + { + "epoch": 0.5854661321821358, + "grad_norm": 0.1454802304506302, + "learning_rate": 8.371395503870882e-05, + "loss": 0.5616, + "step": 2848 + }, + { + "epoch": 0.5856717031555144, + "grad_norm": 0.24941618740558624, + "learning_rate": 8.370886243734358e-05, + "loss": 0.6982, + "step": 2849 + }, + { + "epoch": 0.585877274128893, + "grad_norm": 0.21928314864635468, + "learning_rate": 8.370376792897359e-05, + "loss": 0.6931, + "step": 2850 + }, + { + "epoch": 0.5860828451022716, + "grad_norm": 0.20207005739212036, + "learning_rate": 8.369867151384987e-05, + "loss": 0.6671, + "step": 2851 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 0.22684946656227112, + "learning_rate": 8.369357319222348e-05, + "loss": 0.6684, + "step": 2852 + }, + { + "epoch": 0.5864939870490287, + "grad_norm": 0.21584348380565643, + "learning_rate": 8.368847296434557e-05, + "loss": 0.7032, + "step": 2853 + }, + { + "epoch": 0.5866995580224073, + "grad_norm": 0.209476038813591, + "learning_rate": 8.368337083046747e-05, + "loss": 0.6804, + "step": 2854 + }, + { + "epoch": 0.5869051289957858, + "grad_norm": 0.22032958269119263, + "learning_rate": 8.367826679084046e-05, + "loss": 0.6868, + "step": 2855 + }, + { + "epoch": 0.5871106999691643, + "grad_norm": 0.21995702385902405, + "learning_rate": 8.367316084571603e-05, + "loss": 0.6975, + "step": 2856 + }, + { + "epoch": 0.5873162709425429, + "grad_norm": 0.20626819133758545, + "learning_rate": 8.366805299534574e-05, + "loss": 0.7272, + "step": 2857 + }, + { + "epoch": 0.5875218419159215, + "grad_norm": 0.2072131335735321, + "learning_rate": 8.36629432399812e-05, + "loss": 0.6957, + "step": 2858 + }, + { + "epoch": 0.5877274128893001, + "grad_norm": 0.21286934614181519, + "learning_rate": 8.365783157987416e-05, + "loss": 0.7193, + "step": 2859 + }, + { + "epoch": 0.5879329838626786, + "grad_norm": 0.20594240725040436, + "learning_rate": 8.365271801527644e-05, + "loss": 0.6996, + "step": 2860 + }, + { + "epoch": 0.5881385548360571, + "grad_norm": 0.20829501748085022, + "learning_rate": 8.364760254643997e-05, + "loss": 0.6832, + "step": 2861 + }, + { + "epoch": 0.5883441258094357, + "grad_norm": 0.2092822641134262, + "learning_rate": 8.364248517361676e-05, + "loss": 0.7114, + "step": 2862 + }, + { + "epoch": 0.5885496967828142, + "grad_norm": 0.19926267862319946, + "learning_rate": 8.363736589705892e-05, + "loss": 0.6744, + "step": 2863 + }, + { + "epoch": 0.5887552677561928, + "grad_norm": 0.20233862102031708, + "learning_rate": 8.363224471701866e-05, + "loss": 0.69, + "step": 2864 + }, + { + "epoch": 0.5889608387295714, + "grad_norm": 0.2081189900636673, + "learning_rate": 8.362712163374826e-05, + "loss": 0.7025, + "step": 2865 + }, + { + "epoch": 0.58916640970295, + "grad_norm": 0.19669431447982788, + "learning_rate": 8.362199664750012e-05, + "loss": 0.6796, + "step": 2866 + }, + { + "epoch": 0.5893719806763285, + "grad_norm": 0.20693160593509674, + "learning_rate": 8.361686975852672e-05, + "loss": 0.6996, + "step": 2867 + }, + { + "epoch": 0.589577551649707, + "grad_norm": 0.20690032839775085, + "learning_rate": 8.361174096708066e-05, + "loss": 0.6977, + "step": 2868 + }, + { + "epoch": 0.5897831226230856, + "grad_norm": 0.19090650975704193, + "learning_rate": 8.360661027341459e-05, + "loss": 0.6905, + "step": 2869 + }, + { + "epoch": 0.5899886935964642, + "grad_norm": 0.1915200799703598, + "learning_rate": 8.360147767778126e-05, + "loss": 0.6921, + "step": 2870 + }, + { + "epoch": 0.5901942645698427, + "grad_norm": 0.20431163907051086, + "learning_rate": 8.359634318043356e-05, + "loss": 0.6816, + "step": 2871 + }, + { + "epoch": 0.5903998355432213, + "grad_norm": 0.20922903716564178, + "learning_rate": 8.359120678162442e-05, + "loss": 0.7141, + "step": 2872 + }, + { + "epoch": 0.5906054065165999, + "grad_norm": 0.20200544595718384, + "learning_rate": 8.358606848160692e-05, + "loss": 0.6883, + "step": 2873 + }, + { + "epoch": 0.5908109774899785, + "grad_norm": 0.22084182500839233, + "learning_rate": 8.358092828063416e-05, + "loss": 0.5962, + "step": 2874 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 0.19920572638511658, + "learning_rate": 8.357578617895939e-05, + "loss": 0.6921, + "step": 2875 + }, + { + "epoch": 0.5912221194367355, + "grad_norm": 0.21406704187393188, + "learning_rate": 8.357064217683593e-05, + "loss": 0.6809, + "step": 2876 + }, + { + "epoch": 0.5914276904101141, + "grad_norm": 0.20186960697174072, + "learning_rate": 8.356549627451723e-05, + "loss": 0.7273, + "step": 2877 + }, + { + "epoch": 0.5916332613834927, + "grad_norm": 0.20613306760787964, + "learning_rate": 8.356034847225677e-05, + "loss": 0.6998, + "step": 2878 + }, + { + "epoch": 0.5918388323568712, + "grad_norm": 0.19980058073997498, + "learning_rate": 8.355519877030818e-05, + "loss": 0.6707, + "step": 2879 + }, + { + "epoch": 0.5920444033302498, + "grad_norm": 0.17572249472141266, + "learning_rate": 8.355004716892514e-05, + "loss": 0.5905, + "step": 2880 + }, + { + "epoch": 0.5922499743036284, + "grad_norm": 0.14615419507026672, + "learning_rate": 8.354489366836147e-05, + "loss": 0.5936, + "step": 2881 + }, + { + "epoch": 0.5924555452770068, + "grad_norm": 0.265011191368103, + "learning_rate": 8.353973826887105e-05, + "loss": 0.7195, + "step": 2882 + }, + { + "epoch": 0.5926611162503854, + "grad_norm": 0.22780616581439972, + "learning_rate": 8.353458097070784e-05, + "loss": 0.7003, + "step": 2883 + }, + { + "epoch": 0.592866687223764, + "grad_norm": 0.2108001857995987, + "learning_rate": 8.352942177412594e-05, + "loss": 0.6791, + "step": 2884 + }, + { + "epoch": 0.5930722581971426, + "grad_norm": 0.23062892258167267, + "learning_rate": 8.352426067937953e-05, + "loss": 0.7012, + "step": 2885 + }, + { + "epoch": 0.5932778291705211, + "grad_norm": 0.22096315026283264, + "learning_rate": 8.351909768672286e-05, + "loss": 0.6848, + "step": 2886 + }, + { + "epoch": 0.5934834001438997, + "grad_norm": 0.19417156279087067, + "learning_rate": 8.351393279641026e-05, + "loss": 0.6041, + "step": 2887 + }, + { + "epoch": 0.5936889711172783, + "grad_norm": 0.21793076395988464, + "learning_rate": 8.350876600869624e-05, + "loss": 0.6832, + "step": 2888 + }, + { + "epoch": 0.5938945420906568, + "grad_norm": 0.21608784794807434, + "learning_rate": 8.350359732383528e-05, + "loss": 0.693, + "step": 2889 + }, + { + "epoch": 0.5941001130640353, + "grad_norm": 0.1427665799856186, + "learning_rate": 8.349842674208205e-05, + "loss": 0.6014, + "step": 2890 + }, + { + "epoch": 0.5943056840374139, + "grad_norm": 0.21171724796295166, + "learning_rate": 8.349325426369129e-05, + "loss": 0.7155, + "step": 2891 + }, + { + "epoch": 0.5945112550107925, + "grad_norm": 0.20547601580619812, + "learning_rate": 8.348807988891778e-05, + "loss": 0.6879, + "step": 2892 + }, + { + "epoch": 0.5947168259841711, + "grad_norm": 0.20329566299915314, + "learning_rate": 8.34829036180165e-05, + "loss": 0.6956, + "step": 2893 + }, + { + "epoch": 0.5949223969575496, + "grad_norm": 0.19427530467510223, + "learning_rate": 8.347772545124241e-05, + "loss": 0.6853, + "step": 2894 + }, + { + "epoch": 0.5951279679309281, + "grad_norm": 0.19844532012939453, + "learning_rate": 8.347254538885063e-05, + "loss": 0.6805, + "step": 2895 + }, + { + "epoch": 0.5953335389043067, + "grad_norm": 0.20042115449905396, + "learning_rate": 8.346736343109637e-05, + "loss": 0.6648, + "step": 2896 + }, + { + "epoch": 0.5955391098776853, + "grad_norm": 0.1955205500125885, + "learning_rate": 8.34621795782349e-05, + "loss": 0.6676, + "step": 2897 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.19705745577812195, + "learning_rate": 8.345699383052162e-05, + "loss": 0.6857, + "step": 2898 + }, + { + "epoch": 0.5959502518244424, + "grad_norm": 0.19771529734134674, + "learning_rate": 8.3451806188212e-05, + "loss": 0.6992, + "step": 2899 + }, + { + "epoch": 0.596155822797821, + "grad_norm": 0.1999768763780594, + "learning_rate": 8.344661665156161e-05, + "loss": 0.7006, + "step": 2900 + }, + { + "epoch": 0.5963613937711995, + "grad_norm": 0.2035917341709137, + "learning_rate": 8.344142522082612e-05, + "loss": 0.7032, + "step": 2901 + }, + { + "epoch": 0.596566964744578, + "grad_norm": 0.20297078788280487, + "learning_rate": 8.343623189626129e-05, + "loss": 0.681, + "step": 2902 + }, + { + "epoch": 0.5967725357179566, + "grad_norm": 0.17843900620937347, + "learning_rate": 8.343103667812295e-05, + "loss": 0.5906, + "step": 2903 + }, + { + "epoch": 0.5969781066913352, + "grad_norm": 0.2069201022386551, + "learning_rate": 8.342583956666706e-05, + "loss": 0.7137, + "step": 2904 + }, + { + "epoch": 0.5971836776647137, + "grad_norm": 0.20919117331504822, + "learning_rate": 8.342064056214967e-05, + "loss": 0.6923, + "step": 2905 + }, + { + "epoch": 0.5973892486380923, + "grad_norm": 0.1899642050266266, + "learning_rate": 8.34154396648269e-05, + "loss": 0.668, + "step": 2906 + }, + { + "epoch": 0.5975948196114709, + "grad_norm": 0.1988193541765213, + "learning_rate": 8.341023687495494e-05, + "loss": 0.676, + "step": 2907 + }, + { + "epoch": 0.5978003905848495, + "grad_norm": 0.21733912825584412, + "learning_rate": 8.340503219279017e-05, + "loss": 0.6999, + "step": 2908 + }, + { + "epoch": 0.5980059615582279, + "grad_norm": 0.20647762715816498, + "learning_rate": 8.339982561858896e-05, + "loss": 0.694, + "step": 2909 + }, + { + "epoch": 0.5982115325316065, + "grad_norm": 0.19566026329994202, + "learning_rate": 8.339461715260781e-05, + "loss": 0.6716, + "step": 2910 + }, + { + "epoch": 0.5984171035049851, + "grad_norm": 0.2015964686870575, + "learning_rate": 8.338940679510334e-05, + "loss": 0.6869, + "step": 2911 + }, + { + "epoch": 0.5986226744783637, + "grad_norm": 0.1712951958179474, + "learning_rate": 8.338419454633224e-05, + "loss": 0.5902, + "step": 2912 + }, + { + "epoch": 0.5988282454517422, + "grad_norm": 0.13849389553070068, + "learning_rate": 8.337898040655126e-05, + "loss": 0.5992, + "step": 2913 + }, + { + "epoch": 0.5990338164251208, + "grad_norm": 0.2373506873846054, + "learning_rate": 8.33737643760173e-05, + "loss": 0.6881, + "step": 2914 + }, + { + "epoch": 0.5992393873984994, + "grad_norm": 0.2165384441614151, + "learning_rate": 8.336854645498734e-05, + "loss": 0.6805, + "step": 2915 + }, + { + "epoch": 0.599444958371878, + "grad_norm": 0.21156401932239532, + "learning_rate": 8.336332664371843e-05, + "loss": 0.6781, + "step": 2916 + }, + { + "epoch": 0.5996505293452564, + "grad_norm": 0.22182904183864594, + "learning_rate": 8.335810494246772e-05, + "loss": 0.7046, + "step": 2917 + }, + { + "epoch": 0.599856100318635, + "grad_norm": 0.21610800921916962, + "learning_rate": 8.335288135149246e-05, + "loss": 0.7223, + "step": 2918 + }, + { + "epoch": 0.6000616712920136, + "grad_norm": 0.21809829771518707, + "learning_rate": 8.334765587105002e-05, + "loss": 0.6088, + "step": 2919 + }, + { + "epoch": 0.6002672422653921, + "grad_norm": 0.22887369990348816, + "learning_rate": 8.334242850139779e-05, + "loss": 0.6901, + "step": 2920 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 0.22057749330997467, + "learning_rate": 8.333719924279332e-05, + "loss": 0.5969, + "step": 2921 + }, + { + "epoch": 0.6006783842121493, + "grad_norm": 0.2292318344116211, + "learning_rate": 8.333196809549422e-05, + "loss": 0.6893, + "step": 2922 + }, + { + "epoch": 0.6008839551855278, + "grad_norm": 0.15525048971176147, + "learning_rate": 8.332673505975825e-05, + "loss": 0.5925, + "step": 2923 + }, + { + "epoch": 0.6010895261589063, + "grad_norm": 0.21504151821136475, + "learning_rate": 8.332150013584315e-05, + "loss": 0.678, + "step": 2924 + }, + { + "epoch": 0.6012950971322849, + "grad_norm": 0.21480882167816162, + "learning_rate": 8.331626332400689e-05, + "loss": 0.6897, + "step": 2925 + }, + { + "epoch": 0.6015006681056635, + "grad_norm": 0.14146551489830017, + "learning_rate": 8.331102462450738e-05, + "loss": 0.5684, + "step": 2926 + }, + { + "epoch": 0.6017062390790421, + "grad_norm": 0.23041875660419464, + "learning_rate": 8.330578403760277e-05, + "loss": 0.6994, + "step": 2927 + }, + { + "epoch": 0.6019118100524206, + "grad_norm": 0.20731528103351593, + "learning_rate": 8.330054156355124e-05, + "loss": 0.6792, + "step": 2928 + }, + { + "epoch": 0.6021173810257991, + "grad_norm": 0.19797998666763306, + "learning_rate": 8.329529720261103e-05, + "loss": 0.6951, + "step": 2929 + }, + { + "epoch": 0.6023229519991777, + "grad_norm": 0.2016698569059372, + "learning_rate": 8.32900509550405e-05, + "loss": 0.6833, + "step": 2930 + }, + { + "epoch": 0.6025285229725563, + "grad_norm": 0.20054802298545837, + "learning_rate": 8.328480282109816e-05, + "loss": 0.6842, + "step": 2931 + }, + { + "epoch": 0.6027340939459348, + "grad_norm": 0.19949203729629517, + "learning_rate": 8.32795528010425e-05, + "loss": 0.691, + "step": 2932 + }, + { + "epoch": 0.6029396649193134, + "grad_norm": 0.17907802760601044, + "learning_rate": 8.32743008951322e-05, + "loss": 0.5825, + "step": 2933 + }, + { + "epoch": 0.603145235892692, + "grad_norm": 0.2004586011171341, + "learning_rate": 8.326904710362599e-05, + "loss": 0.6639, + "step": 2934 + }, + { + "epoch": 0.6033508068660706, + "grad_norm": 0.21539311110973358, + "learning_rate": 8.32637914267827e-05, + "loss": 0.6948, + "step": 2935 + }, + { + "epoch": 0.603556377839449, + "grad_norm": 0.20301540195941925, + "learning_rate": 8.325853386486126e-05, + "loss": 0.7028, + "step": 2936 + }, + { + "epoch": 0.6037619488128276, + "grad_norm": 0.19219626486301422, + "learning_rate": 8.325327441812067e-05, + "loss": 0.6727, + "step": 2937 + }, + { + "epoch": 0.6039675197862062, + "grad_norm": 0.20149052143096924, + "learning_rate": 8.324801308682004e-05, + "loss": 0.6887, + "step": 2938 + }, + { + "epoch": 0.6041730907595847, + "grad_norm": 0.20644250512123108, + "learning_rate": 8.324274987121857e-05, + "loss": 0.6764, + "step": 2939 + }, + { + "epoch": 0.6043786617329633, + "grad_norm": 0.20564045011997223, + "learning_rate": 8.323748477157557e-05, + "loss": 0.6912, + "step": 2940 + }, + { + "epoch": 0.6045842327063419, + "grad_norm": 0.18564823269844055, + "learning_rate": 8.323221778815042e-05, + "loss": 0.564, + "step": 2941 + }, + { + "epoch": 0.6047898036797205, + "grad_norm": 0.2087641954421997, + "learning_rate": 8.32269489212026e-05, + "loss": 0.6865, + "step": 2942 + }, + { + "epoch": 0.6049953746530989, + "grad_norm": 0.13221989572048187, + "learning_rate": 8.322167817099166e-05, + "loss": 0.5906, + "step": 2943 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 0.13168349862098694, + "learning_rate": 8.32164055377773e-05, + "loss": 0.6099, + "step": 2944 + }, + { + "epoch": 0.6054065165998561, + "grad_norm": 0.21939873695373535, + "learning_rate": 8.321113102181925e-05, + "loss": 0.6936, + "step": 2945 + }, + { + "epoch": 0.6056120875732347, + "grad_norm": 0.21064333617687225, + "learning_rate": 8.320585462337738e-05, + "loss": 0.6805, + "step": 2946 + }, + { + "epoch": 0.6058176585466132, + "grad_norm": 0.21517851948738098, + "learning_rate": 8.320057634271162e-05, + "loss": 0.6941, + "step": 2947 + }, + { + "epoch": 0.6060232295199918, + "grad_norm": 0.19427655637264252, + "learning_rate": 8.319529618008203e-05, + "loss": 0.6989, + "step": 2948 + }, + { + "epoch": 0.6062288004933704, + "grad_norm": 0.20321017503738403, + "learning_rate": 8.31900141357487e-05, + "loss": 0.6775, + "step": 2949 + }, + { + "epoch": 0.606434371466749, + "grad_norm": 0.2060307115316391, + "learning_rate": 8.318473020997188e-05, + "loss": 0.712, + "step": 2950 + }, + { + "epoch": 0.6066399424401274, + "grad_norm": 0.16920985281467438, + "learning_rate": 8.317944440301188e-05, + "loss": 0.5975, + "step": 2951 + }, + { + "epoch": 0.606845513413506, + "grad_norm": 0.2233453392982483, + "learning_rate": 8.31741567151291e-05, + "loss": 0.6985, + "step": 2952 + }, + { + "epoch": 0.6070510843868846, + "grad_norm": 0.21463671326637268, + "learning_rate": 8.316886714658406e-05, + "loss": 0.6661, + "step": 2953 + }, + { + "epoch": 0.6072566553602631, + "grad_norm": 0.1969480812549591, + "learning_rate": 8.316357569763732e-05, + "loss": 0.7273, + "step": 2954 + }, + { + "epoch": 0.6074622263336417, + "grad_norm": 0.17153163254261017, + "learning_rate": 8.315828236854958e-05, + "loss": 0.6041, + "step": 2955 + }, + { + "epoch": 0.6076677973070203, + "grad_norm": 0.21503044664859772, + "learning_rate": 8.315298715958165e-05, + "loss": 0.6841, + "step": 2956 + }, + { + "epoch": 0.6078733682803988, + "grad_norm": 0.2050783485174179, + "learning_rate": 8.314769007099433e-05, + "loss": 0.6952, + "step": 2957 + }, + { + "epoch": 0.6080789392537773, + "grad_norm": 0.20447179675102234, + "learning_rate": 8.314239110304864e-05, + "loss": 0.7027, + "step": 2958 + }, + { + "epoch": 0.6082845102271559, + "grad_norm": 0.20713284611701965, + "learning_rate": 8.313709025600562e-05, + "loss": 0.7172, + "step": 2959 + }, + { + "epoch": 0.6084900812005345, + "grad_norm": 0.20058241486549377, + "learning_rate": 8.31317875301264e-05, + "loss": 0.6904, + "step": 2960 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.19999080896377563, + "learning_rate": 8.312648292567226e-05, + "loss": 0.7054, + "step": 2961 + }, + { + "epoch": 0.6089012231472916, + "grad_norm": 0.20129017531871796, + "learning_rate": 8.31211764429045e-05, + "loss": 0.6781, + "step": 2962 + }, + { + "epoch": 0.6091067941206701, + "grad_norm": 0.2048570066690445, + "learning_rate": 8.311586808208453e-05, + "loss": 0.6995, + "step": 2963 + }, + { + "epoch": 0.6093123650940487, + "grad_norm": 0.20518624782562256, + "learning_rate": 8.311055784347392e-05, + "loss": 0.6856, + "step": 2964 + }, + { + "epoch": 0.6095179360674273, + "grad_norm": 0.14647917449474335, + "learning_rate": 8.310524572733424e-05, + "loss": 0.6034, + "step": 2965 + }, + { + "epoch": 0.6097235070408058, + "grad_norm": 0.2090081423521042, + "learning_rate": 8.309993173392722e-05, + "loss": 0.6738, + "step": 2966 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 0.13404381275177002, + "learning_rate": 8.309461586351463e-05, + "loss": 0.59, + "step": 2967 + }, + { + "epoch": 0.610134648987563, + "grad_norm": 0.20760053396224976, + "learning_rate": 8.308929811635837e-05, + "loss": 0.7076, + "step": 2968 + }, + { + "epoch": 0.6103402199609416, + "grad_norm": 0.2022329717874527, + "learning_rate": 8.308397849272043e-05, + "loss": 0.6992, + "step": 2969 + }, + { + "epoch": 0.61054579093432, + "grad_norm": 0.20392966270446777, + "learning_rate": 8.307865699286287e-05, + "loss": 0.7017, + "step": 2970 + }, + { + "epoch": 0.6107513619076986, + "grad_norm": 0.14375483989715576, + "learning_rate": 8.307333361704786e-05, + "loss": 0.5902, + "step": 2971 + }, + { + "epoch": 0.6109569328810772, + "grad_norm": 0.20196297764778137, + "learning_rate": 8.306800836553766e-05, + "loss": 0.686, + "step": 2972 + }, + { + "epoch": 0.6111625038544557, + "grad_norm": 0.23178908228874207, + "learning_rate": 8.306268123859461e-05, + "loss": 0.7128, + "step": 2973 + }, + { + "epoch": 0.6113680748278343, + "grad_norm": 0.14498086273670197, + "learning_rate": 8.305735223648117e-05, + "loss": 0.5783, + "step": 2974 + }, + { + "epoch": 0.6115736458012129, + "grad_norm": 0.21291960775852203, + "learning_rate": 8.305202135945985e-05, + "loss": 0.6836, + "step": 2975 + }, + { + "epoch": 0.6117792167745915, + "grad_norm": 0.20154601335525513, + "learning_rate": 8.30466886077933e-05, + "loss": 0.6775, + "step": 2976 + }, + { + "epoch": 0.6119847877479699, + "grad_norm": 0.1371108442544937, + "learning_rate": 8.304135398174423e-05, + "loss": 0.6029, + "step": 2977 + }, + { + "epoch": 0.6121903587213485, + "grad_norm": 0.20939522981643677, + "learning_rate": 8.303601748157545e-05, + "loss": 0.7016, + "step": 2978 + }, + { + "epoch": 0.6123959296947271, + "grad_norm": 0.1982061266899109, + "learning_rate": 8.303067910754988e-05, + "loss": 0.6724, + "step": 2979 + }, + { + "epoch": 0.6126015006681057, + "grad_norm": 0.19184644520282745, + "learning_rate": 8.302533885993051e-05, + "loss": 0.6766, + "step": 2980 + }, + { + "epoch": 0.6128070716414842, + "grad_norm": 0.1973457783460617, + "learning_rate": 8.30199967389804e-05, + "loss": 0.701, + "step": 2981 + }, + { + "epoch": 0.6130126426148628, + "grad_norm": 0.23462116718292236, + "learning_rate": 8.301465274496278e-05, + "loss": 0.7119, + "step": 2982 + }, + { + "epoch": 0.6132182135882414, + "grad_norm": 0.1940578669309616, + "learning_rate": 8.300930687814089e-05, + "loss": 0.6935, + "step": 2983 + }, + { + "epoch": 0.61342378456162, + "grad_norm": 0.20462383329868317, + "learning_rate": 8.30039591387781e-05, + "loss": 0.7066, + "step": 2984 + }, + { + "epoch": 0.6136293555349984, + "grad_norm": 0.1943095475435257, + "learning_rate": 8.299860952713788e-05, + "loss": 0.6764, + "step": 2985 + }, + { + "epoch": 0.613834926508377, + "grad_norm": 0.18959608674049377, + "learning_rate": 8.299325804348377e-05, + "loss": 0.6501, + "step": 2986 + }, + { + "epoch": 0.6140404974817556, + "grad_norm": 0.2010001540184021, + "learning_rate": 8.298790468807941e-05, + "loss": 0.6819, + "step": 2987 + }, + { + "epoch": 0.6142460684551342, + "grad_norm": 0.20373772084712982, + "learning_rate": 8.298254946118856e-05, + "loss": 0.6776, + "step": 2988 + }, + { + "epoch": 0.6144516394285127, + "grad_norm": 0.19308720529079437, + "learning_rate": 8.2977192363075e-05, + "loss": 0.6825, + "step": 2989 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 0.19244827330112457, + "learning_rate": 8.297183339400271e-05, + "loss": 0.6819, + "step": 2990 + }, + { + "epoch": 0.6148627813752698, + "grad_norm": 0.19886994361877441, + "learning_rate": 8.296647255423566e-05, + "loss": 0.6907, + "step": 2991 + }, + { + "epoch": 0.6150683523486483, + "grad_norm": 0.194062277674675, + "learning_rate": 8.296110984403794e-05, + "loss": 0.6725, + "step": 2992 + }, + { + "epoch": 0.6152739233220269, + "grad_norm": 0.19105246663093567, + "learning_rate": 8.295574526367379e-05, + "loss": 0.6895, + "step": 2993 + }, + { + "epoch": 0.6154794942954055, + "grad_norm": 0.20439203083515167, + "learning_rate": 8.295037881340746e-05, + "loss": 0.6997, + "step": 2994 + }, + { + "epoch": 0.6156850652687841, + "grad_norm": 0.2035692036151886, + "learning_rate": 8.294501049350335e-05, + "loss": 0.6797, + "step": 2995 + }, + { + "epoch": 0.6158906362421626, + "grad_norm": 0.2011076956987381, + "learning_rate": 8.293964030422593e-05, + "loss": 0.6948, + "step": 2996 + }, + { + "epoch": 0.6160962072155411, + "grad_norm": 0.1979755461215973, + "learning_rate": 8.293426824583977e-05, + "loss": 0.6984, + "step": 2997 + }, + { + "epoch": 0.6163017781889197, + "grad_norm": 0.20361703634262085, + "learning_rate": 8.29288943186095e-05, + "loss": 0.6804, + "step": 2998 + }, + { + "epoch": 0.6165073491622983, + "grad_norm": 0.19313938915729523, + "learning_rate": 8.29235185227999e-05, + "loss": 0.7105, + "step": 2999 + }, + { + "epoch": 0.6167129201356768, + "grad_norm": 0.19516946375370026, + "learning_rate": 8.291814085867579e-05, + "loss": 0.7015, + "step": 3000 + }, + { + "epoch": 0.6169184911090554, + "grad_norm": 0.19444262981414795, + "learning_rate": 8.291276132650212e-05, + "loss": 0.7028, + "step": 3001 + }, + { + "epoch": 0.617124062082434, + "grad_norm": 0.19477610290050507, + "learning_rate": 8.290737992654389e-05, + "loss": 0.683, + "step": 3002 + }, + { + "epoch": 0.6173296330558126, + "grad_norm": 0.20169807970523834, + "learning_rate": 8.290199665906624e-05, + "loss": 0.6816, + "step": 3003 + }, + { + "epoch": 0.617535204029191, + "grad_norm": 0.1933300644159317, + "learning_rate": 8.289661152433436e-05, + "loss": 0.7073, + "step": 3004 + }, + { + "epoch": 0.6177407750025696, + "grad_norm": 0.16266535222530365, + "learning_rate": 8.289122452261356e-05, + "loss": 0.5968, + "step": 3005 + }, + { + "epoch": 0.6179463459759482, + "grad_norm": 0.19945891201496124, + "learning_rate": 8.288583565416924e-05, + "loss": 0.6826, + "step": 3006 + }, + { + "epoch": 0.6181519169493268, + "grad_norm": 0.1400868445634842, + "learning_rate": 8.288044491926687e-05, + "loss": 0.6002, + "step": 3007 + }, + { + "epoch": 0.6183574879227053, + "grad_norm": 0.12712964415550232, + "learning_rate": 8.287505231817202e-05, + "loss": 0.5836, + "step": 3008 + }, + { + "epoch": 0.6185630588960839, + "grad_norm": 0.20722496509552002, + "learning_rate": 8.286965785115038e-05, + "loss": 0.6821, + "step": 3009 + }, + { + "epoch": 0.6187686298694625, + "grad_norm": 0.1368006467819214, + "learning_rate": 8.28642615184677e-05, + "loss": 0.5909, + "step": 3010 + }, + { + "epoch": 0.6189742008428409, + "grad_norm": 0.1366155594587326, + "learning_rate": 8.285886332038983e-05, + "loss": 0.5806, + "step": 3011 + }, + { + "epoch": 0.6191797718162195, + "grad_norm": 0.20801199972629547, + "learning_rate": 8.285346325718272e-05, + "loss": 0.7111, + "step": 3012 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 0.19898487627506256, + "learning_rate": 8.28480613291124e-05, + "loss": 0.6832, + "step": 3013 + }, + { + "epoch": 0.6195909137629767, + "grad_norm": 0.19258826971054077, + "learning_rate": 8.284265753644499e-05, + "loss": 0.6962, + "step": 3014 + }, + { + "epoch": 0.6197964847363552, + "grad_norm": 0.18354789912700653, + "learning_rate": 8.283725187944674e-05, + "loss": 0.6807, + "step": 3015 + }, + { + "epoch": 0.6200020557097338, + "grad_norm": 0.15917901694774628, + "learning_rate": 8.283184435838392e-05, + "loss": 0.5927, + "step": 3016 + }, + { + "epoch": 0.6202076266831124, + "grad_norm": 0.1983378827571869, + "learning_rate": 8.282643497352296e-05, + "loss": 0.6791, + "step": 3017 + }, + { + "epoch": 0.620413197656491, + "grad_norm": 0.20160548388957977, + "learning_rate": 8.282102372513035e-05, + "loss": 0.6951, + "step": 3018 + }, + { + "epoch": 0.6206187686298694, + "grad_norm": 0.19742833077907562, + "learning_rate": 8.281561061347268e-05, + "loss": 0.6848, + "step": 3019 + }, + { + "epoch": 0.620824339603248, + "grad_norm": 0.19700521230697632, + "learning_rate": 8.281019563881663e-05, + "loss": 0.6975, + "step": 3020 + }, + { + "epoch": 0.6210299105766266, + "grad_norm": 0.20055337250232697, + "learning_rate": 8.280477880142895e-05, + "loss": 0.6769, + "step": 3021 + }, + { + "epoch": 0.6212354815500052, + "grad_norm": 0.23085735738277435, + "learning_rate": 8.279936010157653e-05, + "loss": 0.67, + "step": 3022 + }, + { + "epoch": 0.6214410525233837, + "grad_norm": 0.20529572665691376, + "learning_rate": 8.279393953952632e-05, + "loss": 0.6962, + "step": 3023 + }, + { + "epoch": 0.6216466234967623, + "grad_norm": 0.19554628431797028, + "learning_rate": 8.278851711554532e-05, + "loss": 0.6853, + "step": 3024 + }, + { + "epoch": 0.6218521944701408, + "grad_norm": 0.1940753012895584, + "learning_rate": 8.278309282990073e-05, + "loss": 0.6549, + "step": 3025 + }, + { + "epoch": 0.6220577654435194, + "grad_norm": 0.19746670126914978, + "learning_rate": 8.277766668285977e-05, + "loss": 0.6544, + "step": 3026 + }, + { + "epoch": 0.6222633364168979, + "grad_norm": 0.19035373628139496, + "learning_rate": 8.277223867468971e-05, + "loss": 0.6773, + "step": 3027 + }, + { + "epoch": 0.6224689073902765, + "grad_norm": 0.19404295086860657, + "learning_rate": 8.276680880565803e-05, + "loss": 0.6931, + "step": 3028 + }, + { + "epoch": 0.6226744783636551, + "grad_norm": 0.1988229602575302, + "learning_rate": 8.276137707603219e-05, + "loss": 0.6812, + "step": 3029 + }, + { + "epoch": 0.6228800493370336, + "grad_norm": 0.19786033034324646, + "learning_rate": 8.27559434860798e-05, + "loss": 0.6733, + "step": 3030 + }, + { + "epoch": 0.6230856203104121, + "grad_norm": 0.19254696369171143, + "learning_rate": 8.275050803606853e-05, + "loss": 0.7066, + "step": 3031 + }, + { + "epoch": 0.6232911912837907, + "grad_norm": 0.19956709444522858, + "learning_rate": 8.274507072626619e-05, + "loss": 0.681, + "step": 3032 + }, + { + "epoch": 0.6234967622571693, + "grad_norm": 0.19668106734752655, + "learning_rate": 8.273963155694062e-05, + "loss": 0.676, + "step": 3033 + }, + { + "epoch": 0.6237023332305478, + "grad_norm": 0.21287435293197632, + "learning_rate": 8.273419052835981e-05, + "loss": 0.704, + "step": 3034 + }, + { + "epoch": 0.6239079042039264, + "grad_norm": 2.4127197265625, + "learning_rate": 8.27287476407918e-05, + "loss": 0.7001, + "step": 3035 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 0.20844995975494385, + "learning_rate": 8.272330289450473e-05, + "loss": 0.6808, + "step": 3036 + }, + { + "epoch": 0.6243190461506836, + "grad_norm": 0.19834044575691223, + "learning_rate": 8.271785628976686e-05, + "loss": 0.5957, + "step": 3037 + }, + { + "epoch": 0.624524617124062, + "grad_norm": 0.25713658332824707, + "learning_rate": 8.271240782684649e-05, + "loss": 0.6067, + "step": 3038 + }, + { + "epoch": 0.6247301880974406, + "grad_norm": 0.755788266658783, + "learning_rate": 8.270695750601206e-05, + "loss": 0.7165, + "step": 3039 + }, + { + "epoch": 0.6249357590708192, + "grad_norm": 0.23070074617862701, + "learning_rate": 8.270150532753208e-05, + "loss": 0.7086, + "step": 3040 + }, + { + "epoch": 0.6251413300441978, + "grad_norm": 0.20264309644699097, + "learning_rate": 8.269605129167514e-05, + "loss": 0.5804, + "step": 3041 + }, + { + "epoch": 0.6253469010175763, + "grad_norm": 0.25147226452827454, + "learning_rate": 8.269059539870996e-05, + "loss": 0.6841, + "step": 3042 + }, + { + "epoch": 0.6255524719909549, + "grad_norm": 0.23628079891204834, + "learning_rate": 8.268513764890528e-05, + "loss": 0.7055, + "step": 3043 + }, + { + "epoch": 0.6257580429643335, + "grad_norm": 0.2399078607559204, + "learning_rate": 8.267967804253003e-05, + "loss": 0.7238, + "step": 3044 + }, + { + "epoch": 0.625963613937712, + "grad_norm": 0.2208731472492218, + "learning_rate": 8.267421657985316e-05, + "loss": 0.6938, + "step": 3045 + }, + { + "epoch": 0.6261691849110905, + "grad_norm": 0.21366935968399048, + "learning_rate": 8.266875326114372e-05, + "loss": 0.5907, + "step": 3046 + }, + { + "epoch": 0.6263747558844691, + "grad_norm": 0.22604869306087494, + "learning_rate": 8.266328808667086e-05, + "loss": 0.6977, + "step": 3047 + }, + { + "epoch": 0.6265803268578477, + "grad_norm": 0.20610669255256653, + "learning_rate": 8.265782105670385e-05, + "loss": 0.6953, + "step": 3048 + }, + { + "epoch": 0.6267858978312262, + "grad_norm": 0.2094089388847351, + "learning_rate": 8.2652352171512e-05, + "loss": 0.7114, + "step": 3049 + }, + { + "epoch": 0.6269914688046048, + "grad_norm": 0.20464326441287994, + "learning_rate": 8.264688143136474e-05, + "loss": 0.6828, + "step": 3050 + }, + { + "epoch": 0.6271970397779834, + "grad_norm": 0.20458531379699707, + "learning_rate": 8.26414088365316e-05, + "loss": 0.7172, + "step": 3051 + }, + { + "epoch": 0.6274026107513619, + "grad_norm": 0.20255166292190552, + "learning_rate": 8.26359343872822e-05, + "loss": 0.7034, + "step": 3052 + }, + { + "epoch": 0.6276081817247404, + "grad_norm": 0.20339445769786835, + "learning_rate": 8.26304580838862e-05, + "loss": 0.7053, + "step": 3053 + }, + { + "epoch": 0.627813752698119, + "grad_norm": 0.20055994391441345, + "learning_rate": 8.262497992661342e-05, + "loss": 0.6917, + "step": 3054 + }, + { + "epoch": 0.6280193236714976, + "grad_norm": 0.17087921500205994, + "learning_rate": 8.261949991573374e-05, + "loss": 0.6037, + "step": 3055 + }, + { + "epoch": 0.6282248946448762, + "grad_norm": 0.2011025846004486, + "learning_rate": 8.261401805151711e-05, + "loss": 0.6748, + "step": 3056 + }, + { + "epoch": 0.6284304656182547, + "grad_norm": 0.21176697313785553, + "learning_rate": 8.260853433423366e-05, + "loss": 0.6784, + "step": 3057 + }, + { + "epoch": 0.6286360365916333, + "grad_norm": 0.2133779078722, + "learning_rate": 8.260304876415348e-05, + "loss": 0.7074, + "step": 3058 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 0.21225228905677795, + "learning_rate": 8.259756134154685e-05, + "loss": 0.7336, + "step": 3059 + }, + { + "epoch": 0.6290471785383904, + "grad_norm": 0.16129277646541595, + "learning_rate": 8.25920720666841e-05, + "loss": 0.5877, + "step": 3060 + }, + { + "epoch": 0.6292527495117689, + "grad_norm": 0.2276839166879654, + "learning_rate": 8.258658093983566e-05, + "loss": 0.6943, + "step": 3061 + }, + { + "epoch": 0.6294583204851475, + "grad_norm": 0.20884232223033905, + "learning_rate": 8.258108796127206e-05, + "loss": 0.6802, + "step": 3062 + }, + { + "epoch": 0.6296638914585261, + "grad_norm": 0.21469639241695404, + "learning_rate": 8.257559313126391e-05, + "loss": 0.7264, + "step": 3063 + }, + { + "epoch": 0.6298694624319047, + "grad_norm": 0.20983977615833282, + "learning_rate": 8.257009645008191e-05, + "loss": 0.7146, + "step": 3064 + }, + { + "epoch": 0.6300750334052831, + "grad_norm": 0.20303663611412048, + "learning_rate": 8.256459791799687e-05, + "loss": 0.6593, + "step": 3065 + }, + { + "epoch": 0.6302806043786617, + "grad_norm": 0.20967082679271698, + "learning_rate": 8.255909753527968e-05, + "loss": 0.6983, + "step": 3066 + }, + { + "epoch": 0.6304861753520403, + "grad_norm": 0.15247072279453278, + "learning_rate": 8.255359530220127e-05, + "loss": 0.6055, + "step": 3067 + }, + { + "epoch": 0.6306917463254188, + "grad_norm": 0.2263472080230713, + "learning_rate": 8.254809121903276e-05, + "loss": 0.6934, + "step": 3068 + }, + { + "epoch": 0.6308973172987974, + "grad_norm": 0.22391130030155182, + "learning_rate": 8.25425852860453e-05, + "loss": 0.6984, + "step": 3069 + }, + { + "epoch": 0.631102888272176, + "grad_norm": 0.19726432859897614, + "learning_rate": 8.253707750351013e-05, + "loss": 0.6938, + "step": 3070 + }, + { + "epoch": 0.6313084592455546, + "grad_norm": 0.2162100374698639, + "learning_rate": 8.25315678716986e-05, + "loss": 0.675, + "step": 3071 + }, + { + "epoch": 0.631514030218933, + "grad_norm": 0.2201918661594391, + "learning_rate": 8.252605639088215e-05, + "loss": 0.6931, + "step": 3072 + }, + { + "epoch": 0.6317196011923116, + "grad_norm": 0.20799918472766876, + "learning_rate": 8.25205430613323e-05, + "loss": 0.6911, + "step": 3073 + }, + { + "epoch": 0.6319251721656902, + "grad_norm": 0.19582496583461761, + "learning_rate": 8.251502788332066e-05, + "loss": 0.6763, + "step": 3074 + }, + { + "epoch": 0.6321307431390688, + "grad_norm": 0.2054242044687271, + "learning_rate": 8.250951085711894e-05, + "loss": 0.6907, + "step": 3075 + }, + { + "epoch": 0.6323363141124473, + "grad_norm": 0.15331074595451355, + "learning_rate": 8.250399198299894e-05, + "loss": 0.5903, + "step": 3076 + }, + { + "epoch": 0.6325418850858259, + "grad_norm": 0.22686253488063812, + "learning_rate": 8.249847126123253e-05, + "loss": 0.6944, + "step": 3077 + }, + { + "epoch": 0.6327474560592045, + "grad_norm": 0.2104145586490631, + "learning_rate": 8.249294869209172e-05, + "loss": 0.678, + "step": 3078 + }, + { + "epoch": 0.632953027032583, + "grad_norm": 0.14177118241786957, + "learning_rate": 8.248742427584858e-05, + "loss": 0.5831, + "step": 3079 + }, + { + "epoch": 0.6331585980059615, + "grad_norm": 0.2042471021413803, + "learning_rate": 8.248189801277526e-05, + "loss": 0.6831, + "step": 3080 + }, + { + "epoch": 0.6333641689793401, + "grad_norm": 0.13382332026958466, + "learning_rate": 8.2476369903144e-05, + "loss": 0.5932, + "step": 3081 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 0.21314536035060883, + "learning_rate": 8.247083994722717e-05, + "loss": 0.7024, + "step": 3082 + }, + { + "epoch": 0.6337753109260973, + "grad_norm": 0.2022118717432022, + "learning_rate": 8.24653081452972e-05, + "loss": 0.6778, + "step": 3083 + }, + { + "epoch": 0.6339808818994758, + "grad_norm": 0.1986151486635208, + "learning_rate": 8.24597744976266e-05, + "loss": 0.6955, + "step": 3084 + }, + { + "epoch": 0.6341864528728544, + "grad_norm": 0.1944025456905365, + "learning_rate": 8.245423900448802e-05, + "loss": 0.6761, + "step": 3085 + }, + { + "epoch": 0.6343920238462329, + "grad_norm": 0.1960417479276657, + "learning_rate": 8.244870166615411e-05, + "loss": 0.6694, + "step": 3086 + }, + { + "epoch": 0.6345975948196114, + "grad_norm": 0.19537580013275146, + "learning_rate": 8.244316248289771e-05, + "loss": 0.7057, + "step": 3087 + }, + { + "epoch": 0.63480316579299, + "grad_norm": 0.25191953778266907, + "learning_rate": 8.243762145499173e-05, + "loss": 0.7093, + "step": 3088 + }, + { + "epoch": 0.6350087367663686, + "grad_norm": 0.21354857087135315, + "learning_rate": 8.24320785827091e-05, + "loss": 0.6912, + "step": 3089 + }, + { + "epoch": 0.6352143077397472, + "grad_norm": 0.2095470279455185, + "learning_rate": 8.242653386632292e-05, + "loss": 0.6966, + "step": 3090 + }, + { + "epoch": 0.6354198787131257, + "grad_norm": 0.19135965406894684, + "learning_rate": 8.242098730610636e-05, + "loss": 0.6868, + "step": 3091 + }, + { + "epoch": 0.6356254496865043, + "grad_norm": 0.19568754732608795, + "learning_rate": 8.241543890233263e-05, + "loss": 0.6741, + "step": 3092 + }, + { + "epoch": 0.6358310206598828, + "grad_norm": 0.19776469469070435, + "learning_rate": 8.240988865527513e-05, + "loss": 0.7092, + "step": 3093 + }, + { + "epoch": 0.6360365916332614, + "grad_norm": 0.18224585056304932, + "learning_rate": 8.240433656520727e-05, + "loss": 0.6031, + "step": 3094 + }, + { + "epoch": 0.6362421626066399, + "grad_norm": 0.203841432929039, + "learning_rate": 8.239878263240256e-05, + "loss": 0.6995, + "step": 3095 + }, + { + "epoch": 0.6364477335800185, + "grad_norm": 0.13863101601600647, + "learning_rate": 8.239322685713465e-05, + "loss": 0.5863, + "step": 3096 + }, + { + "epoch": 0.6366533045533971, + "grad_norm": 0.21603704988956451, + "learning_rate": 8.238766923967722e-05, + "loss": 0.7092, + "step": 3097 + }, + { + "epoch": 0.6368588755267757, + "grad_norm": 0.20999345183372498, + "learning_rate": 8.238210978030407e-05, + "loss": 0.6738, + "step": 3098 + }, + { + "epoch": 0.6370644465001541, + "grad_norm": 0.1540490984916687, + "learning_rate": 8.23765484792891e-05, + "loss": 0.589, + "step": 3099 + }, + { + "epoch": 0.6372700174735327, + "grad_norm": 0.21293634176254272, + "learning_rate": 8.237098533690628e-05, + "loss": 0.6747, + "step": 3100 + }, + { + "epoch": 0.6374755884469113, + "grad_norm": 0.23176319897174835, + "learning_rate": 8.236542035342969e-05, + "loss": 0.679, + "step": 3101 + }, + { + "epoch": 0.6376811594202898, + "grad_norm": 0.19695045053958893, + "learning_rate": 8.235985352913348e-05, + "loss": 0.6856, + "step": 3102 + }, + { + "epoch": 0.6378867303936684, + "grad_norm": 0.19714051485061646, + "learning_rate": 8.235428486429191e-05, + "loss": 0.697, + "step": 3103 + }, + { + "epoch": 0.638092301367047, + "grad_norm": 0.21369072794914246, + "learning_rate": 8.23487143591793e-05, + "loss": 0.6986, + "step": 3104 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 0.19707739353179932, + "learning_rate": 8.234314201407012e-05, + "loss": 0.7098, + "step": 3105 + }, + { + "epoch": 0.638503443313804, + "grad_norm": 0.1957058161497116, + "learning_rate": 8.233756782923888e-05, + "loss": 0.6754, + "step": 3106 + }, + { + "epoch": 0.6387090142871826, + "grad_norm": 0.19346770644187927, + "learning_rate": 8.233199180496019e-05, + "loss": 0.6703, + "step": 3107 + }, + { + "epoch": 0.6389145852605612, + "grad_norm": 0.2065419703722, + "learning_rate": 8.232641394150873e-05, + "loss": 0.6961, + "step": 3108 + }, + { + "epoch": 0.6391201562339398, + "grad_norm": 0.20303097367286682, + "learning_rate": 8.232083423915932e-05, + "loss": 0.6764, + "step": 3109 + }, + { + "epoch": 0.6393257272073183, + "grad_norm": 0.19711004197597504, + "learning_rate": 8.231525269818688e-05, + "loss": 0.6965, + "step": 3110 + }, + { + "epoch": 0.6395312981806969, + "grad_norm": 0.19637802243232727, + "learning_rate": 8.230966931886631e-05, + "loss": 0.7109, + "step": 3111 + }, + { + "epoch": 0.6397368691540755, + "grad_norm": 0.20301949977874756, + "learning_rate": 8.230408410147274e-05, + "loss": 0.6824, + "step": 3112 + }, + { + "epoch": 0.639942440127454, + "grad_norm": 1.2079687118530273, + "learning_rate": 8.229849704628131e-05, + "loss": 0.6643, + "step": 3113 + }, + { + "epoch": 0.6401480111008325, + "grad_norm": 0.17537331581115723, + "learning_rate": 8.229290815356723e-05, + "loss": 0.5969, + "step": 3114 + }, + { + "epoch": 0.6403535820742111, + "grad_norm": 0.2206054925918579, + "learning_rate": 8.22873174236059e-05, + "loss": 0.6856, + "step": 3115 + }, + { + "epoch": 0.6405591530475897, + "grad_norm": 0.20161283016204834, + "learning_rate": 8.228172485667273e-05, + "loss": 0.6803, + "step": 3116 + }, + { + "epoch": 0.6407647240209683, + "grad_norm": 0.5840950012207031, + "learning_rate": 8.227613045304321e-05, + "loss": 0.688, + "step": 3117 + }, + { + "epoch": 0.6409702949943468, + "grad_norm": 0.19631561636924744, + "learning_rate": 8.227053421299297e-05, + "loss": 0.5931, + "step": 3118 + }, + { + "epoch": 0.6411758659677254, + "grad_norm": 0.23822426795959473, + "learning_rate": 8.226493613679772e-05, + "loss": 0.5962, + "step": 3119 + }, + { + "epoch": 0.6413814369411039, + "grad_norm": 0.15889045596122742, + "learning_rate": 8.225933622473322e-05, + "loss": 0.5809, + "step": 3120 + }, + { + "epoch": 0.6415870079144824, + "grad_norm": 0.24698416888713837, + "learning_rate": 8.22537344770754e-05, + "loss": 0.6965, + "step": 3121 + }, + { + "epoch": 0.641792578887861, + "grad_norm": 0.2314760684967041, + "learning_rate": 8.224813089410021e-05, + "loss": 0.6989, + "step": 3122 + }, + { + "epoch": 0.6419981498612396, + "grad_norm": 0.20642580091953278, + "learning_rate": 8.22425254760837e-05, + "loss": 0.7141, + "step": 3123 + }, + { + "epoch": 0.6422037208346182, + "grad_norm": 0.209413081407547, + "learning_rate": 8.223691822330203e-05, + "loss": 0.7117, + "step": 3124 + }, + { + "epoch": 0.6424092918079967, + "grad_norm": 0.21780717372894287, + "learning_rate": 8.223130913603144e-05, + "loss": 0.6902, + "step": 3125 + }, + { + "epoch": 0.6426148627813753, + "grad_norm": 0.21011175215244293, + "learning_rate": 8.222569821454826e-05, + "loss": 0.6963, + "step": 3126 + }, + { + "epoch": 0.6428204337547538, + "grad_norm": 0.2518548369407654, + "learning_rate": 8.222008545912895e-05, + "loss": 0.6005, + "step": 3127 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 0.21928563714027405, + "learning_rate": 8.221447087004996e-05, + "loss": 0.6957, + "step": 3128 + }, + { + "epoch": 0.6432315757015109, + "grad_norm": 0.21237944066524506, + "learning_rate": 8.220885444758796e-05, + "loss": 0.6559, + "step": 3129 + }, + { + "epoch": 0.6434371466748895, + "grad_norm": 0.22411003708839417, + "learning_rate": 8.220323619201958e-05, + "loss": 0.7081, + "step": 3130 + }, + { + "epoch": 0.6436427176482681, + "grad_norm": 0.19972927868366241, + "learning_rate": 8.219761610362168e-05, + "loss": 0.6792, + "step": 3131 + }, + { + "epoch": 0.6438482886216467, + "grad_norm": 0.24267856776714325, + "learning_rate": 8.219199418267107e-05, + "loss": 0.7113, + "step": 3132 + }, + { + "epoch": 0.6440538595950251, + "grad_norm": 0.20243190228939056, + "learning_rate": 8.218637042944476e-05, + "loss": 0.6826, + "step": 3133 + }, + { + "epoch": 0.6442594305684037, + "grad_norm": 0.19848772883415222, + "learning_rate": 8.218074484421978e-05, + "loss": 0.6965, + "step": 3134 + }, + { + "epoch": 0.6444650015417823, + "grad_norm": 0.20293201506137848, + "learning_rate": 8.217511742727327e-05, + "loss": 0.6646, + "step": 3135 + }, + { + "epoch": 0.6446705725151609, + "grad_norm": 0.20322081446647644, + "learning_rate": 8.21694881788825e-05, + "loss": 0.699, + "step": 3136 + }, + { + "epoch": 0.6448761434885394, + "grad_norm": 0.20811443030834198, + "learning_rate": 8.216385709932476e-05, + "loss": 0.6561, + "step": 3137 + }, + { + "epoch": 0.645081714461918, + "grad_norm": 0.21710549294948578, + "learning_rate": 8.21582241888775e-05, + "loss": 0.6903, + "step": 3138 + }, + { + "epoch": 0.6452872854352966, + "grad_norm": 0.2017020285129547, + "learning_rate": 8.21525894478182e-05, + "loss": 0.6837, + "step": 3139 + }, + { + "epoch": 0.645492856408675, + "grad_norm": 0.21228978037834167, + "learning_rate": 8.214695287642448e-05, + "loss": 0.7046, + "step": 3140 + }, + { + "epoch": 0.6456984273820536, + "grad_norm": 0.19248290359973907, + "learning_rate": 8.214131447497401e-05, + "loss": 0.6838, + "step": 3141 + }, + { + "epoch": 0.6459039983554322, + "grad_norm": 0.20567071437835693, + "learning_rate": 8.213567424374458e-05, + "loss": 0.6728, + "step": 3142 + }, + { + "epoch": 0.6461095693288108, + "grad_norm": 0.19881267845630646, + "learning_rate": 8.213003218301404e-05, + "loss": 0.6937, + "step": 3143 + }, + { + "epoch": 0.6463151403021893, + "grad_norm": 0.20884251594543457, + "learning_rate": 8.212438829306037e-05, + "loss": 0.6889, + "step": 3144 + }, + { + "epoch": 0.6465207112755679, + "grad_norm": 0.196677565574646, + "learning_rate": 8.21187425741616e-05, + "loss": 0.6586, + "step": 3145 + }, + { + "epoch": 0.6467262822489465, + "grad_norm": 0.19286644458770752, + "learning_rate": 8.211309502659588e-05, + "loss": 0.6643, + "step": 3146 + }, + { + "epoch": 0.646931853222325, + "grad_norm": 0.19453571736812592, + "learning_rate": 8.210744565064142e-05, + "loss": 0.6898, + "step": 3147 + }, + { + "epoch": 0.6471374241957035, + "grad_norm": 0.22043997049331665, + "learning_rate": 8.210179444657658e-05, + "loss": 0.5958, + "step": 3148 + }, + { + "epoch": 0.6473429951690821, + "grad_norm": 0.2146371752023697, + "learning_rate": 8.209614141467972e-05, + "loss": 0.7184, + "step": 3149 + }, + { + "epoch": 0.6475485661424607, + "grad_norm": 0.2086339145898819, + "learning_rate": 8.209048655522937e-05, + "loss": 0.6878, + "step": 3150 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 0.19689536094665527, + "learning_rate": 8.20848298685041e-05, + "loss": 0.6693, + "step": 3151 + }, + { + "epoch": 0.6479597080892178, + "grad_norm": 0.19254978001117706, + "learning_rate": 8.207917135478259e-05, + "loss": 0.6931, + "step": 3152 + }, + { + "epoch": 0.6481652790625964, + "grad_norm": 0.19382552802562714, + "learning_rate": 8.207351101434363e-05, + "loss": 0.6691, + "step": 3153 + }, + { + "epoch": 0.6483708500359749, + "grad_norm": 0.20275139808654785, + "learning_rate": 8.206784884746607e-05, + "loss": 0.7085, + "step": 3154 + }, + { + "epoch": 0.6485764210093535, + "grad_norm": 0.19114693999290466, + "learning_rate": 8.206218485442883e-05, + "loss": 0.6732, + "step": 3155 + }, + { + "epoch": 0.648781991982732, + "grad_norm": 0.19770143926143646, + "learning_rate": 8.2056519035511e-05, + "loss": 0.6691, + "step": 3156 + }, + { + "epoch": 0.6489875629561106, + "grad_norm": 0.2007279098033905, + "learning_rate": 8.205085139099165e-05, + "loss": 0.6647, + "step": 3157 + }, + { + "epoch": 0.6491931339294892, + "grad_norm": 0.19302336871623993, + "learning_rate": 8.204518192115004e-05, + "loss": 0.663, + "step": 3158 + }, + { + "epoch": 0.6493987049028677, + "grad_norm": 0.19728437066078186, + "learning_rate": 8.203951062626546e-05, + "loss": 0.674, + "step": 3159 + }, + { + "epoch": 0.6496042758762463, + "grad_norm": 0.20836929976940155, + "learning_rate": 8.203383750661731e-05, + "loss": 0.6827, + "step": 3160 + }, + { + "epoch": 0.6498098468496248, + "grad_norm": 0.226349338889122, + "learning_rate": 8.202816256248509e-05, + "loss": 0.579, + "step": 3161 + }, + { + "epoch": 0.6500154178230034, + "grad_norm": 0.203635573387146, + "learning_rate": 8.202248579414837e-05, + "loss": 0.6959, + "step": 3162 + }, + { + "epoch": 0.6502209887963819, + "grad_norm": 0.14256790280342102, + "learning_rate": 8.201680720188682e-05, + "loss": 0.589, + "step": 3163 + }, + { + "epoch": 0.6504265597697605, + "grad_norm": 0.214716836810112, + "learning_rate": 8.201112678598018e-05, + "loss": 0.6951, + "step": 3164 + }, + { + "epoch": 0.6506321307431391, + "grad_norm": 0.20737797021865845, + "learning_rate": 8.200544454670834e-05, + "loss": 0.6921, + "step": 3165 + }, + { + "epoch": 0.6508377017165177, + "grad_norm": 0.2059832364320755, + "learning_rate": 8.199976048435118e-05, + "loss": 0.6845, + "step": 3166 + }, + { + "epoch": 0.6510432726898961, + "grad_norm": 0.20531848073005676, + "learning_rate": 8.199407459918877e-05, + "loss": 0.696, + "step": 3167 + }, + { + "epoch": 0.6512488436632747, + "grad_norm": 0.20587943494319916, + "learning_rate": 8.19883868915012e-05, + "loss": 0.6877, + "step": 3168 + }, + { + "epoch": 0.6514544146366533, + "grad_norm": 0.19502076506614685, + "learning_rate": 8.198269736156872e-05, + "loss": 0.6735, + "step": 3169 + }, + { + "epoch": 0.6516599856100319, + "grad_norm": 0.1964626908302307, + "learning_rate": 8.197700600967158e-05, + "loss": 0.6702, + "step": 3170 + }, + { + "epoch": 0.6518655565834104, + "grad_norm": 0.19854065775871277, + "learning_rate": 8.19713128360902e-05, + "loss": 0.6639, + "step": 3171 + }, + { + "epoch": 0.652071127556789, + "grad_norm": 0.2041742503643036, + "learning_rate": 8.196561784110502e-05, + "loss": 0.6813, + "step": 3172 + }, + { + "epoch": 0.6522766985301676, + "grad_norm": 0.19994084537029266, + "learning_rate": 8.195992102499663e-05, + "loss": 0.668, + "step": 3173 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 0.1984533816576004, + "learning_rate": 8.195422238804569e-05, + "loss": 0.6839, + "step": 3174 + }, + { + "epoch": 0.6526878404769246, + "grad_norm": 0.2585853338241577, + "learning_rate": 8.194852193053293e-05, + "loss": 0.5857, + "step": 3175 + }, + { + "epoch": 0.6528934114503032, + "grad_norm": 0.21707791090011597, + "learning_rate": 8.194281965273919e-05, + "loss": 0.7002, + "step": 3176 + }, + { + "epoch": 0.6530989824236818, + "grad_norm": 0.21522431075572968, + "learning_rate": 8.193711555494541e-05, + "loss": 0.6681, + "step": 3177 + }, + { + "epoch": 0.6533045533970603, + "grad_norm": 0.20251545310020447, + "learning_rate": 8.193140963743258e-05, + "loss": 0.7119, + "step": 3178 + }, + { + "epoch": 0.6535101243704389, + "grad_norm": 0.20081111788749695, + "learning_rate": 8.192570190048181e-05, + "loss": 0.7013, + "step": 3179 + }, + { + "epoch": 0.6537156953438175, + "grad_norm": 0.20084579288959503, + "learning_rate": 8.19199923443743e-05, + "loss": 0.6996, + "step": 3180 + }, + { + "epoch": 0.653921266317196, + "grad_norm": 0.2081523984670639, + "learning_rate": 8.191428096939134e-05, + "loss": 0.6774, + "step": 3181 + }, + { + "epoch": 0.6541268372905745, + "grad_norm": 0.19181185960769653, + "learning_rate": 8.190856777581427e-05, + "loss": 0.5909, + "step": 3182 + }, + { + "epoch": 0.6543324082639531, + "grad_norm": 0.21452546119689941, + "learning_rate": 8.190285276392461e-05, + "loss": 0.6737, + "step": 3183 + }, + { + "epoch": 0.6545379792373317, + "grad_norm": 0.20853358507156372, + "learning_rate": 8.189713593400385e-05, + "loss": 0.6823, + "step": 3184 + }, + { + "epoch": 0.6547435502107103, + "grad_norm": 0.20873308181762695, + "learning_rate": 8.189141728633367e-05, + "loss": 0.7007, + "step": 3185 + }, + { + "epoch": 0.6549491211840888, + "grad_norm": 0.19929181039333344, + "learning_rate": 8.188569682119579e-05, + "loss": 0.6567, + "step": 3186 + }, + { + "epoch": 0.6551546921574674, + "grad_norm": 0.19836626946926117, + "learning_rate": 8.187997453887202e-05, + "loss": 0.6607, + "step": 3187 + }, + { + "epoch": 0.6553602631308459, + "grad_norm": 0.18740180134773254, + "learning_rate": 8.187425043964429e-05, + "loss": 0.6858, + "step": 3188 + }, + { + "epoch": 0.6555658341042245, + "grad_norm": 0.20412470400333405, + "learning_rate": 8.18685245237946e-05, + "loss": 0.6895, + "step": 3189 + }, + { + "epoch": 0.655771405077603, + "grad_norm": 0.15742400288581848, + "learning_rate": 8.186279679160502e-05, + "loss": 0.5842, + "step": 3190 + }, + { + "epoch": 0.6559769760509816, + "grad_norm": 0.20259132981300354, + "learning_rate": 8.185706724335773e-05, + "loss": 0.6967, + "step": 3191 + }, + { + "epoch": 0.6561825470243602, + "grad_norm": 1.9348865747451782, + "learning_rate": 8.185133587933502e-05, + "loss": 0.7117, + "step": 3192 + }, + { + "epoch": 0.6563881179977388, + "grad_norm": 0.2033887505531311, + "learning_rate": 8.184560269981922e-05, + "loss": 0.6728, + "step": 3193 + }, + { + "epoch": 0.6565936889711173, + "grad_norm": 0.15772481262683868, + "learning_rate": 8.183986770509281e-05, + "loss": 0.5949, + "step": 3194 + }, + { + "epoch": 0.6567992599444958, + "grad_norm": 0.21117869019508362, + "learning_rate": 8.18341308954383e-05, + "loss": 0.7154, + "step": 3195 + }, + { + "epoch": 0.6570048309178744, + "grad_norm": 0.21583619713783264, + "learning_rate": 8.182839227113833e-05, + "loss": 0.7056, + "step": 3196 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 0.21002855896949768, + "learning_rate": 8.18226518324756e-05, + "loss": 0.7106, + "step": 3197 + }, + { + "epoch": 0.6574159728646315, + "grad_norm": 0.20425178110599518, + "learning_rate": 8.181690957973292e-05, + "loss": 0.6785, + "step": 3198 + }, + { + "epoch": 0.6576215438380101, + "grad_norm": 0.2083713412284851, + "learning_rate": 8.181116551319319e-05, + "loss": 0.707, + "step": 3199 + }, + { + "epoch": 0.6578271148113887, + "grad_norm": 0.1998489499092102, + "learning_rate": 8.180541963313939e-05, + "loss": 0.6886, + "step": 3200 + }, + { + "epoch": 0.6580326857847671, + "grad_norm": 0.20870743691921234, + "learning_rate": 8.17996719398546e-05, + "loss": 0.6931, + "step": 3201 + }, + { + "epoch": 0.6582382567581457, + "grad_norm": 0.20594879984855652, + "learning_rate": 8.179392243362195e-05, + "loss": 0.6897, + "step": 3202 + }, + { + "epoch": 0.6584438277315243, + "grad_norm": 0.19401825964450836, + "learning_rate": 8.178817111472474e-05, + "loss": 0.6719, + "step": 3203 + }, + { + "epoch": 0.6586493987049029, + "grad_norm": 0.20549017190933228, + "learning_rate": 8.178241798344627e-05, + "loss": 0.666, + "step": 3204 + }, + { + "epoch": 0.6588549696782814, + "grad_norm": 0.1869438886642456, + "learning_rate": 8.177666304007e-05, + "loss": 0.6728, + "step": 3205 + }, + { + "epoch": 0.65906054065166, + "grad_norm": 0.19876159727573395, + "learning_rate": 8.177090628487943e-05, + "loss": 0.6646, + "step": 3206 + }, + { + "epoch": 0.6592661116250386, + "grad_norm": 0.1998775601387024, + "learning_rate": 8.176514771815818e-05, + "loss": 0.7035, + "step": 3207 + }, + { + "epoch": 0.6594716825984172, + "grad_norm": 0.19949300587177277, + "learning_rate": 8.175938734018994e-05, + "loss": 0.7035, + "step": 3208 + }, + { + "epoch": 0.6596772535717956, + "grad_norm": 0.1943056583404541, + "learning_rate": 8.175362515125849e-05, + "loss": 0.702, + "step": 3209 + }, + { + "epoch": 0.6598828245451742, + "grad_norm": 0.20226384699344635, + "learning_rate": 8.174786115164773e-05, + "loss": 0.6887, + "step": 3210 + }, + { + "epoch": 0.6600883955185528, + "grad_norm": 0.19821226596832275, + "learning_rate": 8.174209534164161e-05, + "loss": 0.7097, + "step": 3211 + }, + { + "epoch": 0.6602939664919314, + "grad_norm": 0.19110795855522156, + "learning_rate": 8.173632772152416e-05, + "loss": 0.6737, + "step": 3212 + }, + { + "epoch": 0.6604995374653099, + "grad_norm": 0.19855926930904388, + "learning_rate": 8.173055829157957e-05, + "loss": 0.6818, + "step": 3213 + }, + { + "epoch": 0.6607051084386885, + "grad_norm": 0.19995853304862976, + "learning_rate": 8.172478705209204e-05, + "loss": 0.6811, + "step": 3214 + }, + { + "epoch": 0.660910679412067, + "grad_norm": 0.22749421000480652, + "learning_rate": 8.171901400334591e-05, + "loss": 0.6004, + "step": 3215 + }, + { + "epoch": 0.6611162503854455, + "grad_norm": 0.2062731236219406, + "learning_rate": 8.171323914562559e-05, + "loss": 0.7145, + "step": 3216 + }, + { + "epoch": 0.6613218213588241, + "grad_norm": 0.20264078676700592, + "learning_rate": 8.170746247921555e-05, + "loss": 0.6664, + "step": 3217 + }, + { + "epoch": 0.6615273923322027, + "grad_norm": 0.20601505041122437, + "learning_rate": 8.170168400440044e-05, + "loss": 0.6727, + "step": 3218 + }, + { + "epoch": 0.6617329633055813, + "grad_norm": 0.22924602031707764, + "learning_rate": 8.169590372146487e-05, + "loss": 0.6836, + "step": 3219 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 0.19378581643104553, + "learning_rate": 8.169012163069366e-05, + "loss": 0.6851, + "step": 3220 + }, + { + "epoch": 0.6621441052523384, + "grad_norm": 0.20838582515716553, + "learning_rate": 8.168433773237164e-05, + "loss": 0.6856, + "step": 3221 + }, + { + "epoch": 0.6623496762257169, + "grad_norm": 0.21452072262763977, + "learning_rate": 8.167855202678377e-05, + "loss": 0.7068, + "step": 3222 + }, + { + "epoch": 0.6625552471990955, + "grad_norm": 0.2000737488269806, + "learning_rate": 8.167276451421506e-05, + "loss": 0.6874, + "step": 3223 + }, + { + "epoch": 0.662760818172474, + "grad_norm": 0.23498542606830597, + "learning_rate": 8.166697519495066e-05, + "loss": 0.5939, + "step": 3224 + }, + { + "epoch": 0.6629663891458526, + "grad_norm": 0.2128230184316635, + "learning_rate": 8.166118406927578e-05, + "loss": 0.7094, + "step": 3225 + }, + { + "epoch": 0.6631719601192312, + "grad_norm": 0.1330750733613968, + "learning_rate": 8.16553911374757e-05, + "loss": 0.6022, + "step": 3226 + }, + { + "epoch": 0.6633775310926098, + "grad_norm": 0.21321649849414825, + "learning_rate": 8.164959639983583e-05, + "loss": 0.6905, + "step": 3227 + }, + { + "epoch": 0.6635831020659883, + "grad_norm": 0.2014767974615097, + "learning_rate": 8.164379985664166e-05, + "loss": 0.685, + "step": 3228 + }, + { + "epoch": 0.6637886730393668, + "grad_norm": 0.17292124032974243, + "learning_rate": 8.163800150817872e-05, + "loss": 0.5932, + "step": 3229 + }, + { + "epoch": 0.6639942440127454, + "grad_norm": 0.20624692738056183, + "learning_rate": 8.163220135473271e-05, + "loss": 0.6831, + "step": 3230 + }, + { + "epoch": 0.6641998149861239, + "grad_norm": 0.2030026912689209, + "learning_rate": 8.162639939658935e-05, + "loss": 0.7166, + "step": 3231 + }, + { + "epoch": 0.6644053859595025, + "grad_norm": 0.19677379727363586, + "learning_rate": 8.162059563403448e-05, + "loss": 0.6646, + "step": 3232 + }, + { + "epoch": 0.6646109569328811, + "grad_norm": 0.1929975152015686, + "learning_rate": 8.161479006735404e-05, + "loss": 0.671, + "step": 3233 + }, + { + "epoch": 0.6648165279062597, + "grad_norm": 0.196861132979393, + "learning_rate": 8.1608982696834e-05, + "loss": 0.6899, + "step": 3234 + }, + { + "epoch": 0.6650220988796381, + "grad_norm": 0.19990988075733185, + "learning_rate": 8.160317352276053e-05, + "loss": 0.6889, + "step": 3235 + }, + { + "epoch": 0.6652276698530167, + "grad_norm": 0.1800822615623474, + "learning_rate": 8.159736254541976e-05, + "loss": 0.6149, + "step": 3236 + }, + { + "epoch": 0.6654332408263953, + "grad_norm": 0.1930818259716034, + "learning_rate": 8.159154976509801e-05, + "loss": 0.6756, + "step": 3237 + }, + { + "epoch": 0.6656388117997739, + "grad_norm": 0.18298830091953278, + "learning_rate": 8.158573518208162e-05, + "loss": 0.5984, + "step": 3238 + }, + { + "epoch": 0.6658443827731524, + "grad_norm": 0.19836896657943726, + "learning_rate": 8.157991879665706e-05, + "loss": 0.6869, + "step": 3239 + }, + { + "epoch": 0.666049953746531, + "grad_norm": 0.20596401393413544, + "learning_rate": 8.157410060911087e-05, + "loss": 0.6882, + "step": 3240 + }, + { + "epoch": 0.6662555247199096, + "grad_norm": 0.1683359146118164, + "learning_rate": 8.15682806197297e-05, + "loss": 0.5799, + "step": 3241 + }, + { + "epoch": 0.6664610956932882, + "grad_norm": 0.19776779413223267, + "learning_rate": 8.156245882880026e-05, + "loss": 0.6528, + "step": 3242 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.1920391172170639, + "learning_rate": 8.155663523660936e-05, + "loss": 0.6982, + "step": 3243 + }, + { + "epoch": 0.6668722376400452, + "grad_norm": 0.1352914422750473, + "learning_rate": 8.155080984344391e-05, + "loss": 0.5837, + "step": 3244 + }, + { + "epoch": 0.6670778086134238, + "grad_norm": 0.2184402048587799, + "learning_rate": 8.15449826495909e-05, + "loss": 0.6784, + "step": 3245 + }, + { + "epoch": 0.6672833795868024, + "grad_norm": 0.19601434469223022, + "learning_rate": 8.15391536553374e-05, + "loss": 0.6778, + "step": 3246 + }, + { + "epoch": 0.6674889505601809, + "grad_norm": 0.19717663526535034, + "learning_rate": 8.15333228609706e-05, + "loss": 0.7024, + "step": 3247 + }, + { + "epoch": 0.6676945215335595, + "grad_norm": 0.19221165776252747, + "learning_rate": 8.152749026677773e-05, + "loss": 0.6951, + "step": 3248 + }, + { + "epoch": 0.667900092506938, + "grad_norm": 0.15361624956130981, + "learning_rate": 8.152165587304613e-05, + "loss": 0.5739, + "step": 3249 + }, + { + "epoch": 0.6681056634803165, + "grad_norm": 0.13391469419002533, + "learning_rate": 8.151581968006325e-05, + "loss": 0.5979, + "step": 3250 + }, + { + "epoch": 0.6683112344536951, + "grad_norm": 0.21153193712234497, + "learning_rate": 8.150998168811663e-05, + "loss": 0.6651, + "step": 3251 + }, + { + "epoch": 0.6685168054270737, + "grad_norm": 0.13939164578914642, + "learning_rate": 8.150414189749385e-05, + "loss": 0.5664, + "step": 3252 + }, + { + "epoch": 0.6687223764004523, + "grad_norm": 0.21254399418830872, + "learning_rate": 8.149830030848261e-05, + "loss": 0.6856, + "step": 3253 + }, + { + "epoch": 0.6689279473738308, + "grad_norm": 0.19342190027236938, + "learning_rate": 8.14924569213707e-05, + "loss": 0.6828, + "step": 3254 + }, + { + "epoch": 0.6691335183472094, + "grad_norm": 0.19527758657932281, + "learning_rate": 8.148661173644602e-05, + "loss": 0.7009, + "step": 3255 + }, + { + "epoch": 0.6693390893205879, + "grad_norm": 0.1978977620601654, + "learning_rate": 8.148076475399651e-05, + "loss": 0.7137, + "step": 3256 + }, + { + "epoch": 0.6695446602939665, + "grad_norm": 0.20413827896118164, + "learning_rate": 8.147491597431025e-05, + "loss": 0.672, + "step": 3257 + }, + { + "epoch": 0.669750231267345, + "grad_norm": 0.19834209978580475, + "learning_rate": 8.146906539767534e-05, + "loss": 0.6726, + "step": 3258 + }, + { + "epoch": 0.6699558022407236, + "grad_norm": 0.1580744832754135, + "learning_rate": 8.146321302438004e-05, + "loss": 0.5621, + "step": 3259 + }, + { + "epoch": 0.6701613732141022, + "grad_norm": 0.20448711514472961, + "learning_rate": 8.145735885471266e-05, + "loss": 0.6633, + "step": 3260 + }, + { + "epoch": 0.6703669441874808, + "grad_norm": 0.12794892489910126, + "learning_rate": 8.145150288896161e-05, + "loss": 0.5989, + "step": 3261 + }, + { + "epoch": 0.6705725151608593, + "grad_norm": 0.20495088398456573, + "learning_rate": 8.144564512741539e-05, + "loss": 0.6778, + "step": 3262 + }, + { + "epoch": 0.6707780861342378, + "grad_norm": 0.13609834015369415, + "learning_rate": 8.143978557036259e-05, + "loss": 0.5879, + "step": 3263 + }, + { + "epoch": 0.6709836571076164, + "grad_norm": 0.19716021418571472, + "learning_rate": 8.143392421809186e-05, + "loss": 0.6998, + "step": 3264 + }, + { + "epoch": 0.671189228080995, + "grad_norm": 0.19806286692619324, + "learning_rate": 8.142806107089198e-05, + "loss": 0.6884, + "step": 3265 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 0.14359678328037262, + "learning_rate": 8.14221961290518e-05, + "loss": 0.5788, + "step": 3266 + }, + { + "epoch": 0.6716003700277521, + "grad_norm": 0.19541367888450623, + "learning_rate": 8.141632939286026e-05, + "loss": 0.704, + "step": 3267 + }, + { + "epoch": 0.6718059410011307, + "grad_norm": 0.19442065060138702, + "learning_rate": 8.141046086260636e-05, + "loss": 0.6666, + "step": 3268 + }, + { + "epoch": 0.6720115119745091, + "grad_norm": 0.1996643990278244, + "learning_rate": 8.140459053857924e-05, + "loss": 0.6888, + "step": 3269 + }, + { + "epoch": 0.6722170829478877, + "grad_norm": 0.19437336921691895, + "learning_rate": 8.13987184210681e-05, + "loss": 0.7176, + "step": 3270 + }, + { + "epoch": 0.6724226539212663, + "grad_norm": 0.14562220871448517, + "learning_rate": 8.139284451036223e-05, + "loss": 0.5886, + "step": 3271 + }, + { + "epoch": 0.6726282248946449, + "grad_norm": 0.2078685313463211, + "learning_rate": 8.138696880675102e-05, + "loss": 0.6867, + "step": 3272 + }, + { + "epoch": 0.6728337958680234, + "grad_norm": 0.20113688707351685, + "learning_rate": 8.138109131052393e-05, + "loss": 0.7112, + "step": 3273 + }, + { + "epoch": 0.673039366841402, + "grad_norm": 0.19516409933567047, + "learning_rate": 8.137521202197052e-05, + "loss": 0.6735, + "step": 3274 + }, + { + "epoch": 0.6732449378147806, + "grad_norm": 0.18511922657489777, + "learning_rate": 8.136933094138042e-05, + "loss": 0.6696, + "step": 3275 + }, + { + "epoch": 0.6734505087881592, + "grad_norm": 0.18774795532226562, + "learning_rate": 8.136344806904336e-05, + "loss": 0.6739, + "step": 3276 + }, + { + "epoch": 0.6736560797615376, + "grad_norm": 0.19817449152469635, + "learning_rate": 8.135756340524919e-05, + "loss": 0.6896, + "step": 3277 + }, + { + "epoch": 0.6738616507349162, + "grad_norm": 0.19579534232616425, + "learning_rate": 8.135167695028782e-05, + "loss": 0.6669, + "step": 3278 + }, + { + "epoch": 0.6740672217082948, + "grad_norm": 0.1967802196741104, + "learning_rate": 8.13457887044492e-05, + "loss": 0.6763, + "step": 3279 + }, + { + "epoch": 0.6742727926816734, + "grad_norm": 0.1518080234527588, + "learning_rate": 8.133989866802349e-05, + "loss": 0.5755, + "step": 3280 + }, + { + "epoch": 0.6744783636550519, + "grad_norm": 0.1956729292869568, + "learning_rate": 8.13340068413008e-05, + "loss": 0.6695, + "step": 3281 + }, + { + "epoch": 0.6746839346284305, + "grad_norm": 0.20296379923820496, + "learning_rate": 8.132811322457142e-05, + "loss": 0.678, + "step": 3282 + }, + { + "epoch": 0.674889505601809, + "grad_norm": 0.19922013580799103, + "learning_rate": 8.132221781812571e-05, + "loss": 0.6898, + "step": 3283 + }, + { + "epoch": 0.6750950765751876, + "grad_norm": 0.1867515742778778, + "learning_rate": 8.13163206222541e-05, + "loss": 0.6911, + "step": 3284 + }, + { + "epoch": 0.6753006475485661, + "grad_norm": 0.20013710856437683, + "learning_rate": 8.13104216372471e-05, + "loss": 0.6878, + "step": 3285 + }, + { + "epoch": 0.6755062185219447, + "grad_norm": 0.19711051881313324, + "learning_rate": 8.130452086339535e-05, + "loss": 0.6755, + "step": 3286 + }, + { + "epoch": 0.6757117894953233, + "grad_norm": 0.22560589015483856, + "learning_rate": 8.129861830098953e-05, + "loss": 0.6961, + "step": 3287 + }, + { + "epoch": 0.6759173604687018, + "grad_norm": 0.1926925927400589, + "learning_rate": 8.129271395032046e-05, + "loss": 0.6887, + "step": 3288 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 0.19523480534553528, + "learning_rate": 8.1286807811679e-05, + "loss": 0.7129, + "step": 3289 + }, + { + "epoch": 0.6763285024154589, + "grad_norm": 0.19967713952064514, + "learning_rate": 8.128089988535613e-05, + "loss": 0.6985, + "step": 3290 + }, + { + "epoch": 0.6765340733888375, + "grad_norm": 0.1905701607465744, + "learning_rate": 8.127499017164289e-05, + "loss": 0.6839, + "step": 3291 + }, + { + "epoch": 0.676739644362216, + "grad_norm": 0.1880829632282257, + "learning_rate": 8.126907867083043e-05, + "loss": 0.6795, + "step": 3292 + }, + { + "epoch": 0.6769452153355946, + "grad_norm": 0.19849906861782074, + "learning_rate": 8.126316538320999e-05, + "loss": 0.7022, + "step": 3293 + }, + { + "epoch": 0.6771507863089732, + "grad_norm": 0.19704832136631012, + "learning_rate": 8.125725030907289e-05, + "loss": 0.6762, + "step": 3294 + }, + { + "epoch": 0.6773563572823518, + "grad_norm": 0.20323243737220764, + "learning_rate": 8.125133344871052e-05, + "loss": 0.7123, + "step": 3295 + }, + { + "epoch": 0.6775619282557303, + "grad_norm": 0.16344204545021057, + "learning_rate": 8.124541480241441e-05, + "loss": 0.5788, + "step": 3296 + }, + { + "epoch": 0.6777674992291088, + "grad_norm": 0.212424173951149, + "learning_rate": 8.123949437047611e-05, + "loss": 0.6874, + "step": 3297 + }, + { + "epoch": 0.6779730702024874, + "grad_norm": 0.2008782923221588, + "learning_rate": 8.123357215318731e-05, + "loss": 0.67, + "step": 3298 + }, + { + "epoch": 0.678178641175866, + "grad_norm": 0.20118223130702972, + "learning_rate": 8.122764815083976e-05, + "loss": 0.6802, + "step": 3299 + }, + { + "epoch": 0.6783842121492445, + "grad_norm": 0.1353181004524231, + "learning_rate": 8.122172236372533e-05, + "loss": 0.6006, + "step": 3300 + }, + { + "epoch": 0.6785897831226231, + "grad_norm": 0.19989068806171417, + "learning_rate": 8.121579479213591e-05, + "loss": 0.6934, + "step": 3301 + }, + { + "epoch": 0.6787953540960017, + "grad_norm": 0.20248281955718994, + "learning_rate": 8.120986543636357e-05, + "loss": 0.6721, + "step": 3302 + }, + { + "epoch": 0.6790009250693803, + "grad_norm": 0.19119137525558472, + "learning_rate": 8.12039342967004e-05, + "loss": 0.6735, + "step": 3303 + }, + { + "epoch": 0.6792064960427587, + "grad_norm": 0.19932256639003754, + "learning_rate": 8.119800137343861e-05, + "loss": 0.6672, + "step": 3304 + }, + { + "epoch": 0.6794120670161373, + "grad_norm": 0.19938862323760986, + "learning_rate": 8.119206666687047e-05, + "loss": 0.681, + "step": 3305 + }, + { + "epoch": 0.6796176379895159, + "grad_norm": 0.20113952457904816, + "learning_rate": 8.118613017728839e-05, + "loss": 0.6699, + "step": 3306 + }, + { + "epoch": 0.6798232089628944, + "grad_norm": 0.19112683832645416, + "learning_rate": 8.118019190498477e-05, + "loss": 0.7142, + "step": 3307 + }, + { + "epoch": 0.680028779936273, + "grad_norm": 0.19518610835075378, + "learning_rate": 8.117425185025225e-05, + "loss": 0.6599, + "step": 3308 + }, + { + "epoch": 0.6802343509096516, + "grad_norm": 0.20748484134674072, + "learning_rate": 8.116831001338338e-05, + "loss": 0.6737, + "step": 3309 + }, + { + "epoch": 0.6804399218830302, + "grad_norm": 0.19534945487976074, + "learning_rate": 8.116236639467094e-05, + "loss": 0.6724, + "step": 3310 + }, + { + "epoch": 0.6806454928564086, + "grad_norm": 0.1551889032125473, + "learning_rate": 8.115642099440773e-05, + "loss": 0.5907, + "step": 3311 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.223983034491539, + "learning_rate": 8.115047381288667e-05, + "loss": 0.6984, + "step": 3312 + }, + { + "epoch": 0.6810566348031658, + "grad_norm": 0.2107374668121338, + "learning_rate": 8.11445248504007e-05, + "loss": 0.6801, + "step": 3313 + }, + { + "epoch": 0.6812622057765444, + "grad_norm": 0.2035159170627594, + "learning_rate": 8.113857410724294e-05, + "loss": 0.6509, + "step": 3314 + }, + { + "epoch": 0.6814677767499229, + "grad_norm": 0.1422436386346817, + "learning_rate": 8.113262158370655e-05, + "loss": 0.6071, + "step": 3315 + }, + { + "epoch": 0.6816733477233015, + "grad_norm": 0.20899644494056702, + "learning_rate": 8.11266672800848e-05, + "loss": 0.6571, + "step": 3316 + }, + { + "epoch": 0.68187891869668, + "grad_norm": 0.19945669174194336, + "learning_rate": 8.112071119667098e-05, + "loss": 0.7201, + "step": 3317 + }, + { + "epoch": 0.6820844896700586, + "grad_norm": 0.21106722950935364, + "learning_rate": 8.111475333375854e-05, + "loss": 0.6759, + "step": 3318 + }, + { + "epoch": 0.6822900606434371, + "grad_norm": 0.2076927125453949, + "learning_rate": 8.110879369164101e-05, + "loss": 0.6832, + "step": 3319 + }, + { + "epoch": 0.6824956316168157, + "grad_norm": 0.20357108116149902, + "learning_rate": 8.1102832270612e-05, + "loss": 0.6636, + "step": 3320 + }, + { + "epoch": 0.6827012025901943, + "grad_norm": 0.1578240841627121, + "learning_rate": 8.109686907096517e-05, + "loss": 0.6158, + "step": 3321 + }, + { + "epoch": 0.6829067735635729, + "grad_norm": 0.20219643414020538, + "learning_rate": 8.109090409299434e-05, + "loss": 0.6839, + "step": 3322 + }, + { + "epoch": 0.6831123445369514, + "grad_norm": 0.2029838114976883, + "learning_rate": 8.108493733699335e-05, + "loss": 0.6963, + "step": 3323 + }, + { + "epoch": 0.6833179155103299, + "grad_norm": 0.19904999434947968, + "learning_rate": 8.107896880325615e-05, + "loss": 0.6648, + "step": 3324 + }, + { + "epoch": 0.6835234864837085, + "grad_norm": 0.2000379115343094, + "learning_rate": 8.10729984920768e-05, + "loss": 0.6706, + "step": 3325 + }, + { + "epoch": 0.683729057457087, + "grad_norm": 0.19663308560848236, + "learning_rate": 8.106702640374939e-05, + "loss": 0.6798, + "step": 3326 + }, + { + "epoch": 0.6839346284304656, + "grad_norm": 0.2028771936893463, + "learning_rate": 8.10610525385682e-05, + "loss": 0.6919, + "step": 3327 + }, + { + "epoch": 0.6841401994038442, + "grad_norm": 0.19258631765842438, + "learning_rate": 8.105507689682748e-05, + "loss": 0.653, + "step": 3328 + }, + { + "epoch": 0.6843457703772228, + "grad_norm": 0.14250509440898895, + "learning_rate": 8.104909947882165e-05, + "loss": 0.5786, + "step": 3329 + }, + { + "epoch": 0.6845513413506013, + "grad_norm": 0.2034870833158493, + "learning_rate": 8.104312028484517e-05, + "loss": 0.6705, + "step": 3330 + }, + { + "epoch": 0.6847569123239798, + "grad_norm": 0.19610241055488586, + "learning_rate": 8.103713931519263e-05, + "loss": 0.7, + "step": 3331 + }, + { + "epoch": 0.6849624832973584, + "grad_norm": 0.14964817464351654, + "learning_rate": 8.103115657015868e-05, + "loss": 0.5914, + "step": 3332 + }, + { + "epoch": 0.685168054270737, + "grad_norm": 0.20991382002830505, + "learning_rate": 8.102517205003804e-05, + "loss": 0.6841, + "step": 3333 + }, + { + "epoch": 0.6853736252441155, + "grad_norm": 0.20073123276233673, + "learning_rate": 8.101918575512556e-05, + "loss": 0.6919, + "step": 3334 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 0.21147504448890686, + "learning_rate": 8.101319768571616e-05, + "loss": 0.6585, + "step": 3335 + }, + { + "epoch": 0.6857847671908727, + "grad_norm": 0.20476599037647247, + "learning_rate": 8.100720784210482e-05, + "loss": 0.7009, + "step": 3336 + }, + { + "epoch": 0.6859903381642513, + "grad_norm": 0.20010556280612946, + "learning_rate": 8.100121622458666e-05, + "loss": 0.6734, + "step": 3337 + }, + { + "epoch": 0.6861959091376297, + "grad_norm": 0.1875293105840683, + "learning_rate": 8.099522283345683e-05, + "loss": 0.6779, + "step": 3338 + }, + { + "epoch": 0.6864014801110083, + "grad_norm": 0.20071950554847717, + "learning_rate": 8.098922766901063e-05, + "loss": 0.6709, + "step": 3339 + }, + { + "epoch": 0.6866070510843869, + "grad_norm": 0.19928574562072754, + "learning_rate": 8.098323073154338e-05, + "loss": 0.7085, + "step": 3340 + }, + { + "epoch": 0.6868126220577655, + "grad_norm": 0.19401361048221588, + "learning_rate": 8.097723202135054e-05, + "loss": 0.6872, + "step": 3341 + }, + { + "epoch": 0.687018193031144, + "grad_norm": 0.19485783576965332, + "learning_rate": 8.097123153872765e-05, + "loss": 0.6864, + "step": 3342 + }, + { + "epoch": 0.6872237640045226, + "grad_norm": 0.1916022002696991, + "learning_rate": 8.09652292839703e-05, + "loss": 0.7022, + "step": 3343 + }, + { + "epoch": 0.6874293349779012, + "grad_norm": 0.1911773532629013, + "learning_rate": 8.09592252573742e-05, + "loss": 0.708, + "step": 3344 + }, + { + "epoch": 0.6876349059512796, + "grad_norm": 0.19738483428955078, + "learning_rate": 8.095321945923515e-05, + "loss": 0.7014, + "step": 3345 + }, + { + "epoch": 0.6878404769246582, + "grad_norm": 0.16668002307415009, + "learning_rate": 8.094721188984903e-05, + "loss": 0.6045, + "step": 3346 + }, + { + "epoch": 0.6880460478980368, + "grad_norm": 0.20171229541301727, + "learning_rate": 8.094120254951179e-05, + "loss": 0.6919, + "step": 3347 + }, + { + "epoch": 0.6882516188714154, + "grad_norm": 0.19809181988239288, + "learning_rate": 8.093519143851949e-05, + "loss": 0.6767, + "step": 3348 + }, + { + "epoch": 0.6884571898447939, + "grad_norm": 0.19745509326457977, + "learning_rate": 8.092917855716826e-05, + "loss": 0.6738, + "step": 3349 + }, + { + "epoch": 0.6886627608181725, + "grad_norm": 0.19986550509929657, + "learning_rate": 8.092316390575435e-05, + "loss": 0.7112, + "step": 3350 + }, + { + "epoch": 0.688868331791551, + "grad_norm": 0.19324201345443726, + "learning_rate": 8.091714748457404e-05, + "loss": 0.6906, + "step": 3351 + }, + { + "epoch": 0.6890739027649296, + "grad_norm": 0.20095904171466827, + "learning_rate": 8.091112929392376e-05, + "loss": 0.6486, + "step": 3352 + }, + { + "epoch": 0.6892794737383081, + "grad_norm": 0.1877359300851822, + "learning_rate": 8.09051093341e-05, + "loss": 0.6844, + "step": 3353 + }, + { + "epoch": 0.6894850447116867, + "grad_norm": 0.19812311232089996, + "learning_rate": 8.08990876053993e-05, + "loss": 0.6795, + "step": 3354 + }, + { + "epoch": 0.6896906156850653, + "grad_norm": 0.19134752452373505, + "learning_rate": 8.089306410811836e-05, + "loss": 0.703, + "step": 3355 + }, + { + "epoch": 0.6898961866584439, + "grad_norm": 0.1890835165977478, + "learning_rate": 8.088703884255393e-05, + "loss": 0.6585, + "step": 3356 + }, + { + "epoch": 0.6901017576318224, + "grad_norm": 0.18926945328712463, + "learning_rate": 8.088101180900282e-05, + "loss": 0.6694, + "step": 3357 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 0.18181371688842773, + "learning_rate": 8.087498300776194e-05, + "loss": 0.5831, + "step": 3358 + }, + { + "epoch": 0.6905128995785795, + "grad_norm": 0.1939140260219574, + "learning_rate": 8.086895243912835e-05, + "loss": 0.6658, + "step": 3359 + }, + { + "epoch": 0.690718470551958, + "grad_norm": 0.13031508028507233, + "learning_rate": 8.086292010339912e-05, + "loss": 0.6073, + "step": 3360 + }, + { + "epoch": 0.6909240415253366, + "grad_norm": 0.1984340101480484, + "learning_rate": 8.085688600087144e-05, + "loss": 0.6565, + "step": 3361 + }, + { + "epoch": 0.6911296124987152, + "grad_norm": 0.20224301517009735, + "learning_rate": 8.08508501318426e-05, + "loss": 0.7191, + "step": 3362 + }, + { + "epoch": 0.6913351834720938, + "grad_norm": 0.18884535133838654, + "learning_rate": 8.084481249660991e-05, + "loss": 0.7012, + "step": 3363 + }, + { + "epoch": 0.6915407544454722, + "grad_norm": 0.1905461698770523, + "learning_rate": 8.083877309547086e-05, + "loss": 0.6861, + "step": 3364 + }, + { + "epoch": 0.6917463254188508, + "grad_norm": 0.19112585484981537, + "learning_rate": 8.083273192872297e-05, + "loss": 0.6698, + "step": 3365 + }, + { + "epoch": 0.6919518963922294, + "grad_norm": 0.19276300072669983, + "learning_rate": 8.082668899666386e-05, + "loss": 0.6939, + "step": 3366 + }, + { + "epoch": 0.692157467365608, + "grad_norm": 0.1849944144487381, + "learning_rate": 8.082064429959123e-05, + "loss": 0.6653, + "step": 3367 + }, + { + "epoch": 0.6923630383389865, + "grad_norm": 0.197621151804924, + "learning_rate": 8.081459783780288e-05, + "loss": 0.69, + "step": 3368 + }, + { + "epoch": 0.6925686093123651, + "grad_norm": 0.20411409437656403, + "learning_rate": 8.08085496115967e-05, + "loss": 0.6928, + "step": 3369 + }, + { + "epoch": 0.6927741802857437, + "grad_norm": 0.19879065454006195, + "learning_rate": 8.080249962127064e-05, + "loss": 0.6855, + "step": 3370 + }, + { + "epoch": 0.6929797512591223, + "grad_norm": 0.19563095271587372, + "learning_rate": 8.079644786712277e-05, + "loss": 0.6692, + "step": 3371 + }, + { + "epoch": 0.6931853222325007, + "grad_norm": 0.1997094601392746, + "learning_rate": 8.079039434945124e-05, + "loss": 0.6851, + "step": 3372 + }, + { + "epoch": 0.6933908932058793, + "grad_norm": 0.19280613958835602, + "learning_rate": 8.078433906855424e-05, + "loss": 0.6731, + "step": 3373 + }, + { + "epoch": 0.6935964641792579, + "grad_norm": 0.18386954069137573, + "learning_rate": 8.077828202473013e-05, + "loss": 0.6934, + "step": 3374 + }, + { + "epoch": 0.6938020351526365, + "grad_norm": 0.20323842763900757, + "learning_rate": 8.077222321827727e-05, + "loss": 0.6856, + "step": 3375 + }, + { + "epoch": 0.694007606126015, + "grad_norm": 0.1947094351053238, + "learning_rate": 8.076616264949418e-05, + "loss": 0.6884, + "step": 3376 + }, + { + "epoch": 0.6942131770993936, + "grad_norm": 0.19289527833461761, + "learning_rate": 8.076010031867944e-05, + "loss": 0.589, + "step": 3377 + }, + { + "epoch": 0.6944187480727722, + "grad_norm": 0.19861692190170288, + "learning_rate": 8.075403622613168e-05, + "loss": 0.7024, + "step": 3378 + }, + { + "epoch": 0.6946243190461506, + "grad_norm": 0.21449032425880432, + "learning_rate": 8.074797037214968e-05, + "loss": 0.7021, + "step": 3379 + }, + { + "epoch": 0.6948298900195292, + "grad_norm": 0.1875978410243988, + "learning_rate": 8.074190275703227e-05, + "loss": 0.6898, + "step": 3380 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 0.15483641624450684, + "learning_rate": 8.073583338107837e-05, + "loss": 0.5851, + "step": 3381 + }, + { + "epoch": 0.6952410319662864, + "grad_norm": 0.19564680755138397, + "learning_rate": 8.072976224458697e-05, + "loss": 0.6792, + "step": 3382 + }, + { + "epoch": 0.6954466029396649, + "grad_norm": 0.20344282686710358, + "learning_rate": 8.072368934785719e-05, + "loss": 0.6869, + "step": 3383 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.19657017290592194, + "learning_rate": 8.071761469118822e-05, + "loss": 0.6595, + "step": 3384 + }, + { + "epoch": 0.695857744886422, + "grad_norm": 0.19356437027454376, + "learning_rate": 8.071153827487931e-05, + "loss": 0.6804, + "step": 3385 + }, + { + "epoch": 0.6960633158598006, + "grad_norm": 0.19667509198188782, + "learning_rate": 8.070546009922981e-05, + "loss": 0.7075, + "step": 3386 + }, + { + "epoch": 0.6962688868331791, + "grad_norm": 0.18919992446899414, + "learning_rate": 8.06993801645392e-05, + "loss": 0.6778, + "step": 3387 + }, + { + "epoch": 0.6964744578065577, + "grad_norm": 0.15784306824207306, + "learning_rate": 8.0693298471107e-05, + "loss": 0.5685, + "step": 3388 + }, + { + "epoch": 0.6966800287799363, + "grad_norm": 0.20536069571971893, + "learning_rate": 8.068721501923279e-05, + "loss": 0.6465, + "step": 3389 + }, + { + "epoch": 0.6968855997533149, + "grad_norm": 0.1936463564634323, + "learning_rate": 8.06811298092163e-05, + "loss": 0.6918, + "step": 3390 + }, + { + "epoch": 0.6970911707266934, + "grad_norm": 0.19561581313610077, + "learning_rate": 8.067504284135732e-05, + "loss": 0.673, + "step": 3391 + }, + { + "epoch": 0.6972967417000719, + "grad_norm": 0.198947474360466, + "learning_rate": 8.066895411595572e-05, + "loss": 0.6773, + "step": 3392 + }, + { + "epoch": 0.6975023126734505, + "grad_norm": 0.19654102623462677, + "learning_rate": 8.066286363331147e-05, + "loss": 0.6467, + "step": 3393 + }, + { + "epoch": 0.6977078836468291, + "grad_norm": 0.1938384771347046, + "learning_rate": 8.065677139372462e-05, + "loss": 0.6993, + "step": 3394 + }, + { + "epoch": 0.6979134546202076, + "grad_norm": 0.1924823522567749, + "learning_rate": 8.06506773974953e-05, + "loss": 0.6672, + "step": 3395 + }, + { + "epoch": 0.6981190255935862, + "grad_norm": 0.19648601114749908, + "learning_rate": 8.064458164492372e-05, + "loss": 0.6478, + "step": 3396 + }, + { + "epoch": 0.6983245965669648, + "grad_norm": 0.1876935362815857, + "learning_rate": 8.063848413631023e-05, + "loss": 0.6704, + "step": 3397 + }, + { + "epoch": 0.6985301675403432, + "grad_norm": 0.19049161672592163, + "learning_rate": 8.06323848719552e-05, + "loss": 0.6582, + "step": 3398 + }, + { + "epoch": 0.6987357385137218, + "grad_norm": 0.19286733865737915, + "learning_rate": 8.06262838521591e-05, + "loss": 0.7147, + "step": 3399 + }, + { + "epoch": 0.6989413094871004, + "grad_norm": 0.19397635757923126, + "learning_rate": 8.062018107722252e-05, + "loss": 0.6801, + "step": 3400 + }, + { + "epoch": 0.699146880460479, + "grad_norm": 0.20421355962753296, + "learning_rate": 8.06140765474461e-05, + "loss": 0.6723, + "step": 3401 + }, + { + "epoch": 0.6993524514338575, + "grad_norm": 0.1797918975353241, + "learning_rate": 8.060797026313059e-05, + "loss": 0.5854, + "step": 3402 + }, + { + "epoch": 0.6995580224072361, + "grad_norm": 0.19936294853687286, + "learning_rate": 8.060186222457682e-05, + "loss": 0.6819, + "step": 3403 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 0.19907638430595398, + "learning_rate": 8.05957524320857e-05, + "loss": 0.6739, + "step": 3404 + }, + { + "epoch": 0.6999691643539933, + "grad_norm": 0.20160700380802155, + "learning_rate": 8.058964088595822e-05, + "loss": 0.6694, + "step": 3405 + }, + { + "epoch": 0.7001747353273717, + "grad_norm": 0.19310222566127777, + "learning_rate": 8.05835275864955e-05, + "loss": 0.6806, + "step": 3406 + }, + { + "epoch": 0.7003803063007503, + "grad_norm": 0.1963704526424408, + "learning_rate": 8.057741253399866e-05, + "loss": 0.6816, + "step": 3407 + }, + { + "epoch": 0.7005858772741289, + "grad_norm": 0.5723682641983032, + "learning_rate": 8.057129572876903e-05, + "loss": 0.6971, + "step": 3408 + }, + { + "epoch": 0.7007914482475075, + "grad_norm": 0.1899087131023407, + "learning_rate": 8.05651771711079e-05, + "loss": 0.6834, + "step": 3409 + }, + { + "epoch": 0.700997019220886, + "grad_norm": 0.1957729011774063, + "learning_rate": 8.055905686131672e-05, + "loss": 0.7188, + "step": 3410 + }, + { + "epoch": 0.7012025901942646, + "grad_norm": 0.19298696517944336, + "learning_rate": 8.055293479969702e-05, + "loss": 0.6694, + "step": 3411 + }, + { + "epoch": 0.7014081611676432, + "grad_norm": 0.1891012340784073, + "learning_rate": 8.05468109865504e-05, + "loss": 0.6817, + "step": 3412 + }, + { + "epoch": 0.7016137321410217, + "grad_norm": 0.19800642132759094, + "learning_rate": 8.054068542217854e-05, + "loss": 0.6592, + "step": 3413 + }, + { + "epoch": 0.7018193031144002, + "grad_norm": 0.18479777872562408, + "learning_rate": 8.053455810688322e-05, + "loss": 0.6702, + "step": 3414 + }, + { + "epoch": 0.7020248740877788, + "grad_norm": 0.20111770927906036, + "learning_rate": 8.052842904096631e-05, + "loss": 0.7025, + "step": 3415 + }, + { + "epoch": 0.7022304450611574, + "grad_norm": 0.19288669526576996, + "learning_rate": 8.052229822472977e-05, + "loss": 0.6858, + "step": 3416 + }, + { + "epoch": 0.7024360160345359, + "grad_norm": 0.2072620391845703, + "learning_rate": 8.051616565847562e-05, + "loss": 0.6998, + "step": 3417 + }, + { + "epoch": 0.7026415870079145, + "grad_norm": 0.1882101595401764, + "learning_rate": 8.051003134250601e-05, + "loss": 0.6669, + "step": 3418 + }, + { + "epoch": 0.702847157981293, + "grad_norm": 0.2227669060230255, + "learning_rate": 8.050389527712312e-05, + "loss": 0.6115, + "step": 3419 + }, + { + "epoch": 0.7030527289546716, + "grad_norm": 0.1958729773759842, + "learning_rate": 8.049775746262924e-05, + "loss": 0.7012, + "step": 3420 + }, + { + "epoch": 0.7032582999280501, + "grad_norm": 0.14937171339988708, + "learning_rate": 8.049161789932677e-05, + "loss": 0.6124, + "step": 3421 + }, + { + "epoch": 0.7034638709014287, + "grad_norm": 0.16276027262210846, + "learning_rate": 8.048547658751817e-05, + "loss": 0.5928, + "step": 3422 + }, + { + "epoch": 0.7036694418748073, + "grad_norm": 0.15098173916339874, + "learning_rate": 8.047933352750601e-05, + "loss": 0.6122, + "step": 3423 + }, + { + "epoch": 0.7038750128481859, + "grad_norm": 0.20423725247383118, + "learning_rate": 8.047318871959292e-05, + "loss": 0.6988, + "step": 3424 + }, + { + "epoch": 0.7040805838215644, + "grad_norm": 0.19810713827610016, + "learning_rate": 8.046704216408161e-05, + "loss": 0.6585, + "step": 3425 + }, + { + "epoch": 0.7042861547949429, + "grad_norm": 0.21174119412899017, + "learning_rate": 8.046089386127491e-05, + "loss": 0.5926, + "step": 3426 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 0.18921788036823273, + "learning_rate": 8.045474381147572e-05, + "loss": 0.663, + "step": 3427 + }, + { + "epoch": 0.7046972967417001, + "grad_norm": 0.21867318451404572, + "learning_rate": 8.044859201498701e-05, + "loss": 0.6619, + "step": 3428 + }, + { + "epoch": 0.7049028677150786, + "grad_norm": 0.18937045335769653, + "learning_rate": 8.044243847211186e-05, + "loss": 0.6972, + "step": 3429 + }, + { + "epoch": 0.7051084386884572, + "grad_norm": 0.20421583950519562, + "learning_rate": 8.043628318315343e-05, + "loss": 0.6855, + "step": 3430 + }, + { + "epoch": 0.7053140096618358, + "grad_norm": 0.20946352183818817, + "learning_rate": 8.043012614841493e-05, + "loss": 0.5986, + "step": 3431 + }, + { + "epoch": 0.7055195806352144, + "grad_norm": 0.21439684927463531, + "learning_rate": 8.042396736819974e-05, + "loss": 0.6642, + "step": 3432 + }, + { + "epoch": 0.7057251516085928, + "grad_norm": 0.1428326517343521, + "learning_rate": 8.041780684281124e-05, + "loss": 0.5734, + "step": 3433 + }, + { + "epoch": 0.7059307225819714, + "grad_norm": 0.21994005143642426, + "learning_rate": 8.041164457255295e-05, + "loss": 0.6916, + "step": 3434 + }, + { + "epoch": 0.70613629355535, + "grad_norm": 0.19378912448883057, + "learning_rate": 8.040548055772843e-05, + "loss": 0.6845, + "step": 3435 + }, + { + "epoch": 0.7063418645287285, + "grad_norm": 0.14617706835269928, + "learning_rate": 8.039931479864138e-05, + "loss": 0.5823, + "step": 3436 + }, + { + "epoch": 0.7065474355021071, + "grad_norm": 0.2063405066728592, + "learning_rate": 8.039314729559553e-05, + "loss": 0.7163, + "step": 3437 + }, + { + "epoch": 0.7067530064754857, + "grad_norm": 0.20391802489757538, + "learning_rate": 8.038697804889476e-05, + "loss": 0.6825, + "step": 3438 + }, + { + "epoch": 0.7069585774488643, + "grad_norm": 0.1884995549917221, + "learning_rate": 8.038080705884297e-05, + "loss": 0.7005, + "step": 3439 + }, + { + "epoch": 0.7071641484222427, + "grad_norm": 0.15203148126602173, + "learning_rate": 8.03746343257442e-05, + "loss": 0.5766, + "step": 3440 + }, + { + "epoch": 0.7073697193956213, + "grad_norm": 0.1965416520833969, + "learning_rate": 8.036845984990251e-05, + "loss": 0.6746, + "step": 3441 + }, + { + "epoch": 0.7075752903689999, + "grad_norm": 0.19438838958740234, + "learning_rate": 8.036228363162214e-05, + "loss": 0.68, + "step": 3442 + }, + { + "epoch": 0.7077808613423785, + "grad_norm": 0.19313882291316986, + "learning_rate": 8.035610567120731e-05, + "loss": 0.6638, + "step": 3443 + }, + { + "epoch": 0.707986432315757, + "grad_norm": 0.19299215078353882, + "learning_rate": 8.034992596896244e-05, + "loss": 0.6862, + "step": 3444 + }, + { + "epoch": 0.7081920032891356, + "grad_norm": 0.20329324901103973, + "learning_rate": 8.034374452519193e-05, + "loss": 0.6824, + "step": 3445 + }, + { + "epoch": 0.7083975742625142, + "grad_norm": 0.18780893087387085, + "learning_rate": 8.033756134020032e-05, + "loss": 0.662, + "step": 3446 + }, + { + "epoch": 0.7086031452358927, + "grad_norm": 0.19197134673595428, + "learning_rate": 8.033137641429223e-05, + "loss": 0.6791, + "step": 3447 + }, + { + "epoch": 0.7088087162092712, + "grad_norm": 0.19330036640167236, + "learning_rate": 8.032518974777236e-05, + "loss": 0.6907, + "step": 3448 + }, + { + "epoch": 0.7090142871826498, + "grad_norm": 0.19305558502674103, + "learning_rate": 8.03190013409455e-05, + "loss": 0.6755, + "step": 3449 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 0.17885883152484894, + "learning_rate": 8.031281119411653e-05, + "loss": 0.6032, + "step": 3450 + }, + { + "epoch": 0.709425429129407, + "grad_norm": 0.19554337859153748, + "learning_rate": 8.030661930759041e-05, + "loss": 0.6943, + "step": 3451 + }, + { + "epoch": 0.7096310001027855, + "grad_norm": 0.19464746117591858, + "learning_rate": 8.030042568167216e-05, + "loss": 0.6655, + "step": 3452 + }, + { + "epoch": 0.709836571076164, + "grad_norm": 0.19761775434017181, + "learning_rate": 8.029423031666694e-05, + "loss": 0.6915, + "step": 3453 + }, + { + "epoch": 0.7100421420495426, + "grad_norm": 0.20174358785152435, + "learning_rate": 8.028803321287997e-05, + "loss": 0.6715, + "step": 3454 + }, + { + "epoch": 0.7102477130229211, + "grad_norm": 0.19728273153305054, + "learning_rate": 8.028183437061653e-05, + "loss": 0.7062, + "step": 3455 + }, + { + "epoch": 0.7104532839962997, + "grad_norm": 0.1927875429391861, + "learning_rate": 8.027563379018202e-05, + "loss": 0.6685, + "step": 3456 + }, + { + "epoch": 0.7106588549696783, + "grad_norm": 0.16123135387897491, + "learning_rate": 8.02694314718819e-05, + "loss": 0.5778, + "step": 3457 + }, + { + "epoch": 0.7108644259430569, + "grad_norm": 0.1330617517232895, + "learning_rate": 8.026322741602176e-05, + "loss": 0.5941, + "step": 3458 + }, + { + "epoch": 0.7110699969164354, + "grad_norm": 0.24413903057575226, + "learning_rate": 8.025702162290721e-05, + "loss": 0.6845, + "step": 3459 + }, + { + "epoch": 0.7112755678898139, + "grad_norm": 0.21330687403678894, + "learning_rate": 8.0250814092844e-05, + "loss": 0.6724, + "step": 3460 + }, + { + "epoch": 0.7114811388631925, + "grad_norm": 0.21365886926651, + "learning_rate": 8.024460482613793e-05, + "loss": 0.6668, + "step": 3461 + }, + { + "epoch": 0.7116867098365711, + "grad_norm": 0.2229931354522705, + "learning_rate": 8.023839382309493e-05, + "loss": 0.6628, + "step": 3462 + }, + { + "epoch": 0.7118922808099496, + "grad_norm": 0.21787157654762268, + "learning_rate": 8.023218108402096e-05, + "loss": 0.6776, + "step": 3463 + }, + { + "epoch": 0.7120978517833282, + "grad_norm": 0.19112589955329895, + "learning_rate": 8.022596660922212e-05, + "loss": 0.5856, + "step": 3464 + }, + { + "epoch": 0.7123034227567068, + "grad_norm": 0.20584847033023834, + "learning_rate": 8.021975039900453e-05, + "loss": 0.6659, + "step": 3465 + }, + { + "epoch": 0.7125089937300854, + "grad_norm": 0.13937044143676758, + "learning_rate": 8.021353245367445e-05, + "loss": 0.581, + "step": 3466 + }, + { + "epoch": 0.7127145647034638, + "grad_norm": 0.21949850022792816, + "learning_rate": 8.020731277353824e-05, + "loss": 0.6818, + "step": 3467 + }, + { + "epoch": 0.7129201356768424, + "grad_norm": 0.19672751426696777, + "learning_rate": 8.020109135890227e-05, + "loss": 0.6788, + "step": 3468 + }, + { + "epoch": 0.713125706650221, + "grad_norm": 0.18057693541049957, + "learning_rate": 8.019486821007307e-05, + "loss": 0.5962, + "step": 3469 + }, + { + "epoch": 0.7133312776235996, + "grad_norm": 0.20432183146476746, + "learning_rate": 8.01886433273572e-05, + "loss": 0.6854, + "step": 3470 + }, + { + "epoch": 0.7135368485969781, + "grad_norm": 0.20442970097064972, + "learning_rate": 8.018241671106135e-05, + "loss": 0.6755, + "step": 3471 + }, + { + "epoch": 0.7137424195703567, + "grad_norm": 0.1377362608909607, + "learning_rate": 8.017618836149227e-05, + "loss": 0.5924, + "step": 3472 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 0.20388440787792206, + "learning_rate": 8.01699582789568e-05, + "loss": 0.6946, + "step": 3473 + }, + { + "epoch": 0.7141535615171137, + "grad_norm": 0.2007599174976349, + "learning_rate": 8.016372646376188e-05, + "loss": 0.6916, + "step": 3474 + }, + { + "epoch": 0.7143591324904923, + "grad_norm": 0.1868349313735962, + "learning_rate": 8.015749291621449e-05, + "loss": 0.6758, + "step": 3475 + }, + { + "epoch": 0.7145647034638709, + "grad_norm": 0.20039929449558258, + "learning_rate": 8.015125763662177e-05, + "loss": 0.6769, + "step": 3476 + }, + { + "epoch": 0.7147702744372495, + "grad_norm": 0.1937168687582016, + "learning_rate": 8.014502062529089e-05, + "loss": 0.6572, + "step": 3477 + }, + { + "epoch": 0.714975845410628, + "grad_norm": 0.16396324336528778, + "learning_rate": 8.013878188252908e-05, + "loss": 0.5781, + "step": 3478 + }, + { + "epoch": 0.7151814163840066, + "grad_norm": 0.19520901143550873, + "learning_rate": 8.013254140864376e-05, + "loss": 0.7001, + "step": 3479 + }, + { + "epoch": 0.7153869873573852, + "grad_norm": 0.1290317177772522, + "learning_rate": 8.012629920394231e-05, + "loss": 0.5826, + "step": 3480 + }, + { + "epoch": 0.7155925583307637, + "grad_norm": 0.20711787045001984, + "learning_rate": 8.012005526873228e-05, + "loss": 0.7025, + "step": 3481 + }, + { + "epoch": 0.7157981293041422, + "grad_norm": 0.20414526760578156, + "learning_rate": 8.011380960332128e-05, + "loss": 0.6697, + "step": 3482 + }, + { + "epoch": 0.7160037002775208, + "grad_norm": 0.19431988894939423, + "learning_rate": 8.010756220801702e-05, + "loss": 0.6705, + "step": 3483 + }, + { + "epoch": 0.7162092712508994, + "grad_norm": 0.1636938601732254, + "learning_rate": 8.010131308312725e-05, + "loss": 0.5727, + "step": 3484 + }, + { + "epoch": 0.716414842224278, + "grad_norm": 0.19284431636333466, + "learning_rate": 8.009506222895984e-05, + "loss": 0.6772, + "step": 3485 + }, + { + "epoch": 0.7166204131976565, + "grad_norm": 0.19347639381885529, + "learning_rate": 8.008880964582275e-05, + "loss": 0.6934, + "step": 3486 + }, + { + "epoch": 0.716825984171035, + "grad_norm": 0.12324893474578857, + "learning_rate": 8.008255533402403e-05, + "loss": 0.5841, + "step": 3487 + }, + { + "epoch": 0.7170315551444136, + "grad_norm": 0.12979742884635925, + "learning_rate": 8.007629929387176e-05, + "loss": 0.5726, + "step": 3488 + }, + { + "epoch": 0.7172371261177922, + "grad_norm": 0.19342902302742004, + "learning_rate": 8.007004152567417e-05, + "loss": 0.6887, + "step": 3489 + }, + { + "epoch": 0.7174426970911707, + "grad_norm": 0.13253627717494965, + "learning_rate": 8.006378202973959e-05, + "loss": 0.5835, + "step": 3490 + }, + { + "epoch": 0.7176482680645493, + "grad_norm": 0.2006087452173233, + "learning_rate": 8.005752080637632e-05, + "loss": 0.6998, + "step": 3491 + }, + { + "epoch": 0.7178538390379279, + "grad_norm": 0.12888813018798828, + "learning_rate": 8.005125785589286e-05, + "loss": 0.595, + "step": 3492 + }, + { + "epoch": 0.7180594100113064, + "grad_norm": 0.1942748874425888, + "learning_rate": 8.004499317859776e-05, + "loss": 0.683, + "step": 3493 + }, + { + "epoch": 0.7182649809846849, + "grad_norm": 0.18737460672855377, + "learning_rate": 8.003872677479965e-05, + "loss": 0.6861, + "step": 3494 + }, + { + "epoch": 0.7184705519580635, + "grad_norm": 0.24117667973041534, + "learning_rate": 8.003245864480724e-05, + "loss": 0.6826, + "step": 3495 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 0.19393832981586456, + "learning_rate": 8.002618878892934e-05, + "loss": 0.6682, + "step": 3496 + }, + { + "epoch": 0.7188816939048206, + "grad_norm": 0.19202245771884918, + "learning_rate": 8.001991720747481e-05, + "loss": 0.683, + "step": 3497 + }, + { + "epoch": 0.7190872648781992, + "grad_norm": 0.18830347061157227, + "learning_rate": 8.001364390075266e-05, + "loss": 0.6762, + "step": 3498 + }, + { + "epoch": 0.7192928358515778, + "grad_norm": 0.18478117883205414, + "learning_rate": 8.000736886907193e-05, + "loss": 0.673, + "step": 3499 + }, + { + "epoch": 0.7194984068249564, + "grad_norm": 0.19119176268577576, + "learning_rate": 8.000109211274176e-05, + "loss": 0.6683, + "step": 3500 + }, + { + "epoch": 0.7197039777983348, + "grad_norm": 0.18504808843135834, + "learning_rate": 7.999481363207136e-05, + "loss": 0.6671, + "step": 3501 + }, + { + "epoch": 0.7199095487717134, + "grad_norm": 0.18554535508155823, + "learning_rate": 7.998853342737007e-05, + "loss": 0.6531, + "step": 3502 + }, + { + "epoch": 0.720115119745092, + "grad_norm": 0.20063155889511108, + "learning_rate": 7.998225149894729e-05, + "loss": 0.6826, + "step": 3503 + }, + { + "epoch": 0.7203206907184706, + "grad_norm": 0.18054603040218353, + "learning_rate": 7.997596784711245e-05, + "loss": 0.6657, + "step": 3504 + }, + { + "epoch": 0.7205262616918491, + "grad_norm": 0.19543704390525818, + "learning_rate": 7.996968247217517e-05, + "loss": 0.7077, + "step": 3505 + }, + { + "epoch": 0.7207318326652277, + "grad_norm": 0.196107417345047, + "learning_rate": 7.996339537444508e-05, + "loss": 0.6607, + "step": 3506 + }, + { + "epoch": 0.7209374036386063, + "grad_norm": 0.1699989140033722, + "learning_rate": 7.995710655423193e-05, + "loss": 0.5965, + "step": 3507 + }, + { + "epoch": 0.7211429746119847, + "grad_norm": 0.13372716307640076, + "learning_rate": 7.995081601184552e-05, + "loss": 0.5885, + "step": 3508 + }, + { + "epoch": 0.7213485455853633, + "grad_norm": 0.2239861637353897, + "learning_rate": 7.994452374759577e-05, + "loss": 0.6822, + "step": 3509 + }, + { + "epoch": 0.7215541165587419, + "grad_norm": 0.20403791964054108, + "learning_rate": 7.993822976179265e-05, + "loss": 0.6794, + "step": 3510 + }, + { + "epoch": 0.7217596875321205, + "grad_norm": 0.18789462745189667, + "learning_rate": 7.993193405474626e-05, + "loss": 0.6642, + "step": 3511 + }, + { + "epoch": 0.721965258505499, + "grad_norm": 0.1892167031764984, + "learning_rate": 7.992563662676676e-05, + "loss": 0.6768, + "step": 3512 + }, + { + "epoch": 0.7221708294788776, + "grad_norm": 0.19989047944545746, + "learning_rate": 7.991933747816437e-05, + "loss": 0.7015, + "step": 3513 + }, + { + "epoch": 0.7223764004522562, + "grad_norm": 0.19818507134914398, + "learning_rate": 7.991303660924944e-05, + "loss": 0.6459, + "step": 3514 + }, + { + "epoch": 0.7225819714256347, + "grad_norm": 0.20084840059280396, + "learning_rate": 7.990673402033238e-05, + "loss": 0.6967, + "step": 3515 + }, + { + "epoch": 0.7227875423990132, + "grad_norm": 0.19589127600193024, + "learning_rate": 7.990042971172369e-05, + "loss": 0.6819, + "step": 3516 + }, + { + "epoch": 0.7229931133723918, + "grad_norm": 0.2054595798254013, + "learning_rate": 7.989412368373395e-05, + "loss": 0.5563, + "step": 3517 + }, + { + "epoch": 0.7231986843457704, + "grad_norm": 0.16840699315071106, + "learning_rate": 7.988781593667382e-05, + "loss": 0.5998, + "step": 3518 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 0.20174477994441986, + "learning_rate": 7.988150647085408e-05, + "loss": 0.6767, + "step": 3519 + }, + { + "epoch": 0.7236098262925275, + "grad_norm": 0.2114832103252411, + "learning_rate": 7.987519528658556e-05, + "loss": 0.674, + "step": 3520 + }, + { + "epoch": 0.723815397265906, + "grad_norm": 0.20603235065937042, + "learning_rate": 7.986888238417915e-05, + "loss": 0.6922, + "step": 3521 + }, + { + "epoch": 0.7240209682392846, + "grad_norm": 0.19396202266216278, + "learning_rate": 7.98625677639459e-05, + "loss": 0.6542, + "step": 3522 + }, + { + "epoch": 0.7242265392126632, + "grad_norm": 0.19188427925109863, + "learning_rate": 7.985625142619688e-05, + "loss": 0.6423, + "step": 3523 + }, + { + "epoch": 0.7244321101860417, + "grad_norm": 0.24525907635688782, + "learning_rate": 7.984993337124326e-05, + "loss": 0.5969, + "step": 3524 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.22921410202980042, + "learning_rate": 7.984361359939632e-05, + "loss": 0.6787, + "step": 3525 + }, + { + "epoch": 0.7248432521327989, + "grad_norm": 0.23027624189853668, + "learning_rate": 7.98372921109674e-05, + "loss": 0.6958, + "step": 3526 + }, + { + "epoch": 0.7250488231061774, + "grad_norm": 0.21798734366893768, + "learning_rate": 7.983096890626792e-05, + "loss": 0.7058, + "step": 3527 + }, + { + "epoch": 0.7252543940795559, + "grad_norm": 0.1834592968225479, + "learning_rate": 7.98246439856094e-05, + "loss": 0.5576, + "step": 3528 + }, + { + "epoch": 0.7254599650529345, + "grad_norm": 0.20253108441829681, + "learning_rate": 7.981831734930344e-05, + "loss": 0.6919, + "step": 3529 + }, + { + "epoch": 0.7256655360263131, + "grad_norm": 0.2038789689540863, + "learning_rate": 7.981198899766173e-05, + "loss": 0.7226, + "step": 3530 + }, + { + "epoch": 0.7258711069996916, + "grad_norm": 0.19789783656597137, + "learning_rate": 7.980565893099604e-05, + "loss": 0.6876, + "step": 3531 + }, + { + "epoch": 0.7260766779730702, + "grad_norm": 0.14825506508350372, + "learning_rate": 7.97993271496182e-05, + "loss": 0.5838, + "step": 3532 + }, + { + "epoch": 0.7262822489464488, + "grad_norm": 0.19643041491508484, + "learning_rate": 7.979299365384017e-05, + "loss": 0.6868, + "step": 3533 + }, + { + "epoch": 0.7264878199198274, + "grad_norm": 0.20128373801708221, + "learning_rate": 7.978665844397397e-05, + "loss": 0.683, + "step": 3534 + }, + { + "epoch": 0.7266933908932058, + "grad_norm": 0.2025127112865448, + "learning_rate": 7.978032152033169e-05, + "loss": 0.6801, + "step": 3535 + }, + { + "epoch": 0.7268989618665844, + "grad_norm": 0.19767989218235016, + "learning_rate": 7.977398288322554e-05, + "loss": 0.6735, + "step": 3536 + }, + { + "epoch": 0.727104532839963, + "grad_norm": 0.1649659425020218, + "learning_rate": 7.976764253296779e-05, + "loss": 0.5818, + "step": 3537 + }, + { + "epoch": 0.7273101038133416, + "grad_norm": 0.20704413950443268, + "learning_rate": 7.976130046987078e-05, + "loss": 0.7032, + "step": 3538 + }, + { + "epoch": 0.7275156747867201, + "grad_norm": 0.20047134160995483, + "learning_rate": 7.975495669424698e-05, + "loss": 0.6851, + "step": 3539 + }, + { + "epoch": 0.7277212457600987, + "grad_norm": 0.14262793958187103, + "learning_rate": 7.974861120640891e-05, + "loss": 0.5911, + "step": 3540 + }, + { + "epoch": 0.7279268167334773, + "grad_norm": 0.19910430908203125, + "learning_rate": 7.974226400666918e-05, + "loss": 0.6729, + "step": 3541 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 0.1975426971912384, + "learning_rate": 7.973591509534048e-05, + "loss": 0.6614, + "step": 3542 + }, + { + "epoch": 0.7283379586802343, + "grad_norm": 0.18741396069526672, + "learning_rate": 7.972956447273561e-05, + "loss": 0.6808, + "step": 3543 + }, + { + "epoch": 0.7285435296536129, + "grad_norm": 0.19174180924892426, + "learning_rate": 7.972321213916742e-05, + "loss": 0.6732, + "step": 3544 + }, + { + "epoch": 0.7287491006269915, + "grad_norm": 0.18941205739974976, + "learning_rate": 7.971685809494886e-05, + "loss": 0.6854, + "step": 3545 + }, + { + "epoch": 0.72895467160037, + "grad_norm": 0.18745878338813782, + "learning_rate": 7.971050234039298e-05, + "loss": 0.6653, + "step": 3546 + }, + { + "epoch": 0.7291602425737486, + "grad_norm": 0.2130347341299057, + "learning_rate": 7.970414487581287e-05, + "loss": 0.6932, + "step": 3547 + }, + { + "epoch": 0.7293658135471272, + "grad_norm": 0.18765027821063995, + "learning_rate": 7.969778570152175e-05, + "loss": 0.6639, + "step": 3548 + }, + { + "epoch": 0.7295713845205057, + "grad_norm": 0.1892290711402893, + "learning_rate": 7.969142481783291e-05, + "loss": 0.6788, + "step": 3549 + }, + { + "epoch": 0.7297769554938842, + "grad_norm": 0.19938233494758606, + "learning_rate": 7.968506222505972e-05, + "loss": 0.6736, + "step": 3550 + }, + { + "epoch": 0.7299825264672628, + "grad_norm": 0.19479283690452576, + "learning_rate": 7.967869792351563e-05, + "loss": 0.671, + "step": 3551 + }, + { + "epoch": 0.7301880974406414, + "grad_norm": 0.18895529210567474, + "learning_rate": 7.967233191351418e-05, + "loss": 0.6559, + "step": 3552 + }, + { + "epoch": 0.73039366841402, + "grad_norm": 0.19964531064033508, + "learning_rate": 7.966596419536899e-05, + "loss": 0.6772, + "step": 3553 + }, + { + "epoch": 0.7305992393873985, + "grad_norm": 0.1866195648908615, + "learning_rate": 7.965959476939377e-05, + "loss": 0.642, + "step": 3554 + }, + { + "epoch": 0.730804810360777, + "grad_norm": 0.15533728897571564, + "learning_rate": 7.965322363590232e-05, + "loss": 0.5754, + "step": 3555 + }, + { + "epoch": 0.7310103813341556, + "grad_norm": 0.19216640293598175, + "learning_rate": 7.964685079520851e-05, + "loss": 0.6827, + "step": 3556 + }, + { + "epoch": 0.7312159523075342, + "grad_norm": 0.1994984894990921, + "learning_rate": 7.96404762476263e-05, + "loss": 0.6814, + "step": 3557 + }, + { + "epoch": 0.7314215232809127, + "grad_norm": 0.34993866086006165, + "learning_rate": 7.963409999346974e-05, + "loss": 0.7039, + "step": 3558 + }, + { + "epoch": 0.7316270942542913, + "grad_norm": 0.13572952151298523, + "learning_rate": 7.962772203305295e-05, + "loss": 0.5847, + "step": 3559 + }, + { + "epoch": 0.7318326652276699, + "grad_norm": 0.21044890582561493, + "learning_rate": 7.962134236669015e-05, + "loss": 0.6852, + "step": 3560 + }, + { + "epoch": 0.7320382362010485, + "grad_norm": 0.13309255242347717, + "learning_rate": 7.961496099469562e-05, + "loss": 0.5953, + "step": 3561 + }, + { + "epoch": 0.7322438071744269, + "grad_norm": 0.19451969861984253, + "learning_rate": 7.960857791738376e-05, + "loss": 0.6785, + "step": 3562 + }, + { + "epoch": 0.7324493781478055, + "grad_norm": 0.12751372158527374, + "learning_rate": 7.960219313506901e-05, + "loss": 0.6012, + "step": 3563 + }, + { + "epoch": 0.7326549491211841, + "grad_norm": 0.19144867360591888, + "learning_rate": 7.959580664806594e-05, + "loss": 0.6883, + "step": 3564 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 0.18746548891067505, + "learning_rate": 7.958941845668921e-05, + "loss": 0.6731, + "step": 3565 + }, + { + "epoch": 0.7330660910679412, + "grad_norm": 0.7065462470054626, + "learning_rate": 7.958302856125347e-05, + "loss": 0.595, + "step": 3566 + }, + { + "epoch": 0.7332716620413198, + "grad_norm": 0.1951018124818802, + "learning_rate": 7.957663696207355e-05, + "loss": 0.6601, + "step": 3567 + }, + { + "epoch": 0.7334772330146984, + "grad_norm": 0.13065175712108612, + "learning_rate": 7.957024365946436e-05, + "loss": 0.5551, + "step": 3568 + }, + { + "epoch": 0.7336828039880768, + "grad_norm": 0.21272675693035126, + "learning_rate": 7.956384865374082e-05, + "loss": 0.6846, + "step": 3569 + }, + { + "epoch": 0.7338883749614554, + "grad_norm": 0.19540101289749146, + "learning_rate": 7.955745194521802e-05, + "loss": 0.6747, + "step": 3570 + }, + { + "epoch": 0.734093945934834, + "grad_norm": 0.19584521651268005, + "learning_rate": 7.95510535342111e-05, + "loss": 0.6877, + "step": 3571 + }, + { + "epoch": 0.7342995169082126, + "grad_norm": 0.19038638472557068, + "learning_rate": 7.954465342103525e-05, + "loss": 0.6776, + "step": 3572 + }, + { + "epoch": 0.7345050878815911, + "grad_norm": 0.1913788616657257, + "learning_rate": 7.953825160600579e-05, + "loss": 0.6754, + "step": 3573 + }, + { + "epoch": 0.7347106588549697, + "grad_norm": 0.19518351554870605, + "learning_rate": 7.953184808943808e-05, + "loss": 0.675, + "step": 3574 + }, + { + "epoch": 0.7349162298283483, + "grad_norm": 0.19314491748809814, + "learning_rate": 7.952544287164763e-05, + "loss": 0.6771, + "step": 3575 + }, + { + "epoch": 0.7351218008017268, + "grad_norm": 0.2056049257516861, + "learning_rate": 7.951903595295e-05, + "loss": 0.6825, + "step": 3576 + }, + { + "epoch": 0.7353273717751053, + "grad_norm": 0.19159257411956787, + "learning_rate": 7.95126273336608e-05, + "loss": 0.6783, + "step": 3577 + }, + { + "epoch": 0.7355329427484839, + "grad_norm": 0.1686679869890213, + "learning_rate": 7.950621701409577e-05, + "loss": 0.581, + "step": 3578 + }, + { + "epoch": 0.7357385137218625, + "grad_norm": 0.14951810240745544, + "learning_rate": 7.94998049945707e-05, + "loss": 0.5694, + "step": 3579 + }, + { + "epoch": 0.7359440846952411, + "grad_norm": 0.2037050724029541, + "learning_rate": 7.949339127540149e-05, + "loss": 0.6722, + "step": 3580 + }, + { + "epoch": 0.7361496556686196, + "grad_norm": 0.15541227161884308, + "learning_rate": 7.948697585690412e-05, + "loss": 0.6053, + "step": 3581 + }, + { + "epoch": 0.7363552266419982, + "grad_norm": 0.20057538151741028, + "learning_rate": 7.948055873939463e-05, + "loss": 0.6745, + "step": 3582 + }, + { + "epoch": 0.7365607976153767, + "grad_norm": 0.19490864872932434, + "learning_rate": 7.947413992318918e-05, + "loss": 0.6963, + "step": 3583 + }, + { + "epoch": 0.7367663685887552, + "grad_norm": 0.19570674002170563, + "learning_rate": 7.946771940860398e-05, + "loss": 0.6913, + "step": 3584 + }, + { + "epoch": 0.7369719395621338, + "grad_norm": 0.18625394999980927, + "learning_rate": 7.946129719595535e-05, + "loss": 0.6699, + "step": 3585 + }, + { + "epoch": 0.7371775105355124, + "grad_norm": 0.6736593246459961, + "learning_rate": 7.945487328555969e-05, + "loss": 0.5934, + "step": 3586 + }, + { + "epoch": 0.737383081508891, + "grad_norm": 0.1934710294008255, + "learning_rate": 7.944844767773344e-05, + "loss": 0.672, + "step": 3587 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 0.20478187501430511, + "learning_rate": 7.944202037279322e-05, + "loss": 0.6703, + "step": 3588 + }, + { + "epoch": 0.737794223455648, + "grad_norm": 0.1952143758535385, + "learning_rate": 7.94355913710556e-05, + "loss": 0.665, + "step": 3589 + }, + { + "epoch": 0.7379997944290266, + "grad_norm": 0.2044733166694641, + "learning_rate": 7.942916067283737e-05, + "loss": 0.6705, + "step": 3590 + }, + { + "epoch": 0.7382053654024052, + "grad_norm": 0.1511656492948532, + "learning_rate": 7.942272827845531e-05, + "loss": 0.5709, + "step": 3591 + }, + { + "epoch": 0.7384109363757837, + "grad_norm": 0.20712168514728546, + "learning_rate": 7.941629418822631e-05, + "loss": 0.6822, + "step": 3592 + }, + { + "epoch": 0.7386165073491623, + "grad_norm": 0.18875378370285034, + "learning_rate": 7.940985840246738e-05, + "loss": 0.6657, + "step": 3593 + }, + { + "epoch": 0.7388220783225409, + "grad_norm": 0.20335470139980316, + "learning_rate": 7.940342092149552e-05, + "loss": 0.6803, + "step": 3594 + }, + { + "epoch": 0.7390276492959195, + "grad_norm": 0.19990339875221252, + "learning_rate": 7.939698174562795e-05, + "loss": 0.6633, + "step": 3595 + }, + { + "epoch": 0.7392332202692979, + "grad_norm": 0.19923284649848938, + "learning_rate": 7.939054087518184e-05, + "loss": 0.6894, + "step": 3596 + }, + { + "epoch": 0.7394387912426765, + "grad_norm": 0.20602424442768097, + "learning_rate": 7.938409831047452e-05, + "loss": 0.7057, + "step": 3597 + }, + { + "epoch": 0.7396443622160551, + "grad_norm": 0.19284965097904205, + "learning_rate": 7.93776540518234e-05, + "loss": 0.6619, + "step": 3598 + }, + { + "epoch": 0.7398499331894337, + "grad_norm": 0.18483732640743256, + "learning_rate": 7.937120809954593e-05, + "loss": 0.664, + "step": 3599 + }, + { + "epoch": 0.7400555041628122, + "grad_norm": 0.19070151448249817, + "learning_rate": 7.93647604539597e-05, + "loss": 0.6934, + "step": 3600 + }, + { + "epoch": 0.7402610751361908, + "grad_norm": 0.1932380348443985, + "learning_rate": 7.935831111538234e-05, + "loss": 0.6692, + "step": 3601 + }, + { + "epoch": 0.7404666461095694, + "grad_norm": 0.1923176348209381, + "learning_rate": 7.935186008413158e-05, + "loss": 0.6813, + "step": 3602 + }, + { + "epoch": 0.7406722170829478, + "grad_norm": 0.19491972029209137, + "learning_rate": 7.934540736052524e-05, + "loss": 0.6571, + "step": 3603 + }, + { + "epoch": 0.7408777880563264, + "grad_norm": 0.19038790464401245, + "learning_rate": 7.93389529448812e-05, + "loss": 0.6627, + "step": 3604 + }, + { + "epoch": 0.741083359029705, + "grad_norm": 0.1902906596660614, + "learning_rate": 7.933249683751745e-05, + "loss": 0.6792, + "step": 3605 + }, + { + "epoch": 0.7412889300030836, + "grad_norm": 0.18056754767894745, + "learning_rate": 7.932603903875205e-05, + "loss": 0.6706, + "step": 3606 + }, + { + "epoch": 0.7414945009764621, + "grad_norm": 0.19401055574417114, + "learning_rate": 7.931957954890316e-05, + "loss": 0.6997, + "step": 3607 + }, + { + "epoch": 0.7417000719498407, + "grad_norm": 0.19308343529701233, + "learning_rate": 7.931311836828898e-05, + "loss": 0.6804, + "step": 3608 + }, + { + "epoch": 0.7419056429232193, + "grad_norm": 0.20034140348434448, + "learning_rate": 7.930665549722784e-05, + "loss": 0.6672, + "step": 3609 + }, + { + "epoch": 0.7421112138965978, + "grad_norm": 0.1429484337568283, + "learning_rate": 7.930019093603813e-05, + "loss": 0.5769, + "step": 3610 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 0.19549964368343353, + "learning_rate": 7.929372468503834e-05, + "loss": 0.68, + "step": 3611 + }, + { + "epoch": 0.7425223558433549, + "grad_norm": 0.1939014494419098, + "learning_rate": 7.928725674454702e-05, + "loss": 0.6436, + "step": 3612 + }, + { + "epoch": 0.7427279268167335, + "grad_norm": 0.1987033188343048, + "learning_rate": 7.928078711488281e-05, + "loss": 0.6975, + "step": 3613 + }, + { + "epoch": 0.7429334977901121, + "grad_norm": 0.19069653749465942, + "learning_rate": 7.927431579636445e-05, + "loss": 0.6744, + "step": 3614 + }, + { + "epoch": 0.7431390687634906, + "grad_norm": 0.14583733677864075, + "learning_rate": 7.926784278931075e-05, + "loss": 0.587, + "step": 3615 + }, + { + "epoch": 0.7433446397368692, + "grad_norm": 0.19307653605937958, + "learning_rate": 7.926136809404063e-05, + "loss": 0.6458, + "step": 3616 + }, + { + "epoch": 0.7435502107102477, + "grad_norm": 0.19686581194400787, + "learning_rate": 7.9254891710873e-05, + "loss": 0.6936, + "step": 3617 + }, + { + "epoch": 0.7437557816836263, + "grad_norm": 0.19272616505622864, + "learning_rate": 7.924841364012698e-05, + "loss": 0.6931, + "step": 3618 + }, + { + "epoch": 0.7439613526570048, + "grad_norm": 0.1832963228225708, + "learning_rate": 7.92419338821217e-05, + "loss": 0.6543, + "step": 3619 + }, + { + "epoch": 0.7441669236303834, + "grad_norm": 0.1948852688074112, + "learning_rate": 7.923545243717638e-05, + "loss": 0.6934, + "step": 3620 + }, + { + "epoch": 0.744372494603762, + "grad_norm": 0.19358238577842712, + "learning_rate": 7.922896930561034e-05, + "loss": 0.6901, + "step": 3621 + }, + { + "epoch": 0.7445780655771405, + "grad_norm": 0.18982093036174774, + "learning_rate": 7.922248448774296e-05, + "loss": 0.6832, + "step": 3622 + }, + { + "epoch": 0.744783636550519, + "grad_norm": 0.19411057233810425, + "learning_rate": 7.921599798389372e-05, + "loss": 0.6899, + "step": 3623 + }, + { + "epoch": 0.7449892075238976, + "grad_norm": 0.1885984092950821, + "learning_rate": 7.92095097943822e-05, + "loss": 0.6699, + "step": 3624 + }, + { + "epoch": 0.7451947784972762, + "grad_norm": 0.19820182025432587, + "learning_rate": 7.920301991952802e-05, + "loss": 0.6872, + "step": 3625 + }, + { + "epoch": 0.7454003494706547, + "grad_norm": 0.18656107783317566, + "learning_rate": 7.91965283596509e-05, + "loss": 0.6982, + "step": 3626 + }, + { + "epoch": 0.7456059204440333, + "grad_norm": 0.14508990943431854, + "learning_rate": 7.919003511507069e-05, + "loss": 0.5908, + "step": 3627 + }, + { + "epoch": 0.7458114914174119, + "grad_norm": 0.2058647722005844, + "learning_rate": 7.918354018610723e-05, + "loss": 0.6962, + "step": 3628 + }, + { + "epoch": 0.7460170623907905, + "grad_norm": 0.20024776458740234, + "learning_rate": 7.917704357308052e-05, + "loss": 0.6748, + "step": 3629 + }, + { + "epoch": 0.7462226333641689, + "grad_norm": 0.18803846836090088, + "learning_rate": 7.917054527631062e-05, + "loss": 0.6878, + "step": 3630 + }, + { + "epoch": 0.7464282043375475, + "grad_norm": 0.18676309287548065, + "learning_rate": 7.916404529611768e-05, + "loss": 0.6497, + "step": 3631 + }, + { + "epoch": 0.7466337753109261, + "grad_norm": 0.18984469771385193, + "learning_rate": 7.915754363282189e-05, + "loss": 0.667, + "step": 3632 + }, + { + "epoch": 0.7468393462843047, + "grad_norm": 0.1905134618282318, + "learning_rate": 7.915104028674359e-05, + "loss": 0.7037, + "step": 3633 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 0.19282597303390503, + "learning_rate": 7.914453525820314e-05, + "loss": 0.6825, + "step": 3634 + }, + { + "epoch": 0.7472504882310618, + "grad_norm": 0.191225066781044, + "learning_rate": 7.913802854752105e-05, + "loss": 0.6693, + "step": 3635 + }, + { + "epoch": 0.7474560592044404, + "grad_norm": 0.19597823917865753, + "learning_rate": 7.913152015501785e-05, + "loss": 0.6854, + "step": 3636 + }, + { + "epoch": 0.7476616301778188, + "grad_norm": 0.19076837599277496, + "learning_rate": 7.912501008101417e-05, + "loss": 0.6669, + "step": 3637 + }, + { + "epoch": 0.7478672011511974, + "grad_norm": 0.15839332342147827, + "learning_rate": 7.911849832583075e-05, + "loss": 0.5823, + "step": 3638 + }, + { + "epoch": 0.748072772124576, + "grad_norm": 0.19790640473365784, + "learning_rate": 7.91119848897884e-05, + "loss": 0.6758, + "step": 3639 + }, + { + "epoch": 0.7482783430979546, + "grad_norm": 0.20291505753993988, + "learning_rate": 7.910546977320799e-05, + "loss": 0.6858, + "step": 3640 + }, + { + "epoch": 0.7484839140713331, + "grad_norm": 0.19537273049354553, + "learning_rate": 7.909895297641047e-05, + "loss": 0.6818, + "step": 3641 + }, + { + "epoch": 0.7486894850447117, + "grad_norm": 0.14734981954097748, + "learning_rate": 7.909243449971693e-05, + "loss": 0.5743, + "step": 3642 + }, + { + "epoch": 0.7488950560180903, + "grad_norm": 0.15119509398937225, + "learning_rate": 7.90859143434485e-05, + "loss": 0.5797, + "step": 3643 + }, + { + "epoch": 0.7491006269914688, + "grad_norm": 0.23732592165470123, + "learning_rate": 7.907939250792638e-05, + "loss": 0.6841, + "step": 3644 + }, + { + "epoch": 0.7493061979648473, + "grad_norm": 0.2022113800048828, + "learning_rate": 7.907286899347187e-05, + "loss": 0.707, + "step": 3645 + }, + { + "epoch": 0.7495117689382259, + "grad_norm": 0.19698172807693481, + "learning_rate": 7.906634380040636e-05, + "loss": 0.6966, + "step": 3646 + }, + { + "epoch": 0.7497173399116045, + "grad_norm": 0.21839676797389984, + "learning_rate": 7.905981692905133e-05, + "loss": 0.6853, + "step": 3647 + }, + { + "epoch": 0.7499229108849831, + "grad_norm": 0.20229050517082214, + "learning_rate": 7.90532883797283e-05, + "loss": 0.659, + "step": 3648 + }, + { + "epoch": 0.7501284818583616, + "grad_norm": 0.18536463379859924, + "learning_rate": 7.904675815275894e-05, + "loss": 0.6534, + "step": 3649 + }, + { + "epoch": 0.7503340528317402, + "grad_norm": 0.20928248763084412, + "learning_rate": 7.904022624846491e-05, + "loss": 0.6913, + "step": 3650 + }, + { + "epoch": 0.7505396238051187, + "grad_norm": 0.20999811589717865, + "learning_rate": 7.903369266716806e-05, + "loss": 0.654, + "step": 3651 + }, + { + "epoch": 0.7507451947784973, + "grad_norm": 0.19690896570682526, + "learning_rate": 7.902715740919023e-05, + "loss": 0.5836, + "step": 3652 + }, + { + "epoch": 0.7509507657518758, + "grad_norm": 0.1489873230457306, + "learning_rate": 7.902062047485341e-05, + "loss": 0.5822, + "step": 3653 + }, + { + "epoch": 0.7511563367252544, + "grad_norm": 0.2375965416431427, + "learning_rate": 7.901408186447962e-05, + "loss": 0.6857, + "step": 3654 + }, + { + "epoch": 0.751361907698633, + "grad_norm": 0.2292969673871994, + "learning_rate": 7.9007541578391e-05, + "loss": 0.6998, + "step": 3655 + }, + { + "epoch": 0.7515674786720115, + "grad_norm": 0.1982121616601944, + "learning_rate": 7.900099961690976e-05, + "loss": 0.6853, + "step": 3656 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 0.21135136485099792, + "learning_rate": 7.899445598035819e-05, + "loss": 0.6663, + "step": 3657 + }, + { + "epoch": 0.7519786206187686, + "grad_norm": 0.2433331459760666, + "learning_rate": 7.898791066905866e-05, + "loss": 0.603, + "step": 3658 + }, + { + "epoch": 0.7521841915921472, + "grad_norm": 0.19841930270195007, + "learning_rate": 7.898136368333363e-05, + "loss": 0.6507, + "step": 3659 + }, + { + "epoch": 0.7523897625655257, + "grad_norm": 0.20042434334754944, + "learning_rate": 7.897481502350565e-05, + "loss": 0.6522, + "step": 3660 + }, + { + "epoch": 0.7525953335389043, + "grad_norm": 0.2082412987947464, + "learning_rate": 7.896826468989731e-05, + "loss": 0.682, + "step": 3661 + }, + { + "epoch": 0.7528009045122829, + "grad_norm": 0.2017931491136551, + "learning_rate": 7.896171268283136e-05, + "loss": 0.6729, + "step": 3662 + }, + { + "epoch": 0.7530064754856615, + "grad_norm": 0.1931910514831543, + "learning_rate": 7.895515900263055e-05, + "loss": 0.6525, + "step": 3663 + }, + { + "epoch": 0.7532120464590399, + "grad_norm": 0.21447621285915375, + "learning_rate": 7.894860364961778e-05, + "loss": 0.689, + "step": 3664 + }, + { + "epoch": 0.7534176174324185, + "grad_norm": 0.20270651578903198, + "learning_rate": 7.894204662411595e-05, + "loss": 0.6926, + "step": 3665 + }, + { + "epoch": 0.7536231884057971, + "grad_norm": 0.1878805160522461, + "learning_rate": 7.893548792644815e-05, + "loss": 0.6721, + "step": 3666 + }, + { + "epoch": 0.7538287593791757, + "grad_norm": 0.19181132316589355, + "learning_rate": 7.892892755693747e-05, + "loss": 0.6734, + "step": 3667 + }, + { + "epoch": 0.7540343303525542, + "grad_norm": 0.19380466639995575, + "learning_rate": 7.892236551590712e-05, + "loss": 0.6621, + "step": 3668 + }, + { + "epoch": 0.7542399013259328, + "grad_norm": 0.20492911338806152, + "learning_rate": 7.891580180368036e-05, + "loss": 0.6827, + "step": 3669 + }, + { + "epoch": 0.7544454722993114, + "grad_norm": 0.18449199199676514, + "learning_rate": 7.890923642058058e-05, + "loss": 0.6666, + "step": 3670 + }, + { + "epoch": 0.75465104327269, + "grad_norm": 0.18999159336090088, + "learning_rate": 7.890266936693121e-05, + "loss": 0.6498, + "step": 3671 + }, + { + "epoch": 0.7548566142460684, + "grad_norm": 0.19277434051036835, + "learning_rate": 7.889610064305578e-05, + "loss": 0.6759, + "step": 3672 + }, + { + "epoch": 0.755062185219447, + "grad_norm": 0.1884971410036087, + "learning_rate": 7.888953024927789e-05, + "loss": 0.6745, + "step": 3673 + }, + { + "epoch": 0.7552677561928256, + "grad_norm": 0.19598397612571716, + "learning_rate": 7.888295818592125e-05, + "loss": 0.6803, + "step": 3674 + }, + { + "epoch": 0.7554733271662041, + "grad_norm": 0.19982978701591492, + "learning_rate": 7.887638445330962e-05, + "loss": 0.6736, + "step": 3675 + }, + { + "epoch": 0.7556788981395827, + "grad_norm": 0.19140852987766266, + "learning_rate": 7.886980905176689e-05, + "loss": 0.6659, + "step": 3676 + }, + { + "epoch": 0.7558844691129613, + "grad_norm": 0.18775241076946259, + "learning_rate": 7.886323198161695e-05, + "loss": 0.67, + "step": 3677 + }, + { + "epoch": 0.7560900400863398, + "grad_norm": 0.1859831064939499, + "learning_rate": 7.885665324318386e-05, + "loss": 0.6554, + "step": 3678 + }, + { + "epoch": 0.7562956110597183, + "grad_norm": 0.19015206396579742, + "learning_rate": 7.885007283679173e-05, + "loss": 0.7039, + "step": 3679 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 0.19563472270965576, + "learning_rate": 7.884349076276469e-05, + "loss": 0.6769, + "step": 3680 + }, + { + "epoch": 0.7567067530064755, + "grad_norm": 0.2165932059288025, + "learning_rate": 7.883690702142706e-05, + "loss": 0.5897, + "step": 3681 + }, + { + "epoch": 0.7569123239798541, + "grad_norm": 0.19110572338104248, + "learning_rate": 7.883032161310318e-05, + "loss": 0.6666, + "step": 3682 + }, + { + "epoch": 0.7571178949532326, + "grad_norm": 0.2043447345495224, + "learning_rate": 7.882373453811745e-05, + "loss": 0.6633, + "step": 3683 + }, + { + "epoch": 0.7573234659266112, + "grad_norm": 0.19598691165447235, + "learning_rate": 7.881714579679444e-05, + "loss": 0.6601, + "step": 3684 + }, + { + "epoch": 0.7575290368999897, + "grad_norm": 0.16248776018619537, + "learning_rate": 7.88105553894587e-05, + "loss": 0.585, + "step": 3685 + }, + { + "epoch": 0.7577346078733683, + "grad_norm": 0.1903761625289917, + "learning_rate": 7.880396331643496e-05, + "loss": 0.6702, + "step": 3686 + }, + { + "epoch": 0.7579401788467468, + "grad_norm": 0.19729363918304443, + "learning_rate": 7.87973695780479e-05, + "loss": 0.6762, + "step": 3687 + }, + { + "epoch": 0.7581457498201254, + "grad_norm": 0.20168879628181458, + "learning_rate": 7.879077417462244e-05, + "loss": 0.7108, + "step": 3688 + }, + { + "epoch": 0.758351320793504, + "grad_norm": 0.18572981655597687, + "learning_rate": 7.878417710648346e-05, + "loss": 0.6516, + "step": 3689 + }, + { + "epoch": 0.7585568917668826, + "grad_norm": 0.18781378865242004, + "learning_rate": 7.8777578373956e-05, + "loss": 0.6767, + "step": 3690 + }, + { + "epoch": 0.758762462740261, + "grad_norm": 0.1998245269060135, + "learning_rate": 7.877097797736511e-05, + "loss": 0.6723, + "step": 3691 + }, + { + "epoch": 0.7589680337136396, + "grad_norm": 0.22822120785713196, + "learning_rate": 7.876437591703598e-05, + "loss": 0.668, + "step": 3692 + }, + { + "epoch": 0.7591736046870182, + "grad_norm": 0.19273287057876587, + "learning_rate": 7.875777219329386e-05, + "loss": 0.6699, + "step": 3693 + }, + { + "epoch": 0.7593791756603967, + "grad_norm": 0.2089652717113495, + "learning_rate": 7.875116680646411e-05, + "loss": 0.6664, + "step": 3694 + }, + { + "epoch": 0.7595847466337753, + "grad_norm": 0.1920463740825653, + "learning_rate": 7.87445597568721e-05, + "loss": 0.6731, + "step": 3695 + }, + { + "epoch": 0.7597903176071539, + "grad_norm": 0.19104163348674774, + "learning_rate": 7.873795104484337e-05, + "loss": 0.6813, + "step": 3696 + }, + { + "epoch": 0.7599958885805325, + "grad_norm": 0.15439750254154205, + "learning_rate": 7.873134067070347e-05, + "loss": 0.56, + "step": 3697 + }, + { + "epoch": 0.7602014595539109, + "grad_norm": 0.19592773914337158, + "learning_rate": 7.872472863477808e-05, + "loss": 0.6858, + "step": 3698 + }, + { + "epoch": 0.7604070305272895, + "grad_norm": 0.19534648954868317, + "learning_rate": 7.871811493739294e-05, + "loss": 0.681, + "step": 3699 + }, + { + "epoch": 0.7606126015006681, + "grad_norm": 0.13310682773590088, + "learning_rate": 7.871149957887387e-05, + "loss": 0.5885, + "step": 3700 + }, + { + "epoch": 0.7608181724740467, + "grad_norm": 0.19378095865249634, + "learning_rate": 7.870488255954679e-05, + "loss": 0.667, + "step": 3701 + }, + { + "epoch": 0.7610237434474252, + "grad_norm": 0.19437304139137268, + "learning_rate": 7.869826387973768e-05, + "loss": 0.6729, + "step": 3702 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 0.19552649557590485, + "learning_rate": 7.869164353977261e-05, + "loss": 0.668, + "step": 3703 + }, + { + "epoch": 0.7614348853941824, + "grad_norm": 0.15091755986213684, + "learning_rate": 7.868502153997774e-05, + "loss": 0.5726, + "step": 3704 + }, + { + "epoch": 0.761640456367561, + "grad_norm": 0.2120988517999649, + "learning_rate": 7.867839788067931e-05, + "loss": 0.69, + "step": 3705 + }, + { + "epoch": 0.7618460273409394, + "grad_norm": 0.1858333796262741, + "learning_rate": 7.867177256220362e-05, + "loss": 0.677, + "step": 3706 + }, + { + "epoch": 0.762051598314318, + "grad_norm": 0.1518946886062622, + "learning_rate": 7.866514558487709e-05, + "loss": 0.5866, + "step": 3707 + }, + { + "epoch": 0.7622571692876966, + "grad_norm": 0.20156964659690857, + "learning_rate": 7.865851694902617e-05, + "loss": 0.6694, + "step": 3708 + }, + { + "epoch": 0.7624627402610752, + "grad_norm": 0.19284150004386902, + "learning_rate": 7.865188665497744e-05, + "loss": 0.6577, + "step": 3709 + }, + { + "epoch": 0.7626683112344537, + "grad_norm": 0.13599884510040283, + "learning_rate": 7.864525470305756e-05, + "loss": 0.5647, + "step": 3710 + }, + { + "epoch": 0.7628738822078323, + "grad_norm": 0.20330367982387543, + "learning_rate": 7.863862109359322e-05, + "loss": 0.6663, + "step": 3711 + }, + { + "epoch": 0.7630794531812108, + "grad_norm": 0.1969096064567566, + "learning_rate": 7.863198582691125e-05, + "loss": 0.6966, + "step": 3712 + }, + { + "epoch": 0.7632850241545893, + "grad_norm": 0.20115163922309875, + "learning_rate": 7.862534890333854e-05, + "loss": 0.7011, + "step": 3713 + }, + { + "epoch": 0.7634905951279679, + "grad_norm": 0.20134492218494415, + "learning_rate": 7.861871032320206e-05, + "loss": 0.6588, + "step": 3714 + }, + { + "epoch": 0.7636961661013465, + "grad_norm": 0.18914572894573212, + "learning_rate": 7.861207008682884e-05, + "loss": 0.6581, + "step": 3715 + }, + { + "epoch": 0.7639017370747251, + "grad_norm": 0.430144339799881, + "learning_rate": 7.860542819454603e-05, + "loss": 0.6026, + "step": 3716 + }, + { + "epoch": 0.7641073080481036, + "grad_norm": 0.18655115365982056, + "learning_rate": 7.859878464668086e-05, + "loss": 0.6869, + "step": 3717 + }, + { + "epoch": 0.7643128790214821, + "grad_norm": 0.19397111237049103, + "learning_rate": 7.85921394435606e-05, + "loss": 0.6888, + "step": 3718 + }, + { + "epoch": 0.7645184499948607, + "grad_norm": 0.18396249413490295, + "learning_rate": 7.858549258551263e-05, + "loss": 0.6527, + "step": 3719 + }, + { + "epoch": 0.7647240209682393, + "grad_norm": 0.17971353232860565, + "learning_rate": 7.857884407286442e-05, + "loss": 0.6879, + "step": 3720 + }, + { + "epoch": 0.7649295919416178, + "grad_norm": 0.1879139393568039, + "learning_rate": 7.857219390594353e-05, + "loss": 0.6821, + "step": 3721 + }, + { + "epoch": 0.7651351629149964, + "grad_norm": 0.1858903020620346, + "learning_rate": 7.856554208507755e-05, + "loss": 0.6818, + "step": 3722 + }, + { + "epoch": 0.765340733888375, + "grad_norm": 0.1843085139989853, + "learning_rate": 7.85588886105942e-05, + "loss": 0.6661, + "step": 3723 + }, + { + "epoch": 0.7655463048617536, + "grad_norm": 0.18377020955085754, + "learning_rate": 7.855223348282126e-05, + "loss": 0.6742, + "step": 3724 + }, + { + "epoch": 0.765751875835132, + "grad_norm": 0.1833381950855255, + "learning_rate": 7.854557670208659e-05, + "loss": 0.6676, + "step": 3725 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.19020181894302368, + "learning_rate": 7.853891826871816e-05, + "loss": 0.6742, + "step": 3726 + }, + { + "epoch": 0.7661630177818892, + "grad_norm": 0.18213771283626556, + "learning_rate": 7.853225818304398e-05, + "loss": 0.5946, + "step": 3727 + }, + { + "epoch": 0.7663685887552678, + "grad_norm": 0.20896635949611664, + "learning_rate": 7.852559644539216e-05, + "loss": 0.6719, + "step": 3728 + }, + { + "epoch": 0.7665741597286463, + "grad_norm": 0.19129472970962524, + "learning_rate": 7.851893305609091e-05, + "loss": 0.6838, + "step": 3729 + }, + { + "epoch": 0.7667797307020249, + "grad_norm": 0.18608838319778442, + "learning_rate": 7.85122680154685e-05, + "loss": 0.6702, + "step": 3730 + }, + { + "epoch": 0.7669853016754035, + "grad_norm": 0.13603243231773376, + "learning_rate": 7.85056013238533e-05, + "loss": 0.5653, + "step": 3731 + }, + { + "epoch": 0.7671908726487819, + "grad_norm": 0.1969052106142044, + "learning_rate": 7.849893298157369e-05, + "loss": 0.6705, + "step": 3732 + }, + { + "epoch": 0.7673964436221605, + "grad_norm": 0.19232457876205444, + "learning_rate": 7.849226298895824e-05, + "loss": 0.6542, + "step": 3733 + }, + { + "epoch": 0.7676020145955391, + "grad_norm": 0.18796077370643616, + "learning_rate": 7.848559134633555e-05, + "loss": 0.6682, + "step": 3734 + }, + { + "epoch": 0.7678075855689177, + "grad_norm": 0.19674451649188995, + "learning_rate": 7.847891805403426e-05, + "loss": 0.6574, + "step": 3735 + }, + { + "epoch": 0.7680131565422962, + "grad_norm": 0.19735072553157806, + "learning_rate": 7.847224311238316e-05, + "loss": 0.6637, + "step": 3736 + }, + { + "epoch": 0.7682187275156748, + "grad_norm": 0.22023150324821472, + "learning_rate": 7.846556652171112e-05, + "loss": 0.6634, + "step": 3737 + }, + { + "epoch": 0.7684242984890534, + "grad_norm": 0.18101370334625244, + "learning_rate": 7.845888828234701e-05, + "loss": 0.6424, + "step": 3738 + }, + { + "epoch": 0.768629869462432, + "grad_norm": 0.18563824892044067, + "learning_rate": 7.845220839461987e-05, + "loss": 0.6618, + "step": 3739 + }, + { + "epoch": 0.7688354404358104, + "grad_norm": 0.18954195082187653, + "learning_rate": 7.844552685885877e-05, + "loss": 0.6885, + "step": 3740 + }, + { + "epoch": 0.769041011409189, + "grad_norm": 0.14499548077583313, + "learning_rate": 7.843884367539289e-05, + "loss": 0.6127, + "step": 3741 + }, + { + "epoch": 0.7692465823825676, + "grad_norm": 0.20436535775661469, + "learning_rate": 7.843215884455147e-05, + "loss": 0.6805, + "step": 3742 + }, + { + "epoch": 0.7694521533559462, + "grad_norm": 0.20969851315021515, + "learning_rate": 7.842547236666386e-05, + "loss": 0.6548, + "step": 3743 + }, + { + "epoch": 0.7696577243293247, + "grad_norm": 0.19497977197170258, + "learning_rate": 7.841878424205944e-05, + "loss": 0.7104, + "step": 3744 + }, + { + "epoch": 0.7698632953027033, + "grad_norm": 0.1905307173728943, + "learning_rate": 7.841209447106772e-05, + "loss": 0.6676, + "step": 3745 + }, + { + "epoch": 0.7700688662760818, + "grad_norm": 0.1859470009803772, + "learning_rate": 7.840540305401828e-05, + "loss": 0.6712, + "step": 3746 + }, + { + "epoch": 0.7702744372494604, + "grad_norm": 0.19429220259189606, + "learning_rate": 7.839870999124077e-05, + "loss": 0.6763, + "step": 3747 + }, + { + "epoch": 0.7704800082228389, + "grad_norm": 0.188473641872406, + "learning_rate": 7.839201528306492e-05, + "loss": 0.6856, + "step": 3748 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 0.19540703296661377, + "learning_rate": 7.838531892982057e-05, + "loss": 0.6616, + "step": 3749 + }, + { + "epoch": 0.7708911501695961, + "grad_norm": 0.1938808113336563, + "learning_rate": 7.837862093183758e-05, + "loss": 0.6553, + "step": 3750 + }, + { + "epoch": 0.7710967211429746, + "grad_norm": 0.1836869865655899, + "learning_rate": 7.837192128944594e-05, + "loss": 0.6768, + "step": 3751 + }, + { + "epoch": 0.7713022921163531, + "grad_norm": 0.1519763171672821, + "learning_rate": 7.836522000297572e-05, + "loss": 0.6059, + "step": 3752 + }, + { + "epoch": 0.7715078630897317, + "grad_norm": 0.19223132729530334, + "learning_rate": 7.835851707275707e-05, + "loss": 0.7093, + "step": 3753 + }, + { + "epoch": 0.7717134340631103, + "grad_norm": 0.19785994291305542, + "learning_rate": 7.83518124991202e-05, + "loss": 0.6557, + "step": 3754 + }, + { + "epoch": 0.7719190050364888, + "grad_norm": 0.18960314989089966, + "learning_rate": 7.834510628239541e-05, + "loss": 0.6495, + "step": 3755 + }, + { + "epoch": 0.7721245760098674, + "grad_norm": 0.1869727522134781, + "learning_rate": 7.833839842291309e-05, + "loss": 0.6561, + "step": 3756 + }, + { + "epoch": 0.772330146983246, + "grad_norm": 0.19522154331207275, + "learning_rate": 7.83316889210037e-05, + "loss": 0.6781, + "step": 3757 + }, + { + "epoch": 0.7725357179566246, + "grad_norm": 0.19209223985671997, + "learning_rate": 7.832497777699779e-05, + "loss": 0.6598, + "step": 3758 + }, + { + "epoch": 0.772741288930003, + "grad_norm": 0.19709967076778412, + "learning_rate": 7.831826499122599e-05, + "loss": 0.6977, + "step": 3759 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 0.19524455070495605, + "learning_rate": 7.8311550564019e-05, + "loss": 0.6701, + "step": 3760 + }, + { + "epoch": 0.7731524308767602, + "grad_norm": 0.19056567549705505, + "learning_rate": 7.830483449570762e-05, + "loss": 0.652, + "step": 3761 + }, + { + "epoch": 0.7733580018501388, + "grad_norm": 0.2009115368127823, + "learning_rate": 7.829811678662269e-05, + "loss": 0.6796, + "step": 3762 + }, + { + "epoch": 0.7735635728235173, + "grad_norm": 0.1854369342327118, + "learning_rate": 7.829139743709518e-05, + "loss": 0.6959, + "step": 3763 + }, + { + "epoch": 0.7737691437968959, + "grad_norm": 0.19334383308887482, + "learning_rate": 7.828467644745614e-05, + "loss": 0.6803, + "step": 3764 + }, + { + "epoch": 0.7739747147702745, + "grad_norm": 0.1896241158246994, + "learning_rate": 7.827795381803666e-05, + "loss": 0.6589, + "step": 3765 + }, + { + "epoch": 0.774180285743653, + "grad_norm": 0.19462954998016357, + "learning_rate": 7.827122954916793e-05, + "loss": 0.6884, + "step": 3766 + }, + { + "epoch": 0.7743858567170315, + "grad_norm": 0.15615877509117126, + "learning_rate": 7.826450364118124e-05, + "loss": 0.5868, + "step": 3767 + }, + { + "epoch": 0.7745914276904101, + "grad_norm": 0.21053725481033325, + "learning_rate": 7.825777609440793e-05, + "loss": 0.6619, + "step": 3768 + }, + { + "epoch": 0.7747969986637887, + "grad_norm": 0.1837691068649292, + "learning_rate": 7.825104690917943e-05, + "loss": 0.68, + "step": 3769 + }, + { + "epoch": 0.7750025696371672, + "grad_norm": 0.18419477343559265, + "learning_rate": 7.824431608582728e-05, + "loss": 0.6629, + "step": 3770 + }, + { + "epoch": 0.7752081406105458, + "grad_norm": 0.19641302525997162, + "learning_rate": 7.823758362468305e-05, + "loss": 0.6919, + "step": 3771 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 0.14012254774570465, + "learning_rate": 7.823084952607842e-05, + "loss": 0.5845, + "step": 3772 + }, + { + "epoch": 0.775619282557303, + "grad_norm": 0.13224144279956818, + "learning_rate": 7.822411379034516e-05, + "loss": 0.5851, + "step": 3773 + }, + { + "epoch": 0.7758248535306814, + "grad_norm": 0.20598402619361877, + "learning_rate": 7.82173764178151e-05, + "loss": 0.6987, + "step": 3774 + }, + { + "epoch": 0.77603042450406, + "grad_norm": 0.19516415894031525, + "learning_rate": 7.821063740882017e-05, + "loss": 0.681, + "step": 3775 + }, + { + "epoch": 0.7762359954774386, + "grad_norm": 0.192254900932312, + "learning_rate": 7.820389676369237e-05, + "loss": 0.6647, + "step": 3776 + }, + { + "epoch": 0.7764415664508172, + "grad_norm": 0.21489369869232178, + "learning_rate": 7.819715448276374e-05, + "loss": 0.6804, + "step": 3777 + }, + { + "epoch": 0.7766471374241957, + "grad_norm": 0.18683873116970062, + "learning_rate": 7.81904105663665e-05, + "loss": 0.6766, + "step": 3778 + }, + { + "epoch": 0.7768527083975743, + "grad_norm": 0.19451092183589935, + "learning_rate": 7.818366501483285e-05, + "loss": 0.6689, + "step": 3779 + }, + { + "epoch": 0.7770582793709528, + "grad_norm": 0.16607536375522614, + "learning_rate": 7.817691782849512e-05, + "loss": 0.6039, + "step": 3780 + }, + { + "epoch": 0.7772638503443314, + "grad_norm": 0.20235170423984528, + "learning_rate": 7.817016900768573e-05, + "loss": 0.6846, + "step": 3781 + }, + { + "epoch": 0.7774694213177099, + "grad_norm": 0.1997910737991333, + "learning_rate": 7.816341855273715e-05, + "loss": 0.665, + "step": 3782 + }, + { + "epoch": 0.7776749922910885, + "grad_norm": 0.19691520929336548, + "learning_rate": 7.815666646398193e-05, + "loss": 0.6791, + "step": 3783 + }, + { + "epoch": 0.7778805632644671, + "grad_norm": 0.14885997772216797, + "learning_rate": 7.814991274175273e-05, + "loss": 0.6101, + "step": 3784 + }, + { + "epoch": 0.7780861342378456, + "grad_norm": 0.19798895716667175, + "learning_rate": 7.814315738638227e-05, + "loss": 0.6652, + "step": 3785 + }, + { + "epoch": 0.7782917052112241, + "grad_norm": 0.13677549362182617, + "learning_rate": 7.813640039820337e-05, + "loss": 0.583, + "step": 3786 + }, + { + "epoch": 0.7784972761846027, + "grad_norm": 0.19505973160266876, + "learning_rate": 7.81296417775489e-05, + "loss": 0.7306, + "step": 3787 + }, + { + "epoch": 0.7787028471579813, + "grad_norm": 0.18989427387714386, + "learning_rate": 7.812288152475182e-05, + "loss": 0.6883, + "step": 3788 + }, + { + "epoch": 0.7789084181313598, + "grad_norm": 0.18871872127056122, + "learning_rate": 7.811611964014518e-05, + "loss": 0.6781, + "step": 3789 + }, + { + "epoch": 0.7791139891047384, + "grad_norm": 0.19525344669818878, + "learning_rate": 7.81093561240621e-05, + "loss": 0.657, + "step": 3790 + }, + { + "epoch": 0.779319560078117, + "grad_norm": 0.1633206307888031, + "learning_rate": 7.810259097683582e-05, + "loss": 0.5749, + "step": 3791 + }, + { + "epoch": 0.7795251310514956, + "grad_norm": 0.19155313074588776, + "learning_rate": 7.80958241987996e-05, + "loss": 0.6782, + "step": 3792 + }, + { + "epoch": 0.779730702024874, + "grad_norm": 0.18953226506710052, + "learning_rate": 7.80890557902868e-05, + "loss": 0.668, + "step": 3793 + }, + { + "epoch": 0.7799362729982526, + "grad_norm": 0.19336241483688354, + "learning_rate": 7.808228575163088e-05, + "loss": 0.6523, + "step": 3794 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 0.18969465792179108, + "learning_rate": 7.807551408316537e-05, + "loss": 0.6893, + "step": 3795 + }, + { + "epoch": 0.7803474149450098, + "grad_norm": 0.19042238593101501, + "learning_rate": 7.806874078522388e-05, + "loss": 0.64, + "step": 3796 + }, + { + "epoch": 0.7805529859183883, + "grad_norm": 0.1883266568183899, + "learning_rate": 7.80619658581401e-05, + "loss": 0.6471, + "step": 3797 + }, + { + "epoch": 0.7807585568917669, + "grad_norm": 0.1871403008699417, + "learning_rate": 7.805518930224777e-05, + "loss": 0.6642, + "step": 3798 + }, + { + "epoch": 0.7809641278651455, + "grad_norm": 0.1827799677848816, + "learning_rate": 7.804841111788078e-05, + "loss": 0.677, + "step": 3799 + }, + { + "epoch": 0.781169698838524, + "grad_norm": 0.18511800467967987, + "learning_rate": 7.804163130537304e-05, + "loss": 0.6586, + "step": 3800 + }, + { + "epoch": 0.7813752698119025, + "grad_norm": 0.1907230168581009, + "learning_rate": 7.803484986505855e-05, + "loss": 0.6573, + "step": 3801 + }, + { + "epoch": 0.7815808407852811, + "grad_norm": 0.18352137506008148, + "learning_rate": 7.802806679727144e-05, + "loss": 0.6952, + "step": 3802 + }, + { + "epoch": 0.7817864117586597, + "grad_norm": 0.18589456379413605, + "learning_rate": 7.802128210234583e-05, + "loss": 0.6877, + "step": 3803 + }, + { + "epoch": 0.7819919827320382, + "grad_norm": 0.19165122509002686, + "learning_rate": 7.8014495780616e-05, + "loss": 0.6721, + "step": 3804 + }, + { + "epoch": 0.7821975537054168, + "grad_norm": 0.18092942237854004, + "learning_rate": 7.800770783241627e-05, + "loss": 0.6472, + "step": 3805 + }, + { + "epoch": 0.7824031246787954, + "grad_norm": 0.1938347965478897, + "learning_rate": 7.800091825808104e-05, + "loss": 0.6875, + "step": 3806 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.18910136818885803, + "learning_rate": 7.799412705794484e-05, + "loss": 0.6634, + "step": 3807 + }, + { + "epoch": 0.7828142666255524, + "grad_norm": 0.18492446839809418, + "learning_rate": 7.798733423234219e-05, + "loss": 0.6772, + "step": 3808 + }, + { + "epoch": 0.783019837598931, + "grad_norm": 0.18603304028511047, + "learning_rate": 7.798053978160777e-05, + "loss": 0.6888, + "step": 3809 + }, + { + "epoch": 0.7832254085723096, + "grad_norm": 0.1817874163389206, + "learning_rate": 7.797374370607632e-05, + "loss": 0.6675, + "step": 3810 + }, + { + "epoch": 0.7834309795456882, + "grad_norm": 0.1888546198606491, + "learning_rate": 7.796694600608261e-05, + "loss": 0.6472, + "step": 3811 + }, + { + "epoch": 0.7836365505190667, + "grad_norm": 0.18347470462322235, + "learning_rate": 7.796014668196159e-05, + "loss": 0.6368, + "step": 3812 + }, + { + "epoch": 0.7838421214924453, + "grad_norm": 0.18692941963672638, + "learning_rate": 7.795334573404817e-05, + "loss": 0.6637, + "step": 3813 + }, + { + "epoch": 0.7840476924658238, + "grad_norm": 0.18573735654354095, + "learning_rate": 7.794654316267745e-05, + "loss": 0.6716, + "step": 3814 + }, + { + "epoch": 0.7842532634392024, + "grad_norm": 0.1885242462158203, + "learning_rate": 7.793973896818452e-05, + "loss": 0.6957, + "step": 3815 + }, + { + "epoch": 0.7844588344125809, + "grad_norm": 0.19421452283859253, + "learning_rate": 7.793293315090462e-05, + "loss": 0.6977, + "step": 3816 + }, + { + "epoch": 0.7846644053859595, + "grad_norm": 0.18501219153404236, + "learning_rate": 7.792612571117304e-05, + "loss": 0.676, + "step": 3817 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 0.18256261944770813, + "learning_rate": 7.791931664932514e-05, + "loss": 0.6637, + "step": 3818 + }, + { + "epoch": 0.7850755473327167, + "grad_norm": 0.16926661133766174, + "learning_rate": 7.791250596569636e-05, + "loss": 0.5883, + "step": 3819 + }, + { + "epoch": 0.7852811183060951, + "grad_norm": 0.19965988397598267, + "learning_rate": 7.790569366062226e-05, + "loss": 0.6873, + "step": 3820 + }, + { + "epoch": 0.7854866892794737, + "grad_norm": 0.19432468712329865, + "learning_rate": 7.789887973443842e-05, + "loss": 0.6727, + "step": 3821 + }, + { + "epoch": 0.7856922602528523, + "grad_norm": 1.5224770307540894, + "learning_rate": 7.789206418748055e-05, + "loss": 0.6645, + "step": 3822 + }, + { + "epoch": 0.7858978312262308, + "grad_norm": 0.25981712341308594, + "learning_rate": 7.788524702008442e-05, + "loss": 0.6693, + "step": 3823 + }, + { + "epoch": 0.7861034021996094, + "grad_norm": 0.17504632472991943, + "learning_rate": 7.787842823258587e-05, + "loss": 0.6081, + "step": 3824 + }, + { + "epoch": 0.786308973172988, + "grad_norm": 0.20936280488967896, + "learning_rate": 7.787160782532084e-05, + "loss": 0.6833, + "step": 3825 + }, + { + "epoch": 0.7865145441463666, + "grad_norm": 0.2347778081893921, + "learning_rate": 7.786478579862532e-05, + "loss": 0.6824, + "step": 3826 + }, + { + "epoch": 0.786720115119745, + "grad_norm": 0.19294393062591553, + "learning_rate": 7.785796215283543e-05, + "loss": 0.6811, + "step": 3827 + }, + { + "epoch": 0.7869256860931236, + "grad_norm": 0.253738671541214, + "learning_rate": 7.785113688828731e-05, + "loss": 0.6015, + "step": 3828 + }, + { + "epoch": 0.7871312570665022, + "grad_norm": 0.22543035447597504, + "learning_rate": 7.784431000531722e-05, + "loss": 0.6593, + "step": 3829 + }, + { + "epoch": 0.7873368280398808, + "grad_norm": 0.19480814039707184, + "learning_rate": 7.78374815042615e-05, + "loss": 0.6131, + "step": 3830 + }, + { + "epoch": 0.7875423990132593, + "grad_norm": 0.2131412923336029, + "learning_rate": 7.783065138545655e-05, + "loss": 0.6982, + "step": 3831 + }, + { + "epoch": 0.7877479699866379, + "grad_norm": 0.20891313254833221, + "learning_rate": 7.782381964923885e-05, + "loss": 0.6981, + "step": 3832 + }, + { + "epoch": 0.7879535409600165, + "grad_norm": 0.15176214277744293, + "learning_rate": 7.781698629594498e-05, + "loss": 0.5964, + "step": 3833 + }, + { + "epoch": 0.788159111933395, + "grad_norm": 0.19954368472099304, + "learning_rate": 7.781015132591156e-05, + "loss": 0.681, + "step": 3834 + }, + { + "epoch": 0.7883646829067735, + "grad_norm": 0.19388937950134277, + "learning_rate": 7.780331473947537e-05, + "loss": 0.6776, + "step": 3835 + }, + { + "epoch": 0.7885702538801521, + "grad_norm": 0.19515137374401093, + "learning_rate": 7.779647653697317e-05, + "loss": 0.7054, + "step": 3836 + }, + { + "epoch": 0.7887758248535307, + "grad_norm": 0.15485966205596924, + "learning_rate": 7.778963671874186e-05, + "loss": 0.5838, + "step": 3837 + }, + { + "epoch": 0.7889813958269093, + "grad_norm": 0.2033955603837967, + "learning_rate": 7.778279528511841e-05, + "loss": 0.6831, + "step": 3838 + }, + { + "epoch": 0.7891869668002878, + "grad_norm": 0.14127175509929657, + "learning_rate": 7.777595223643985e-05, + "loss": 0.5782, + "step": 3839 + }, + { + "epoch": 0.7893925377736664, + "grad_norm": 0.19278831779956818, + "learning_rate": 7.776910757304333e-05, + "loss": 0.6604, + "step": 3840 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 0.19700968265533447, + "learning_rate": 7.776226129526606e-05, + "loss": 0.6487, + "step": 3841 + }, + { + "epoch": 0.7898036797204234, + "grad_norm": 0.20007772743701935, + "learning_rate": 7.775541340344528e-05, + "loss": 0.7053, + "step": 3842 + }, + { + "epoch": 0.790009250693802, + "grad_norm": 0.1945502907037735, + "learning_rate": 7.774856389791838e-05, + "loss": 0.6633, + "step": 3843 + }, + { + "epoch": 0.7902148216671806, + "grad_norm": 0.18347761034965515, + "learning_rate": 7.774171277902282e-05, + "loss": 0.6509, + "step": 3844 + }, + { + "epoch": 0.7904203926405592, + "grad_norm": 0.1927865594625473, + "learning_rate": 7.773486004709608e-05, + "loss": 0.6873, + "step": 3845 + }, + { + "epoch": 0.7906259636139377, + "grad_norm": 0.1933821141719818, + "learning_rate": 7.772800570247582e-05, + "loss": 0.6784, + "step": 3846 + }, + { + "epoch": 0.7908315345873163, + "grad_norm": 0.1437695473432541, + "learning_rate": 7.772114974549966e-05, + "loss": 0.5979, + "step": 3847 + }, + { + "epoch": 0.7910371055606948, + "grad_norm": 0.20420506596565247, + "learning_rate": 7.77142921765054e-05, + "loss": 0.7083, + "step": 3848 + }, + { + "epoch": 0.7912426765340734, + "grad_norm": 0.13508614897727966, + "learning_rate": 7.770743299583089e-05, + "loss": 0.5824, + "step": 3849 + }, + { + "epoch": 0.7914482475074519, + "grad_norm": 0.1953742653131485, + "learning_rate": 7.770057220381401e-05, + "loss": 0.6655, + "step": 3850 + }, + { + "epoch": 0.7916538184808305, + "grad_norm": 0.192901611328125, + "learning_rate": 7.769370980079277e-05, + "loss": 0.6922, + "step": 3851 + }, + { + "epoch": 0.7918593894542091, + "grad_norm": 0.19612765312194824, + "learning_rate": 7.768684578710528e-05, + "loss": 0.6687, + "step": 3852 + }, + { + "epoch": 0.7920649604275877, + "grad_norm": 0.19205497205257416, + "learning_rate": 7.767998016308968e-05, + "loss": 0.6837, + "step": 3853 + }, + { + "epoch": 0.7922705314009661, + "grad_norm": 0.15582695603370667, + "learning_rate": 7.767311292908419e-05, + "loss": 0.5945, + "step": 3854 + }, + { + "epoch": 0.7924761023743447, + "grad_norm": 0.18942193686962128, + "learning_rate": 7.766624408542713e-05, + "loss": 0.652, + "step": 3855 + }, + { + "epoch": 0.7926816733477233, + "grad_norm": 0.19103151559829712, + "learning_rate": 7.765937363245692e-05, + "loss": 0.6518, + "step": 3856 + }, + { + "epoch": 0.7928872443211019, + "grad_norm": 0.18634134531021118, + "learning_rate": 7.765250157051202e-05, + "loss": 0.6556, + "step": 3857 + }, + { + "epoch": 0.7930928152944804, + "grad_norm": 0.1883394718170166, + "learning_rate": 7.764562789993099e-05, + "loss": 0.6736, + "step": 3858 + }, + { + "epoch": 0.793298386267859, + "grad_norm": 0.18593887984752655, + "learning_rate": 7.763875262105245e-05, + "loss": 0.652, + "step": 3859 + }, + { + "epoch": 0.7935039572412376, + "grad_norm": 0.2020663321018219, + "learning_rate": 7.763187573421511e-05, + "loss": 0.6447, + "step": 3860 + }, + { + "epoch": 0.793709528214616, + "grad_norm": 0.18651576340198517, + "learning_rate": 7.76249972397578e-05, + "loss": 0.6746, + "step": 3861 + }, + { + "epoch": 0.7939150991879946, + "grad_norm": 0.19070084393024445, + "learning_rate": 7.761811713801935e-05, + "loss": 0.6866, + "step": 3862 + }, + { + "epoch": 0.7941206701613732, + "grad_norm": 0.18511120975017548, + "learning_rate": 7.761123542933872e-05, + "loss": 0.6491, + "step": 3863 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 0.18863095343112946, + "learning_rate": 7.760435211405495e-05, + "loss": 0.672, + "step": 3864 + }, + { + "epoch": 0.7945318121081303, + "grad_norm": 0.19631804525852203, + "learning_rate": 7.759746719250714e-05, + "loss": 0.6509, + "step": 3865 + }, + { + "epoch": 0.7947373830815089, + "grad_norm": 0.17893162369728088, + "learning_rate": 7.75905806650345e-05, + "loss": 0.6707, + "step": 3866 + }, + { + "epoch": 0.7949429540548875, + "grad_norm": 0.18233318626880646, + "learning_rate": 7.758369253197626e-05, + "loss": 0.657, + "step": 3867 + }, + { + "epoch": 0.795148525028266, + "grad_norm": 0.19054913520812988, + "learning_rate": 7.757680279367178e-05, + "loss": 0.6796, + "step": 3868 + }, + { + "epoch": 0.7953540960016445, + "grad_norm": 0.20700985193252563, + "learning_rate": 7.75699114504605e-05, + "loss": 0.6672, + "step": 3869 + }, + { + "epoch": 0.7955596669750231, + "grad_norm": 0.1838599294424057, + "learning_rate": 7.756301850268193e-05, + "loss": 0.6721, + "step": 3870 + }, + { + "epoch": 0.7957652379484017, + "grad_norm": 0.1944621503353119, + "learning_rate": 7.755612395067562e-05, + "loss": 0.6751, + "step": 3871 + }, + { + "epoch": 0.7959708089217803, + "grad_norm": 0.18728716671466827, + "learning_rate": 7.754922779478125e-05, + "loss": 0.6765, + "step": 3872 + }, + { + "epoch": 0.7961763798951588, + "grad_norm": 0.18458257615566254, + "learning_rate": 7.754233003533856e-05, + "loss": 0.6609, + "step": 3873 + }, + { + "epoch": 0.7963819508685374, + "grad_norm": 0.18987616896629333, + "learning_rate": 7.753543067268737e-05, + "loss": 0.647, + "step": 3874 + }, + { + "epoch": 0.796587521841916, + "grad_norm": 0.19032716751098633, + "learning_rate": 7.752852970716761e-05, + "loss": 0.6514, + "step": 3875 + }, + { + "epoch": 0.7967930928152945, + "grad_norm": 0.18918365240097046, + "learning_rate": 7.752162713911918e-05, + "loss": 0.6705, + "step": 3876 + }, + { + "epoch": 0.796998663788673, + "grad_norm": 0.18836969137191772, + "learning_rate": 7.751472296888222e-05, + "loss": 0.6651, + "step": 3877 + }, + { + "epoch": 0.7972042347620516, + "grad_norm": 0.18875330686569214, + "learning_rate": 7.750781719679683e-05, + "loss": 0.6864, + "step": 3878 + }, + { + "epoch": 0.7974098057354302, + "grad_norm": 0.18728755414485931, + "learning_rate": 7.750090982320321e-05, + "loss": 0.6629, + "step": 3879 + }, + { + "epoch": 0.7976153767088087, + "grad_norm": 0.1937887966632843, + "learning_rate": 7.749400084844169e-05, + "loss": 0.6673, + "step": 3880 + }, + { + "epoch": 0.7978209476821873, + "grad_norm": 0.16451017558574677, + "learning_rate": 7.748709027285261e-05, + "loss": 0.5989, + "step": 3881 + }, + { + "epoch": 0.7980265186555658, + "grad_norm": 0.1364785134792328, + "learning_rate": 7.748017809677646e-05, + "loss": 0.5949, + "step": 3882 + }, + { + "epoch": 0.7982320896289444, + "grad_norm": 0.14087210595607758, + "learning_rate": 7.747326432055372e-05, + "loss": 0.5753, + "step": 3883 + }, + { + "epoch": 0.7984376606023229, + "grad_norm": 0.20993009209632874, + "learning_rate": 7.746634894452504e-05, + "loss": 0.7021, + "step": 3884 + }, + { + "epoch": 0.7986432315757015, + "grad_norm": 0.1940746009349823, + "learning_rate": 7.74594319690311e-05, + "loss": 0.6743, + "step": 3885 + }, + { + "epoch": 0.7988488025490801, + "grad_norm": 0.1924261897802353, + "learning_rate": 7.745251339441265e-05, + "loss": 0.6795, + "step": 3886 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 0.1905447542667389, + "learning_rate": 7.744559322101056e-05, + "loss": 0.6862, + "step": 3887 + }, + { + "epoch": 0.7992599444958371, + "grad_norm": 0.18997174501419067, + "learning_rate": 7.743867144916573e-05, + "loss": 0.5848, + "step": 3888 + }, + { + "epoch": 0.7994655154692157, + "grad_norm": 0.1488848179578781, + "learning_rate": 7.743174807921919e-05, + "loss": 0.5842, + "step": 3889 + }, + { + "epoch": 0.7996710864425943, + "grad_norm": 0.14569362998008728, + "learning_rate": 7.7424823111512e-05, + "loss": 0.5866, + "step": 3890 + }, + { + "epoch": 0.7998766574159729, + "grad_norm": 0.22627940773963928, + "learning_rate": 7.741789654638532e-05, + "loss": 0.6954, + "step": 3891 + }, + { + "epoch": 0.8000822283893514, + "grad_norm": 0.18143914639949799, + "learning_rate": 7.74109683841804e-05, + "loss": 0.5874, + "step": 3892 + }, + { + "epoch": 0.80028779936273, + "grad_norm": 0.1479119211435318, + "learning_rate": 7.740403862523857e-05, + "loss": 0.5729, + "step": 3893 + }, + { + "epoch": 0.8004933703361086, + "grad_norm": 0.20130044221878052, + "learning_rate": 7.73971072699012e-05, + "loss": 0.6855, + "step": 3894 + }, + { + "epoch": 0.8006989413094872, + "grad_norm": 0.19785720109939575, + "learning_rate": 7.739017431850978e-05, + "loss": 0.687, + "step": 3895 + }, + { + "epoch": 0.8009045122828656, + "grad_norm": 0.20219095051288605, + "learning_rate": 7.738323977140587e-05, + "loss": 0.585, + "step": 3896 + }, + { + "epoch": 0.8011100832562442, + "grad_norm": 0.1963326632976532, + "learning_rate": 7.737630362893109e-05, + "loss": 0.6628, + "step": 3897 + }, + { + "epoch": 0.8013156542296228, + "grad_norm": 0.18930426239967346, + "learning_rate": 7.736936589142717e-05, + "loss": 0.6674, + "step": 3898 + }, + { + "epoch": 0.8015212252030013, + "grad_norm": 0.18726347386837006, + "learning_rate": 7.736242655923587e-05, + "loss": 0.6837, + "step": 3899 + }, + { + "epoch": 0.8017267961763799, + "grad_norm": 0.19241462647914886, + "learning_rate": 7.735548563269907e-05, + "loss": 0.6677, + "step": 3900 + }, + { + "epoch": 0.8019323671497585, + "grad_norm": 0.1922820508480072, + "learning_rate": 7.734854311215874e-05, + "loss": 0.6865, + "step": 3901 + }, + { + "epoch": 0.802137938123137, + "grad_norm": 0.19233377277851105, + "learning_rate": 7.734159899795688e-05, + "loss": 0.6813, + "step": 3902 + }, + { + "epoch": 0.8023435090965155, + "grad_norm": 0.18713760375976562, + "learning_rate": 7.73346532904356e-05, + "loss": 0.6537, + "step": 3903 + }, + { + "epoch": 0.8025490800698941, + "grad_norm": 0.19880633056163788, + "learning_rate": 7.732770598993708e-05, + "loss": 0.6728, + "step": 3904 + }, + { + "epoch": 0.8027546510432727, + "grad_norm": 0.19050458073616028, + "learning_rate": 7.73207570968036e-05, + "loss": 0.6749, + "step": 3905 + }, + { + "epoch": 0.8029602220166513, + "grad_norm": 0.1801813244819641, + "learning_rate": 7.731380661137747e-05, + "loss": 0.5939, + "step": 3906 + }, + { + "epoch": 0.8031657929900298, + "grad_norm": 0.19383971393108368, + "learning_rate": 7.730685453400113e-05, + "loss": 0.6826, + "step": 3907 + }, + { + "epoch": 0.8033713639634084, + "grad_norm": 0.20955929160118103, + "learning_rate": 7.729990086501707e-05, + "loss": 0.6954, + "step": 3908 + }, + { + "epoch": 0.803576934936787, + "grad_norm": 0.19068995118141174, + "learning_rate": 7.729294560476786e-05, + "loss": 0.6686, + "step": 3909 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 0.19245314598083496, + "learning_rate": 7.728598875359615e-05, + "loss": 0.6619, + "step": 3910 + }, + { + "epoch": 0.803988076883544, + "grad_norm": 0.1979014128446579, + "learning_rate": 7.727903031184469e-05, + "loss": 0.6614, + "step": 3911 + }, + { + "epoch": 0.8041936478569226, + "grad_norm": 0.1900876760482788, + "learning_rate": 7.727207027985626e-05, + "loss": 0.6486, + "step": 3912 + }, + { + "epoch": 0.8043992188303012, + "grad_norm": 0.17994777858257294, + "learning_rate": 7.726510865797379e-05, + "loss": 0.6729, + "step": 3913 + }, + { + "epoch": 0.8046047898036797, + "grad_norm": 0.18554867804050446, + "learning_rate": 7.725814544654021e-05, + "loss": 0.6541, + "step": 3914 + }, + { + "epoch": 0.8048103607770583, + "grad_norm": 0.24200813472270966, + "learning_rate": 7.725118064589859e-05, + "loss": 0.6514, + "step": 3915 + }, + { + "epoch": 0.8050159317504368, + "grad_norm": 0.18101008236408234, + "learning_rate": 7.724421425639201e-05, + "loss": 0.6382, + "step": 3916 + }, + { + "epoch": 0.8052215027238154, + "grad_norm": 0.18432863056659698, + "learning_rate": 7.723724627836374e-05, + "loss": 0.64, + "step": 3917 + }, + { + "epoch": 0.8054270736971939, + "grad_norm": 0.19102488458156586, + "learning_rate": 7.7230276712157e-05, + "loss": 0.7106, + "step": 3918 + }, + { + "epoch": 0.8056326446705725, + "grad_norm": 0.16466036438941956, + "learning_rate": 7.722330555811519e-05, + "loss": 0.5831, + "step": 3919 + }, + { + "epoch": 0.8058382156439511, + "grad_norm": 0.19325773417949677, + "learning_rate": 7.721633281658171e-05, + "loss": 0.6855, + "step": 3920 + }, + { + "epoch": 0.8060437866173297, + "grad_norm": 0.1921764314174652, + "learning_rate": 7.720935848790009e-05, + "loss": 0.6858, + "step": 3921 + }, + { + "epoch": 0.8062493575907081, + "grad_norm": 0.1909746527671814, + "learning_rate": 7.720238257241394e-05, + "loss": 0.6825, + "step": 3922 + }, + { + "epoch": 0.8064549285640867, + "grad_norm": 0.18359649181365967, + "learning_rate": 7.71954050704669e-05, + "loss": 0.6807, + "step": 3923 + }, + { + "epoch": 0.8066604995374653, + "grad_norm": 0.1895141303539276, + "learning_rate": 7.718842598240273e-05, + "loss": 0.7047, + "step": 3924 + }, + { + "epoch": 0.8068660705108439, + "grad_norm": 0.18683840334415436, + "learning_rate": 7.718144530856527e-05, + "loss": 0.6704, + "step": 3925 + }, + { + "epoch": 0.8070716414842224, + "grad_norm": 0.19502970576286316, + "learning_rate": 7.717446304929841e-05, + "loss": 0.6785, + "step": 3926 + }, + { + "epoch": 0.807277212457601, + "grad_norm": 0.1623646318912506, + "learning_rate": 7.716747920494615e-05, + "loss": 0.5998, + "step": 3927 + }, + { + "epoch": 0.8074827834309796, + "grad_norm": 0.13050900399684906, + "learning_rate": 7.716049377585252e-05, + "loss": 0.5749, + "step": 3928 + }, + { + "epoch": 0.8076883544043582, + "grad_norm": 0.2015300691127777, + "learning_rate": 7.715350676236169e-05, + "loss": 0.6902, + "step": 3929 + }, + { + "epoch": 0.8078939253777366, + "grad_norm": 0.19763372838497162, + "learning_rate": 7.714651816481788e-05, + "loss": 0.6666, + "step": 3930 + }, + { + "epoch": 0.8080994963511152, + "grad_norm": 0.19438831508159637, + "learning_rate": 7.713952798356535e-05, + "loss": 0.6901, + "step": 3931 + }, + { + "epoch": 0.8083050673244938, + "grad_norm": 0.1897808313369751, + "learning_rate": 7.71325362189485e-05, + "loss": 0.6652, + "step": 3932 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 0.2024880349636078, + "learning_rate": 7.712554287131179e-05, + "loss": 0.6983, + "step": 3933 + }, + { + "epoch": 0.8087162092712509, + "grad_norm": 0.21040861308574677, + "learning_rate": 7.711854794099973e-05, + "loss": 0.6676, + "step": 3934 + }, + { + "epoch": 0.8089217802446295, + "grad_norm": 0.19779765605926514, + "learning_rate": 7.711155142835693e-05, + "loss": 0.6699, + "step": 3935 + }, + { + "epoch": 0.809127351218008, + "grad_norm": 0.18733692169189453, + "learning_rate": 7.710455333372809e-05, + "loss": 0.6876, + "step": 3936 + }, + { + "epoch": 0.8093329221913865, + "grad_norm": 0.18417513370513916, + "learning_rate": 7.709755365745796e-05, + "loss": 0.6592, + "step": 3937 + }, + { + "epoch": 0.8095384931647651, + "grad_norm": 0.19497236609458923, + "learning_rate": 7.709055239989138e-05, + "loss": 0.6704, + "step": 3938 + }, + { + "epoch": 0.8097440641381437, + "grad_norm": 0.19937434792518616, + "learning_rate": 7.708354956137329e-05, + "loss": 0.6672, + "step": 3939 + }, + { + "epoch": 0.8099496351115223, + "grad_norm": 0.18484531342983246, + "learning_rate": 7.707654514224865e-05, + "loss": 0.639, + "step": 3940 + }, + { + "epoch": 0.8101552060849008, + "grad_norm": 0.21879440546035767, + "learning_rate": 7.706953914286256e-05, + "loss": 0.5811, + "step": 3941 + }, + { + "epoch": 0.8103607770582794, + "grad_norm": 0.19117337465286255, + "learning_rate": 7.706253156356018e-05, + "loss": 0.6602, + "step": 3942 + }, + { + "epoch": 0.810566348031658, + "grad_norm": 0.20928023755550385, + "learning_rate": 7.705552240468672e-05, + "loss": 0.6755, + "step": 3943 + }, + { + "epoch": 0.8107719190050365, + "grad_norm": 0.1899488866329193, + "learning_rate": 7.70485116665875e-05, + "loss": 0.6596, + "step": 3944 + }, + { + "epoch": 0.810977489978415, + "grad_norm": 0.1829700917005539, + "learning_rate": 7.70414993496079e-05, + "loss": 0.6536, + "step": 3945 + }, + { + "epoch": 0.8111830609517936, + "grad_norm": 0.2187718152999878, + "learning_rate": 7.70344854540934e-05, + "loss": 0.6712, + "step": 3946 + }, + { + "epoch": 0.8113886319251722, + "grad_norm": 0.1931912750005722, + "learning_rate": 7.702746998038952e-05, + "loss": 0.6848, + "step": 3947 + }, + { + "epoch": 0.8115942028985508, + "grad_norm": 0.1904575526714325, + "learning_rate": 7.70204529288419e-05, + "loss": 0.6688, + "step": 3948 + }, + { + "epoch": 0.8117997738719293, + "grad_norm": 0.18743041157722473, + "learning_rate": 7.701343429979622e-05, + "loss": 0.6804, + "step": 3949 + }, + { + "epoch": 0.8120053448453078, + "grad_norm": 0.1948167085647583, + "learning_rate": 7.700641409359827e-05, + "loss": 0.6985, + "step": 3950 + }, + { + "epoch": 0.8122109158186864, + "grad_norm": 0.19588027894496918, + "learning_rate": 7.69993923105939e-05, + "loss": 0.6802, + "step": 3951 + }, + { + "epoch": 0.8124164867920649, + "grad_norm": 0.18361736834049225, + "learning_rate": 7.699236895112903e-05, + "loss": 0.5713, + "step": 3952 + }, + { + "epoch": 0.8126220577654435, + "grad_norm": 0.1924244612455368, + "learning_rate": 7.698534401554966e-05, + "loss": 0.6732, + "step": 3953 + }, + { + "epoch": 0.8128276287388221, + "grad_norm": 0.19700728356838226, + "learning_rate": 7.697831750420189e-05, + "loss": 0.6635, + "step": 3954 + }, + { + "epoch": 0.8130331997122007, + "grad_norm": 0.20763562619686127, + "learning_rate": 7.69712894174319e-05, + "loss": 0.6926, + "step": 3955 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 0.19522826373577118, + "learning_rate": 7.69642597555859e-05, + "loss": 0.6651, + "step": 3956 + }, + { + "epoch": 0.8134443416589577, + "grad_norm": 0.18719004094600677, + "learning_rate": 7.695722851901024e-05, + "loss": 0.6871, + "step": 3957 + }, + { + "epoch": 0.8136499126323363, + "grad_norm": 0.18853691220283508, + "learning_rate": 7.695019570805129e-05, + "loss": 0.6951, + "step": 3958 + }, + { + "epoch": 0.8138554836057149, + "grad_norm": 0.191143199801445, + "learning_rate": 7.694316132305553e-05, + "loss": 0.6819, + "step": 3959 + }, + { + "epoch": 0.8140610545790934, + "grad_norm": 0.20034968852996826, + "learning_rate": 7.69361253643695e-05, + "loss": 0.6813, + "step": 3960 + }, + { + "epoch": 0.814266625552472, + "grad_norm": 0.1926213502883911, + "learning_rate": 7.692908783233987e-05, + "loss": 0.6766, + "step": 3961 + }, + { + "epoch": 0.8144721965258506, + "grad_norm": 0.17970655858516693, + "learning_rate": 7.692204872731329e-05, + "loss": 0.6708, + "step": 3962 + }, + { + "epoch": 0.8146777674992292, + "grad_norm": 0.18484726548194885, + "learning_rate": 7.691500804963659e-05, + "loss": 0.6606, + "step": 3963 + }, + { + "epoch": 0.8148833384726076, + "grad_norm": 0.19342055916786194, + "learning_rate": 7.690796579965661e-05, + "loss": 0.6878, + "step": 3964 + }, + { + "epoch": 0.8150889094459862, + "grad_norm": 0.17727455496788025, + "learning_rate": 7.69009219777203e-05, + "loss": 0.5893, + "step": 3965 + }, + { + "epoch": 0.8152944804193648, + "grad_norm": 0.14557015895843506, + "learning_rate": 7.689387658417466e-05, + "loss": 0.5706, + "step": 3966 + }, + { + "epoch": 0.8155000513927434, + "grad_norm": 0.20403575897216797, + "learning_rate": 7.688682961936678e-05, + "loss": 0.6717, + "step": 3967 + }, + { + "epoch": 0.8157056223661219, + "grad_norm": 0.1949741244316101, + "learning_rate": 7.687978108364386e-05, + "loss": 0.6679, + "step": 3968 + }, + { + "epoch": 0.8159111933395005, + "grad_norm": 0.18995149433612823, + "learning_rate": 7.687273097735314e-05, + "loss": 0.6625, + "step": 3969 + }, + { + "epoch": 0.816116764312879, + "grad_norm": 0.1978754699230194, + "learning_rate": 7.686567930084193e-05, + "loss": 0.6665, + "step": 3970 + }, + { + "epoch": 0.8163223352862575, + "grad_norm": 0.20074686408042908, + "learning_rate": 7.685862605445763e-05, + "loss": 0.585, + "step": 3971 + }, + { + "epoch": 0.8165279062596361, + "grad_norm": 0.2053072452545166, + "learning_rate": 7.685157123854774e-05, + "loss": 0.6753, + "step": 3972 + }, + { + "epoch": 0.8167334772330147, + "grad_norm": 0.19377997517585754, + "learning_rate": 7.68445148534598e-05, + "loss": 0.7029, + "step": 3973 + }, + { + "epoch": 0.8169390482063933, + "grad_norm": 0.19419549405574799, + "learning_rate": 7.683745689954146e-05, + "loss": 0.6722, + "step": 3974 + }, + { + "epoch": 0.8171446191797718, + "grad_norm": 0.1902785748243332, + "learning_rate": 7.683039737714042e-05, + "loss": 0.6982, + "step": 3975 + }, + { + "epoch": 0.8173501901531504, + "grad_norm": 0.19267836213111877, + "learning_rate": 7.68233362866045e-05, + "loss": 0.6485, + "step": 3976 + }, + { + "epoch": 0.817555761126529, + "grad_norm": 0.1380038857460022, + "learning_rate": 7.681627362828152e-05, + "loss": 0.583, + "step": 3977 + }, + { + "epoch": 0.8177613320999075, + "grad_norm": 0.20162338018417358, + "learning_rate": 7.680920940251947e-05, + "loss": 0.662, + "step": 3978 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 0.12970632314682007, + "learning_rate": 7.680214360966631e-05, + "loss": 0.5716, + "step": 3979 + }, + { + "epoch": 0.8181724740466646, + "grad_norm": 0.20082327723503113, + "learning_rate": 7.679507625007021e-05, + "loss": 0.681, + "step": 3980 + }, + { + "epoch": 0.8183780450200432, + "grad_norm": 0.18788529932498932, + "learning_rate": 7.67880073240793e-05, + "loss": 0.6779, + "step": 3981 + }, + { + "epoch": 0.8185836159934218, + "grad_norm": 0.1803288459777832, + "learning_rate": 7.678093683204185e-05, + "loss": 0.6553, + "step": 3982 + }, + { + "epoch": 0.8187891869668003, + "grad_norm": 0.17987079918384552, + "learning_rate": 7.677386477430619e-05, + "loss": 0.6784, + "step": 3983 + }, + { + "epoch": 0.8189947579401788, + "grad_norm": 0.14350593090057373, + "learning_rate": 7.676679115122071e-05, + "loss": 0.5904, + "step": 3984 + }, + { + "epoch": 0.8192003289135574, + "grad_norm": 0.18889760971069336, + "learning_rate": 7.675971596313391e-05, + "loss": 0.6551, + "step": 3985 + }, + { + "epoch": 0.819405899886936, + "grad_norm": 0.1940951943397522, + "learning_rate": 7.675263921039436e-05, + "loss": 0.6905, + "step": 3986 + }, + { + "epoch": 0.8196114708603145, + "grad_norm": 0.18888835608959198, + "learning_rate": 7.674556089335068e-05, + "loss": 0.6613, + "step": 3987 + }, + { + "epoch": 0.8198170418336931, + "grad_norm": 0.18659929931163788, + "learning_rate": 7.673848101235161e-05, + "loss": 0.6346, + "step": 3988 + }, + { + "epoch": 0.8200226128070717, + "grad_norm": 0.19220280647277832, + "learning_rate": 7.67313995677459e-05, + "loss": 0.6835, + "step": 3989 + }, + { + "epoch": 0.8202281837804501, + "grad_norm": 0.18803051114082336, + "learning_rate": 7.672431655988245e-05, + "loss": 0.6733, + "step": 3990 + }, + { + "epoch": 0.8204337547538287, + "grad_norm": 0.15034914016723633, + "learning_rate": 7.671723198911022e-05, + "loss": 0.5774, + "step": 3991 + }, + { + "epoch": 0.8206393257272073, + "grad_norm": 0.19378551840782166, + "learning_rate": 7.671014585577821e-05, + "loss": 0.6688, + "step": 3992 + }, + { + "epoch": 0.8208448967005859, + "grad_norm": 0.22061464190483093, + "learning_rate": 7.670305816023551e-05, + "loss": 0.6763, + "step": 3993 + }, + { + "epoch": 0.8210504676739644, + "grad_norm": 0.18267303705215454, + "learning_rate": 7.669596890283132e-05, + "loss": 0.6657, + "step": 3994 + }, + { + "epoch": 0.821256038647343, + "grad_norm": 0.1902119219303131, + "learning_rate": 7.66888780839149e-05, + "loss": 0.6827, + "step": 3995 + }, + { + "epoch": 0.8214616096207216, + "grad_norm": 0.1934443563222885, + "learning_rate": 7.668178570383558e-05, + "loss": 0.6979, + "step": 3996 + }, + { + "epoch": 0.8216671805941002, + "grad_norm": 0.19263286888599396, + "learning_rate": 7.667469176294272e-05, + "loss": 0.6665, + "step": 3997 + }, + { + "epoch": 0.8218727515674786, + "grad_norm": 0.13605189323425293, + "learning_rate": 7.666759626158587e-05, + "loss": 0.5615, + "step": 3998 + }, + { + "epoch": 0.8220783225408572, + "grad_norm": 0.19073757529258728, + "learning_rate": 7.666049920011457e-05, + "loss": 0.6676, + "step": 3999 + }, + { + "epoch": 0.8222838935142358, + "grad_norm": 0.193292036652565, + "learning_rate": 7.665340057887844e-05, + "loss": 0.6751, + "step": 4000 + }, + { + "epoch": 0.8224894644876144, + "grad_norm": 0.18150904774665833, + "learning_rate": 7.664630039822722e-05, + "loss": 0.6678, + "step": 4001 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 0.19092898070812225, + "learning_rate": 7.663919865851071e-05, + "loss": 0.6643, + "step": 4002 + }, + { + "epoch": 0.8229006064343715, + "grad_norm": 0.1463061273097992, + "learning_rate": 7.663209536007873e-05, + "loss": 0.6015, + "step": 4003 + }, + { + "epoch": 0.82310617740775, + "grad_norm": 0.13264085352420807, + "learning_rate": 7.662499050328129e-05, + "loss": 0.5761, + "step": 4004 + }, + { + "epoch": 0.8233117483811286, + "grad_norm": 0.19010482728481293, + "learning_rate": 7.661788408846837e-05, + "loss": 0.6417, + "step": 4005 + }, + { + "epoch": 0.8235173193545071, + "grad_norm": 0.1999100148677826, + "learning_rate": 7.661077611599007e-05, + "loss": 0.6863, + "step": 4006 + }, + { + "epoch": 0.8237228903278857, + "grad_norm": 0.19514624774456024, + "learning_rate": 7.660366658619658e-05, + "loss": 0.6738, + "step": 4007 + }, + { + "epoch": 0.8239284613012643, + "grad_norm": 0.18463024497032166, + "learning_rate": 7.659655549943817e-05, + "loss": 0.6723, + "step": 4008 + }, + { + "epoch": 0.8241340322746428, + "grad_norm": 0.19612738490104675, + "learning_rate": 7.658944285606515e-05, + "loss": 0.6856, + "step": 4009 + }, + { + "epoch": 0.8243396032480214, + "grad_norm": 0.18983608484268188, + "learning_rate": 7.658232865642793e-05, + "loss": 0.6705, + "step": 4010 + }, + { + "epoch": 0.8245451742214, + "grad_norm": 0.18740776181221008, + "learning_rate": 7.657521290087699e-05, + "loss": 0.6769, + "step": 4011 + }, + { + "epoch": 0.8247507451947785, + "grad_norm": 0.1823440045118332, + "learning_rate": 7.656809558976289e-05, + "loss": 0.663, + "step": 4012 + }, + { + "epoch": 0.824956316168157, + "grad_norm": 0.18513023853302002, + "learning_rate": 7.656097672343626e-05, + "loss": 0.6657, + "step": 4013 + }, + { + "epoch": 0.8251618871415356, + "grad_norm": 0.1865355670452118, + "learning_rate": 7.655385630224783e-05, + "loss": 0.649, + "step": 4014 + }, + { + "epoch": 0.8253674581149142, + "grad_norm": 0.18735235929489136, + "learning_rate": 7.654673432654839e-05, + "loss": 0.6717, + "step": 4015 + }, + { + "epoch": 0.8255730290882928, + "grad_norm": 0.25272443890571594, + "learning_rate": 7.65396107966888e-05, + "loss": 0.5985, + "step": 4016 + }, + { + "epoch": 0.8257786000616713, + "grad_norm": 0.19560717046260834, + "learning_rate": 7.653248571301998e-05, + "loss": 0.6861, + "step": 4017 + }, + { + "epoch": 0.8259841710350498, + "grad_norm": 0.2014644891023636, + "learning_rate": 7.652535907589299e-05, + "loss": 0.6849, + "step": 4018 + }, + { + "epoch": 0.8261897420084284, + "grad_norm": 0.15079200267791748, + "learning_rate": 7.65182308856589e-05, + "loss": 0.5943, + "step": 4019 + }, + { + "epoch": 0.826395312981807, + "grad_norm": 0.19071127474308014, + "learning_rate": 7.651110114266889e-05, + "loss": 0.672, + "step": 4020 + }, + { + "epoch": 0.8266008839551855, + "grad_norm": 0.1912720799446106, + "learning_rate": 7.650396984727422e-05, + "loss": 0.672, + "step": 4021 + }, + { + "epoch": 0.8268064549285641, + "grad_norm": 0.1873595118522644, + "learning_rate": 7.64968369998262e-05, + "loss": 0.6576, + "step": 4022 + }, + { + "epoch": 0.8270120259019427, + "grad_norm": 0.19510895013809204, + "learning_rate": 7.648970260067623e-05, + "loss": 0.6711, + "step": 4023 + }, + { + "epoch": 0.8272175968753213, + "grad_norm": 0.1938508152961731, + "learning_rate": 7.64825666501758e-05, + "loss": 0.6629, + "step": 4024 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 0.1958763152360916, + "learning_rate": 7.647542914867646e-05, + "loss": 0.6749, + "step": 4025 + }, + { + "epoch": 0.8276287388220783, + "grad_norm": 0.18302227556705475, + "learning_rate": 7.646829009652985e-05, + "loss": 0.6462, + "step": 4026 + }, + { + "epoch": 0.8278343097954569, + "grad_norm": 0.15973201394081116, + "learning_rate": 7.646114949408764e-05, + "loss": 0.5734, + "step": 4027 + }, + { + "epoch": 0.8280398807688354, + "grad_norm": 0.18773558735847473, + "learning_rate": 7.645400734170168e-05, + "loss": 0.6912, + "step": 4028 + }, + { + "epoch": 0.828245451742214, + "grad_norm": 0.12838105857372284, + "learning_rate": 7.644686363972378e-05, + "loss": 0.5789, + "step": 4029 + }, + { + "epoch": 0.8284510227155926, + "grad_norm": 0.19766302406787872, + "learning_rate": 7.643971838850589e-05, + "loss": 0.6654, + "step": 4030 + }, + { + "epoch": 0.8286565936889712, + "grad_norm": 0.1896764189004898, + "learning_rate": 7.643257158840001e-05, + "loss": 0.7013, + "step": 4031 + }, + { + "epoch": 0.8288621646623496, + "grad_norm": 0.14424748718738556, + "learning_rate": 7.642542323975826e-05, + "loss": 0.5759, + "step": 4032 + }, + { + "epoch": 0.8290677356357282, + "grad_norm": 0.192418172955513, + "learning_rate": 7.641827334293279e-05, + "loss": 0.697, + "step": 4033 + }, + { + "epoch": 0.8292733066091068, + "grad_norm": 0.19316205382347107, + "learning_rate": 7.641112189827583e-05, + "loss": 0.6466, + "step": 4034 + }, + { + "epoch": 0.8294788775824854, + "grad_norm": 0.17913931608200073, + "learning_rate": 7.640396890613972e-05, + "loss": 0.6539, + "step": 4035 + }, + { + "epoch": 0.8296844485558639, + "grad_norm": 0.1839427500963211, + "learning_rate": 7.639681436687685e-05, + "loss": 0.678, + "step": 4036 + }, + { + "epoch": 0.8298900195292425, + "grad_norm": 0.18442392349243164, + "learning_rate": 7.638965828083966e-05, + "loss": 0.6628, + "step": 4037 + }, + { + "epoch": 0.830095590502621, + "grad_norm": 0.1920039802789688, + "learning_rate": 7.638250064838073e-05, + "loss": 0.6813, + "step": 4038 + }, + { + "epoch": 0.8303011614759996, + "grad_norm": 0.14554156363010406, + "learning_rate": 7.637534146985269e-05, + "loss": 0.5533, + "step": 4039 + }, + { + "epoch": 0.8305067324493781, + "grad_norm": 0.13095219433307648, + "learning_rate": 7.63681807456082e-05, + "loss": 0.5738, + "step": 4040 + }, + { + "epoch": 0.8307123034227567, + "grad_norm": 0.2078784555196762, + "learning_rate": 7.636101847600008e-05, + "loss": 0.6674, + "step": 4041 + }, + { + "epoch": 0.8309178743961353, + "grad_norm": 0.21770761907100677, + "learning_rate": 7.635385466138116e-05, + "loss": 0.6671, + "step": 4042 + }, + { + "epoch": 0.8311234453695138, + "grad_norm": 0.18896861374378204, + "learning_rate": 7.634668930210436e-05, + "loss": 0.6855, + "step": 4043 + }, + { + "epoch": 0.8313290163428924, + "grad_norm": 0.14647965133190155, + "learning_rate": 7.633952239852269e-05, + "loss": 0.598, + "step": 4044 + }, + { + "epoch": 0.831534587316271, + "grad_norm": 0.19375310838222504, + "learning_rate": 7.633235395098923e-05, + "loss": 0.6639, + "step": 4045 + }, + { + "epoch": 0.8317401582896495, + "grad_norm": 0.19974082708358765, + "learning_rate": 7.632518395985715e-05, + "loss": 0.6907, + "step": 4046 + }, + { + "epoch": 0.831945729263028, + "grad_norm": 0.19184468686580658, + "learning_rate": 7.631801242547967e-05, + "loss": 0.6713, + "step": 4047 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 0.13093294203281403, + "learning_rate": 7.631083934821008e-05, + "loss": 0.5689, + "step": 4048 + }, + { + "epoch": 0.8323568712097852, + "grad_norm": 0.19299007952213287, + "learning_rate": 7.63036647284018e-05, + "loss": 0.6664, + "step": 4049 + }, + { + "epoch": 0.8325624421831638, + "grad_norm": 0.19684211909770966, + "learning_rate": 7.629648856640827e-05, + "loss": 0.6594, + "step": 4050 + }, + { + "epoch": 0.8327680131565423, + "grad_norm": 0.1866525262594223, + "learning_rate": 7.6289310862583e-05, + "loss": 0.6664, + "step": 4051 + }, + { + "epoch": 0.8329735841299208, + "grad_norm": 0.1905846893787384, + "learning_rate": 7.628213161727966e-05, + "loss": 0.6458, + "step": 4052 + }, + { + "epoch": 0.8331791551032994, + "grad_norm": 0.19215607643127441, + "learning_rate": 7.62749508308519e-05, + "loss": 0.6508, + "step": 4053 + }, + { + "epoch": 0.833384726076678, + "grad_norm": 0.18882425129413605, + "learning_rate": 7.62677685036535e-05, + "loss": 0.6679, + "step": 4054 + }, + { + "epoch": 0.8335902970500565, + "grad_norm": 0.1906069815158844, + "learning_rate": 7.626058463603828e-05, + "loss": 0.6619, + "step": 4055 + }, + { + "epoch": 0.8337958680234351, + "grad_norm": 0.18673735857009888, + "learning_rate": 7.625339922836016e-05, + "loss": 0.6658, + "step": 4056 + }, + { + "epoch": 0.8340014389968137, + "grad_norm": 0.19083453714847565, + "learning_rate": 7.624621228097316e-05, + "loss": 0.6631, + "step": 4057 + }, + { + "epoch": 0.8342070099701923, + "grad_norm": 0.18321901559829712, + "learning_rate": 7.62390237942313e-05, + "loss": 0.6579, + "step": 4058 + }, + { + "epoch": 0.8344125809435707, + "grad_norm": 0.14776909351348877, + "learning_rate": 7.623183376848878e-05, + "loss": 0.5934, + "step": 4059 + }, + { + "epoch": 0.8346181519169493, + "grad_norm": 0.20167462527751923, + "learning_rate": 7.622464220409975e-05, + "loss": 0.6709, + "step": 4060 + }, + { + "epoch": 0.8348237228903279, + "grad_norm": 0.19711320102214813, + "learning_rate": 7.621744910141858e-05, + "loss": 0.6672, + "step": 4061 + }, + { + "epoch": 0.8350292938637064, + "grad_norm": 0.18972383439540863, + "learning_rate": 7.621025446079956e-05, + "loss": 0.6677, + "step": 4062 + }, + { + "epoch": 0.835234864837085, + "grad_norm": 0.19243162870407104, + "learning_rate": 7.620305828259722e-05, + "loss": 0.6874, + "step": 4063 + }, + { + "epoch": 0.8354404358104636, + "grad_norm": 0.18802182376384735, + "learning_rate": 7.619586056716601e-05, + "loss": 0.6656, + "step": 4064 + }, + { + "epoch": 0.8356460067838422, + "grad_norm": 0.14523807168006897, + "learning_rate": 7.618866131486058e-05, + "loss": 0.6011, + "step": 4065 + }, + { + "epoch": 0.8358515777572206, + "grad_norm": 0.18922917544841766, + "learning_rate": 7.618146052603557e-05, + "loss": 0.6577, + "step": 4066 + }, + { + "epoch": 0.8360571487305992, + "grad_norm": 0.19187946617603302, + "learning_rate": 7.617425820104574e-05, + "loss": 0.6774, + "step": 4067 + }, + { + "epoch": 0.8362627197039778, + "grad_norm": 0.1862529069185257, + "learning_rate": 7.616705434024593e-05, + "loss": 0.6503, + "step": 4068 + }, + { + "epoch": 0.8364682906773564, + "grad_norm": 0.19143825769424438, + "learning_rate": 7.615984894399102e-05, + "loss": 0.6803, + "step": 4069 + }, + { + "epoch": 0.8366738616507349, + "grad_norm": 0.18703386187553406, + "learning_rate": 7.615264201263599e-05, + "loss": 0.6779, + "step": 4070 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 0.18577006459236145, + "learning_rate": 7.61454335465359e-05, + "loss": 0.6671, + "step": 4071 + }, + { + "epoch": 0.837085003597492, + "grad_norm": 0.18921016156673431, + "learning_rate": 7.613822354604587e-05, + "loss": 0.6955, + "step": 4072 + }, + { + "epoch": 0.8372905745708706, + "grad_norm": 0.1349778026342392, + "learning_rate": 7.613101201152111e-05, + "loss": 0.568, + "step": 4073 + }, + { + "epoch": 0.8374961455442491, + "grad_norm": 0.1813334822654724, + "learning_rate": 7.612379894331689e-05, + "loss": 0.6512, + "step": 4074 + }, + { + "epoch": 0.8377017165176277, + "grad_norm": 0.1277725249528885, + "learning_rate": 7.611658434178857e-05, + "loss": 0.5773, + "step": 4075 + }, + { + "epoch": 0.8379072874910063, + "grad_norm": 0.1959075778722763, + "learning_rate": 7.610936820729157e-05, + "loss": 0.6923, + "step": 4076 + }, + { + "epoch": 0.8381128584643849, + "grad_norm": 0.19275759160518646, + "learning_rate": 7.610215054018142e-05, + "loss": 0.6868, + "step": 4077 + }, + { + "epoch": 0.8383184294377634, + "grad_norm": 0.19022993743419647, + "learning_rate": 7.609493134081367e-05, + "loss": 0.636, + "step": 4078 + }, + { + "epoch": 0.838524000411142, + "grad_norm": 0.1396605670452118, + "learning_rate": 7.608771060954399e-05, + "loss": 0.5913, + "step": 4079 + }, + { + "epoch": 0.8387295713845205, + "grad_norm": 0.126824289560318, + "learning_rate": 7.608048834672812e-05, + "loss": 0.5857, + "step": 4080 + }, + { + "epoch": 0.838935142357899, + "grad_norm": 0.20024533569812775, + "learning_rate": 7.607326455272187e-05, + "loss": 0.6722, + "step": 4081 + }, + { + "epoch": 0.8391407133312776, + "grad_norm": 0.19841928780078888, + "learning_rate": 7.606603922788108e-05, + "loss": 0.6507, + "step": 4082 + }, + { + "epoch": 0.8393462843046562, + "grad_norm": 0.17838910222053528, + "learning_rate": 7.605881237256175e-05, + "loss": 0.6203, + "step": 4083 + }, + { + "epoch": 0.8395518552780348, + "grad_norm": 0.1466301828622818, + "learning_rate": 7.605158398711991e-05, + "loss": 0.5627, + "step": 4084 + }, + { + "epoch": 0.8397574262514133, + "grad_norm": 0.1911042481660843, + "learning_rate": 7.604435407191167e-05, + "loss": 0.656, + "step": 4085 + }, + { + "epoch": 0.8399629972247918, + "grad_norm": 0.1837422102689743, + "learning_rate": 7.60371226272932e-05, + "loss": 0.653, + "step": 4086 + }, + { + "epoch": 0.8401685681981704, + "grad_norm": 0.1889040619134903, + "learning_rate": 7.602988965362075e-05, + "loss": 0.6757, + "step": 4087 + }, + { + "epoch": 0.840374139171549, + "grad_norm": 0.18443772196769714, + "learning_rate": 7.602265515125069e-05, + "loss": 0.6627, + "step": 4088 + }, + { + "epoch": 0.8405797101449275, + "grad_norm": 0.19531475007534027, + "learning_rate": 7.601541912053939e-05, + "loss": 0.6678, + "step": 4089 + }, + { + "epoch": 0.8407852811183061, + "grad_norm": 0.18012624979019165, + "learning_rate": 7.600818156184338e-05, + "loss": 0.6605, + "step": 4090 + }, + { + "epoch": 0.8409908520916847, + "grad_norm": 0.16611045598983765, + "learning_rate": 7.600094247551918e-05, + "loss": 0.606, + "step": 4091 + }, + { + "epoch": 0.8411964230650633, + "grad_norm": 0.1904737800359726, + "learning_rate": 7.599370186192345e-05, + "loss": 0.6825, + "step": 4092 + }, + { + "epoch": 0.8414019940384417, + "grad_norm": 0.1872866153717041, + "learning_rate": 7.598645972141288e-05, + "loss": 0.6555, + "step": 4093 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 0.1912485808134079, + "learning_rate": 7.59792160543443e-05, + "loss": 0.667, + "step": 4094 + }, + { + "epoch": 0.8418131359851989, + "grad_norm": 0.18316781520843506, + "learning_rate": 7.597197086107451e-05, + "loss": 0.6583, + "step": 4095 + }, + { + "epoch": 0.8420187069585775, + "grad_norm": 0.18488352000713348, + "learning_rate": 7.596472414196049e-05, + "loss": 0.6619, + "step": 4096 + }, + { + "epoch": 0.842224277931956, + "grad_norm": 0.16305844485759735, + "learning_rate": 7.595747589735923e-05, + "loss": 0.5869, + "step": 4097 + }, + { + "epoch": 0.8424298489053346, + "grad_norm": 0.19764935970306396, + "learning_rate": 7.595022612762786e-05, + "loss": 0.6704, + "step": 4098 + }, + { + "epoch": 0.8426354198787132, + "grad_norm": 0.2008553147315979, + "learning_rate": 7.594297483312348e-05, + "loss": 0.6928, + "step": 4099 + }, + { + "epoch": 0.8428409908520916, + "grad_norm": 0.19005800783634186, + "learning_rate": 7.593572201420336e-05, + "loss": 0.68, + "step": 4100 + }, + { + "epoch": 0.8430465618254702, + "grad_norm": 0.18260590732097626, + "learning_rate": 7.592846767122481e-05, + "loss": 0.6452, + "step": 4101 + }, + { + "epoch": 0.8432521327988488, + "grad_norm": 0.24055607616901398, + "learning_rate": 7.592121180454522e-05, + "loss": 0.6555, + "step": 4102 + }, + { + "epoch": 0.8434577037722274, + "grad_norm": 0.18779988586902618, + "learning_rate": 7.591395441452205e-05, + "loss": 0.6558, + "step": 4103 + }, + { + "epoch": 0.8436632747456059, + "grad_norm": 0.19184498488903046, + "learning_rate": 7.590669550151284e-05, + "loss": 0.6737, + "step": 4104 + }, + { + "epoch": 0.8438688457189845, + "grad_norm": 0.17881546914577484, + "learning_rate": 7.58994350658752e-05, + "loss": 0.6482, + "step": 4105 + }, + { + "epoch": 0.844074416692363, + "grad_norm": 0.19403071701526642, + "learning_rate": 7.589217310796682e-05, + "loss": 0.6316, + "step": 4106 + }, + { + "epoch": 0.8442799876657416, + "grad_norm": 0.18991516530513763, + "learning_rate": 7.588490962814544e-05, + "loss": 0.6286, + "step": 4107 + }, + { + "epoch": 0.8444855586391201, + "grad_norm": 0.19792747497558594, + "learning_rate": 7.587764462676895e-05, + "loss": 0.6514, + "step": 4108 + }, + { + "epoch": 0.8446911296124987, + "grad_norm": 0.18424390256404877, + "learning_rate": 7.587037810419521e-05, + "loss": 0.6726, + "step": 4109 + }, + { + "epoch": 0.8448967005858773, + "grad_norm": 0.16541998088359833, + "learning_rate": 7.586311006078223e-05, + "loss": 0.5817, + "step": 4110 + }, + { + "epoch": 0.8451022715592559, + "grad_norm": 0.19858099520206451, + "learning_rate": 7.585584049688807e-05, + "loss": 0.6799, + "step": 4111 + }, + { + "epoch": 0.8453078425326344, + "grad_norm": 0.19580329954624176, + "learning_rate": 7.58485694128709e-05, + "loss": 0.6626, + "step": 4112 + }, + { + "epoch": 0.845513413506013, + "grad_norm": 0.18652157485485077, + "learning_rate": 7.584129680908886e-05, + "loss": 0.6406, + "step": 4113 + }, + { + "epoch": 0.8457189844793915, + "grad_norm": 0.1859186291694641, + "learning_rate": 7.58340226859003e-05, + "loss": 0.6477, + "step": 4114 + }, + { + "epoch": 0.8459245554527701, + "grad_norm": 0.1960713267326355, + "learning_rate": 7.582674704366354e-05, + "loss": 0.6685, + "step": 4115 + }, + { + "epoch": 0.8461301264261486, + "grad_norm": 0.19311878085136414, + "learning_rate": 7.581946988273706e-05, + "loss": 0.6976, + "step": 4116 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 0.18788793683052063, + "learning_rate": 7.581219120347933e-05, + "loss": 0.6545, + "step": 4117 + }, + { + "epoch": 0.8465412683729058, + "grad_norm": 0.1906074583530426, + "learning_rate": 7.580491100624896e-05, + "loss": 0.6772, + "step": 4118 + }, + { + "epoch": 0.8467468393462843, + "grad_norm": 0.18752005696296692, + "learning_rate": 7.579762929140462e-05, + "loss": 0.672, + "step": 4119 + }, + { + "epoch": 0.8469524103196628, + "grad_norm": 0.1863172948360443, + "learning_rate": 7.579034605930502e-05, + "loss": 0.6502, + "step": 4120 + }, + { + "epoch": 0.8471579812930414, + "grad_norm": 0.18836906552314758, + "learning_rate": 7.578306131030898e-05, + "loss": 0.6438, + "step": 4121 + }, + { + "epoch": 0.84736355226642, + "grad_norm": 0.1857694834470749, + "learning_rate": 7.577577504477541e-05, + "loss": 0.6595, + "step": 4122 + }, + { + "epoch": 0.8475691232397985, + "grad_norm": 0.18018977344036102, + "learning_rate": 7.576848726306323e-05, + "loss": 0.6315, + "step": 4123 + }, + { + "epoch": 0.8477746942131771, + "grad_norm": 0.18060006201267242, + "learning_rate": 7.57611979655315e-05, + "loss": 0.6764, + "step": 4124 + }, + { + "epoch": 0.8479802651865557, + "grad_norm": 0.18697619438171387, + "learning_rate": 7.575390715253932e-05, + "loss": 0.6397, + "step": 4125 + }, + { + "epoch": 0.8481858361599343, + "grad_norm": 0.19681645929813385, + "learning_rate": 7.574661482444589e-05, + "loss": 0.663, + "step": 4126 + }, + { + "epoch": 0.8483914071333127, + "grad_norm": 0.18985417485237122, + "learning_rate": 7.573932098161043e-05, + "loss": 0.6413, + "step": 4127 + }, + { + "epoch": 0.8485969781066913, + "grad_norm": 0.183248370885849, + "learning_rate": 7.573202562439232e-05, + "loss": 0.6521, + "step": 4128 + }, + { + "epoch": 0.8488025490800699, + "grad_norm": 0.17444172501564026, + "learning_rate": 7.572472875315095e-05, + "loss": 0.5904, + "step": 4129 + }, + { + "epoch": 0.8490081200534485, + "grad_norm": 0.21605822443962097, + "learning_rate": 7.57174303682458e-05, + "loss": 0.6615, + "step": 4130 + }, + { + "epoch": 0.849213691026827, + "grad_norm": 0.20160672068595886, + "learning_rate": 7.571013047003643e-05, + "loss": 0.7124, + "step": 4131 + }, + { + "epoch": 0.8494192620002056, + "grad_norm": 0.18523965775966644, + "learning_rate": 7.570282905888246e-05, + "loss": 0.6608, + "step": 4132 + }, + { + "epoch": 0.8496248329735842, + "grad_norm": 0.19887828826904297, + "learning_rate": 7.569552613514362e-05, + "loss": 0.6699, + "step": 4133 + }, + { + "epoch": 0.8498304039469627, + "grad_norm": 0.19583609700202942, + "learning_rate": 7.568822169917967e-05, + "loss": 0.6682, + "step": 4134 + }, + { + "epoch": 0.8500359749203412, + "grad_norm": 0.19429847598075867, + "learning_rate": 7.568091575135048e-05, + "loss": 0.6828, + "step": 4135 + }, + { + "epoch": 0.8502415458937198, + "grad_norm": 0.1865924745798111, + "learning_rate": 7.567360829201597e-05, + "loss": 0.674, + "step": 4136 + }, + { + "epoch": 0.8504471168670984, + "grad_norm": 0.17295409739017487, + "learning_rate": 7.566629932153615e-05, + "loss": 0.5802, + "step": 4137 + }, + { + "epoch": 0.8506526878404769, + "grad_norm": 0.1509198248386383, + "learning_rate": 7.565898884027107e-05, + "loss": 0.5835, + "step": 4138 + }, + { + "epoch": 0.8508582588138555, + "grad_norm": 0.2158360481262207, + "learning_rate": 7.565167684858095e-05, + "loss": 0.6711, + "step": 4139 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.17296075820922852, + "learning_rate": 7.564436334682594e-05, + "loss": 0.6029, + "step": 4140 + }, + { + "epoch": 0.8512694007606126, + "grad_norm": 0.21175174415111542, + "learning_rate": 7.56370483353664e-05, + "loss": 0.7072, + "step": 4141 + }, + { + "epoch": 0.8514749717339911, + "grad_norm": 0.1445254236459732, + "learning_rate": 7.562973181456269e-05, + "loss": 0.5766, + "step": 4142 + }, + { + "epoch": 0.8516805427073697, + "grad_norm": 0.19627566635608673, + "learning_rate": 7.562241378477526e-05, + "loss": 0.6652, + "step": 4143 + }, + { + "epoch": 0.8518861136807483, + "grad_norm": 0.22292684018611908, + "learning_rate": 7.561509424636462e-05, + "loss": 0.7013, + "step": 4144 + }, + { + "epoch": 0.8520916846541269, + "grad_norm": 0.1842968612909317, + "learning_rate": 7.560777319969138e-05, + "loss": 0.6621, + "step": 4145 + }, + { + "epoch": 0.8522972556275054, + "grad_norm": 0.19120851159095764, + "learning_rate": 7.560045064511622e-05, + "loss": 0.6508, + "step": 4146 + }, + { + "epoch": 0.852502826600884, + "grad_norm": 0.21807745099067688, + "learning_rate": 7.559312658299988e-05, + "loss": 0.6831, + "step": 4147 + }, + { + "epoch": 0.8527083975742625, + "grad_norm": 0.19106024503707886, + "learning_rate": 7.558580101370318e-05, + "loss": 0.6636, + "step": 4148 + }, + { + "epoch": 0.8529139685476411, + "grad_norm": 0.1850479245185852, + "learning_rate": 7.557847393758702e-05, + "loss": 0.589, + "step": 4149 + }, + { + "epoch": 0.8531195395210196, + "grad_norm": 0.1937406063079834, + "learning_rate": 7.55711453550124e-05, + "loss": 0.6401, + "step": 4150 + }, + { + "epoch": 0.8533251104943982, + "grad_norm": 0.12518863379955292, + "learning_rate": 7.556381526634031e-05, + "loss": 0.5776, + "step": 4151 + }, + { + "epoch": 0.8535306814677768, + "grad_norm": 0.6598914861679077, + "learning_rate": 7.555648367193191e-05, + "loss": 0.6637, + "step": 4152 + }, + { + "epoch": 0.8537362524411554, + "grad_norm": 0.19615043699741364, + "learning_rate": 7.554915057214837e-05, + "loss": 0.6883, + "step": 4153 + }, + { + "epoch": 0.8539418234145338, + "grad_norm": 0.18384511768817902, + "learning_rate": 7.554181596735097e-05, + "loss": 0.6749, + "step": 4154 + }, + { + "epoch": 0.8541473943879124, + "grad_norm": 0.198414608836174, + "learning_rate": 7.553447985790105e-05, + "loss": 0.6878, + "step": 4155 + }, + { + "epoch": 0.854352965361291, + "grad_norm": 0.19876956939697266, + "learning_rate": 7.552714224416002e-05, + "loss": 0.6398, + "step": 4156 + }, + { + "epoch": 0.8545585363346695, + "grad_norm": 0.18689413368701935, + "learning_rate": 7.551980312648939e-05, + "loss": 0.6765, + "step": 4157 + }, + { + "epoch": 0.8547641073080481, + "grad_norm": 0.1880849003791809, + "learning_rate": 7.55124625052507e-05, + "loss": 0.6596, + "step": 4158 + }, + { + "epoch": 0.8549696782814267, + "grad_norm": 0.18960778415203094, + "learning_rate": 7.550512038080559e-05, + "loss": 0.6677, + "step": 4159 + }, + { + "epoch": 0.8551752492548053, + "grad_norm": 0.20969745516777039, + "learning_rate": 7.549777675351581e-05, + "loss": 0.5811, + "step": 4160 + }, + { + "epoch": 0.8553808202281837, + "grad_norm": 0.1950722187757492, + "learning_rate": 7.549043162374308e-05, + "loss": 0.6807, + "step": 4161 + }, + { + "epoch": 0.8555863912015623, + "grad_norm": 0.20414437353610992, + "learning_rate": 7.54830849918493e-05, + "loss": 0.6937, + "step": 4162 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 0.3100520670413971, + "learning_rate": 7.547573685819643e-05, + "loss": 0.5698, + "step": 4163 + }, + { + "epoch": 0.8559975331483195, + "grad_norm": 0.186519056558609, + "learning_rate": 7.546838722314641e-05, + "loss": 0.6604, + "step": 4164 + }, + { + "epoch": 0.856203104121698, + "grad_norm": 0.19283641874790192, + "learning_rate": 7.546103608706137e-05, + "loss": 0.6484, + "step": 4165 + }, + { + "epoch": 0.8564086750950766, + "grad_norm": 0.1958523392677307, + "learning_rate": 7.545368345030348e-05, + "loss": 0.6814, + "step": 4166 + }, + { + "epoch": 0.8566142460684552, + "grad_norm": 0.19231447577476501, + "learning_rate": 7.544632931323492e-05, + "loss": 0.6768, + "step": 4167 + }, + { + "epoch": 0.8568198170418337, + "grad_norm": 0.18475113809108734, + "learning_rate": 7.543897367621804e-05, + "loss": 0.6781, + "step": 4168 + }, + { + "epoch": 0.8570253880152122, + "grad_norm": 0.1537688672542572, + "learning_rate": 7.543161653961518e-05, + "loss": 0.6122, + "step": 4169 + }, + { + "epoch": 0.8572309589885908, + "grad_norm": 0.20179788768291473, + "learning_rate": 7.542425790378882e-05, + "loss": 0.6563, + "step": 4170 + }, + { + "epoch": 0.8574365299619694, + "grad_norm": 0.1862722784280777, + "learning_rate": 7.541689776910149e-05, + "loss": 0.6752, + "step": 4171 + }, + { + "epoch": 0.857642100935348, + "grad_norm": 0.18401017785072327, + "learning_rate": 7.540953613591576e-05, + "loss": 0.6828, + "step": 4172 + }, + { + "epoch": 0.8578476719087265, + "grad_norm": 0.18829752504825592, + "learning_rate": 7.540217300459431e-05, + "loss": 0.6479, + "step": 4173 + }, + { + "epoch": 0.858053242882105, + "grad_norm": 0.19413256645202637, + "learning_rate": 7.539480837549991e-05, + "loss": 0.6429, + "step": 4174 + }, + { + "epoch": 0.8582588138554836, + "grad_norm": 0.19081558287143707, + "learning_rate": 7.538744224899536e-05, + "loss": 0.647, + "step": 4175 + }, + { + "epoch": 0.8584643848288621, + "grad_norm": 0.15339916944503784, + "learning_rate": 7.538007462544356e-05, + "loss": 0.5791, + "step": 4176 + }, + { + "epoch": 0.8586699558022407, + "grad_norm": 0.12977366149425507, + "learning_rate": 7.537270550520749e-05, + "loss": 0.6098, + "step": 4177 + }, + { + "epoch": 0.8588755267756193, + "grad_norm": 0.21286390721797943, + "learning_rate": 7.536533488865016e-05, + "loss": 0.6783, + "step": 4178 + }, + { + "epoch": 0.8590810977489979, + "grad_norm": 0.14268797636032104, + "learning_rate": 7.535796277613473e-05, + "loss": 0.5743, + "step": 4179 + }, + { + "epoch": 0.8592866687223764, + "grad_norm": 0.19620656967163086, + "learning_rate": 7.535058916802435e-05, + "loss": 0.6796, + "step": 4180 + }, + { + "epoch": 0.859492239695755, + "grad_norm": 0.18335068225860596, + "learning_rate": 7.534321406468231e-05, + "loss": 0.6621, + "step": 4181 + }, + { + "epoch": 0.8596978106691335, + "grad_norm": 0.19787956774234772, + "learning_rate": 7.533583746647194e-05, + "loss": 0.6775, + "step": 4182 + }, + { + "epoch": 0.8599033816425121, + "grad_norm": 0.19326303899288177, + "learning_rate": 7.532845937375664e-05, + "loss": 0.6674, + "step": 4183 + }, + { + "epoch": 0.8601089526158906, + "grad_norm": 0.1872076541185379, + "learning_rate": 7.532107978689988e-05, + "loss": 0.6777, + "step": 4184 + }, + { + "epoch": 0.8603145235892692, + "grad_norm": 0.18660016357898712, + "learning_rate": 7.531369870626528e-05, + "loss": 0.6712, + "step": 4185 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 0.19512499868869781, + "learning_rate": 7.53063161322164e-05, + "loss": 0.6848, + "step": 4186 + }, + { + "epoch": 0.8607256655360264, + "grad_norm": 0.19282682240009308, + "learning_rate": 7.5298932065117e-05, + "loss": 0.6611, + "step": 4187 + }, + { + "epoch": 0.8609312365094048, + "grad_norm": 0.2191070318222046, + "learning_rate": 7.529154650533081e-05, + "loss": 0.6792, + "step": 4188 + }, + { + "epoch": 0.8611368074827834, + "grad_norm": 0.1931408941745758, + "learning_rate": 7.528415945322172e-05, + "loss": 0.6362, + "step": 4189 + }, + { + "epoch": 0.861342378456162, + "grad_norm": 0.18459977209568024, + "learning_rate": 7.527677090915364e-05, + "loss": 0.5784, + "step": 4190 + }, + { + "epoch": 0.8615479494295405, + "grad_norm": 0.19997800886631012, + "learning_rate": 7.526938087349057e-05, + "loss": 0.677, + "step": 4191 + }, + { + "epoch": 0.8617535204029191, + "grad_norm": 0.19136178493499756, + "learning_rate": 7.52619893465966e-05, + "loss": 0.6854, + "step": 4192 + }, + { + "epoch": 0.8619590913762977, + "grad_norm": 0.18970435857772827, + "learning_rate": 7.525459632883582e-05, + "loss": 0.674, + "step": 4193 + }, + { + "epoch": 0.8621646623496763, + "grad_norm": 0.21736173331737518, + "learning_rate": 7.524720182057252e-05, + "loss": 0.6546, + "step": 4194 + }, + { + "epoch": 0.8623702333230547, + "grad_norm": 0.1582231968641281, + "learning_rate": 7.523980582217096e-05, + "loss": 0.5956, + "step": 4195 + }, + { + "epoch": 0.8625758042964333, + "grad_norm": 0.19707003235816956, + "learning_rate": 7.52324083339955e-05, + "loss": 0.6682, + "step": 4196 + }, + { + "epoch": 0.8627813752698119, + "grad_norm": 0.19862191379070282, + "learning_rate": 7.522500935641058e-05, + "loss": 0.6435, + "step": 4197 + }, + { + "epoch": 0.8629869462431905, + "grad_norm": 0.1881260871887207, + "learning_rate": 7.521760888978073e-05, + "loss": 0.6581, + "step": 4198 + }, + { + "epoch": 0.863192517216569, + "grad_norm": 0.1898849755525589, + "learning_rate": 7.521020693447052e-05, + "loss": 0.6645, + "step": 4199 + }, + { + "epoch": 0.8633980881899476, + "grad_norm": 0.1787111759185791, + "learning_rate": 7.520280349084462e-05, + "loss": 0.6113, + "step": 4200 + }, + { + "epoch": 0.8636036591633262, + "grad_norm": 0.19326132535934448, + "learning_rate": 7.519539855926777e-05, + "loss": 0.6772, + "step": 4201 + }, + { + "epoch": 0.8638092301367047, + "grad_norm": 0.18564841151237488, + "learning_rate": 7.518799214010474e-05, + "loss": 0.6657, + "step": 4202 + }, + { + "epoch": 0.8640148011100832, + "grad_norm": 0.2385823279619217, + "learning_rate": 7.518058423372045e-05, + "loss": 0.5945, + "step": 4203 + }, + { + "epoch": 0.8642203720834618, + "grad_norm": 0.199651300907135, + "learning_rate": 7.517317484047984e-05, + "loss": 0.664, + "step": 4204 + }, + { + "epoch": 0.8644259430568404, + "grad_norm": 0.194375678896904, + "learning_rate": 7.516576396074794e-05, + "loss": 0.6745, + "step": 4205 + }, + { + "epoch": 0.864631514030219, + "grad_norm": 0.18686725199222565, + "learning_rate": 7.515835159488984e-05, + "loss": 0.6897, + "step": 4206 + }, + { + "epoch": 0.8648370850035975, + "grad_norm": 0.18740524351596832, + "learning_rate": 7.515093774327071e-05, + "loss": 0.6931, + "step": 4207 + }, + { + "epoch": 0.865042655976976, + "grad_norm": 0.1922253668308258, + "learning_rate": 7.514352240625581e-05, + "loss": 0.6467, + "step": 4208 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 0.19109128415584564, + "learning_rate": 7.513610558421045e-05, + "loss": 0.6697, + "step": 4209 + }, + { + "epoch": 0.8654537979237331, + "grad_norm": 0.18134894967079163, + "learning_rate": 7.512868727750002e-05, + "loss": 0.6566, + "step": 4210 + }, + { + "epoch": 0.8656593688971117, + "grad_norm": 0.1900303065776825, + "learning_rate": 7.512126748648999e-05, + "loss": 0.6987, + "step": 4211 + }, + { + "epoch": 0.8658649398704903, + "grad_norm": 0.19076496362686157, + "learning_rate": 7.51138462115459e-05, + "loss": 0.6514, + "step": 4212 + }, + { + "epoch": 0.8660705108438689, + "grad_norm": 0.18519791960716248, + "learning_rate": 7.510642345303338e-05, + "loss": 0.6964, + "step": 4213 + }, + { + "epoch": 0.8662760818172474, + "grad_norm": 0.13831019401550293, + "learning_rate": 7.509899921131805e-05, + "loss": 0.5829, + "step": 4214 + }, + { + "epoch": 0.866481652790626, + "grad_norm": 0.20118573307991028, + "learning_rate": 7.509157348676574e-05, + "loss": 0.6699, + "step": 4215 + }, + { + "epoch": 0.8666872237640045, + "grad_norm": 0.18774531781673431, + "learning_rate": 7.508414627974225e-05, + "loss": 0.6612, + "step": 4216 + }, + { + "epoch": 0.8668927947373831, + "grad_norm": 0.17688573896884918, + "learning_rate": 7.507671759061346e-05, + "loss": 0.6519, + "step": 4217 + }, + { + "epoch": 0.8670983657107616, + "grad_norm": 0.18357358872890472, + "learning_rate": 7.50692874197454e-05, + "loss": 0.6792, + "step": 4218 + }, + { + "epoch": 0.8673039366841402, + "grad_norm": 0.19416451454162598, + "learning_rate": 7.506185576750409e-05, + "loss": 0.6708, + "step": 4219 + }, + { + "epoch": 0.8675095076575188, + "grad_norm": 0.18293076753616333, + "learning_rate": 7.505442263425565e-05, + "loss": 0.6843, + "step": 4220 + }, + { + "epoch": 0.8677150786308974, + "grad_norm": 0.18310247361660004, + "learning_rate": 7.504698802036629e-05, + "loss": 0.6409, + "step": 4221 + }, + { + "epoch": 0.8679206496042758, + "grad_norm": 0.18264919519424438, + "learning_rate": 7.503955192620225e-05, + "loss": 0.6709, + "step": 4222 + }, + { + "epoch": 0.8681262205776544, + "grad_norm": 0.19960664212703705, + "learning_rate": 7.50321143521299e-05, + "loss": 0.6537, + "step": 4223 + }, + { + "epoch": 0.868331791551033, + "grad_norm": 0.19281069934368134, + "learning_rate": 7.502467529851565e-05, + "loss": 0.6657, + "step": 4224 + }, + { + "epoch": 0.8685373625244116, + "grad_norm": 0.19561025500297546, + "learning_rate": 7.501723476572599e-05, + "loss": 0.6867, + "step": 4225 + }, + { + "epoch": 0.8687429334977901, + "grad_norm": 0.17898957431316376, + "learning_rate": 7.500979275412747e-05, + "loss": 0.6587, + "step": 4226 + }, + { + "epoch": 0.8689485044711687, + "grad_norm": 0.19035303592681885, + "learning_rate": 7.500234926408671e-05, + "loss": 0.6719, + "step": 4227 + }, + { + "epoch": 0.8691540754445473, + "grad_norm": 0.1813403069972992, + "learning_rate": 7.499490429597044e-05, + "loss": 0.6734, + "step": 4228 + }, + { + "epoch": 0.8693596464179257, + "grad_norm": 0.18334521353244781, + "learning_rate": 7.498745785014543e-05, + "loss": 0.6559, + "step": 4229 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.17807736992835999, + "learning_rate": 7.498000992697854e-05, + "loss": 0.6318, + "step": 4230 + }, + { + "epoch": 0.8697707883646829, + "grad_norm": 0.18650507926940918, + "learning_rate": 7.497256052683668e-05, + "loss": 0.666, + "step": 4231 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 0.18326011300086975, + "learning_rate": 7.496510965008686e-05, + "loss": 0.6587, + "step": 4232 + }, + { + "epoch": 0.87018193031144, + "grad_norm": 0.1905418336391449, + "learning_rate": 7.495765729709615e-05, + "loss": 0.6544, + "step": 4233 + }, + { + "epoch": 0.8703875012848186, + "grad_norm": 0.187713161110878, + "learning_rate": 7.495020346823168e-05, + "loss": 0.6711, + "step": 4234 + }, + { + "epoch": 0.8705930722581972, + "grad_norm": 0.1464671492576599, + "learning_rate": 7.494274816386066e-05, + "loss": 0.5836, + "step": 4235 + }, + { + "epoch": 0.8707986432315757, + "grad_norm": 0.5995880961418152, + "learning_rate": 7.49352913843504e-05, + "loss": 0.6806, + "step": 4236 + }, + { + "epoch": 0.8710042142049542, + "grad_norm": 0.18340499699115753, + "learning_rate": 7.492783313006827e-05, + "loss": 0.676, + "step": 4237 + }, + { + "epoch": 0.8712097851783328, + "grad_norm": 0.191572368144989, + "learning_rate": 7.492037340138165e-05, + "loss": 0.6651, + "step": 4238 + }, + { + "epoch": 0.8714153561517114, + "grad_norm": 0.13379883766174316, + "learning_rate": 7.49129121986581e-05, + "loss": 0.5498, + "step": 4239 + }, + { + "epoch": 0.87162092712509, + "grad_norm": 0.19760626554489136, + "learning_rate": 7.490544952226517e-05, + "loss": 0.6624, + "step": 4240 + }, + { + "epoch": 0.8718264980984685, + "grad_norm": 0.19867949187755585, + "learning_rate": 7.489798537257052e-05, + "loss": 0.6542, + "step": 4241 + }, + { + "epoch": 0.872032069071847, + "grad_norm": 0.13943122327327728, + "learning_rate": 7.489051974994188e-05, + "loss": 0.5833, + "step": 4242 + }, + { + "epoch": 0.8722376400452256, + "grad_norm": 0.20543548464775085, + "learning_rate": 7.488305265474704e-05, + "loss": 0.6621, + "step": 4243 + }, + { + "epoch": 0.8724432110186042, + "grad_norm": 0.19805829226970673, + "learning_rate": 7.487558408735387e-05, + "loss": 0.6489, + "step": 4244 + }, + { + "epoch": 0.8726487819919827, + "grad_norm": 0.1895926296710968, + "learning_rate": 7.486811404813032e-05, + "loss": 0.688, + "step": 4245 + }, + { + "epoch": 0.8728543529653613, + "grad_norm": 0.13180671632289886, + "learning_rate": 7.486064253744436e-05, + "loss": 0.587, + "step": 4246 + }, + { + "epoch": 0.8730599239387399, + "grad_norm": 0.20886261761188507, + "learning_rate": 7.485316955566414e-05, + "loss": 0.6347, + "step": 4247 + }, + { + "epoch": 0.8732654949121184, + "grad_norm": 0.20359115302562714, + "learning_rate": 7.484569510315778e-05, + "loss": 0.6872, + "step": 4248 + }, + { + "epoch": 0.873471065885497, + "grad_norm": 0.184517964720726, + "learning_rate": 7.483821918029351e-05, + "loss": 0.6556, + "step": 4249 + }, + { + "epoch": 0.8736766368588755, + "grad_norm": 0.1971379816532135, + "learning_rate": 7.483074178743966e-05, + "loss": 0.6817, + "step": 4250 + }, + { + "epoch": 0.8738822078322541, + "grad_norm": 0.19668948650360107, + "learning_rate": 7.482326292496458e-05, + "loss": 0.6625, + "step": 4251 + }, + { + "epoch": 0.8740877788056326, + "grad_norm": 0.1894627958536148, + "learning_rate": 7.481578259323674e-05, + "loss": 0.6445, + "step": 4252 + }, + { + "epoch": 0.8742933497790112, + "grad_norm": 0.1403988003730774, + "learning_rate": 7.480830079262465e-05, + "loss": 0.5633, + "step": 4253 + }, + { + "epoch": 0.8744989207523898, + "grad_norm": 0.12436271458864212, + "learning_rate": 7.48008175234969e-05, + "loss": 0.5708, + "step": 4254 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 0.7834026217460632, + "learning_rate": 7.479333278622216e-05, + "loss": 0.6563, + "step": 4255 + }, + { + "epoch": 0.8749100626991468, + "grad_norm": 0.1350373923778534, + "learning_rate": 7.478584658116915e-05, + "loss": 0.5961, + "step": 4256 + }, + { + "epoch": 0.8751156336725254, + "grad_norm": 0.1937408745288849, + "learning_rate": 7.477835890870672e-05, + "loss": 0.6703, + "step": 4257 + }, + { + "epoch": 0.875321204645904, + "grad_norm": 0.13636933267116547, + "learning_rate": 7.477086976920373e-05, + "loss": 0.5909, + "step": 4258 + }, + { + "epoch": 0.8755267756192826, + "grad_norm": 0.21809430420398712, + "learning_rate": 7.476337916302911e-05, + "loss": 0.6848, + "step": 4259 + }, + { + "epoch": 0.8757323465926611, + "grad_norm": 0.16706953942775726, + "learning_rate": 7.475588709055195e-05, + "loss": 0.5596, + "step": 4260 + }, + { + "epoch": 0.8759379175660397, + "grad_norm": 0.19577208161354065, + "learning_rate": 7.47483935521413e-05, + "loss": 0.6608, + "step": 4261 + }, + { + "epoch": 0.8761434885394183, + "grad_norm": 0.194346085190773, + "learning_rate": 7.474089854816633e-05, + "loss": 0.6508, + "step": 4262 + }, + { + "epoch": 0.8763490595127968, + "grad_norm": 0.20509304106235504, + "learning_rate": 7.47334020789963e-05, + "loss": 0.6794, + "step": 4263 + }, + { + "epoch": 0.8765546304861753, + "grad_norm": 0.20143075287342072, + "learning_rate": 7.472590414500053e-05, + "loss": 0.691, + "step": 4264 + }, + { + "epoch": 0.8767602014595539, + "grad_norm": 0.2505229711532593, + "learning_rate": 7.471840474654838e-05, + "loss": 0.652, + "step": 4265 + }, + { + "epoch": 0.8769657724329325, + "grad_norm": 0.18424780666828156, + "learning_rate": 7.471090388400936e-05, + "loss": 0.6396, + "step": 4266 + }, + { + "epoch": 0.877171343406311, + "grad_norm": 0.18971550464630127, + "learning_rate": 7.470340155775296e-05, + "loss": 0.6445, + "step": 4267 + }, + { + "epoch": 0.8773769143796896, + "grad_norm": 0.19411668181419373, + "learning_rate": 7.46958977681488e-05, + "loss": 0.6377, + "step": 4268 + }, + { + "epoch": 0.8775824853530682, + "grad_norm": 0.1822851151227951, + "learning_rate": 7.468839251556656e-05, + "loss": 0.6684, + "step": 4269 + }, + { + "epoch": 0.8777880563264467, + "grad_norm": 0.17239375412464142, + "learning_rate": 7.468088580037598e-05, + "loss": 0.5929, + "step": 4270 + }, + { + "epoch": 0.8779936272998252, + "grad_norm": 0.19313600659370422, + "learning_rate": 7.467337762294689e-05, + "loss": 0.659, + "step": 4271 + }, + { + "epoch": 0.8781991982732038, + "grad_norm": 0.18807615339756012, + "learning_rate": 7.466586798364918e-05, + "loss": 0.6608, + "step": 4272 + }, + { + "epoch": 0.8784047692465824, + "grad_norm": 0.1784089207649231, + "learning_rate": 7.46583568828528e-05, + "loss": 0.6781, + "step": 4273 + }, + { + "epoch": 0.878610340219961, + "grad_norm": 0.21919219195842743, + "learning_rate": 7.46508443209278e-05, + "loss": 0.6469, + "step": 4274 + }, + { + "epoch": 0.8788159111933395, + "grad_norm": 0.20207509398460388, + "learning_rate": 7.464333029824429e-05, + "loss": 0.6928, + "step": 4275 + }, + { + "epoch": 0.879021482166718, + "grad_norm": 0.18525582551956177, + "learning_rate": 7.463581481517245e-05, + "loss": 0.6391, + "step": 4276 + }, + { + "epoch": 0.8792270531400966, + "grad_norm": 0.1859021782875061, + "learning_rate": 7.462829787208254e-05, + "loss": 0.6515, + "step": 4277 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 0.1962486058473587, + "learning_rate": 7.462077946934488e-05, + "loss": 0.6575, + "step": 4278 + }, + { + "epoch": 0.8796381950868537, + "grad_norm": 0.1927611082792282, + "learning_rate": 7.461325960732984e-05, + "loss": 0.6696, + "step": 4279 + }, + { + "epoch": 0.8798437660602323, + "grad_norm": 0.1841474175453186, + "learning_rate": 7.460573828640791e-05, + "loss": 0.6796, + "step": 4280 + }, + { + "epoch": 0.8800493370336109, + "grad_norm": 0.17558540403842926, + "learning_rate": 7.459821550694965e-05, + "loss": 0.6047, + "step": 4281 + }, + { + "epoch": 0.8802549080069895, + "grad_norm": 0.19254080951213837, + "learning_rate": 7.459069126932565e-05, + "loss": 0.6795, + "step": 4282 + }, + { + "epoch": 0.8804604789803679, + "grad_norm": 0.21128569543361664, + "learning_rate": 7.45831655739066e-05, + "loss": 0.6753, + "step": 4283 + }, + { + "epoch": 0.8806660499537465, + "grad_norm": 0.18865573406219482, + "learning_rate": 7.457563842106324e-05, + "loss": 0.6917, + "step": 4284 + }, + { + "epoch": 0.8808716209271251, + "grad_norm": 0.14653199911117554, + "learning_rate": 7.456810981116643e-05, + "loss": 0.5964, + "step": 4285 + }, + { + "epoch": 0.8810771919005036, + "grad_norm": 0.19860735535621643, + "learning_rate": 7.456057974458704e-05, + "loss": 0.6534, + "step": 4286 + }, + { + "epoch": 0.8812827628738822, + "grad_norm": 0.1889762133359909, + "learning_rate": 7.455304822169606e-05, + "loss": 0.6638, + "step": 4287 + }, + { + "epoch": 0.8814883338472608, + "grad_norm": 0.19104063510894775, + "learning_rate": 7.454551524286451e-05, + "loss": 0.6779, + "step": 4288 + }, + { + "epoch": 0.8816939048206394, + "grad_norm": 0.14345437288284302, + "learning_rate": 7.453798080846353e-05, + "loss": 0.5678, + "step": 4289 + }, + { + "epoch": 0.8818994757940178, + "grad_norm": 0.1917407065629959, + "learning_rate": 7.453044491886429e-05, + "loss": 0.6866, + "step": 4290 + }, + { + "epoch": 0.8821050467673964, + "grad_norm": 0.18986758589744568, + "learning_rate": 7.452290757443806e-05, + "loss": 0.6745, + "step": 4291 + }, + { + "epoch": 0.882310617740775, + "grad_norm": 0.18070244789123535, + "learning_rate": 7.451536877555617e-05, + "loss": 0.6416, + "step": 4292 + }, + { + "epoch": 0.8825161887141536, + "grad_norm": 0.1812668889760971, + "learning_rate": 7.450782852259e-05, + "loss": 0.6547, + "step": 4293 + }, + { + "epoch": 0.8827217596875321, + "grad_norm": 0.19002775847911835, + "learning_rate": 7.450028681591104e-05, + "loss": 0.6392, + "step": 4294 + }, + { + "epoch": 0.8829273306609107, + "grad_norm": 0.18384869396686554, + "learning_rate": 7.449274365589083e-05, + "loss": 0.6789, + "step": 4295 + }, + { + "epoch": 0.8831329016342893, + "grad_norm": 0.13192251324653625, + "learning_rate": 7.4485199042901e-05, + "loss": 0.5635, + "step": 4296 + }, + { + "epoch": 0.8833384726076678, + "grad_norm": 0.20571434497833252, + "learning_rate": 7.447765297731322e-05, + "loss": 0.7032, + "step": 4297 + }, + { + "epoch": 0.8835440435810463, + "grad_norm": 0.1892521232366562, + "learning_rate": 7.447010545949926e-05, + "loss": 0.6616, + "step": 4298 + }, + { + "epoch": 0.8837496145544249, + "grad_norm": 0.1817133128643036, + "learning_rate": 7.446255648983095e-05, + "loss": 0.68, + "step": 4299 + }, + { + "epoch": 0.8839551855278035, + "grad_norm": 0.18332892656326294, + "learning_rate": 7.445500606868016e-05, + "loss": 0.6436, + "step": 4300 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 0.18680675327777863, + "learning_rate": 7.444745419641893e-05, + "loss": 0.6678, + "step": 4301 + }, + { + "epoch": 0.8843663274745606, + "grad_norm": 0.18525037169456482, + "learning_rate": 7.443990087341926e-05, + "loss": 0.6411, + "step": 4302 + }, + { + "epoch": 0.8845718984479392, + "grad_norm": 0.18258033692836761, + "learning_rate": 7.443234610005327e-05, + "loss": 0.6625, + "step": 4303 + }, + { + "epoch": 0.8847774694213177, + "grad_norm": 0.1923125982284546, + "learning_rate": 7.442478987669315e-05, + "loss": 0.646, + "step": 4304 + }, + { + "epoch": 0.8849830403946962, + "grad_norm": 0.18216663599014282, + "learning_rate": 7.441723220371118e-05, + "loss": 0.6628, + "step": 4305 + }, + { + "epoch": 0.8851886113680748, + "grad_norm": 0.15292415022850037, + "learning_rate": 7.440967308147966e-05, + "loss": 0.5989, + "step": 4306 + }, + { + "epoch": 0.8853941823414534, + "grad_norm": 0.187953382730484, + "learning_rate": 7.440211251037101e-05, + "loss": 0.6624, + "step": 4307 + }, + { + "epoch": 0.885599753314832, + "grad_norm": 0.1256251335144043, + "learning_rate": 7.439455049075771e-05, + "loss": 0.5845, + "step": 4308 + }, + { + "epoch": 0.8858053242882105, + "grad_norm": 0.19565753638744354, + "learning_rate": 7.438698702301229e-05, + "loss": 0.674, + "step": 4309 + }, + { + "epoch": 0.886010895261589, + "grad_norm": 0.1811288446187973, + "learning_rate": 7.437942210750737e-05, + "loss": 0.6772, + "step": 4310 + }, + { + "epoch": 0.8862164662349676, + "grad_norm": 0.18292637169361115, + "learning_rate": 7.437185574461564e-05, + "loss": 0.6611, + "step": 4311 + }, + { + "epoch": 0.8864220372083462, + "grad_norm": 0.18883992731571198, + "learning_rate": 7.436428793470987e-05, + "loss": 0.6885, + "step": 4312 + }, + { + "epoch": 0.8866276081817247, + "grad_norm": 0.17563700675964355, + "learning_rate": 7.435671867816288e-05, + "loss": 0.6364, + "step": 4313 + }, + { + "epoch": 0.8868331791551033, + "grad_norm": 0.1886730045080185, + "learning_rate": 7.434914797534758e-05, + "loss": 0.6734, + "step": 4314 + }, + { + "epoch": 0.8870387501284819, + "grad_norm": 0.18746259808540344, + "learning_rate": 7.434157582663691e-05, + "loss": 0.6793, + "step": 4315 + }, + { + "epoch": 0.8872443211018605, + "grad_norm": 0.16091619431972504, + "learning_rate": 7.433400223240397e-05, + "loss": 0.6101, + "step": 4316 + }, + { + "epoch": 0.8874498920752389, + "grad_norm": 0.1879081130027771, + "learning_rate": 7.432642719302184e-05, + "loss": 0.6706, + "step": 4317 + }, + { + "epoch": 0.8876554630486175, + "grad_norm": 0.1933298110961914, + "learning_rate": 7.431885070886372e-05, + "loss": 0.6647, + "step": 4318 + }, + { + "epoch": 0.8878610340219961, + "grad_norm": 0.12698352336883545, + "learning_rate": 7.431127278030285e-05, + "loss": 0.5725, + "step": 4319 + }, + { + "epoch": 0.8880666049953746, + "grad_norm": 0.18227995932102203, + "learning_rate": 7.430369340771258e-05, + "loss": 0.6751, + "step": 4320 + }, + { + "epoch": 0.8882721759687532, + "grad_norm": 0.12696510553359985, + "learning_rate": 7.429611259146628e-05, + "loss": 0.5934, + "step": 4321 + }, + { + "epoch": 0.8884777469421318, + "grad_norm": 0.12385066598653793, + "learning_rate": 7.428853033193745e-05, + "loss": 0.5753, + "step": 4322 + }, + { + "epoch": 0.8886833179155104, + "grad_norm": 0.189598947763443, + "learning_rate": 7.428094662949964e-05, + "loss": 0.6631, + "step": 4323 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.19134309887886047, + "learning_rate": 7.427336148452645e-05, + "loss": 0.6627, + "step": 4324 + }, + { + "epoch": 0.8890944598622674, + "grad_norm": 0.1795106679201126, + "learning_rate": 7.426577489739155e-05, + "loss": 0.6591, + "step": 4325 + }, + { + "epoch": 0.889300030835646, + "grad_norm": 0.18666116893291473, + "learning_rate": 7.425818686846872e-05, + "loss": 0.6704, + "step": 4326 + }, + { + "epoch": 0.8895056018090246, + "grad_norm": 0.18348322808742523, + "learning_rate": 7.425059739813177e-05, + "loss": 0.6872, + "step": 4327 + }, + { + "epoch": 0.8897111727824031, + "grad_norm": 0.18486203253269196, + "learning_rate": 7.424300648675459e-05, + "loss": 0.683, + "step": 4328 + }, + { + "epoch": 0.8899167437557817, + "grad_norm": 0.19054512679576874, + "learning_rate": 7.423541413471117e-05, + "loss": 0.6541, + "step": 4329 + }, + { + "epoch": 0.8901223147291603, + "grad_norm": 0.18132087588310242, + "learning_rate": 7.422782034237554e-05, + "loss": 0.6879, + "step": 4330 + }, + { + "epoch": 0.8903278857025388, + "grad_norm": 0.17876796424388885, + "learning_rate": 7.422022511012182e-05, + "loss": 0.6338, + "step": 4331 + }, + { + "epoch": 0.8905334566759173, + "grad_norm": 0.18260298669338226, + "learning_rate": 7.421262843832417e-05, + "loss": 0.6436, + "step": 4332 + }, + { + "epoch": 0.8907390276492959, + "grad_norm": 0.19324032962322235, + "learning_rate": 7.420503032735688e-05, + "loss": 0.6672, + "step": 4333 + }, + { + "epoch": 0.8909445986226745, + "grad_norm": 0.1886059194803238, + "learning_rate": 7.419743077759423e-05, + "loss": 0.6803, + "step": 4334 + }, + { + "epoch": 0.8911501695960531, + "grad_norm": 0.18304765224456787, + "learning_rate": 7.418982978941065e-05, + "loss": 0.682, + "step": 4335 + }, + { + "epoch": 0.8913557405694316, + "grad_norm": 0.17993968725204468, + "learning_rate": 7.418222736318057e-05, + "loss": 0.5898, + "step": 4336 + }, + { + "epoch": 0.8915613115428102, + "grad_norm": 0.21449178457260132, + "learning_rate": 7.417462349927855e-05, + "loss": 0.6657, + "step": 4337 + }, + { + "epoch": 0.8917668825161887, + "grad_norm": 0.1957646608352661, + "learning_rate": 7.41670181980792e-05, + "loss": 0.6752, + "step": 4338 + }, + { + "epoch": 0.8919724534895672, + "grad_norm": 0.1868593990802765, + "learning_rate": 7.415941145995719e-05, + "loss": 0.7023, + "step": 4339 + }, + { + "epoch": 0.8921780244629458, + "grad_norm": 0.17802853882312775, + "learning_rate": 7.415180328528726e-05, + "loss": 0.6407, + "step": 4340 + }, + { + "epoch": 0.8923835954363244, + "grad_norm": 0.1869519203901291, + "learning_rate": 7.414419367444425e-05, + "loss": 0.6797, + "step": 4341 + }, + { + "epoch": 0.892589166409703, + "grad_norm": 0.1857430785894394, + "learning_rate": 7.413658262780301e-05, + "loss": 0.6507, + "step": 4342 + }, + { + "epoch": 0.8927947373830815, + "grad_norm": 0.18577779829502106, + "learning_rate": 7.412897014573856e-05, + "loss": 0.6426, + "step": 4343 + }, + { + "epoch": 0.89300030835646, + "grad_norm": 0.18945308029651642, + "learning_rate": 7.412135622862588e-05, + "loss": 0.6654, + "step": 4344 + }, + { + "epoch": 0.8932058793298386, + "grad_norm": 0.19126087427139282, + "learning_rate": 7.41137408768401e-05, + "loss": 0.6727, + "step": 4345 + }, + { + "epoch": 0.8934114503032172, + "grad_norm": 0.18092653155326843, + "learning_rate": 7.410612409075639e-05, + "loss": 0.6423, + "step": 4346 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 0.18795473873615265, + "learning_rate": 7.409850587074997e-05, + "loss": 0.671, + "step": 4347 + }, + { + "epoch": 0.8938225922499743, + "grad_norm": 0.18189306557178497, + "learning_rate": 7.409088621719618e-05, + "loss": 0.6605, + "step": 4348 + }, + { + "epoch": 0.8940281632233529, + "grad_norm": 0.532122015953064, + "learning_rate": 7.40832651304704e-05, + "loss": 0.7076, + "step": 4349 + }, + { + "epoch": 0.8942337341967315, + "grad_norm": 0.18601365387439728, + "learning_rate": 7.407564261094808e-05, + "loss": 0.6822, + "step": 4350 + }, + { + "epoch": 0.8944393051701099, + "grad_norm": 0.17646148800849915, + "learning_rate": 7.406801865900474e-05, + "loss": 0.5773, + "step": 4351 + }, + { + "epoch": 0.8946448761434885, + "grad_norm": 0.19108296930789948, + "learning_rate": 7.406039327501599e-05, + "loss": 0.6699, + "step": 4352 + }, + { + "epoch": 0.8948504471168671, + "grad_norm": 0.13074934482574463, + "learning_rate": 7.40527664593575e-05, + "loss": 0.5759, + "step": 4353 + }, + { + "epoch": 0.8950560180902457, + "grad_norm": 0.2011304795742035, + "learning_rate": 7.4045138212405e-05, + "loss": 0.6647, + "step": 4354 + }, + { + "epoch": 0.8952615890636242, + "grad_norm": 0.20265452563762665, + "learning_rate": 7.403750853453428e-05, + "loss": 0.6872, + "step": 4355 + }, + { + "epoch": 0.8954671600370028, + "grad_norm": 0.14710208773612976, + "learning_rate": 7.402987742612124e-05, + "loss": 0.5707, + "step": 4356 + }, + { + "epoch": 0.8956727310103814, + "grad_norm": 0.1870591640472412, + "learning_rate": 7.402224488754184e-05, + "loss": 0.6863, + "step": 4357 + }, + { + "epoch": 0.8958783019837598, + "grad_norm": 0.18606425821781158, + "learning_rate": 7.401461091917206e-05, + "loss": 0.6825, + "step": 4358 + }, + { + "epoch": 0.8960838729571384, + "grad_norm": 0.18178561329841614, + "learning_rate": 7.400697552138803e-05, + "loss": 0.6685, + "step": 4359 + }, + { + "epoch": 0.896289443930517, + "grad_norm": 0.1832335740327835, + "learning_rate": 7.399933869456589e-05, + "loss": 0.6756, + "step": 4360 + }, + { + "epoch": 0.8964950149038956, + "grad_norm": 0.18786631524562836, + "learning_rate": 7.399170043908187e-05, + "loss": 0.6464, + "step": 4361 + }, + { + "epoch": 0.8967005858772741, + "grad_norm": 0.18036015331745148, + "learning_rate": 7.398406075531228e-05, + "loss": 0.6493, + "step": 4362 + }, + { + "epoch": 0.8969061568506527, + "grad_norm": 0.19510389864444733, + "learning_rate": 7.39764196436335e-05, + "loss": 0.6499, + "step": 4363 + }, + { + "epoch": 0.8971117278240313, + "grad_norm": 0.18855442106723785, + "learning_rate": 7.396877710442194e-05, + "loss": 0.6618, + "step": 4364 + }, + { + "epoch": 0.8973172987974098, + "grad_norm": 0.1755952090024948, + "learning_rate": 7.396113313805416e-05, + "loss": 0.5859, + "step": 4365 + }, + { + "epoch": 0.8975228697707883, + "grad_norm": 0.19632591307163239, + "learning_rate": 7.395348774490668e-05, + "loss": 0.6806, + "step": 4366 + }, + { + "epoch": 0.8977284407441669, + "grad_norm": 0.1848839372396469, + "learning_rate": 7.394584092535622e-05, + "loss": 0.6589, + "step": 4367 + }, + { + "epoch": 0.8979340117175455, + "grad_norm": 0.1884489208459854, + "learning_rate": 7.393819267977945e-05, + "loss": 0.6858, + "step": 4368 + }, + { + "epoch": 0.8981395826909241, + "grad_norm": 0.1883459985256195, + "learning_rate": 7.393054300855318e-05, + "loss": 0.6714, + "step": 4369 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 0.18213316798210144, + "learning_rate": 7.392289191205428e-05, + "loss": 0.6601, + "step": 4370 + }, + { + "epoch": 0.8985507246376812, + "grad_norm": 0.18287204205989838, + "learning_rate": 7.391523939065969e-05, + "loss": 0.6714, + "step": 4371 + }, + { + "epoch": 0.8987562956110597, + "grad_norm": 0.18707792460918427, + "learning_rate": 7.390758544474639e-05, + "loss": 0.6407, + "step": 4372 + }, + { + "epoch": 0.8989618665844383, + "grad_norm": 0.18532080948352814, + "learning_rate": 7.389993007469148e-05, + "loss": 0.6813, + "step": 4373 + }, + { + "epoch": 0.8991674375578168, + "grad_norm": 0.17980536818504333, + "learning_rate": 7.38922732808721e-05, + "loss": 0.6335, + "step": 4374 + }, + { + "epoch": 0.8993730085311954, + "grad_norm": 0.18949337303638458, + "learning_rate": 7.388461506366544e-05, + "loss": 0.6959, + "step": 4375 + }, + { + "epoch": 0.899578579504574, + "grad_norm": 0.18386761844158173, + "learning_rate": 7.387695542344881e-05, + "loss": 0.6337, + "step": 4376 + }, + { + "epoch": 0.8997841504779525, + "grad_norm": 0.18090958893299103, + "learning_rate": 7.386929436059956e-05, + "loss": 0.6445, + "step": 4377 + }, + { + "epoch": 0.899989721451331, + "grad_norm": 0.18790413439273834, + "learning_rate": 7.386163187549511e-05, + "loss": 0.6622, + "step": 4378 + }, + { + "epoch": 0.9001952924247096, + "grad_norm": 0.18693870306015015, + "learning_rate": 7.385396796851296e-05, + "loss": 0.6711, + "step": 4379 + }, + { + "epoch": 0.9004008633980882, + "grad_norm": 0.18476144969463348, + "learning_rate": 7.384630264003067e-05, + "loss": 0.6642, + "step": 4380 + }, + { + "epoch": 0.9006064343714667, + "grad_norm": 0.18623842298984528, + "learning_rate": 7.383863589042587e-05, + "loss": 0.6242, + "step": 4381 + }, + { + "epoch": 0.9008120053448453, + "grad_norm": 0.14655017852783203, + "learning_rate": 7.383096772007628e-05, + "loss": 0.5558, + "step": 4382 + }, + { + "epoch": 0.9010175763182239, + "grad_norm": 0.18449489772319794, + "learning_rate": 7.382329812935963e-05, + "loss": 0.6603, + "step": 4383 + }, + { + "epoch": 0.9012231472916025, + "grad_norm": 0.1364215761423111, + "learning_rate": 7.381562711865385e-05, + "loss": 0.5671, + "step": 4384 + }, + { + "epoch": 0.9014287182649809, + "grad_norm": 0.19321440160274506, + "learning_rate": 7.380795468833679e-05, + "loss": 0.6826, + "step": 4385 + }, + { + "epoch": 0.9016342892383595, + "grad_norm": 0.18807579576969147, + "learning_rate": 7.380028083878644e-05, + "loss": 0.6982, + "step": 4386 + }, + { + "epoch": 0.9018398602117381, + "grad_norm": 0.18062882125377655, + "learning_rate": 7.379260557038088e-05, + "loss": 0.6676, + "step": 4387 + }, + { + "epoch": 0.9020454311851167, + "grad_norm": 0.14082865417003632, + "learning_rate": 7.37849288834982e-05, + "loss": 0.6026, + "step": 4388 + }, + { + "epoch": 0.9022510021584952, + "grad_norm": 0.1912989616394043, + "learning_rate": 7.377725077851663e-05, + "loss": 0.6711, + "step": 4389 + }, + { + "epoch": 0.9024565731318738, + "grad_norm": 0.12428473681211472, + "learning_rate": 7.376957125581441e-05, + "loss": 0.5805, + "step": 4390 + }, + { + "epoch": 0.9026621441052524, + "grad_norm": 0.1931021362543106, + "learning_rate": 7.376189031576991e-05, + "loss": 0.6652, + "step": 4391 + }, + { + "epoch": 0.902867715078631, + "grad_norm": 0.1896105408668518, + "learning_rate": 7.375420795876148e-05, + "loss": 0.6592, + "step": 4392 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 0.18101376295089722, + "learning_rate": 7.374652418516761e-05, + "loss": 0.6803, + "step": 4393 + }, + { + "epoch": 0.903278857025388, + "grad_norm": 0.18925289809703827, + "learning_rate": 7.373883899536688e-05, + "loss": 0.6599, + "step": 4394 + }, + { + "epoch": 0.9034844279987666, + "grad_norm": 0.18771770596504211, + "learning_rate": 7.373115238973786e-05, + "loss": 0.6866, + "step": 4395 + }, + { + "epoch": 0.9036899989721451, + "grad_norm": 0.18801310658454895, + "learning_rate": 7.372346436865927e-05, + "loss": 0.6602, + "step": 4396 + }, + { + "epoch": 0.9038955699455237, + "grad_norm": 0.1810484528541565, + "learning_rate": 7.371577493250983e-05, + "loss": 0.6377, + "step": 4397 + }, + { + "epoch": 0.9041011409189023, + "grad_norm": 0.18624310195446014, + "learning_rate": 7.370808408166838e-05, + "loss": 0.6655, + "step": 4398 + }, + { + "epoch": 0.9043067118922808, + "grad_norm": 0.1851394772529602, + "learning_rate": 7.37003918165138e-05, + "loss": 0.6622, + "step": 4399 + }, + { + "epoch": 0.9045122828656593, + "grad_norm": 0.20104296505451202, + "learning_rate": 7.369269813742507e-05, + "loss": 0.6727, + "step": 4400 + }, + { + "epoch": 0.9047178538390379, + "grad_norm": 0.15082360804080963, + "learning_rate": 7.368500304478121e-05, + "loss": 0.5995, + "step": 4401 + }, + { + "epoch": 0.9049234248124165, + "grad_norm": 0.13055920600891113, + "learning_rate": 7.367730653896132e-05, + "loss": 0.5763, + "step": 4402 + }, + { + "epoch": 0.9051289957857951, + "grad_norm": 0.1956562101840973, + "learning_rate": 7.366960862034458e-05, + "loss": 0.6743, + "step": 4403 + }, + { + "epoch": 0.9053345667591736, + "grad_norm": 0.1876806765794754, + "learning_rate": 7.366190928931021e-05, + "loss": 0.6862, + "step": 4404 + }, + { + "epoch": 0.9055401377325522, + "grad_norm": 0.1496850550174713, + "learning_rate": 7.365420854623755e-05, + "loss": 0.5858, + "step": 4405 + }, + { + "epoch": 0.9057457087059307, + "grad_norm": 0.14271092414855957, + "learning_rate": 7.364650639150596e-05, + "loss": 0.6152, + "step": 4406 + }, + { + "epoch": 0.9059512796793093, + "grad_norm": 0.20220176875591278, + "learning_rate": 7.36388028254949e-05, + "loss": 0.6771, + "step": 4407 + }, + { + "epoch": 0.9061568506526878, + "grad_norm": 0.13460181653499603, + "learning_rate": 7.363109784858388e-05, + "loss": 0.5904, + "step": 4408 + }, + { + "epoch": 0.9063624216260664, + "grad_norm": 0.13429884612560272, + "learning_rate": 7.362339146115248e-05, + "loss": 0.5729, + "step": 4409 + }, + { + "epoch": 0.906567992599445, + "grad_norm": 0.18408654630184174, + "learning_rate": 7.361568366358038e-05, + "loss": 0.6534, + "step": 4410 + }, + { + "epoch": 0.9067735635728236, + "grad_norm": 0.20259039103984833, + "learning_rate": 7.360797445624729e-05, + "loss": 0.6585, + "step": 4411 + }, + { + "epoch": 0.906979134546202, + "grad_norm": 0.18721166253089905, + "learning_rate": 7.360026383953301e-05, + "loss": 0.6825, + "step": 4412 + }, + { + "epoch": 0.9071847055195806, + "grad_norm": 0.18604475259780884, + "learning_rate": 7.359255181381741e-05, + "loss": 0.6372, + "step": 4413 + }, + { + "epoch": 0.9073902764929592, + "grad_norm": 0.1993558555841446, + "learning_rate": 7.358483837948043e-05, + "loss": 0.653, + "step": 4414 + }, + { + "epoch": 0.9075958474663377, + "grad_norm": 0.18966707587242126, + "learning_rate": 7.357712353690205e-05, + "loss": 0.6598, + "step": 4415 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 0.18439289927482605, + "learning_rate": 7.35694072864624e-05, + "loss": 0.6923, + "step": 4416 + }, + { + "epoch": 0.9080069894130949, + "grad_norm": 0.1838844269514084, + "learning_rate": 7.356168962854155e-05, + "loss": 0.6617, + "step": 4417 + }, + { + "epoch": 0.9082125603864735, + "grad_norm": 0.18378853797912598, + "learning_rate": 7.355397056351975e-05, + "loss": 0.6939, + "step": 4418 + }, + { + "epoch": 0.9084181313598519, + "grad_norm": 0.1807030588388443, + "learning_rate": 7.354625009177729e-05, + "loss": 0.6425, + "step": 4419 + }, + { + "epoch": 0.9086237023332305, + "grad_norm": 0.18497875332832336, + "learning_rate": 7.353852821369452e-05, + "loss": 0.682, + "step": 4420 + }, + { + "epoch": 0.9088292733066091, + "grad_norm": 0.18819686770439148, + "learning_rate": 7.353080492965184e-05, + "loss": 0.6772, + "step": 4421 + }, + { + "epoch": 0.9090348442799877, + "grad_norm": 0.19138747453689575, + "learning_rate": 7.352308024002977e-05, + "loss": 0.5944, + "step": 4422 + }, + { + "epoch": 0.9092404152533662, + "grad_norm": 0.1951584368944168, + "learning_rate": 7.351535414520884e-05, + "loss": 0.6523, + "step": 4423 + }, + { + "epoch": 0.9094459862267448, + "grad_norm": 0.19077381491661072, + "learning_rate": 7.350762664556969e-05, + "loss": 0.6364, + "step": 4424 + }, + { + "epoch": 0.9096515572001234, + "grad_norm": 0.18306457996368408, + "learning_rate": 7.349989774149302e-05, + "loss": 0.6616, + "step": 4425 + }, + { + "epoch": 0.909857128173502, + "grad_norm": 0.18449269235134125, + "learning_rate": 7.349216743335961e-05, + "loss": 0.6431, + "step": 4426 + }, + { + "epoch": 0.9100626991468804, + "grad_norm": 0.18515408039093018, + "learning_rate": 7.348443572155027e-05, + "loss": 0.634, + "step": 4427 + }, + { + "epoch": 0.910268270120259, + "grad_norm": 0.16887053847312927, + "learning_rate": 7.347670260644592e-05, + "loss": 0.5846, + "step": 4428 + }, + { + "epoch": 0.9104738410936376, + "grad_norm": 0.19056186079978943, + "learning_rate": 7.346896808842753e-05, + "loss": 0.6496, + "step": 4429 + }, + { + "epoch": 0.9106794120670162, + "grad_norm": 0.19287076592445374, + "learning_rate": 7.346123216787616e-05, + "loss": 0.6689, + "step": 4430 + }, + { + "epoch": 0.9108849830403947, + "grad_norm": 0.18666431307792664, + "learning_rate": 7.34534948451729e-05, + "loss": 0.6639, + "step": 4431 + }, + { + "epoch": 0.9110905540137733, + "grad_norm": 0.1829727292060852, + "learning_rate": 7.344575612069893e-05, + "loss": 0.6354, + "step": 4432 + }, + { + "epoch": 0.9112961249871518, + "grad_norm": 0.19133751094341278, + "learning_rate": 7.343801599483554e-05, + "loss": 0.6718, + "step": 4433 + }, + { + "epoch": 0.9115016959605303, + "grad_norm": 0.19330398738384247, + "learning_rate": 7.3430274467964e-05, + "loss": 0.6725, + "step": 4434 + }, + { + "epoch": 0.9117072669339089, + "grad_norm": 0.18251781165599823, + "learning_rate": 7.342253154046571e-05, + "loss": 0.6553, + "step": 4435 + }, + { + "epoch": 0.9119128379072875, + "grad_norm": 0.1795288473367691, + "learning_rate": 7.341478721272215e-05, + "loss": 0.6338, + "step": 4436 + }, + { + "epoch": 0.9121184088806661, + "grad_norm": 0.19127197563648224, + "learning_rate": 7.340704148511483e-05, + "loss": 0.6715, + "step": 4437 + }, + { + "epoch": 0.9123239798540446, + "grad_norm": 0.1778208166360855, + "learning_rate": 7.339929435802536e-05, + "loss": 0.6374, + "step": 4438 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 0.18795600533485413, + "learning_rate": 7.339154583183538e-05, + "loss": 0.6714, + "step": 4439 + }, + { + "epoch": 0.9127351218008017, + "grad_norm": 0.1836930364370346, + "learning_rate": 7.338379590692665e-05, + "loss": 0.6638, + "step": 4440 + }, + { + "epoch": 0.9129406927741803, + "grad_norm": 0.1537027806043625, + "learning_rate": 7.337604458368095e-05, + "loss": 0.5768, + "step": 4441 + }, + { + "epoch": 0.9131462637475588, + "grad_norm": 0.1962456852197647, + "learning_rate": 7.336829186248018e-05, + "loss": 0.679, + "step": 4442 + }, + { + "epoch": 0.9133518347209374, + "grad_norm": 0.185762420296669, + "learning_rate": 7.336053774370626e-05, + "loss": 0.6342, + "step": 4443 + }, + { + "epoch": 0.913557405694316, + "grad_norm": 0.18311749398708344, + "learning_rate": 7.33527822277412e-05, + "loss": 0.6502, + "step": 4444 + }, + { + "epoch": 0.9137629766676946, + "grad_norm": 0.17960374057292938, + "learning_rate": 7.334502531496707e-05, + "loss": 0.6496, + "step": 4445 + }, + { + "epoch": 0.913968547641073, + "grad_norm": 0.17319880425930023, + "learning_rate": 7.333726700576603e-05, + "loss": 0.6354, + "step": 4446 + }, + { + "epoch": 0.9141741186144516, + "grad_norm": 0.19266557693481445, + "learning_rate": 7.332950730052029e-05, + "loss": 0.66, + "step": 4447 + }, + { + "epoch": 0.9143796895878302, + "grad_norm": 0.18681232631206512, + "learning_rate": 7.332174619961215e-05, + "loss": 0.6807, + "step": 4448 + }, + { + "epoch": 0.9145852605612088, + "grad_norm": 0.14952509105205536, + "learning_rate": 7.331398370342393e-05, + "loss": 0.5697, + "step": 4449 + }, + { + "epoch": 0.9147908315345873, + "grad_norm": 0.18955743312835693, + "learning_rate": 7.33062198123381e-05, + "loss": 0.6537, + "step": 4450 + }, + { + "epoch": 0.9149964025079659, + "grad_norm": 0.18778184056282043, + "learning_rate": 7.32984545267371e-05, + "loss": 0.6738, + "step": 4451 + }, + { + "epoch": 0.9152019734813445, + "grad_norm": 0.12955501675605774, + "learning_rate": 7.329068784700352e-05, + "loss": 0.5692, + "step": 4452 + }, + { + "epoch": 0.9154075444547229, + "grad_norm": 0.12300048768520355, + "learning_rate": 7.328291977351998e-05, + "loss": 0.5731, + "step": 4453 + }, + { + "epoch": 0.9156131154281015, + "grad_norm": 0.1227407306432724, + "learning_rate": 7.327515030666918e-05, + "loss": 0.5563, + "step": 4454 + }, + { + "epoch": 0.9158186864014801, + "grad_norm": 0.1943395435810089, + "learning_rate": 7.326737944683387e-05, + "loss": 0.6464, + "step": 4455 + }, + { + "epoch": 0.9160242573748587, + "grad_norm": 0.1225886195898056, + "learning_rate": 7.32596071943969e-05, + "loss": 0.5903, + "step": 4456 + }, + { + "epoch": 0.9162298283482372, + "grad_norm": 0.19221986830234528, + "learning_rate": 7.325183354974119e-05, + "loss": 0.6723, + "step": 4457 + }, + { + "epoch": 0.9164353993216158, + "grad_norm": 0.12794283032417297, + "learning_rate": 7.324405851324967e-05, + "loss": 0.5684, + "step": 4458 + }, + { + "epoch": 0.9166409702949944, + "grad_norm": 0.18591567873954773, + "learning_rate": 7.32362820853054e-05, + "loss": 0.6615, + "step": 4459 + }, + { + "epoch": 0.916846541268373, + "grad_norm": 0.1794724315404892, + "learning_rate": 7.32285042662915e-05, + "loss": 0.66, + "step": 4460 + }, + { + "epoch": 0.9170521122417514, + "grad_norm": 0.17996345460414886, + "learning_rate": 7.322072505659111e-05, + "loss": 0.6703, + "step": 4461 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 0.1860412210226059, + "learning_rate": 7.321294445658754e-05, + "loss": 0.6633, + "step": 4462 + }, + { + "epoch": 0.9174632541885086, + "grad_norm": 0.18460632860660553, + "learning_rate": 7.320516246666401e-05, + "loss": 0.6719, + "step": 4463 + }, + { + "epoch": 0.9176688251618872, + "grad_norm": 0.179931178689003, + "learning_rate": 7.3197379087204e-05, + "loss": 0.6874, + "step": 4464 + }, + { + "epoch": 0.9178743961352657, + "grad_norm": 0.17799624800682068, + "learning_rate": 7.31895943185909e-05, + "loss": 0.6829, + "step": 4465 + }, + { + "epoch": 0.9180799671086443, + "grad_norm": 0.17857834696769714, + "learning_rate": 7.318180816120825e-05, + "loss": 0.6732, + "step": 4466 + }, + { + "epoch": 0.9182855380820228, + "grad_norm": 0.18361206352710724, + "learning_rate": 7.317402061543963e-05, + "loss": 0.6628, + "step": 4467 + }, + { + "epoch": 0.9184911090554013, + "grad_norm": 0.18038511276245117, + "learning_rate": 7.316623168166869e-05, + "loss": 0.65, + "step": 4468 + }, + { + "epoch": 0.9186966800287799, + "grad_norm": 0.18144308030605316, + "learning_rate": 7.315844136027917e-05, + "loss": 0.6874, + "step": 4469 + }, + { + "epoch": 0.9189022510021585, + "grad_norm": 0.18093526363372803, + "learning_rate": 7.315064965165486e-05, + "loss": 0.6514, + "step": 4470 + }, + { + "epoch": 0.9191078219755371, + "grad_norm": 0.15748950839042664, + "learning_rate": 7.314285655617962e-05, + "loss": 0.5854, + "step": 4471 + }, + { + "epoch": 0.9193133929489156, + "grad_norm": 0.18608981370925903, + "learning_rate": 7.313506207423738e-05, + "loss": 0.6583, + "step": 4472 + }, + { + "epoch": 0.9195189639222942, + "grad_norm": 0.13079427182674408, + "learning_rate": 7.312726620621211e-05, + "loss": 0.5866, + "step": 4473 + }, + { + "epoch": 0.9197245348956727, + "grad_norm": 0.1949155479669571, + "learning_rate": 7.311946895248793e-05, + "loss": 0.6501, + "step": 4474 + }, + { + "epoch": 0.9199301058690513, + "grad_norm": 0.19749537110328674, + "learning_rate": 7.311167031344894e-05, + "loss": 0.6782, + "step": 4475 + }, + { + "epoch": 0.9201356768424298, + "grad_norm": 0.18449652194976807, + "learning_rate": 7.310387028947934e-05, + "loss": 0.6683, + "step": 4476 + }, + { + "epoch": 0.9203412478158084, + "grad_norm": 0.18260683119297028, + "learning_rate": 7.309606888096341e-05, + "loss": 0.6541, + "step": 4477 + }, + { + "epoch": 0.920546818789187, + "grad_norm": 0.16052718460559845, + "learning_rate": 7.308826608828548e-05, + "loss": 0.5706, + "step": 4478 + }, + { + "epoch": 0.9207523897625656, + "grad_norm": 0.1838260293006897, + "learning_rate": 7.308046191182998e-05, + "loss": 0.6577, + "step": 4479 + }, + { + "epoch": 0.920957960735944, + "grad_norm": 0.12468434125185013, + "learning_rate": 7.307265635198135e-05, + "loss": 0.5989, + "step": 4480 + }, + { + "epoch": 0.9211635317093226, + "grad_norm": 0.12467863410711288, + "learning_rate": 7.306484940912416e-05, + "loss": 0.5734, + "step": 4481 + }, + { + "epoch": 0.9213691026827012, + "grad_norm": 0.2048860639333725, + "learning_rate": 7.305704108364301e-05, + "loss": 0.6777, + "step": 4482 + }, + { + "epoch": 0.9215746736560798, + "grad_norm": 0.12955395877361298, + "learning_rate": 7.304923137592258e-05, + "loss": 0.5742, + "step": 4483 + }, + { + "epoch": 0.9217802446294583, + "grad_norm": 0.18409843742847443, + "learning_rate": 7.304142028634764e-05, + "loss": 0.6323, + "step": 4484 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 0.1324310153722763, + "learning_rate": 7.303360781530299e-05, + "loss": 0.5826, + "step": 4485 + }, + { + "epoch": 0.9221913865762155, + "grad_norm": 0.18649353086948395, + "learning_rate": 7.30257939631735e-05, + "loss": 0.6743, + "step": 4486 + }, + { + "epoch": 0.9223969575495939, + "grad_norm": 0.19196631014347076, + "learning_rate": 7.301797873034412e-05, + "loss": 0.6578, + "step": 4487 + }, + { + "epoch": 0.9226025285229725, + "grad_norm": 0.13730254769325256, + "learning_rate": 7.301016211719992e-05, + "loss": 0.5787, + "step": 4488 + }, + { + "epoch": 0.9228080994963511, + "grad_norm": 0.17747841775417328, + "learning_rate": 7.300234412412593e-05, + "loss": 0.6616, + "step": 4489 + }, + { + "epoch": 0.9230136704697297, + "grad_norm": 0.1930990219116211, + "learning_rate": 7.299452475150732e-05, + "loss": 0.6509, + "step": 4490 + }, + { + "epoch": 0.9232192414431082, + "grad_norm": 0.18891580402851105, + "learning_rate": 7.298670399972933e-05, + "loss": 0.6808, + "step": 4491 + }, + { + "epoch": 0.9234248124164868, + "grad_norm": 0.1751311719417572, + "learning_rate": 7.297888186917724e-05, + "loss": 0.649, + "step": 4492 + }, + { + "epoch": 0.9236303833898654, + "grad_norm": 0.37240174412727356, + "learning_rate": 7.297105836023642e-05, + "loss": 0.6677, + "step": 4493 + }, + { + "epoch": 0.923835954363244, + "grad_norm": 0.1758231371641159, + "learning_rate": 7.296323347329228e-05, + "loss": 0.6484, + "step": 4494 + }, + { + "epoch": 0.9240415253366224, + "grad_norm": 0.18870992958545685, + "learning_rate": 7.295540720873034e-05, + "loss": 0.6792, + "step": 4495 + }, + { + "epoch": 0.924247096310001, + "grad_norm": 0.17921528220176697, + "learning_rate": 7.294757956693616e-05, + "loss": 0.6595, + "step": 4496 + }, + { + "epoch": 0.9244526672833796, + "grad_norm": 0.18485888838768005, + "learning_rate": 7.293975054829534e-05, + "loss": 0.6875, + "step": 4497 + }, + { + "epoch": 0.9246582382567582, + "grad_norm": 0.1897556483745575, + "learning_rate": 7.293192015319359e-05, + "loss": 0.6486, + "step": 4498 + }, + { + "epoch": 0.9248638092301367, + "grad_norm": 0.18818815052509308, + "learning_rate": 7.29240883820167e-05, + "loss": 0.6567, + "step": 4499 + }, + { + "epoch": 0.9250693802035153, + "grad_norm": 0.1831316202878952, + "learning_rate": 7.291625523515051e-05, + "loss": 0.6784, + "step": 4500 + }, + { + "epoch": 0.9252749511768938, + "grad_norm": 0.18603403866291046, + "learning_rate": 7.290842071298088e-05, + "loss": 0.6519, + "step": 4501 + }, + { + "epoch": 0.9254805221502724, + "grad_norm": 0.18271493911743164, + "learning_rate": 7.290058481589381e-05, + "loss": 0.6522, + "step": 4502 + }, + { + "epoch": 0.9256860931236509, + "grad_norm": 0.1795085072517395, + "learning_rate": 7.289274754427536e-05, + "loss": 0.6418, + "step": 4503 + }, + { + "epoch": 0.9258916640970295, + "grad_norm": 0.18269407749176025, + "learning_rate": 7.288490889851158e-05, + "loss": 0.6724, + "step": 4504 + }, + { + "epoch": 0.9260972350704081, + "grad_norm": 0.18335239589214325, + "learning_rate": 7.287706887898867e-05, + "loss": 0.6758, + "step": 4505 + }, + { + "epoch": 0.9263028060437866, + "grad_norm": 0.17889541387557983, + "learning_rate": 7.28692274860929e-05, + "loss": 0.6228, + "step": 4506 + }, + { + "epoch": 0.9265083770171652, + "grad_norm": 0.1919565200805664, + "learning_rate": 7.286138472021053e-05, + "loss": 0.6629, + "step": 4507 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 0.162271648645401, + "learning_rate": 7.285354058172796e-05, + "loss": 0.5823, + "step": 4508 + }, + { + "epoch": 0.9269195189639223, + "grad_norm": 0.18953320384025574, + "learning_rate": 7.284569507103164e-05, + "loss": 0.687, + "step": 4509 + }, + { + "epoch": 0.9271250899373008, + "grad_norm": 0.18748800456523895, + "learning_rate": 7.283784818850807e-05, + "loss": 0.6741, + "step": 4510 + }, + { + "epoch": 0.9273306609106794, + "grad_norm": 0.20176881551742554, + "learning_rate": 7.282999993454383e-05, + "loss": 0.6658, + "step": 4511 + }, + { + "epoch": 0.927536231884058, + "grad_norm": 0.19243447482585907, + "learning_rate": 7.282215030952558e-05, + "loss": 0.6633, + "step": 4512 + }, + { + "epoch": 0.9277418028574366, + "grad_norm": 0.18939968943595886, + "learning_rate": 7.281429931384001e-05, + "loss": 0.6858, + "step": 4513 + }, + { + "epoch": 0.927947373830815, + "grad_norm": 0.1850508600473404, + "learning_rate": 7.280644694787393e-05, + "loss": 0.6459, + "step": 4514 + }, + { + "epoch": 0.9281529448041936, + "grad_norm": 0.18251655995845795, + "learning_rate": 7.279859321201418e-05, + "loss": 0.6619, + "step": 4515 + }, + { + "epoch": 0.9283585157775722, + "grad_norm": 0.18301190435886383, + "learning_rate": 7.279073810664767e-05, + "loss": 0.6507, + "step": 4516 + }, + { + "epoch": 0.9285640867509508, + "grad_norm": 0.15676529705524445, + "learning_rate": 7.278288163216138e-05, + "loss": 0.5846, + "step": 4517 + }, + { + "epoch": 0.9287696577243293, + "grad_norm": 0.18805831670761108, + "learning_rate": 7.277502378894237e-05, + "loss": 0.6531, + "step": 4518 + }, + { + "epoch": 0.9289752286977079, + "grad_norm": 0.1892201006412506, + "learning_rate": 7.276716457737776e-05, + "loss": 0.659, + "step": 4519 + }, + { + "epoch": 0.9291807996710865, + "grad_norm": 0.13629117608070374, + "learning_rate": 7.275930399785473e-05, + "loss": 0.569, + "step": 4520 + }, + { + "epoch": 0.929386370644465, + "grad_norm": 0.20433087646961212, + "learning_rate": 7.275144205076053e-05, + "loss": 0.6686, + "step": 4521 + }, + { + "epoch": 0.9295919416178435, + "grad_norm": 0.1851327121257782, + "learning_rate": 7.274357873648252e-05, + "loss": 0.6472, + "step": 4522 + }, + { + "epoch": 0.9297975125912221, + "grad_norm": 0.19075118005275726, + "learning_rate": 7.273571405540802e-05, + "loss": 0.6702, + "step": 4523 + }, + { + "epoch": 0.9300030835646007, + "grad_norm": 0.1823331117630005, + "learning_rate": 7.272784800792457e-05, + "loss": 0.6637, + "step": 4524 + }, + { + "epoch": 0.9302086545379792, + "grad_norm": 0.18252402544021606, + "learning_rate": 7.271998059441962e-05, + "loss": 0.6553, + "step": 4525 + }, + { + "epoch": 0.9304142255113578, + "grad_norm": 0.18108795583248138, + "learning_rate": 7.27121118152808e-05, + "loss": 0.6487, + "step": 4526 + }, + { + "epoch": 0.9306197964847364, + "grad_norm": 0.18125282227993011, + "learning_rate": 7.270424167089574e-05, + "loss": 0.6674, + "step": 4527 + }, + { + "epoch": 0.930825367458115, + "grad_norm": 0.17340825498104095, + "learning_rate": 7.269637016165218e-05, + "loss": 0.6521, + "step": 4528 + }, + { + "epoch": 0.9310309384314934, + "grad_norm": 0.17793837189674377, + "learning_rate": 7.268849728793794e-05, + "loss": 0.6443, + "step": 4529 + }, + { + "epoch": 0.931236509404872, + "grad_norm": 0.1863885074853897, + "learning_rate": 7.268062305014085e-05, + "loss": 0.6374, + "step": 4530 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 0.1790206879377365, + "learning_rate": 7.267274744864883e-05, + "loss": 0.6463, + "step": 4531 + }, + { + "epoch": 0.9316476513516292, + "grad_norm": 0.19194868206977844, + "learning_rate": 7.266487048384987e-05, + "loss": 0.6575, + "step": 4532 + }, + { + "epoch": 0.9318532223250077, + "grad_norm": 0.17925460636615753, + "learning_rate": 7.265699215613208e-05, + "loss": 0.655, + "step": 4533 + }, + { + "epoch": 0.9320587932983863, + "grad_norm": 0.18405981361865997, + "learning_rate": 7.264911246588353e-05, + "loss": 0.6661, + "step": 4534 + }, + { + "epoch": 0.9322643642717648, + "grad_norm": 0.15504823625087738, + "learning_rate": 7.264123141349245e-05, + "loss": 0.5726, + "step": 4535 + }, + { + "epoch": 0.9324699352451434, + "grad_norm": 0.1932215392589569, + "learning_rate": 7.26333489993471e-05, + "loss": 0.659, + "step": 4536 + }, + { + "epoch": 0.9326755062185219, + "grad_norm": 0.182255357503891, + "learning_rate": 7.262546522383579e-05, + "loss": 0.6792, + "step": 4537 + }, + { + "epoch": 0.9328810771919005, + "grad_norm": 0.1837291121482849, + "learning_rate": 7.261758008734693e-05, + "loss": 0.6816, + "step": 4538 + }, + { + "epoch": 0.9330866481652791, + "grad_norm": 0.1409105658531189, + "learning_rate": 7.2609693590269e-05, + "loss": 0.5832, + "step": 4539 + }, + { + "epoch": 0.9332922191386577, + "grad_norm": 0.19682304561138153, + "learning_rate": 7.260180573299049e-05, + "loss": 0.6693, + "step": 4540 + }, + { + "epoch": 0.9334977901120362, + "grad_norm": 0.1264413744211197, + "learning_rate": 7.259391651590005e-05, + "loss": 0.5933, + "step": 4541 + }, + { + "epoch": 0.9337033610854147, + "grad_norm": 0.1842966377735138, + "learning_rate": 7.258602593938629e-05, + "loss": 0.6619, + "step": 4542 + }, + { + "epoch": 0.9339089320587933, + "grad_norm": 0.18830621242523193, + "learning_rate": 7.257813400383798e-05, + "loss": 0.6614, + "step": 4543 + }, + { + "epoch": 0.9341145030321718, + "grad_norm": 0.17995837330818176, + "learning_rate": 7.257024070964391e-05, + "loss": 0.6535, + "step": 4544 + }, + { + "epoch": 0.9343200740055504, + "grad_norm": 0.1838386356830597, + "learning_rate": 7.256234605719294e-05, + "loss": 0.6598, + "step": 4545 + }, + { + "epoch": 0.934525644978929, + "grad_norm": 0.18245835602283478, + "learning_rate": 7.2554450046874e-05, + "loss": 0.6377, + "step": 4546 + }, + { + "epoch": 0.9347312159523076, + "grad_norm": 0.18414458632469177, + "learning_rate": 7.254655267907611e-05, + "loss": 0.6616, + "step": 4547 + }, + { + "epoch": 0.934936786925686, + "grad_norm": 0.14779187738895416, + "learning_rate": 7.253865395418832e-05, + "loss": 0.574, + "step": 4548 + }, + { + "epoch": 0.9351423578990646, + "grad_norm": 0.13919095695018768, + "learning_rate": 7.253075387259975e-05, + "loss": 0.5738, + "step": 4549 + }, + { + "epoch": 0.9353479288724432, + "grad_norm": 0.20152714848518372, + "learning_rate": 7.252285243469962e-05, + "loss": 0.656, + "step": 4550 + }, + { + "epoch": 0.9355534998458218, + "grad_norm": 0.20961932837963104, + "learning_rate": 7.251494964087721e-05, + "loss": 0.6724, + "step": 4551 + }, + { + "epoch": 0.9357590708192003, + "grad_norm": 0.1847916692495346, + "learning_rate": 7.25070454915218e-05, + "loss": 0.6601, + "step": 4552 + }, + { + "epoch": 0.9359646417925789, + "grad_norm": 0.1776532083749771, + "learning_rate": 7.249913998702287e-05, + "loss": 0.645, + "step": 4553 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.18173301219940186, + "learning_rate": 7.249123312776982e-05, + "loss": 0.6983, + "step": 4554 + }, + { + "epoch": 0.936375783739336, + "grad_norm": 0.18323563039302826, + "learning_rate": 7.24833249141522e-05, + "loss": 0.6603, + "step": 4555 + }, + { + "epoch": 0.9365813547127145, + "grad_norm": 0.18385376036167145, + "learning_rate": 7.247541534655962e-05, + "loss": 0.6551, + "step": 4556 + }, + { + "epoch": 0.9367869256860931, + "grad_norm": 0.18663281202316284, + "learning_rate": 7.246750442538176e-05, + "loss": 0.6562, + "step": 4557 + }, + { + "epoch": 0.9369924966594717, + "grad_norm": 0.18781627714633942, + "learning_rate": 7.245959215100834e-05, + "loss": 0.6772, + "step": 4558 + }, + { + "epoch": 0.9371980676328503, + "grad_norm": 0.18314847350120544, + "learning_rate": 7.245167852382915e-05, + "loss": 0.6523, + "step": 4559 + }, + { + "epoch": 0.9374036386062288, + "grad_norm": 0.18462207913398743, + "learning_rate": 7.244376354423408e-05, + "loss": 0.6716, + "step": 4560 + }, + { + "epoch": 0.9376092095796074, + "grad_norm": 0.18789240717887878, + "learning_rate": 7.243584721261302e-05, + "loss": 0.6672, + "step": 4561 + }, + { + "epoch": 0.937814780552986, + "grad_norm": 0.18339639902114868, + "learning_rate": 7.242792952935604e-05, + "loss": 0.6526, + "step": 4562 + }, + { + "epoch": 0.9380203515263644, + "grad_norm": 0.18866628408432007, + "learning_rate": 7.242001049485314e-05, + "loss": 0.6739, + "step": 4563 + }, + { + "epoch": 0.938225922499743, + "grad_norm": 0.18578583002090454, + "learning_rate": 7.241209010949452e-05, + "loss": 0.6485, + "step": 4564 + }, + { + "epoch": 0.9384314934731216, + "grad_norm": 0.18551675975322723, + "learning_rate": 7.240416837367032e-05, + "loss": 0.6537, + "step": 4565 + }, + { + "epoch": 0.9386370644465002, + "grad_norm": 0.1823461502790451, + "learning_rate": 7.239624528777082e-05, + "loss": 0.6626, + "step": 4566 + }, + { + "epoch": 0.9388426354198787, + "grad_norm": 0.18426772952079773, + "learning_rate": 7.23883208521864e-05, + "loss": 0.6314, + "step": 4567 + }, + { + "epoch": 0.9390482063932573, + "grad_norm": 0.19278199970722198, + "learning_rate": 7.23803950673074e-05, + "loss": 0.6813, + "step": 4568 + }, + { + "epoch": 0.9392537773666358, + "grad_norm": 0.17879322171211243, + "learning_rate": 7.23724679335243e-05, + "loss": 0.6412, + "step": 4569 + }, + { + "epoch": 0.9394593483400144, + "grad_norm": 0.18191079795360565, + "learning_rate": 7.236453945122767e-05, + "loss": 0.6825, + "step": 4570 + }, + { + "epoch": 0.9396649193133929, + "grad_norm": 0.19142381846904755, + "learning_rate": 7.235660962080805e-05, + "loss": 0.6717, + "step": 4571 + }, + { + "epoch": 0.9398704902867715, + "grad_norm": 0.18709653615951538, + "learning_rate": 7.234867844265617e-05, + "loss": 0.6483, + "step": 4572 + }, + { + "epoch": 0.9400760612601501, + "grad_norm": 0.18491537868976593, + "learning_rate": 7.234074591716271e-05, + "loss": 0.6614, + "step": 4573 + }, + { + "epoch": 0.9402816322335287, + "grad_norm": 0.2287359982728958, + "learning_rate": 7.233281204471851e-05, + "loss": 0.5824, + "step": 4574 + }, + { + "epoch": 0.9404872032069072, + "grad_norm": 0.1951487511396408, + "learning_rate": 7.232487682571439e-05, + "loss": 0.6553, + "step": 4575 + }, + { + "epoch": 0.9406927741802857, + "grad_norm": 0.19920621812343597, + "learning_rate": 7.231694026054133e-05, + "loss": 0.6497, + "step": 4576 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 0.15709790587425232, + "learning_rate": 7.230900234959028e-05, + "loss": 0.5685, + "step": 4577 + }, + { + "epoch": 0.9411039161270429, + "grad_norm": 0.19238202273845673, + "learning_rate": 7.230106309325234e-05, + "loss": 0.6771, + "step": 4578 + }, + { + "epoch": 0.9413094871004214, + "grad_norm": 0.1886894553899765, + "learning_rate": 7.229312249191862e-05, + "loss": 0.6278, + "step": 4579 + }, + { + "epoch": 0.9415150580738, + "grad_norm": 0.19003844261169434, + "learning_rate": 7.228518054598032e-05, + "loss": 0.6583, + "step": 4580 + }, + { + "epoch": 0.9417206290471786, + "grad_norm": 0.15929581224918365, + "learning_rate": 7.227723725582871e-05, + "loss": 0.5738, + "step": 4581 + }, + { + "epoch": 0.941926200020557, + "grad_norm": 0.19181537628173828, + "learning_rate": 7.226929262185511e-05, + "loss": 0.6692, + "step": 4582 + }, + { + "epoch": 0.9421317709939356, + "grad_norm": 0.19717134535312653, + "learning_rate": 7.226134664445093e-05, + "loss": 0.665, + "step": 4583 + }, + { + "epoch": 0.9423373419673142, + "grad_norm": 0.17591415345668793, + "learning_rate": 7.22533993240076e-05, + "loss": 0.6358, + "step": 4584 + }, + { + "epoch": 0.9425429129406928, + "grad_norm": 0.18756897747516632, + "learning_rate": 7.224545066091669e-05, + "loss": 0.6755, + "step": 4585 + }, + { + "epoch": 0.9427484839140713, + "grad_norm": 0.18418292701244354, + "learning_rate": 7.223750065556977e-05, + "loss": 0.6498, + "step": 4586 + }, + { + "epoch": 0.9429540548874499, + "grad_norm": 0.14689591526985168, + "learning_rate": 7.222954930835849e-05, + "loss": 0.5795, + "step": 4587 + }, + { + "epoch": 0.9431596258608285, + "grad_norm": 0.18386992812156677, + "learning_rate": 7.222159661967459e-05, + "loss": 0.6699, + "step": 4588 + }, + { + "epoch": 0.943365196834207, + "grad_norm": 0.1894700974225998, + "learning_rate": 7.221364258990985e-05, + "loss": 0.6571, + "step": 4589 + }, + { + "epoch": 0.9435707678075855, + "grad_norm": 0.17809130251407623, + "learning_rate": 7.220568721945614e-05, + "loss": 0.6409, + "step": 4590 + }, + { + "epoch": 0.9437763387809641, + "grad_norm": 0.18572141230106354, + "learning_rate": 7.219773050870537e-05, + "loss": 0.6774, + "step": 4591 + }, + { + "epoch": 0.9439819097543427, + "grad_norm": 0.1781856119632721, + "learning_rate": 7.218977245804955e-05, + "loss": 0.6939, + "step": 4592 + }, + { + "epoch": 0.9441874807277213, + "grad_norm": 0.1840573400259018, + "learning_rate": 7.218181306788074e-05, + "loss": 0.6654, + "step": 4593 + }, + { + "epoch": 0.9443930517010998, + "grad_norm": 0.1829008311033249, + "learning_rate": 7.217385233859102e-05, + "loss": 0.6673, + "step": 4594 + }, + { + "epoch": 0.9445986226744784, + "grad_norm": 0.18518169224262238, + "learning_rate": 7.216589027057262e-05, + "loss": 0.6902, + "step": 4595 + }, + { + "epoch": 0.944804193647857, + "grad_norm": 0.18205900490283966, + "learning_rate": 7.215792686421779e-05, + "loss": 0.6773, + "step": 4596 + }, + { + "epoch": 0.9450097646212354, + "grad_norm": 0.14691917598247528, + "learning_rate": 7.214996211991883e-05, + "loss": 0.5941, + "step": 4597 + }, + { + "epoch": 0.945215335594614, + "grad_norm": 0.18329864740371704, + "learning_rate": 7.214199603806812e-05, + "loss": 0.6699, + "step": 4598 + }, + { + "epoch": 0.9454209065679926, + "grad_norm": 0.19199949502944946, + "learning_rate": 7.213402861905814e-05, + "loss": 0.6787, + "step": 4599 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 0.15589165687561035, + "learning_rate": 7.21260598632814e-05, + "loss": 0.592, + "step": 4600 + }, + { + "epoch": 0.9458320485147497, + "grad_norm": 0.12858957052230835, + "learning_rate": 7.211808977113046e-05, + "loss": 0.5699, + "step": 4601 + }, + { + "epoch": 0.9460376194881283, + "grad_norm": 0.19628196954727173, + "learning_rate": 7.2110118342998e-05, + "loss": 0.6516, + "step": 4602 + }, + { + "epoch": 0.9462431904615068, + "grad_norm": 0.1841566562652588, + "learning_rate": 7.210214557927672e-05, + "loss": 0.6473, + "step": 4603 + }, + { + "epoch": 0.9464487614348854, + "grad_norm": 0.17483435571193695, + "learning_rate": 7.20941714803594e-05, + "loss": 0.6634, + "step": 4604 + }, + { + "epoch": 0.9466543324082639, + "grad_norm": 0.18066710233688354, + "learning_rate": 7.20861960466389e-05, + "loss": 0.6443, + "step": 4605 + }, + { + "epoch": 0.9468599033816425, + "grad_norm": 0.18473312258720398, + "learning_rate": 7.207821927850811e-05, + "loss": 0.6632, + "step": 4606 + }, + { + "epoch": 0.9470654743550211, + "grad_norm": 0.18283243477344513, + "learning_rate": 7.207024117636002e-05, + "loss": 0.6703, + "step": 4607 + }, + { + "epoch": 0.9472710453283997, + "grad_norm": 0.16648972034454346, + "learning_rate": 7.206226174058766e-05, + "loss": 0.5717, + "step": 4608 + }, + { + "epoch": 0.9474766163017782, + "grad_norm": 0.19748379290103912, + "learning_rate": 7.205428097158419e-05, + "loss": 0.6838, + "step": 4609 + }, + { + "epoch": 0.9476821872751567, + "grad_norm": 0.19934770464897156, + "learning_rate": 7.204629886974271e-05, + "loss": 0.6729, + "step": 4610 + }, + { + "epoch": 0.9478877582485353, + "grad_norm": 0.19454436004161835, + "learning_rate": 7.203831543545651e-05, + "loss": 0.6693, + "step": 4611 + }, + { + "epoch": 0.9480933292219139, + "grad_norm": 0.18130190670490265, + "learning_rate": 7.203033066911889e-05, + "loss": 0.6533, + "step": 4612 + }, + { + "epoch": 0.9482989001952924, + "grad_norm": 0.17981968820095062, + "learning_rate": 7.202234457112322e-05, + "loss": 0.6574, + "step": 4613 + }, + { + "epoch": 0.948504471168671, + "grad_norm": 0.1938825398683548, + "learning_rate": 7.201435714186294e-05, + "loss": 0.6517, + "step": 4614 + }, + { + "epoch": 0.9487100421420496, + "grad_norm": 0.18407849967479706, + "learning_rate": 7.200636838173153e-05, + "loss": 0.6561, + "step": 4615 + }, + { + "epoch": 0.948915613115428, + "grad_norm": 0.186232790350914, + "learning_rate": 7.199837829112259e-05, + "loss": 0.6417, + "step": 4616 + }, + { + "epoch": 0.9491211840888066, + "grad_norm": 0.14791084825992584, + "learning_rate": 7.199038687042973e-05, + "loss": 0.591, + "step": 4617 + }, + { + "epoch": 0.9493267550621852, + "grad_norm": 0.1812361627817154, + "learning_rate": 7.198239412004667e-05, + "loss": 0.6669, + "step": 4618 + }, + { + "epoch": 0.9495323260355638, + "grad_norm": 0.18593737483024597, + "learning_rate": 7.197440004036716e-05, + "loss": 0.6999, + "step": 4619 + }, + { + "epoch": 0.9497378970089423, + "grad_norm": 0.17995983362197876, + "learning_rate": 7.196640463178506e-05, + "loss": 0.6708, + "step": 4620 + }, + { + "epoch": 0.9499434679823209, + "grad_norm": 0.19636765122413635, + "learning_rate": 7.195840789469422e-05, + "loss": 0.6667, + "step": 4621 + }, + { + "epoch": 0.9501490389556995, + "grad_norm": 0.18465958535671234, + "learning_rate": 7.195040982948865e-05, + "loss": 0.6646, + "step": 4622 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 0.1823161542415619, + "learning_rate": 7.194241043656234e-05, + "loss": 0.636, + "step": 4623 + }, + { + "epoch": 0.9505601809024565, + "grad_norm": 0.19029709696769714, + "learning_rate": 7.19344097163094e-05, + "loss": 0.6731, + "step": 4624 + }, + { + "epoch": 0.9507657518758351, + "grad_norm": 0.19934682548046112, + "learning_rate": 7.192640766912397e-05, + "loss": 0.6384, + "step": 4625 + }, + { + "epoch": 0.9509713228492137, + "grad_norm": 0.14476045966148376, + "learning_rate": 7.19184042954003e-05, + "loss": 0.5821, + "step": 4626 + }, + { + "epoch": 0.9511768938225923, + "grad_norm": 0.1817658394575119, + "learning_rate": 7.191039959553266e-05, + "loss": 0.6815, + "step": 4627 + }, + { + "epoch": 0.9513824647959708, + "grad_norm": 0.1834515631198883, + "learning_rate": 7.190239356991542e-05, + "loss": 0.6393, + "step": 4628 + }, + { + "epoch": 0.9515880357693494, + "grad_norm": 0.18767185509204865, + "learning_rate": 7.189438621894298e-05, + "loss": 0.6657, + "step": 4629 + }, + { + "epoch": 0.951793606742728, + "grad_norm": 0.18882089853286743, + "learning_rate": 7.188637754300984e-05, + "loss": 0.6531, + "step": 4630 + }, + { + "epoch": 0.9519991777161065, + "grad_norm": 0.18064305186271667, + "learning_rate": 7.187836754251055e-05, + "loss": 0.6739, + "step": 4631 + }, + { + "epoch": 0.952204748689485, + "grad_norm": 0.18906618654727936, + "learning_rate": 7.187035621783972e-05, + "loss": 0.68, + "step": 4632 + }, + { + "epoch": 0.9524103196628636, + "grad_norm": 0.1903999000787735, + "learning_rate": 7.186234356939204e-05, + "loss": 0.6503, + "step": 4633 + }, + { + "epoch": 0.9526158906362422, + "grad_norm": 0.184392049908638, + "learning_rate": 7.185432959756222e-05, + "loss": 0.6723, + "step": 4634 + }, + { + "epoch": 0.9528214616096207, + "grad_norm": 0.19594880938529968, + "learning_rate": 7.184631430274512e-05, + "loss": 0.6487, + "step": 4635 + }, + { + "epoch": 0.9530270325829993, + "grad_norm": 0.1459794044494629, + "learning_rate": 7.183829768533558e-05, + "loss": 0.5766, + "step": 4636 + }, + { + "epoch": 0.9532326035563778, + "grad_norm": 0.19931526482105255, + "learning_rate": 7.183027974572856e-05, + "loss": 0.6702, + "step": 4637 + }, + { + "epoch": 0.9534381745297564, + "grad_norm": 0.18936146795749664, + "learning_rate": 7.182226048431907e-05, + "loss": 0.6409, + "step": 4638 + }, + { + "epoch": 0.9536437455031349, + "grad_norm": 0.12762728333473206, + "learning_rate": 7.181423990150215e-05, + "loss": 0.5624, + "step": 4639 + }, + { + "epoch": 0.9538493164765135, + "grad_norm": 0.1938938945531845, + "learning_rate": 7.180621799767298e-05, + "loss": 0.6835, + "step": 4640 + }, + { + "epoch": 0.9540548874498921, + "grad_norm": 0.1908787190914154, + "learning_rate": 7.179819477322673e-05, + "loss": 0.679, + "step": 4641 + }, + { + "epoch": 0.9542604584232707, + "grad_norm": 0.17859888076782227, + "learning_rate": 7.179017022855868e-05, + "loss": 0.6604, + "step": 4642 + }, + { + "epoch": 0.9544660293966492, + "grad_norm": 0.14399871230125427, + "learning_rate": 7.178214436406416e-05, + "loss": 0.5768, + "step": 4643 + }, + { + "epoch": 0.9546716003700277, + "grad_norm": 0.19949081540107727, + "learning_rate": 7.177411718013858e-05, + "loss": 0.6536, + "step": 4644 + }, + { + "epoch": 0.9548771713434063, + "grad_norm": 0.12567096948623657, + "learning_rate": 7.176608867717738e-05, + "loss": 0.579, + "step": 4645 + }, + { + "epoch": 0.9550827423167849, + "grad_norm": 0.1978704035282135, + "learning_rate": 7.175805885557608e-05, + "loss": 0.654, + "step": 4646 + }, + { + "epoch": 0.9552883132901634, + "grad_norm": 0.1830187439918518, + "learning_rate": 7.175002771573031e-05, + "loss": 0.665, + "step": 4647 + }, + { + "epoch": 0.955493884263542, + "grad_norm": 0.14475895464420319, + "learning_rate": 7.17419952580357e-05, + "loss": 0.5824, + "step": 4648 + }, + { + "epoch": 0.9556994552369206, + "grad_norm": 0.2026558667421341, + "learning_rate": 7.173396148288796e-05, + "loss": 0.6604, + "step": 4649 + }, + { + "epoch": 0.9559050262102992, + "grad_norm": 0.18734343349933624, + "learning_rate": 7.172592639068291e-05, + "loss": 0.6658, + "step": 4650 + }, + { + "epoch": 0.9561105971836776, + "grad_norm": 0.18206274509429932, + "learning_rate": 7.171788998181637e-05, + "loss": 0.6371, + "step": 4651 + }, + { + "epoch": 0.9563161681570562, + "grad_norm": 0.18837840855121613, + "learning_rate": 7.170985225668428e-05, + "loss": 0.6306, + "step": 4652 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.19700245559215546, + "learning_rate": 7.17018132156826e-05, + "loss": 0.6645, + "step": 4653 + }, + { + "epoch": 0.9567273101038133, + "grad_norm": 0.18174946308135986, + "learning_rate": 7.169377285920738e-05, + "loss": 0.6657, + "step": 4654 + }, + { + "epoch": 0.9569328810771919, + "grad_norm": 0.1869078427553177, + "learning_rate": 7.168573118765476e-05, + "loss": 0.6752, + "step": 4655 + }, + { + "epoch": 0.9571384520505705, + "grad_norm": 0.19436730444431305, + "learning_rate": 7.167768820142088e-05, + "loss": 0.6694, + "step": 4656 + }, + { + "epoch": 0.957344023023949, + "grad_norm": 0.18894408643245697, + "learning_rate": 7.166964390090199e-05, + "loss": 0.6644, + "step": 4657 + }, + { + "epoch": 0.9575495939973275, + "grad_norm": 0.18464897572994232, + "learning_rate": 7.16615982864944e-05, + "loss": 0.6457, + "step": 4658 + }, + { + "epoch": 0.9577551649707061, + "grad_norm": 0.1893334686756134, + "learning_rate": 7.16535513585945e-05, + "loss": 0.6692, + "step": 4659 + }, + { + "epoch": 0.9579607359440847, + "grad_norm": 0.151536226272583, + "learning_rate": 7.164550311759869e-05, + "loss": 0.5774, + "step": 4660 + }, + { + "epoch": 0.9581663069174633, + "grad_norm": 0.20720963180065155, + "learning_rate": 7.163745356390347e-05, + "loss": 0.6608, + "step": 4661 + }, + { + "epoch": 0.9583718778908418, + "grad_norm": 0.18656425178050995, + "learning_rate": 7.162940269790543e-05, + "loss": 0.6502, + "step": 4662 + }, + { + "epoch": 0.9585774488642204, + "grad_norm": 0.18301479518413544, + "learning_rate": 7.162135052000116e-05, + "loss": 0.6854, + "step": 4663 + }, + { + "epoch": 0.958783019837599, + "grad_norm": 0.14167705178260803, + "learning_rate": 7.161329703058742e-05, + "loss": 0.5932, + "step": 4664 + }, + { + "epoch": 0.9589885908109775, + "grad_norm": 0.13432294130325317, + "learning_rate": 7.16052422300609e-05, + "loss": 0.5758, + "step": 4665 + }, + { + "epoch": 0.959194161784356, + "grad_norm": 0.2055593878030777, + "learning_rate": 7.159718611881845e-05, + "loss": 0.6646, + "step": 4666 + }, + { + "epoch": 0.9593997327577346, + "grad_norm": 0.19777736067771912, + "learning_rate": 7.158912869725695e-05, + "loss": 0.6821, + "step": 4667 + }, + { + "epoch": 0.9596053037311132, + "grad_norm": 0.18612885475158691, + "learning_rate": 7.158106996577336e-05, + "loss": 0.6758, + "step": 4668 + }, + { + "epoch": 0.9598108747044918, + "grad_norm": 0.1979762464761734, + "learning_rate": 7.15730099247647e-05, + "loss": 0.6779, + "step": 4669 + }, + { + "epoch": 0.9600164456778703, + "grad_norm": 0.1957666128873825, + "learning_rate": 7.156494857462803e-05, + "loss": 0.657, + "step": 4670 + }, + { + "epoch": 0.9602220166512488, + "grad_norm": 0.16183792054653168, + "learning_rate": 7.155688591576051e-05, + "loss": 0.5905, + "step": 4671 + }, + { + "epoch": 0.9604275876246274, + "grad_norm": 0.181317538022995, + "learning_rate": 7.154882194855936e-05, + "loss": 0.633, + "step": 4672 + }, + { + "epoch": 0.9606331585980059, + "grad_norm": 0.1878432035446167, + "learning_rate": 7.154075667342183e-05, + "loss": 0.6703, + "step": 4673 + }, + { + "epoch": 0.9608387295713845, + "grad_norm": 0.19090843200683594, + "learning_rate": 7.153269009074528e-05, + "loss": 0.6737, + "step": 4674 + }, + { + "epoch": 0.9610443005447631, + "grad_norm": 0.18672534823417664, + "learning_rate": 7.15246222009271e-05, + "loss": 0.6585, + "step": 4675 + }, + { + "epoch": 0.9612498715181417, + "grad_norm": 0.18867382407188416, + "learning_rate": 7.151655300436475e-05, + "loss": 0.6403, + "step": 4676 + }, + { + "epoch": 0.9614554424915202, + "grad_norm": 0.18556974828243256, + "learning_rate": 7.150848250145578e-05, + "loss": 0.6543, + "step": 4677 + }, + { + "epoch": 0.9616610134648987, + "grad_norm": 0.18414060771465302, + "learning_rate": 7.150041069259777e-05, + "loss": 0.6671, + "step": 4678 + }, + { + "epoch": 0.9618665844382773, + "grad_norm": 0.14137166738510132, + "learning_rate": 7.14923375781884e-05, + "loss": 0.5742, + "step": 4679 + }, + { + "epoch": 0.9620721554116559, + "grad_norm": 0.19371961057186127, + "learning_rate": 7.148426315862537e-05, + "loss": 0.6423, + "step": 4680 + }, + { + "epoch": 0.9622777263850344, + "grad_norm": 0.1935972273349762, + "learning_rate": 7.147618743430648e-05, + "loss": 0.6896, + "step": 4681 + }, + { + "epoch": 0.962483297358413, + "grad_norm": 0.19424404203891754, + "learning_rate": 7.14681104056296e-05, + "loss": 0.6634, + "step": 4682 + }, + { + "epoch": 0.9626888683317916, + "grad_norm": 0.18401269614696503, + "learning_rate": 7.146003207299263e-05, + "loss": 0.6301, + "step": 4683 + }, + { + "epoch": 0.9628944393051702, + "grad_norm": 0.18967941403388977, + "learning_rate": 7.145195243679354e-05, + "loss": 0.6583, + "step": 4684 + }, + { + "epoch": 0.9631000102785486, + "grad_norm": 0.18623065948486328, + "learning_rate": 7.14438714974304e-05, + "loss": 0.6411, + "step": 4685 + }, + { + "epoch": 0.9633055812519272, + "grad_norm": 0.18971066176891327, + "learning_rate": 7.14357892553013e-05, + "loss": 0.6237, + "step": 4686 + }, + { + "epoch": 0.9635111522253058, + "grad_norm": 0.13411302864551544, + "learning_rate": 7.142770571080443e-05, + "loss": 0.5835, + "step": 4687 + }, + { + "epoch": 0.9637167231986844, + "grad_norm": 0.19216755032539368, + "learning_rate": 7.141962086433802e-05, + "loss": 0.6423, + "step": 4688 + }, + { + "epoch": 0.9639222941720629, + "grad_norm": 0.18625061213970184, + "learning_rate": 7.141153471630038e-05, + "loss": 0.6641, + "step": 4689 + }, + { + "epoch": 0.9641278651454415, + "grad_norm": 0.19605115056037903, + "learning_rate": 7.140344726708988e-05, + "loss": 0.6713, + "step": 4690 + }, + { + "epoch": 0.96433343611882, + "grad_norm": 0.18507269024848938, + "learning_rate": 7.139535851710492e-05, + "loss": 0.6626, + "step": 4691 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 0.1852700561285019, + "learning_rate": 7.138726846674403e-05, + "loss": 0.6751, + "step": 4692 + }, + { + "epoch": 0.9647445780655771, + "grad_norm": 0.17961280047893524, + "learning_rate": 7.137917711640575e-05, + "loss": 0.6648, + "step": 4693 + }, + { + "epoch": 0.9649501490389557, + "grad_norm": 0.20353473722934723, + "learning_rate": 7.137108446648873e-05, + "loss": 0.6485, + "step": 4694 + }, + { + "epoch": 0.9651557200123343, + "grad_norm": 0.18588435649871826, + "learning_rate": 7.136299051739162e-05, + "loss": 0.6377, + "step": 4695 + }, + { + "epoch": 0.9653612909857128, + "grad_norm": 0.14005832374095917, + "learning_rate": 7.135489526951318e-05, + "loss": 0.5717, + "step": 4696 + }, + { + "epoch": 0.9655668619590914, + "grad_norm": 0.18969693779945374, + "learning_rate": 7.134679872325224e-05, + "loss": 0.6724, + "step": 4697 + }, + { + "epoch": 0.96577243293247, + "grad_norm": 0.12744298577308655, + "learning_rate": 7.133870087900768e-05, + "loss": 0.5729, + "step": 4698 + }, + { + "epoch": 0.9659780039058485, + "grad_norm": 0.18571147322654724, + "learning_rate": 7.133060173717842e-05, + "loss": 0.6547, + "step": 4699 + }, + { + "epoch": 0.966183574879227, + "grad_norm": 0.19381268322467804, + "learning_rate": 7.13225012981635e-05, + "loss": 0.672, + "step": 4700 + }, + { + "epoch": 0.9663891458526056, + "grad_norm": 0.18004442751407623, + "learning_rate": 7.131439956236194e-05, + "loss": 0.6923, + "step": 4701 + }, + { + "epoch": 0.9665947168259842, + "grad_norm": 0.18902912735939026, + "learning_rate": 7.130629653017293e-05, + "loss": 0.6709, + "step": 4702 + }, + { + "epoch": 0.9668002877993628, + "grad_norm": 0.1331816166639328, + "learning_rate": 7.129819220199566e-05, + "loss": 0.5755, + "step": 4703 + }, + { + "epoch": 0.9670058587727413, + "grad_norm": 0.1306556910276413, + "learning_rate": 7.129008657822936e-05, + "loss": 0.5504, + "step": 4704 + }, + { + "epoch": 0.9672114297461198, + "grad_norm": 0.12396678328514099, + "learning_rate": 7.128197965927337e-05, + "loss": 0.5786, + "step": 4705 + }, + { + "epoch": 0.9674170007194984, + "grad_norm": 0.2061123251914978, + "learning_rate": 7.127387144552709e-05, + "loss": 0.6777, + "step": 4706 + }, + { + "epoch": 0.967622571692877, + "grad_norm": 0.1253053843975067, + "learning_rate": 7.126576193738997e-05, + "loss": 0.5862, + "step": 4707 + }, + { + "epoch": 0.9678281426662555, + "grad_norm": 0.1296505630016327, + "learning_rate": 7.125765113526151e-05, + "loss": 0.5758, + "step": 4708 + }, + { + "epoch": 0.9680337136396341, + "grad_norm": 0.1793881356716156, + "learning_rate": 7.124953903954132e-05, + "loss": 0.6242, + "step": 4709 + }, + { + "epoch": 0.9682392846130127, + "grad_norm": 0.13087224960327148, + "learning_rate": 7.124142565062903e-05, + "loss": 0.5745, + "step": 4710 + }, + { + "epoch": 0.9684448555863912, + "grad_norm": 0.12415716052055359, + "learning_rate": 7.123331096892434e-05, + "loss": 0.5658, + "step": 4711 + }, + { + "epoch": 0.9686504265597697, + "grad_norm": 0.18544363975524902, + "learning_rate": 7.122519499482706e-05, + "loss": 0.6601, + "step": 4712 + }, + { + "epoch": 0.9688559975331483, + "grad_norm": 0.17584609985351562, + "learning_rate": 7.121707772873699e-05, + "loss": 0.6448, + "step": 4713 + }, + { + "epoch": 0.9690615685065269, + "grad_norm": 0.18083997070789337, + "learning_rate": 7.120895917105402e-05, + "loss": 0.6701, + "step": 4714 + }, + { + "epoch": 0.9692671394799054, + "grad_norm": 0.17472274601459503, + "learning_rate": 7.120083932217815e-05, + "loss": 0.6957, + "step": 4715 + }, + { + "epoch": 0.969472710453284, + "grad_norm": 0.15184062719345093, + "learning_rate": 7.119271818250936e-05, + "loss": 0.5817, + "step": 4716 + }, + { + "epoch": 0.9696782814266626, + "grad_norm": 0.18855705857276917, + "learning_rate": 7.11845957524478e-05, + "loss": 0.6749, + "step": 4717 + }, + { + "epoch": 0.9698838524000412, + "grad_norm": 0.18199525773525238, + "learning_rate": 7.117647203239358e-05, + "loss": 0.6665, + "step": 4718 + }, + { + "epoch": 0.9700894233734196, + "grad_norm": 0.1904478669166565, + "learning_rate": 7.116834702274693e-05, + "loss": 0.6339, + "step": 4719 + }, + { + "epoch": 0.9702949943467982, + "grad_norm": 0.18044617772102356, + "learning_rate": 7.116022072390815e-05, + "loss": 0.6575, + "step": 4720 + }, + { + "epoch": 0.9705005653201768, + "grad_norm": 0.17925746738910675, + "learning_rate": 7.115209313627755e-05, + "loss": 0.6639, + "step": 4721 + }, + { + "epoch": 0.9707061362935554, + "grad_norm": 0.18334949016571045, + "learning_rate": 7.114396426025557e-05, + "loss": 0.6716, + "step": 4722 + }, + { + "epoch": 0.9709117072669339, + "grad_norm": 0.17840418219566345, + "learning_rate": 7.113583409624265e-05, + "loss": 0.6672, + "step": 4723 + }, + { + "epoch": 0.9711172782403125, + "grad_norm": 0.14346054196357727, + "learning_rate": 7.112770264463936e-05, + "loss": 0.6005, + "step": 4724 + }, + { + "epoch": 0.971322849213691, + "grad_norm": 0.20740103721618652, + "learning_rate": 7.111956990584626e-05, + "loss": 0.6906, + "step": 4725 + }, + { + "epoch": 0.9715284201870696, + "grad_norm": 0.1770005226135254, + "learning_rate": 7.111143588026406e-05, + "loss": 0.6421, + "step": 4726 + }, + { + "epoch": 0.9717339911604481, + "grad_norm": 0.18892593681812286, + "learning_rate": 7.110330056829344e-05, + "loss": 0.6357, + "step": 4727 + }, + { + "epoch": 0.9719395621338267, + "grad_norm": 0.18768368661403656, + "learning_rate": 7.109516397033522e-05, + "loss": 0.6538, + "step": 4728 + }, + { + "epoch": 0.9721451331072053, + "grad_norm": 0.1885116994380951, + "learning_rate": 7.108702608679022e-05, + "loss": 0.6792, + "step": 4729 + }, + { + "epoch": 0.9723507040805838, + "grad_norm": 0.19139648973941803, + "learning_rate": 7.10788869180594e-05, + "loss": 0.6307, + "step": 4730 + }, + { + "epoch": 0.9725562750539624, + "grad_norm": 0.18306319415569305, + "learning_rate": 7.107074646454368e-05, + "loss": 0.6564, + "step": 4731 + }, + { + "epoch": 0.972761846027341, + "grad_norm": 0.18829376995563507, + "learning_rate": 7.106260472664417e-05, + "loss": 0.6439, + "step": 4732 + }, + { + "epoch": 0.9729674170007195, + "grad_norm": 0.18569877743721008, + "learning_rate": 7.105446170476193e-05, + "loss": 0.6301, + "step": 4733 + }, + { + "epoch": 0.973172987974098, + "grad_norm": 0.17895673215389252, + "learning_rate": 7.104631739929814e-05, + "loss": 0.6752, + "step": 4734 + }, + { + "epoch": 0.9733785589474766, + "grad_norm": 0.14281636476516724, + "learning_rate": 7.103817181065402e-05, + "loss": 0.5585, + "step": 4735 + }, + { + "epoch": 0.9735841299208552, + "grad_norm": 0.18822631239891052, + "learning_rate": 7.103002493923089e-05, + "loss": 0.6773, + "step": 4736 + }, + { + "epoch": 0.9737897008942338, + "grad_norm": 0.19597260653972626, + "learning_rate": 7.102187678543009e-05, + "loss": 0.6525, + "step": 4737 + }, + { + "epoch": 0.9739952718676123, + "grad_norm": 0.18155749142169952, + "learning_rate": 7.101372734965306e-05, + "loss": 0.6369, + "step": 4738 + }, + { + "epoch": 0.9742008428409908, + "grad_norm": 0.1390804648399353, + "learning_rate": 7.100557663230125e-05, + "loss": 0.5831, + "step": 4739 + }, + { + "epoch": 0.9744064138143694, + "grad_norm": 0.19495947659015656, + "learning_rate": 7.099742463377626e-05, + "loss": 0.6545, + "step": 4740 + }, + { + "epoch": 0.974611984787748, + "grad_norm": 0.12120307236909866, + "learning_rate": 7.098927135447965e-05, + "loss": 0.5725, + "step": 4741 + }, + { + "epoch": 0.9748175557611265, + "grad_norm": 0.18559974431991577, + "learning_rate": 7.09811167948131e-05, + "loss": 0.6441, + "step": 4742 + }, + { + "epoch": 0.9750231267345051, + "grad_norm": 0.18373870849609375, + "learning_rate": 7.097296095517838e-05, + "loss": 0.6765, + "step": 4743 + }, + { + "epoch": 0.9752286977078837, + "grad_norm": 0.13721033930778503, + "learning_rate": 7.096480383597725e-05, + "loss": 0.5717, + "step": 4744 + }, + { + "epoch": 0.9754342686812622, + "grad_norm": 0.25049659609794617, + "learning_rate": 7.095664543761162e-05, + "loss": 0.653, + "step": 4745 + }, + { + "epoch": 0.9756398396546407, + "grad_norm": 0.17938856780529022, + "learning_rate": 7.094848576048339e-05, + "loss": 0.6455, + "step": 4746 + }, + { + "epoch": 0.9758454106280193, + "grad_norm": 0.18572570383548737, + "learning_rate": 7.094032480499454e-05, + "loss": 0.634, + "step": 4747 + }, + { + "epoch": 0.9760509816013979, + "grad_norm": 0.18931305408477783, + "learning_rate": 7.093216257154713e-05, + "loss": 0.6397, + "step": 4748 + }, + { + "epoch": 0.9762565525747764, + "grad_norm": 0.178866446018219, + "learning_rate": 7.092399906054328e-05, + "loss": 0.6501, + "step": 4749 + }, + { + "epoch": 0.976462123548155, + "grad_norm": 0.13158805668354034, + "learning_rate": 7.091583427238515e-05, + "loss": 0.5743, + "step": 4750 + }, + { + "epoch": 0.9766676945215336, + "grad_norm": 0.18385621905326843, + "learning_rate": 7.090766820747502e-05, + "loss": 0.6433, + "step": 4751 + }, + { + "epoch": 0.9768732654949122, + "grad_norm": 0.18304161727428436, + "learning_rate": 7.089950086621515e-05, + "loss": 0.6304, + "step": 4752 + }, + { + "epoch": 0.9770788364682906, + "grad_norm": 0.14201928675174713, + "learning_rate": 7.089133224900794e-05, + "loss": 0.5821, + "step": 4753 + }, + { + "epoch": 0.9772844074416692, + "grad_norm": 0.18627387285232544, + "learning_rate": 7.08831623562558e-05, + "loss": 0.6627, + "step": 4754 + }, + { + "epoch": 0.9774899784150478, + "grad_norm": 0.18866147100925446, + "learning_rate": 7.087499118836123e-05, + "loss": 0.6627, + "step": 4755 + }, + { + "epoch": 0.9776955493884264, + "grad_norm": 0.1349857598543167, + "learning_rate": 7.086681874572677e-05, + "loss": 0.5733, + "step": 4756 + }, + { + "epoch": 0.9779011203618049, + "grad_norm": 0.18248964846134186, + "learning_rate": 7.085864502875506e-05, + "loss": 0.6549, + "step": 4757 + }, + { + "epoch": 0.9781066913351835, + "grad_norm": 0.188977912068367, + "learning_rate": 7.085047003784879e-05, + "loss": 0.6531, + "step": 4758 + }, + { + "epoch": 0.978312262308562, + "grad_norm": 0.1410411149263382, + "learning_rate": 7.084229377341068e-05, + "loss": 0.5773, + "step": 4759 + }, + { + "epoch": 0.9785178332819406, + "grad_norm": 0.18209992349147797, + "learning_rate": 7.083411623584352e-05, + "loss": 0.6653, + "step": 4760 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 0.18571458756923676, + "learning_rate": 7.082593742555023e-05, + "loss": 0.6621, + "step": 4761 + }, + { + "epoch": 0.9789289752286977, + "grad_norm": 0.18278199434280396, + "learning_rate": 7.08177573429337e-05, + "loss": 0.6688, + "step": 4762 + }, + { + "epoch": 0.9791345462020763, + "grad_norm": 0.17872901260852814, + "learning_rate": 7.080957598839693e-05, + "loss": 0.6442, + "step": 4763 + }, + { + "epoch": 0.9793401171754548, + "grad_norm": 0.17393019795417786, + "learning_rate": 7.080139336234299e-05, + "loss": 0.6474, + "step": 4764 + }, + { + "epoch": 0.9795456881488334, + "grad_norm": 0.18381252884864807, + "learning_rate": 7.0793209465175e-05, + "loss": 0.6469, + "step": 4765 + }, + { + "epoch": 0.979751259122212, + "grad_norm": 0.18060103058815002, + "learning_rate": 7.078502429729614e-05, + "loss": 0.6635, + "step": 4766 + }, + { + "epoch": 0.9799568300955905, + "grad_norm": 0.18748174607753754, + "learning_rate": 7.077683785910964e-05, + "loss": 0.6695, + "step": 4767 + }, + { + "epoch": 0.980162401068969, + "grad_norm": 0.18352623283863068, + "learning_rate": 7.076865015101882e-05, + "loss": 0.6475, + "step": 4768 + }, + { + "epoch": 0.9803679720423476, + "grad_norm": 0.14373265206813812, + "learning_rate": 7.076046117342705e-05, + "loss": 0.5666, + "step": 4769 + }, + { + "epoch": 0.9805735430157262, + "grad_norm": 0.18376867473125458, + "learning_rate": 7.075227092673777e-05, + "loss": 0.6542, + "step": 4770 + }, + { + "epoch": 0.9807791139891048, + "grad_norm": 0.1273968666791916, + "learning_rate": 7.074407941135447e-05, + "loss": 0.5939, + "step": 4771 + }, + { + "epoch": 0.9809846849624833, + "grad_norm": 0.19144566357135773, + "learning_rate": 7.073588662768069e-05, + "loss": 0.655, + "step": 4772 + }, + { + "epoch": 0.9811902559358618, + "grad_norm": 0.18799123167991638, + "learning_rate": 7.072769257612007e-05, + "loss": 0.6726, + "step": 4773 + }, + { + "epoch": 0.9813958269092404, + "grad_norm": 0.19798782467842102, + "learning_rate": 7.071949725707628e-05, + "loss": 0.6438, + "step": 4774 + }, + { + "epoch": 0.981601397882619, + "grad_norm": 0.18581277132034302, + "learning_rate": 7.07113006709531e-05, + "loss": 0.6562, + "step": 4775 + }, + { + "epoch": 0.9818069688559975, + "grad_norm": 0.1861695498228073, + "learning_rate": 7.070310281815429e-05, + "loss": 0.6693, + "step": 4776 + }, + { + "epoch": 0.9820125398293761, + "grad_norm": 0.15388214588165283, + "learning_rate": 7.069490369908374e-05, + "loss": 0.5852, + "step": 4777 + }, + { + "epoch": 0.9822181108027547, + "grad_norm": 0.19053393602371216, + "learning_rate": 7.068670331414539e-05, + "loss": 0.6512, + "step": 4778 + }, + { + "epoch": 0.9824236817761333, + "grad_norm": 0.19945350289344788, + "learning_rate": 7.067850166374322e-05, + "loss": 0.6898, + "step": 4779 + }, + { + "epoch": 0.9826292527495117, + "grad_norm": 0.12717384099960327, + "learning_rate": 7.067029874828131e-05, + "loss": 0.5656, + "step": 4780 + }, + { + "epoch": 0.9828348237228903, + "grad_norm": 0.18999631702899933, + "learning_rate": 7.066209456816373e-05, + "loss": 0.6775, + "step": 4781 + }, + { + "epoch": 0.9830403946962689, + "grad_norm": 0.1788676530122757, + "learning_rate": 7.065388912379472e-05, + "loss": 0.6573, + "step": 4782 + }, + { + "epoch": 0.9832459656696474, + "grad_norm": 0.1846441626548767, + "learning_rate": 7.06456824155785e-05, + "loss": 0.6285, + "step": 4783 + }, + { + "epoch": 0.983451536643026, + "grad_norm": 0.18069574236869812, + "learning_rate": 7.063747444391937e-05, + "loss": 0.6477, + "step": 4784 + }, + { + "epoch": 0.9836571076164046, + "grad_norm": 0.18519815802574158, + "learning_rate": 7.062926520922171e-05, + "loss": 0.6372, + "step": 4785 + }, + { + "epoch": 0.9838626785897832, + "grad_norm": 0.1890534907579422, + "learning_rate": 7.062105471188993e-05, + "loss": 0.6727, + "step": 4786 + }, + { + "epoch": 0.9840682495631616, + "grad_norm": 0.14361847937107086, + "learning_rate": 7.061284295232854e-05, + "loss": 0.5637, + "step": 4787 + }, + { + "epoch": 0.9842738205365402, + "grad_norm": 0.20357996225357056, + "learning_rate": 7.060462993094209e-05, + "loss": 0.643, + "step": 4788 + }, + { + "epoch": 0.9844793915099188, + "grad_norm": 0.12664301693439484, + "learning_rate": 7.059641564813521e-05, + "loss": 0.5653, + "step": 4789 + }, + { + "epoch": 0.9846849624832974, + "grad_norm": 0.11784827709197998, + "learning_rate": 7.058820010431256e-05, + "loss": 0.5801, + "step": 4790 + }, + { + "epoch": 0.9848905334566759, + "grad_norm": 0.19485826790332794, + "learning_rate": 7.057998329987889e-05, + "loss": 0.6846, + "step": 4791 + }, + { + "epoch": 0.9850961044300545, + "grad_norm": 0.19157935678958893, + "learning_rate": 7.057176523523901e-05, + "loss": 0.6641, + "step": 4792 + }, + { + "epoch": 0.985301675403433, + "grad_norm": 0.17738407850265503, + "learning_rate": 7.056354591079778e-05, + "loss": 0.646, + "step": 4793 + }, + { + "epoch": 0.9855072463768116, + "grad_norm": 0.18637309968471527, + "learning_rate": 7.055532532696012e-05, + "loss": 0.6406, + "step": 4794 + }, + { + "epoch": 0.9857128173501901, + "grad_norm": 0.18436288833618164, + "learning_rate": 7.054710348413103e-05, + "loss": 0.6349, + "step": 4795 + }, + { + "epoch": 0.9859183883235687, + "grad_norm": 0.1875494122505188, + "learning_rate": 7.053888038271555e-05, + "loss": 0.6585, + "step": 4796 + }, + { + "epoch": 0.9861239592969473, + "grad_norm": 0.18584869801998138, + "learning_rate": 7.053065602311882e-05, + "loss": 0.6729, + "step": 4797 + }, + { + "epoch": 0.9863295302703259, + "grad_norm": 0.1785837858915329, + "learning_rate": 7.052243040574597e-05, + "loss": 0.6571, + "step": 4798 + }, + { + "epoch": 0.9865351012437044, + "grad_norm": 0.18055270612239838, + "learning_rate": 7.051420353100228e-05, + "loss": 0.6732, + "step": 4799 + }, + { + "epoch": 0.986740672217083, + "grad_norm": 0.17629997432231903, + "learning_rate": 7.050597539929304e-05, + "loss": 0.6463, + "step": 4800 + }, + { + "epoch": 0.9869462431904615, + "grad_norm": 0.17916452884674072, + "learning_rate": 7.049774601102361e-05, + "loss": 0.664, + "step": 4801 + }, + { + "epoch": 0.98715181416384, + "grad_norm": 0.17072445154190063, + "learning_rate": 7.04895153665994e-05, + "loss": 0.6522, + "step": 4802 + }, + { + "epoch": 0.9873573851372186, + "grad_norm": 0.18947453796863556, + "learning_rate": 7.048128346642591e-05, + "loss": 0.6475, + "step": 4803 + }, + { + "epoch": 0.9875629561105972, + "grad_norm": 0.18113267421722412, + "learning_rate": 7.047305031090869e-05, + "loss": 0.6505, + "step": 4804 + }, + { + "epoch": 0.9877685270839758, + "grad_norm": 0.18333598971366882, + "learning_rate": 7.046481590045331e-05, + "loss": 0.6697, + "step": 4805 + }, + { + "epoch": 0.9879740980573543, + "grad_norm": 0.17988578975200653, + "learning_rate": 7.045658023546551e-05, + "loss": 0.6637, + "step": 4806 + }, + { + "epoch": 0.9881796690307328, + "grad_norm": 0.18621614575386047, + "learning_rate": 7.044834331635098e-05, + "loss": 0.6762, + "step": 4807 + }, + { + "epoch": 0.9883852400041114, + "grad_norm": 0.17191414535045624, + "learning_rate": 7.04401051435155e-05, + "loss": 0.6615, + "step": 4808 + }, + { + "epoch": 0.98859081097749, + "grad_norm": 0.1875917762517929, + "learning_rate": 7.043186571736496e-05, + "loss": 0.6757, + "step": 4809 + }, + { + "epoch": 0.9887963819508685, + "grad_norm": 0.18261222541332245, + "learning_rate": 7.042362503830527e-05, + "loss": 0.635, + "step": 4810 + }, + { + "epoch": 0.9890019529242471, + "grad_norm": 0.18588493764400482, + "learning_rate": 7.04153831067424e-05, + "loss": 0.6719, + "step": 4811 + }, + { + "epoch": 0.9892075238976257, + "grad_norm": 0.1783093363046646, + "learning_rate": 7.040713992308239e-05, + "loss": 0.6538, + "step": 4812 + }, + { + "epoch": 0.9894130948710043, + "grad_norm": 0.18314149975776672, + "learning_rate": 7.039889548773136e-05, + "loss": 0.6912, + "step": 4813 + }, + { + "epoch": 0.9896186658443827, + "grad_norm": 0.18791064620018005, + "learning_rate": 7.039064980109544e-05, + "loss": 0.6627, + "step": 4814 + }, + { + "epoch": 0.9898242368177613, + "grad_norm": 0.18856315314769745, + "learning_rate": 7.038240286358089e-05, + "loss": 0.5847, + "step": 4815 + }, + { + "epoch": 0.9900298077911399, + "grad_norm": 0.19757792353630066, + "learning_rate": 7.0374154675594e-05, + "loss": 0.6815, + "step": 4816 + }, + { + "epoch": 0.9902353787645185, + "grad_norm": 0.18688839673995972, + "learning_rate": 7.036590523754109e-05, + "loss": 0.6686, + "step": 4817 + }, + { + "epoch": 0.990440949737897, + "grad_norm": 0.1844862699508667, + "learning_rate": 7.035765454982861e-05, + "loss": 0.6518, + "step": 4818 + }, + { + "epoch": 0.9906465207112756, + "grad_norm": 0.18571245670318604, + "learning_rate": 7.0349402612863e-05, + "loss": 0.642, + "step": 4819 + }, + { + "epoch": 0.9908520916846542, + "grad_norm": 0.1918804794549942, + "learning_rate": 7.034114942705081e-05, + "loss": 0.6427, + "step": 4820 + }, + { + "epoch": 0.9910576626580326, + "grad_norm": 0.19276823103427887, + "learning_rate": 7.033289499279863e-05, + "loss": 0.6943, + "step": 4821 + }, + { + "epoch": 0.9912632336314112, + "grad_norm": 0.18441876769065857, + "learning_rate": 7.032463931051311e-05, + "loss": 0.6596, + "step": 4822 + }, + { + "epoch": 0.9914688046047898, + "grad_norm": 0.1893150806427002, + "learning_rate": 7.031638238060099e-05, + "loss": 0.6599, + "step": 4823 + }, + { + "epoch": 0.9916743755781684, + "grad_norm": 0.18663519620895386, + "learning_rate": 7.030812420346902e-05, + "loss": 0.6508, + "step": 4824 + }, + { + "epoch": 0.9918799465515469, + "grad_norm": 0.18189288675785065, + "learning_rate": 7.029986477952409e-05, + "loss": 0.5656, + "step": 4825 + }, + { + "epoch": 0.9920855175249255, + "grad_norm": 0.1928027868270874, + "learning_rate": 7.029160410917305e-05, + "loss": 0.6758, + "step": 4826 + }, + { + "epoch": 0.992291088498304, + "grad_norm": 0.19040422141551971, + "learning_rate": 7.028334219282291e-05, + "loss": 0.6546, + "step": 4827 + }, + { + "epoch": 0.9924966594716826, + "grad_norm": 0.15369752049446106, + "learning_rate": 7.027507903088066e-05, + "loss": 0.5874, + "step": 4828 + }, + { + "epoch": 0.9927022304450611, + "grad_norm": 0.13231946527957916, + "learning_rate": 7.026681462375339e-05, + "loss": 0.5761, + "step": 4829 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 0.1998869776725769, + "learning_rate": 7.025854897184828e-05, + "loss": 0.6637, + "step": 4830 + }, + { + "epoch": 0.9931133723918183, + "grad_norm": 0.18532314896583557, + "learning_rate": 7.025028207557251e-05, + "loss": 0.6492, + "step": 4831 + }, + { + "epoch": 0.9933189433651969, + "grad_norm": 0.1902119517326355, + "learning_rate": 7.024201393533337e-05, + "loss": 0.6405, + "step": 4832 + }, + { + "epoch": 0.9935245143385754, + "grad_norm": 0.17781443893909454, + "learning_rate": 7.023374455153817e-05, + "loss": 0.6644, + "step": 4833 + }, + { + "epoch": 0.993730085311954, + "grad_norm": 0.1855769008398056, + "learning_rate": 7.022547392459434e-05, + "loss": 0.6642, + "step": 4834 + }, + { + "epoch": 0.9939356562853325, + "grad_norm": 0.18379683792591095, + "learning_rate": 7.02172020549093e-05, + "loss": 0.6377, + "step": 4835 + }, + { + "epoch": 0.9941412272587111, + "grad_norm": 0.17909419536590576, + "learning_rate": 7.020892894289058e-05, + "loss": 0.6393, + "step": 4836 + }, + { + "epoch": 0.9943467982320896, + "grad_norm": 0.17869077622890472, + "learning_rate": 7.020065458894575e-05, + "loss": 0.6718, + "step": 4837 + }, + { + "epoch": 0.9945523692054682, + "grad_norm": 0.18221206963062286, + "learning_rate": 7.019237899348247e-05, + "loss": 0.6812, + "step": 4838 + }, + { + "epoch": 0.9947579401788468, + "grad_norm": 0.1849188208580017, + "learning_rate": 7.018410215690841e-05, + "loss": 0.586, + "step": 4839 + }, + { + "epoch": 0.9949635111522253, + "grad_norm": 0.2003324180841446, + "learning_rate": 7.017582407963136e-05, + "loss": 0.6561, + "step": 4840 + }, + { + "epoch": 0.9951690821256038, + "grad_norm": 0.19074849784374237, + "learning_rate": 7.016754476205913e-05, + "loss": 0.6509, + "step": 4841 + }, + { + "epoch": 0.9953746530989824, + "grad_norm": 0.14551801979541779, + "learning_rate": 7.01592642045996e-05, + "loss": 0.5873, + "step": 4842 + }, + { + "epoch": 0.995580224072361, + "grad_norm": 0.1880098134279251, + "learning_rate": 7.01509824076607e-05, + "loss": 0.6588, + "step": 4843 + }, + { + "epoch": 0.9957857950457395, + "grad_norm": 0.18471471965312958, + "learning_rate": 7.014269937165048e-05, + "loss": 0.6426, + "step": 4844 + }, + { + "epoch": 0.9959913660191181, + "grad_norm": 1.0276774168014526, + "learning_rate": 7.013441509697696e-05, + "loss": 0.6583, + "step": 4845 + }, + { + "epoch": 0.9961969369924967, + "grad_norm": 0.13645489513874054, + "learning_rate": 7.01261295840483e-05, + "loss": 0.5675, + "step": 4846 + }, + { + "epoch": 0.9964025079658753, + "grad_norm": 0.12980090081691742, + "learning_rate": 7.011784283327266e-05, + "loss": 0.5932, + "step": 4847 + }, + { + "epoch": 0.9966080789392537, + "grad_norm": 0.21611304581165314, + "learning_rate": 7.010955484505831e-05, + "loss": 0.6966, + "step": 4848 + }, + { + "epoch": 0.9968136499126323, + "grad_norm": 0.21331064403057098, + "learning_rate": 7.010126561981356e-05, + "loss": 0.6875, + "step": 4849 + }, + { + "epoch": 0.9970192208860109, + "grad_norm": 0.19772310554981232, + "learning_rate": 7.009297515794678e-05, + "loss": 0.6665, + "step": 4850 + }, + { + "epoch": 0.9972247918593895, + "grad_norm": 0.20528611540794373, + "learning_rate": 7.008468345986637e-05, + "loss": 0.6648, + "step": 4851 + }, + { + "epoch": 0.997430362832768, + "grad_norm": 0.2074098140001297, + "learning_rate": 7.007639052598088e-05, + "loss": 0.6817, + "step": 4852 + }, + { + "epoch": 0.9976359338061466, + "grad_norm": 0.19538044929504395, + "learning_rate": 7.006809635669882e-05, + "loss": 0.6414, + "step": 4853 + }, + { + "epoch": 0.9978415047795252, + "grad_norm": 0.189046248793602, + "learning_rate": 7.005980095242883e-05, + "loss": 0.5861, + "step": 4854 + }, + { + "epoch": 0.9980470757529037, + "grad_norm": 0.14991891384124756, + "learning_rate": 7.005150431357957e-05, + "loss": 0.5907, + "step": 4855 + }, + { + "epoch": 0.9982526467262822, + "grad_norm": 0.24315035343170166, + "learning_rate": 7.004320644055979e-05, + "loss": 0.6664, + "step": 4856 + }, + { + "epoch": 0.9984582176996608, + "grad_norm": 0.183399498462677, + "learning_rate": 7.003490733377827e-05, + "loss": 0.5846, + "step": 4857 + }, + { + "epoch": 0.9986637886730394, + "grad_norm": 0.20931483805179596, + "learning_rate": 7.002660699364389e-05, + "loss": 0.6624, + "step": 4858 + }, + { + "epoch": 0.9988693596464179, + "grad_norm": 0.19488368928432465, + "learning_rate": 7.001830542056555e-05, + "loss": 0.6757, + "step": 4859 + }, + { + "epoch": 0.9990749306197965, + "grad_norm": 0.16465352475643158, + "learning_rate": 7.001000261495223e-05, + "loss": 0.5584, + "step": 4860 + }, + { + "epoch": 0.999280501593175, + "grad_norm": 0.17341670393943787, + "learning_rate": 7.0001698577213e-05, + "loss": 0.5857, + "step": 4861 + }, + { + "epoch": 0.9994860725665536, + "grad_norm": 0.21987827122211456, + "learning_rate": 6.99933933077569e-05, + "loss": 0.6423, + "step": 4862 + }, + { + "epoch": 0.9996916435399321, + "grad_norm": 0.21325050294399261, + "learning_rate": 6.998508680699317e-05, + "loss": 0.6558, + "step": 4863 + }, + { + "epoch": 0.9998972145133107, + "grad_norm": 0.1891472041606903, + "learning_rate": 6.997677907533099e-05, + "loss": 0.6461, + "step": 4864 + }, + { + "epoch": 1.0001027854866893, + "grad_norm": 0.20316524803638458, + "learning_rate": 6.996847011317963e-05, + "loss": 0.5995, + "step": 4865 + }, + { + "epoch": 1.0003083564600679, + "grad_norm": 0.271843820810318, + "learning_rate": 6.996015992094846e-05, + "loss": 0.5709, + "step": 4866 + }, + { + "epoch": 1.0005139274334465, + "grad_norm": 0.22854308784008026, + "learning_rate": 6.995184849904686e-05, + "loss": 0.5628, + "step": 4867 + }, + { + "epoch": 1.000719498406825, + "grad_norm": 0.20615056157112122, + "learning_rate": 6.994353584788431e-05, + "loss": 0.5559, + "step": 4868 + }, + { + "epoch": 1.0009250693802034, + "grad_norm": 0.24276702105998993, + "learning_rate": 6.993522196787035e-05, + "loss": 0.5764, + "step": 4869 + }, + { + "epoch": 1.001130640353582, + "grad_norm": 0.28377482295036316, + "learning_rate": 6.992690685941454e-05, + "loss": 0.5666, + "step": 4870 + }, + { + "epoch": 1.0013362113269606, + "grad_norm": 0.2509450912475586, + "learning_rate": 6.991859052292654e-05, + "loss": 0.5716, + "step": 4871 + }, + { + "epoch": 1.0015417823003392, + "grad_norm": 0.20262686908245087, + "learning_rate": 6.991027295881606e-05, + "loss": 0.5314, + "step": 4872 + }, + { + "epoch": 1.0017473532737178, + "grad_norm": 0.1771395355463028, + "learning_rate": 6.990195416749287e-05, + "loss": 0.5826, + "step": 4873 + }, + { + "epoch": 1.0019529242470964, + "grad_norm": 0.22669513523578644, + "learning_rate": 6.989363414936676e-05, + "loss": 0.5785, + "step": 4874 + }, + { + "epoch": 1.002158495220475, + "grad_norm": 0.18329079449176788, + "learning_rate": 6.988531290484768e-05, + "loss": 0.5626, + "step": 4875 + }, + { + "epoch": 1.0023640661938533, + "grad_norm": 0.17352893948554993, + "learning_rate": 6.987699043434552e-05, + "loss": 0.5549, + "step": 4876 + }, + { + "epoch": 1.002569637167232, + "grad_norm": 0.2029443383216858, + "learning_rate": 6.986866673827032e-05, + "loss": 0.5679, + "step": 4877 + }, + { + "epoch": 1.0027752081406105, + "grad_norm": 0.21238186955451965, + "learning_rate": 6.986034181703216e-05, + "loss": 0.579, + "step": 4878 + }, + { + "epoch": 1.002980779113989, + "grad_norm": 0.20517666637897491, + "learning_rate": 6.985201567104115e-05, + "loss": 0.5578, + "step": 4879 + }, + { + "epoch": 1.0031863500873677, + "grad_norm": 0.221823588013649, + "learning_rate": 6.984368830070747e-05, + "loss": 0.5559, + "step": 4880 + }, + { + "epoch": 1.0033919210607463, + "grad_norm": 0.21827368438243866, + "learning_rate": 6.98353597064414e-05, + "loss": 0.5811, + "step": 4881 + }, + { + "epoch": 1.0035974920341248, + "grad_norm": 0.20785865187644958, + "learning_rate": 6.982702988865326e-05, + "loss": 0.5226, + "step": 4882 + }, + { + "epoch": 1.0038030630075034, + "grad_norm": 0.18137192726135254, + "learning_rate": 6.981869884775336e-05, + "loss": 0.5244, + "step": 4883 + }, + { + "epoch": 1.0040086339808818, + "grad_norm": 0.18444296717643738, + "learning_rate": 6.981036658415218e-05, + "loss": 0.5603, + "step": 4884 + }, + { + "epoch": 1.0042142049542604, + "grad_norm": 0.22535867989063263, + "learning_rate": 6.980203309826021e-05, + "loss": 0.5684, + "step": 4885 + }, + { + "epoch": 1.004419775927639, + "grad_norm": 0.21289990842342377, + "learning_rate": 6.979369839048799e-05, + "loss": 0.5214, + "step": 4886 + }, + { + "epoch": 1.0046253469010176, + "grad_norm": 0.16380147635936737, + "learning_rate": 6.978536246124615e-05, + "loss": 0.5145, + "step": 4887 + }, + { + "epoch": 1.0048309178743962, + "grad_norm": 0.170881450176239, + "learning_rate": 6.977702531094534e-05, + "loss": 0.5329, + "step": 4888 + }, + { + "epoch": 1.0050364888477747, + "grad_norm": 0.17499133944511414, + "learning_rate": 6.976868693999629e-05, + "loss": 0.5228, + "step": 4889 + }, + { + "epoch": 1.0052420598211533, + "grad_norm": 0.20944778621196747, + "learning_rate": 6.976034734880981e-05, + "loss": 0.5465, + "step": 4890 + }, + { + "epoch": 1.005447630794532, + "grad_norm": 0.20664618909358978, + "learning_rate": 6.975200653779674e-05, + "loss": 0.5645, + "step": 4891 + }, + { + "epoch": 1.0056532017679103, + "grad_norm": 0.20452165603637695, + "learning_rate": 6.974366450736801e-05, + "loss": 0.536, + "step": 4892 + }, + { + "epoch": 1.0058587727412889, + "grad_norm": 0.20522767305374146, + "learning_rate": 6.973532125793457e-05, + "loss": 0.5524, + "step": 4893 + }, + { + "epoch": 1.0060643437146675, + "grad_norm": 0.20214490592479706, + "learning_rate": 6.972697678990747e-05, + "loss": 0.5829, + "step": 4894 + }, + { + "epoch": 1.006269914688046, + "grad_norm": 0.19097676873207092, + "learning_rate": 6.971863110369778e-05, + "loss": 0.5589, + "step": 4895 + }, + { + "epoch": 1.0064754856614246, + "grad_norm": 0.19728168845176697, + "learning_rate": 6.97102841997167e-05, + "loss": 0.5546, + "step": 4896 + }, + { + "epoch": 1.0066810566348032, + "grad_norm": 0.1733403503894806, + "learning_rate": 6.97019360783754e-05, + "loss": 0.5264, + "step": 4897 + }, + { + "epoch": 1.0068866276081818, + "grad_norm": 0.17879877984523773, + "learning_rate": 6.969358674008516e-05, + "loss": 0.5623, + "step": 4898 + }, + { + "epoch": 1.0070921985815602, + "grad_norm": 0.19583040475845337, + "learning_rate": 6.968523618525733e-05, + "loss": 0.5773, + "step": 4899 + }, + { + "epoch": 1.0072977695549388, + "grad_norm": 0.193648099899292, + "learning_rate": 6.967688441430328e-05, + "loss": 0.576, + "step": 4900 + }, + { + "epoch": 1.0075033405283174, + "grad_norm": 0.1968041956424713, + "learning_rate": 6.966853142763448e-05, + "loss": 0.5513, + "step": 4901 + }, + { + "epoch": 1.007708911501696, + "grad_norm": 0.196999654173851, + "learning_rate": 6.966017722566246e-05, + "loss": 0.5576, + "step": 4902 + }, + { + "epoch": 1.0079144824750745, + "grad_norm": 0.19729197025299072, + "learning_rate": 6.965182180879873e-05, + "loss": 0.5689, + "step": 4903 + }, + { + "epoch": 1.0081200534484531, + "grad_norm": 0.20436379313468933, + "learning_rate": 6.964346517745498e-05, + "loss": 0.5766, + "step": 4904 + }, + { + "epoch": 1.0083256244218317, + "grad_norm": 0.19463692605495453, + "learning_rate": 6.963510733204288e-05, + "loss": 0.5477, + "step": 4905 + }, + { + "epoch": 1.0085311953952103, + "grad_norm": 0.19440148770809174, + "learning_rate": 6.962674827297418e-05, + "loss": 0.5578, + "step": 4906 + }, + { + "epoch": 1.0087367663685887, + "grad_norm": 0.19789615273475647, + "learning_rate": 6.961838800066072e-05, + "loss": 0.5559, + "step": 4907 + }, + { + "epoch": 1.0089423373419673, + "grad_norm": 0.19245871901512146, + "learning_rate": 6.961002651551432e-05, + "loss": 0.5484, + "step": 4908 + }, + { + "epoch": 1.0091479083153458, + "grad_norm": 0.19907300174236298, + "learning_rate": 6.960166381794697e-05, + "loss": 0.5343, + "step": 4909 + }, + { + "epoch": 1.0093534792887244, + "grad_norm": 0.19037161767482758, + "learning_rate": 6.959329990837061e-05, + "loss": 0.5389, + "step": 4910 + }, + { + "epoch": 1.009559050262103, + "grad_norm": 0.18873098492622375, + "learning_rate": 6.958493478719733e-05, + "loss": 0.5582, + "step": 4911 + }, + { + "epoch": 1.0097646212354816, + "grad_norm": 0.19992291927337646, + "learning_rate": 6.95765684548392e-05, + "loss": 0.5746, + "step": 4912 + }, + { + "epoch": 1.0099701922088602, + "grad_norm": 0.2006637305021286, + "learning_rate": 6.956820091170844e-05, + "loss": 0.5731, + "step": 4913 + }, + { + "epoch": 1.0101757631822386, + "grad_norm": 0.1814371794462204, + "learning_rate": 6.955983215821724e-05, + "loss": 0.5409, + "step": 4914 + }, + { + "epoch": 1.0103813341556172, + "grad_norm": 0.16987626254558563, + "learning_rate": 6.955146219477788e-05, + "loss": 0.5778, + "step": 4915 + }, + { + "epoch": 1.0105869051289957, + "grad_norm": 0.19764257967472076, + "learning_rate": 6.954309102180276e-05, + "loss": 0.5729, + "step": 4916 + }, + { + "epoch": 1.0107924761023743, + "grad_norm": 0.19731703400611877, + "learning_rate": 6.953471863970424e-05, + "loss": 0.5507, + "step": 4917 + }, + { + "epoch": 1.010998047075753, + "grad_norm": 0.18993428349494934, + "learning_rate": 6.952634504889484e-05, + "loss": 0.5448, + "step": 4918 + }, + { + "epoch": 1.0112036180491315, + "grad_norm": 0.1911395788192749, + "learning_rate": 6.951797024978703e-05, + "loss": 0.5319, + "step": 4919 + }, + { + "epoch": 1.01140918902251, + "grad_norm": 0.2100227028131485, + "learning_rate": 6.950959424279342e-05, + "loss": 0.5865, + "step": 4920 + }, + { + "epoch": 1.0116147599958887, + "grad_norm": 0.1891854703426361, + "learning_rate": 6.950121702832666e-05, + "loss": 0.5353, + "step": 4921 + }, + { + "epoch": 1.011820330969267, + "grad_norm": 0.16457346081733704, + "learning_rate": 6.949283860679946e-05, + "loss": 0.519, + "step": 4922 + }, + { + "epoch": 1.0120259019426456, + "grad_norm": 0.13976171612739563, + "learning_rate": 6.948445897862458e-05, + "loss": 0.5277, + "step": 4923 + }, + { + "epoch": 1.0122314729160242, + "grad_norm": 0.17471906542778015, + "learning_rate": 6.947607814421486e-05, + "loss": 0.5693, + "step": 4924 + }, + { + "epoch": 1.0124370438894028, + "grad_norm": 0.22914856672286987, + "learning_rate": 6.946769610398316e-05, + "loss": 0.5756, + "step": 4925 + }, + { + "epoch": 1.0126426148627814, + "grad_norm": 0.20704926550388336, + "learning_rate": 6.945931285834242e-05, + "loss": 0.5726, + "step": 4926 + }, + { + "epoch": 1.01284818583616, + "grad_norm": 0.19101639091968536, + "learning_rate": 6.945092840770567e-05, + "loss": 0.556, + "step": 4927 + }, + { + "epoch": 1.0130537568095386, + "grad_norm": 0.2030901461839676, + "learning_rate": 6.944254275248597e-05, + "loss": 0.5723, + "step": 4928 + }, + { + "epoch": 1.013259327782917, + "grad_norm": 0.24590568244457245, + "learning_rate": 6.943415589309642e-05, + "loss": 0.551, + "step": 4929 + }, + { + "epoch": 1.0134648987562955, + "grad_norm": 0.1951897293329239, + "learning_rate": 6.942576782995022e-05, + "loss": 0.5712, + "step": 4930 + }, + { + "epoch": 1.0136704697296741, + "grad_norm": 0.19376187026500702, + "learning_rate": 6.94173785634606e-05, + "loss": 0.5565, + "step": 4931 + }, + { + "epoch": 1.0138760407030527, + "grad_norm": 0.20647601783275604, + "learning_rate": 6.940898809404086e-05, + "loss": 0.5822, + "step": 4932 + }, + { + "epoch": 1.0140816116764313, + "grad_norm": 0.1940944641828537, + "learning_rate": 6.940059642210438e-05, + "loss": 0.5529, + "step": 4933 + }, + { + "epoch": 1.0142871826498099, + "grad_norm": 0.18651318550109863, + "learning_rate": 6.939220354806455e-05, + "loss": 0.519, + "step": 4934 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.17096173763275146, + "learning_rate": 6.938380947233487e-05, + "loss": 0.5716, + "step": 4935 + }, + { + "epoch": 1.014698324596567, + "grad_norm": 0.20484822988510132, + "learning_rate": 6.937541419532885e-05, + "loss": 0.569, + "step": 4936 + }, + { + "epoch": 1.0149038955699454, + "grad_norm": 2.4529898166656494, + "learning_rate": 6.936701771746012e-05, + "loss": 0.5871, + "step": 4937 + }, + { + "epoch": 1.015109466543324, + "grad_norm": 0.1707240641117096, + "learning_rate": 6.935862003914231e-05, + "loss": 0.5322, + "step": 4938 + }, + { + "epoch": 1.0153150375167026, + "grad_norm": 0.20285925269126892, + "learning_rate": 6.935022116078915e-05, + "loss": 0.5767, + "step": 4939 + }, + { + "epoch": 1.0155206084900812, + "grad_norm": 0.23825408518314362, + "learning_rate": 6.93418210828144e-05, + "loss": 0.5537, + "step": 4940 + }, + { + "epoch": 1.0157261794634598, + "grad_norm": 0.25726380944252014, + "learning_rate": 6.93334198056319e-05, + "loss": 0.5728, + "step": 4941 + }, + { + "epoch": 1.0159317504368384, + "grad_norm": 0.2844366431236267, + "learning_rate": 6.932501732965554e-05, + "loss": 0.5752, + "step": 4942 + }, + { + "epoch": 1.016137321410217, + "grad_norm": 0.24454839527606964, + "learning_rate": 6.931661365529926e-05, + "loss": 0.5687, + "step": 4943 + }, + { + "epoch": 1.0163428923835955, + "grad_norm": 0.2527025043964386, + "learning_rate": 6.930820878297711e-05, + "loss": 0.5439, + "step": 4944 + }, + { + "epoch": 1.016548463356974, + "grad_norm": 0.5170005559921265, + "learning_rate": 6.92998027131031e-05, + "loss": 0.5863, + "step": 4945 + }, + { + "epoch": 1.0167540343303525, + "grad_norm": 0.2004466950893402, + "learning_rate": 6.92913954460914e-05, + "loss": 0.542, + "step": 4946 + }, + { + "epoch": 1.016959605303731, + "grad_norm": 0.2018880397081375, + "learning_rate": 6.928298698235619e-05, + "loss": 0.5909, + "step": 4947 + }, + { + "epoch": 1.0171651762771097, + "grad_norm": 0.21628795564174652, + "learning_rate": 6.927457732231169e-05, + "loss": 0.5622, + "step": 4948 + }, + { + "epoch": 1.0173707472504883, + "grad_norm": 0.21719391644001007, + "learning_rate": 6.926616646637225e-05, + "loss": 0.5624, + "step": 4949 + }, + { + "epoch": 1.0175763182238668, + "grad_norm": 0.20705457031726837, + "learning_rate": 6.92577544149522e-05, + "loss": 0.56, + "step": 4950 + }, + { + "epoch": 1.0177818891972454, + "grad_norm": 0.1947423666715622, + "learning_rate": 6.924934116846596e-05, + "loss": 0.5193, + "step": 4951 + }, + { + "epoch": 1.0179874601706238, + "grad_norm": 0.1868080198764801, + "learning_rate": 6.924092672732802e-05, + "loss": 0.5699, + "step": 4952 + }, + { + "epoch": 1.0181930311440024, + "grad_norm": 0.2158852070569992, + "learning_rate": 6.923251109195293e-05, + "loss": 0.5611, + "step": 4953 + }, + { + "epoch": 1.018398602117381, + "grad_norm": 0.17527857422828674, + "learning_rate": 6.922409426275528e-05, + "loss": 0.5361, + "step": 4954 + }, + { + "epoch": 1.0186041730907596, + "grad_norm": 0.16154874861240387, + "learning_rate": 6.921567624014973e-05, + "loss": 0.5337, + "step": 4955 + }, + { + "epoch": 1.0188097440641382, + "grad_norm": 0.18655456602573395, + "learning_rate": 6.920725702455099e-05, + "loss": 0.5684, + "step": 4956 + }, + { + "epoch": 1.0190153150375167, + "grad_norm": 0.22478148341178894, + "learning_rate": 6.919883661637383e-05, + "loss": 0.5722, + "step": 4957 + }, + { + "epoch": 1.0192208860108953, + "grad_norm": 0.20847651362419128, + "learning_rate": 6.919041501603313e-05, + "loss": 0.5891, + "step": 4958 + }, + { + "epoch": 1.019426456984274, + "grad_norm": 0.17430467903614044, + "learning_rate": 6.918199222394373e-05, + "loss": 0.5449, + "step": 4959 + }, + { + "epoch": 1.0196320279576523, + "grad_norm": 0.17865034937858582, + "learning_rate": 6.917356824052059e-05, + "loss": 0.54, + "step": 4960 + }, + { + "epoch": 1.0198375989310309, + "grad_norm": 0.19386602938175201, + "learning_rate": 6.916514306617874e-05, + "loss": 0.5582, + "step": 4961 + }, + { + "epoch": 1.0200431699044095, + "grad_norm": 0.1756899505853653, + "learning_rate": 6.915671670133324e-05, + "loss": 0.521, + "step": 4962 + }, + { + "epoch": 1.020248740877788, + "grad_norm": 0.16583296656608582, + "learning_rate": 6.914828914639922e-05, + "loss": 0.5647, + "step": 4963 + }, + { + "epoch": 1.0204543118511666, + "grad_norm": 0.19850464165210724, + "learning_rate": 6.913986040179185e-05, + "loss": 0.5415, + "step": 4964 + }, + { + "epoch": 1.0206598828245452, + "grad_norm": 0.2507860064506531, + "learning_rate": 6.913143046792639e-05, + "loss": 0.5441, + "step": 4965 + }, + { + "epoch": 1.0208654537979238, + "grad_norm": 0.19658030569553375, + "learning_rate": 6.912299934521814e-05, + "loss": 0.5782, + "step": 4966 + }, + { + "epoch": 1.0210710247713024, + "grad_norm": 0.19466283917427063, + "learning_rate": 6.911456703408246e-05, + "loss": 0.5552, + "step": 4967 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 0.16281276941299438, + "learning_rate": 6.910613353493479e-05, + "loss": 0.5291, + "step": 4968 + }, + { + "epoch": 1.0214821667180594, + "grad_norm": 0.1634058803319931, + "learning_rate": 6.909769884819057e-05, + "loss": 0.5497, + "step": 4969 + }, + { + "epoch": 1.021687737691438, + "grad_norm": 0.1930556446313858, + "learning_rate": 6.908926297426537e-05, + "loss": 0.5608, + "step": 4970 + }, + { + "epoch": 1.0218933086648165, + "grad_norm": 0.19795656204223633, + "learning_rate": 6.908082591357478e-05, + "loss": 0.5729, + "step": 4971 + }, + { + "epoch": 1.0220988796381951, + "grad_norm": 0.19776557385921478, + "learning_rate": 6.907238766653445e-05, + "loss": 0.5634, + "step": 4972 + }, + { + "epoch": 1.0223044506115737, + "grad_norm": 0.19151826202869415, + "learning_rate": 6.90639482335601e-05, + "loss": 0.571, + "step": 4973 + }, + { + "epoch": 1.0225100215849523, + "grad_norm": 0.18954800069332123, + "learning_rate": 6.905550761506747e-05, + "loss": 0.5519, + "step": 4974 + }, + { + "epoch": 1.0227155925583307, + "grad_norm": 0.19335106015205383, + "learning_rate": 6.904706581147243e-05, + "loss": 0.5452, + "step": 4975 + }, + { + "epoch": 1.0229211635317093, + "grad_norm": 0.20168174803256989, + "learning_rate": 6.903862282319087e-05, + "loss": 0.5838, + "step": 4976 + }, + { + "epoch": 1.0231267345050878, + "grad_norm": 0.20087262988090515, + "learning_rate": 6.90301786506387e-05, + "loss": 0.5656, + "step": 4977 + }, + { + "epoch": 1.0233323054784664, + "grad_norm": 0.1917273849248886, + "learning_rate": 6.902173329423195e-05, + "loss": 0.5679, + "step": 4978 + }, + { + "epoch": 1.023537876451845, + "grad_norm": 0.1951013058423996, + "learning_rate": 6.901328675438669e-05, + "loss": 0.5635, + "step": 4979 + }, + { + "epoch": 1.0237434474252236, + "grad_norm": 0.20168475806713104, + "learning_rate": 6.9004839031519e-05, + "loss": 0.5826, + "step": 4980 + }, + { + "epoch": 1.0239490183986022, + "grad_norm": 0.19177857041358948, + "learning_rate": 6.899639012604512e-05, + "loss": 0.5675, + "step": 4981 + }, + { + "epoch": 1.0241545893719808, + "grad_norm": 0.15916599333286285, + "learning_rate": 6.898794003838124e-05, + "loss": 0.5457, + "step": 4982 + }, + { + "epoch": 1.0243601603453591, + "grad_norm": 0.1605004519224167, + "learning_rate": 6.897948876894369e-05, + "loss": 0.5663, + "step": 4983 + }, + { + "epoch": 1.0245657313187377, + "grad_norm": 0.19485965371131897, + "learning_rate": 6.897103631814878e-05, + "loss": 0.5683, + "step": 4984 + }, + { + "epoch": 1.0247713022921163, + "grad_norm": 0.1926756501197815, + "learning_rate": 6.896258268641298e-05, + "loss": 0.5525, + "step": 4985 + }, + { + "epoch": 1.024976873265495, + "grad_norm": 0.19675122201442719, + "learning_rate": 6.895412787415272e-05, + "loss": 0.5811, + "step": 4986 + }, + { + "epoch": 1.0251824442388735, + "grad_norm": 0.19753362238407135, + "learning_rate": 6.894567188178454e-05, + "loss": 0.582, + "step": 4987 + }, + { + "epoch": 1.025388015212252, + "grad_norm": 0.195309117436409, + "learning_rate": 6.893721470972502e-05, + "loss": 0.5446, + "step": 4988 + }, + { + "epoch": 1.0255935861856307, + "grad_norm": 0.19395774602890015, + "learning_rate": 6.892875635839081e-05, + "loss": 0.574, + "step": 4989 + }, + { + "epoch": 1.025799157159009, + "grad_norm": 0.1611412912607193, + "learning_rate": 6.892029682819864e-05, + "loss": 0.5342, + "step": 4990 + }, + { + "epoch": 1.0260047281323876, + "grad_norm": 0.1334841102361679, + "learning_rate": 6.891183611956523e-05, + "loss": 0.5458, + "step": 4991 + }, + { + "epoch": 1.0262102991057662, + "grad_norm": 0.18414229154586792, + "learning_rate": 6.890337423290743e-05, + "loss": 0.5658, + "step": 4992 + }, + { + "epoch": 1.0264158700791448, + "grad_norm": 0.20537151396274567, + "learning_rate": 6.88949111686421e-05, + "loss": 0.6111, + "step": 4993 + }, + { + "epoch": 1.0266214410525234, + "grad_norm": 0.18854451179504395, + "learning_rate": 6.88864469271862e-05, + "loss": 0.5655, + "step": 4994 + }, + { + "epoch": 1.026827012025902, + "grad_norm": 0.19057627022266388, + "learning_rate": 6.887798150895667e-05, + "loss": 0.5645, + "step": 4995 + }, + { + "epoch": 1.0270325829992806, + "grad_norm": 0.22320972383022308, + "learning_rate": 6.886951491437062e-05, + "loss": 0.5688, + "step": 4996 + }, + { + "epoch": 1.0272381539726592, + "grad_norm": 0.2112189084291458, + "learning_rate": 6.886104714384512e-05, + "loss": 0.5599, + "step": 4997 + }, + { + "epoch": 1.0274437249460375, + "grad_norm": 0.1889009028673172, + "learning_rate": 6.885257819779736e-05, + "loss": 0.5472, + "step": 4998 + }, + { + "epoch": 1.0276492959194161, + "grad_norm": 0.18562033772468567, + "learning_rate": 6.884410807664456e-05, + "loss": 0.5478, + "step": 4999 + }, + { + "epoch": 1.0278548668927947, + "grad_norm": 0.1892947107553482, + "learning_rate": 6.8835636780804e-05, + "loss": 0.5561, + "step": 5000 + }, + { + "epoch": 1.0280604378661733, + "grad_norm": 0.19414404034614563, + "learning_rate": 6.882716431069303e-05, + "loss": 0.5769, + "step": 5001 + }, + { + "epoch": 1.0282660088395519, + "grad_norm": 0.194126158952713, + "learning_rate": 6.881869066672904e-05, + "loss": 0.5609, + "step": 5002 + }, + { + "epoch": 1.0284715798129305, + "grad_norm": 0.1930353194475174, + "learning_rate": 6.881021584932949e-05, + "loss": 0.57, + "step": 5003 + }, + { + "epoch": 1.028677150786309, + "grad_norm": 0.18623441457748413, + "learning_rate": 6.88017398589119e-05, + "loss": 0.5429, + "step": 5004 + }, + { + "epoch": 1.0288827217596874, + "grad_norm": 0.1921243667602539, + "learning_rate": 6.879326269589382e-05, + "loss": 0.5579, + "step": 5005 + }, + { + "epoch": 1.029088292733066, + "grad_norm": 0.18247570097446442, + "learning_rate": 6.87847843606929e-05, + "loss": 0.5402, + "step": 5006 + }, + { + "epoch": 1.0292938637064446, + "grad_norm": 0.17088961601257324, + "learning_rate": 6.877630485372684e-05, + "loss": 0.5483, + "step": 5007 + }, + { + "epoch": 1.0294994346798232, + "grad_norm": 0.20109815895557404, + "learning_rate": 6.876782417541334e-05, + "loss": 0.5541, + "step": 5008 + }, + { + "epoch": 1.0297050056532018, + "grad_norm": 0.19609639048576355, + "learning_rate": 6.875934232617027e-05, + "loss": 0.5629, + "step": 5009 + }, + { + "epoch": 1.0299105766265804, + "grad_norm": 0.19312147796154022, + "learning_rate": 6.875085930641543e-05, + "loss": 0.5603, + "step": 5010 + }, + { + "epoch": 1.030116147599959, + "grad_norm": 0.1975242644548416, + "learning_rate": 6.874237511656677e-05, + "loss": 0.5763, + "step": 5011 + }, + { + "epoch": 1.0303217185733375, + "grad_norm": 0.1999368965625763, + "learning_rate": 6.873388975704225e-05, + "loss": 0.5884, + "step": 5012 + }, + { + "epoch": 1.030527289546716, + "grad_norm": 0.16335703432559967, + "learning_rate": 6.872540322825994e-05, + "loss": 0.5181, + "step": 5013 + }, + { + "epoch": 1.0307328605200945, + "grad_norm": 0.17105185985565186, + "learning_rate": 6.871691553063788e-05, + "loss": 0.566, + "step": 5014 + }, + { + "epoch": 1.030938431493473, + "grad_norm": 0.21174640953540802, + "learning_rate": 6.870842666459425e-05, + "loss": 0.5851, + "step": 5015 + }, + { + "epoch": 1.0311440024668517, + "grad_norm": 0.19627945125102997, + "learning_rate": 6.869993663054725e-05, + "loss": 0.5655, + "step": 5016 + }, + { + "epoch": 1.0313495734402303, + "grad_norm": 0.18857216835021973, + "learning_rate": 6.869144542891517e-05, + "loss": 0.5448, + "step": 5017 + }, + { + "epoch": 1.0315551444136088, + "grad_norm": 0.16696830093860626, + "learning_rate": 6.868295306011628e-05, + "loss": 0.5241, + "step": 5018 + }, + { + "epoch": 1.0317607153869874, + "grad_norm": 0.16772493720054626, + "learning_rate": 6.867445952456899e-05, + "loss": 0.5759, + "step": 5019 + }, + { + "epoch": 1.0319662863603658, + "grad_norm": 0.19662131369113922, + "learning_rate": 6.866596482269175e-05, + "loss": 0.5647, + "step": 5020 + }, + { + "epoch": 1.0321718573337444, + "grad_norm": 0.20151005685329437, + "learning_rate": 6.8657468954903e-05, + "loss": 0.5577, + "step": 5021 + }, + { + "epoch": 1.032377428307123, + "grad_norm": 0.16064363718032837, + "learning_rate": 6.864897192162136e-05, + "loss": 0.5182, + "step": 5022 + }, + { + "epoch": 1.0325829992805016, + "grad_norm": 0.16647809743881226, + "learning_rate": 6.864047372326539e-05, + "loss": 0.5572, + "step": 5023 + }, + { + "epoch": 1.0327885702538802, + "grad_norm": 0.18600209057331085, + "learning_rate": 6.86319743602538e-05, + "loss": 0.5426, + "step": 5024 + }, + { + "epoch": 1.0329941412272587, + "grad_norm": 0.15970858931541443, + "learning_rate": 6.862347383300529e-05, + "loss": 0.5119, + "step": 5025 + }, + { + "epoch": 1.0331997122006373, + "grad_norm": 0.16188785433769226, + "learning_rate": 6.861497214193861e-05, + "loss": 0.5732, + "step": 5026 + }, + { + "epoch": 1.033405283174016, + "grad_norm": 0.19583800435066223, + "learning_rate": 6.860646928747265e-05, + "loss": 0.5387, + "step": 5027 + }, + { + "epoch": 1.0336108541473943, + "grad_norm": 0.19050170481204987, + "learning_rate": 6.859796527002627e-05, + "loss": 0.5715, + "step": 5028 + }, + { + "epoch": 1.0338164251207729, + "grad_norm": 0.19282078742980957, + "learning_rate": 6.858946009001844e-05, + "loss": 0.5717, + "step": 5029 + }, + { + "epoch": 1.0340219960941515, + "grad_norm": 0.19418777525424957, + "learning_rate": 6.858095374786818e-05, + "loss": 0.558, + "step": 5030 + }, + { + "epoch": 1.03422756706753, + "grad_norm": 0.2037775069475174, + "learning_rate": 6.857244624399455e-05, + "loss": 0.5487, + "step": 5031 + }, + { + "epoch": 1.0344331380409086, + "grad_norm": 0.20054981112480164, + "learning_rate": 6.856393757881665e-05, + "loss": 0.5565, + "step": 5032 + }, + { + "epoch": 1.0346387090142872, + "grad_norm": 0.2035524547100067, + "learning_rate": 6.855542775275369e-05, + "loss": 0.5952, + "step": 5033 + }, + { + "epoch": 1.0348442799876658, + "grad_norm": 0.1957496851682663, + "learning_rate": 6.854691676622492e-05, + "loss": 0.5563, + "step": 5034 + }, + { + "epoch": 1.0350498509610444, + "grad_norm": 0.21326994895935059, + "learning_rate": 6.853840461964961e-05, + "loss": 0.5745, + "step": 5035 + }, + { + "epoch": 1.0352554219344228, + "grad_norm": 0.16329696774482727, + "learning_rate": 6.852989131344712e-05, + "loss": 0.5331, + "step": 5036 + }, + { + "epoch": 1.0354609929078014, + "grad_norm": 0.16014549136161804, + "learning_rate": 6.852137684803686e-05, + "loss": 0.5432, + "step": 5037 + }, + { + "epoch": 1.03566656388118, + "grad_norm": 0.164669468998909, + "learning_rate": 6.851286122383831e-05, + "loss": 0.5325, + "step": 5038 + }, + { + "epoch": 1.0358721348545585, + "grad_norm": 0.16093246638774872, + "learning_rate": 6.850434444127098e-05, + "loss": 0.5639, + "step": 5039 + }, + { + "epoch": 1.0360777058279371, + "grad_norm": 0.20291577279567719, + "learning_rate": 6.849582650075445e-05, + "loss": 0.5414, + "step": 5040 + }, + { + "epoch": 1.0362832768013157, + "grad_norm": 0.16935724020004272, + "learning_rate": 6.848730740270839e-05, + "loss": 0.5082, + "step": 5041 + }, + { + "epoch": 1.0364888477746943, + "grad_norm": 0.1641445755958557, + "learning_rate": 6.847878714755244e-05, + "loss": 0.5472, + "step": 5042 + }, + { + "epoch": 1.0366944187480727, + "grad_norm": 0.196893572807312, + "learning_rate": 6.847026573570642e-05, + "loss": 0.5856, + "step": 5043 + }, + { + "epoch": 1.0368999897214513, + "grad_norm": 0.19081740081310272, + "learning_rate": 6.846174316759007e-05, + "loss": 0.5622, + "step": 5044 + }, + { + "epoch": 1.0371055606948298, + "grad_norm": 0.16471846401691437, + "learning_rate": 6.845321944362332e-05, + "loss": 0.5485, + "step": 5045 + }, + { + "epoch": 1.0373111316682084, + "grad_norm": 0.16739195585250854, + "learning_rate": 6.844469456422606e-05, + "loss": 0.5717, + "step": 5046 + }, + { + "epoch": 1.037516702641587, + "grad_norm": 0.19602900743484497, + "learning_rate": 6.843616852981831e-05, + "loss": 0.563, + "step": 5047 + }, + { + "epoch": 1.0377222736149656, + "grad_norm": 0.19288770854473114, + "learning_rate": 6.842764134082004e-05, + "loss": 0.5641, + "step": 5048 + }, + { + "epoch": 1.0379278445883442, + "grad_norm": 0.18607531487941742, + "learning_rate": 6.841911299765141e-05, + "loss": 0.5437, + "step": 5049 + }, + { + "epoch": 1.0381334155617228, + "grad_norm": 0.20571644604206085, + "learning_rate": 6.84105835007325e-05, + "loss": 0.5444, + "step": 5050 + }, + { + "epoch": 1.0383389865351011, + "grad_norm": 0.2017316222190857, + "learning_rate": 6.840205285048359e-05, + "loss": 0.5615, + "step": 5051 + }, + { + "epoch": 1.0385445575084797, + "grad_norm": 0.19289067387580872, + "learning_rate": 6.839352104732492e-05, + "loss": 0.5715, + "step": 5052 + }, + { + "epoch": 1.0387501284818583, + "grad_norm": 0.19483251869678497, + "learning_rate": 6.838498809167681e-05, + "loss": 0.5936, + "step": 5053 + }, + { + "epoch": 1.038955699455237, + "grad_norm": 0.17118024826049805, + "learning_rate": 6.837645398395962e-05, + "loss": 0.5091, + "step": 5054 + }, + { + "epoch": 1.0391612704286155, + "grad_norm": 0.15377415716648102, + "learning_rate": 6.836791872459382e-05, + "loss": 0.5493, + "step": 5055 + }, + { + "epoch": 1.039366841401994, + "grad_norm": 0.16883303225040436, + "learning_rate": 6.835938231399989e-05, + "loss": 0.5309, + "step": 5056 + }, + { + "epoch": 1.0395724123753727, + "grad_norm": 0.18094538152217865, + "learning_rate": 6.835084475259835e-05, + "loss": 0.5579, + "step": 5057 + }, + { + "epoch": 1.0397779833487513, + "grad_norm": 0.1928682029247284, + "learning_rate": 6.834230604080986e-05, + "loss": 0.5608, + "step": 5058 + }, + { + "epoch": 1.0399835543221296, + "grad_norm": 0.16030603647232056, + "learning_rate": 6.833376617905504e-05, + "loss": 0.5228, + "step": 5059 + }, + { + "epoch": 1.0401891252955082, + "grad_norm": 0.16263644397258759, + "learning_rate": 6.832522516775462e-05, + "loss": 0.5724, + "step": 5060 + }, + { + "epoch": 1.0403946962688868, + "grad_norm": 0.2015358805656433, + "learning_rate": 6.831668300732938e-05, + "loss": 0.5652, + "step": 5061 + }, + { + "epoch": 1.0406002672422654, + "grad_norm": 0.19693398475646973, + "learning_rate": 6.830813969820015e-05, + "loss": 0.5457, + "step": 5062 + }, + { + "epoch": 1.040805838215644, + "grad_norm": 0.19118033349514008, + "learning_rate": 6.829959524078782e-05, + "loss": 0.5615, + "step": 5063 + }, + { + "epoch": 1.0410114091890226, + "grad_norm": 0.2018975466489792, + "learning_rate": 6.829104963551332e-05, + "loss": 0.5883, + "step": 5064 + }, + { + "epoch": 1.0412169801624012, + "grad_norm": 0.1714175045490265, + "learning_rate": 6.828250288279768e-05, + "loss": 0.514, + "step": 5065 + }, + { + "epoch": 1.0414225511357795, + "grad_norm": 0.1588568538427353, + "learning_rate": 6.827395498306195e-05, + "loss": 0.5603, + "step": 5066 + }, + { + "epoch": 1.0416281221091581, + "grad_norm": 0.18845230340957642, + "learning_rate": 6.826540593672724e-05, + "loss": 0.5685, + "step": 5067 + }, + { + "epoch": 1.0418336930825367, + "grad_norm": 0.19369468092918396, + "learning_rate": 6.825685574421471e-05, + "loss": 0.5599, + "step": 5068 + }, + { + "epoch": 1.0420392640559153, + "grad_norm": 0.19052891433238983, + "learning_rate": 6.824830440594561e-05, + "loss": 0.5593, + "step": 5069 + }, + { + "epoch": 1.0422448350292939, + "grad_norm": 0.19876545667648315, + "learning_rate": 6.823975192234123e-05, + "loss": 0.5911, + "step": 5070 + }, + { + "epoch": 1.0424504060026725, + "grad_norm": 0.19648174941539764, + "learning_rate": 6.823119829382285e-05, + "loss": 0.544, + "step": 5071 + }, + { + "epoch": 1.042655976976051, + "grad_norm": 0.20026130974292755, + "learning_rate": 6.822264352081194e-05, + "loss": 0.5574, + "step": 5072 + }, + { + "epoch": 1.0428615479494296, + "grad_norm": 0.18708030879497528, + "learning_rate": 6.821408760372994e-05, + "loss": 0.5367, + "step": 5073 + }, + { + "epoch": 1.043067118922808, + "grad_norm": 0.18605582416057587, + "learning_rate": 6.820553054299832e-05, + "loss": 0.5383, + "step": 5074 + }, + { + "epoch": 1.0432726898961866, + "grad_norm": 0.19726519286632538, + "learning_rate": 6.81969723390387e-05, + "loss": 0.5922, + "step": 5075 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 0.1689324975013733, + "learning_rate": 6.818841299227264e-05, + "loss": 0.5238, + "step": 5076 + }, + { + "epoch": 1.0436838318429438, + "grad_norm": 0.16380931437015533, + "learning_rate": 6.817985250312187e-05, + "loss": 0.5622, + "step": 5077 + }, + { + "epoch": 1.0438894028163224, + "grad_norm": 0.18795007467269897, + "learning_rate": 6.817129087200812e-05, + "loss": 0.5529, + "step": 5078 + }, + { + "epoch": 1.044094973789701, + "grad_norm": 0.21080972254276276, + "learning_rate": 6.816272809935315e-05, + "loss": 0.5553, + "step": 5079 + }, + { + "epoch": 1.0443005447630795, + "grad_norm": 0.19788028299808502, + "learning_rate": 6.815416418557885e-05, + "loss": 0.5868, + "step": 5080 + }, + { + "epoch": 1.044506115736458, + "grad_norm": 0.17419970035552979, + "learning_rate": 6.81455991311071e-05, + "loss": 0.5412, + "step": 5081 + }, + { + "epoch": 1.0447116867098365, + "grad_norm": 0.1657952070236206, + "learning_rate": 6.813703293635986e-05, + "loss": 0.557, + "step": 5082 + }, + { + "epoch": 1.044917257683215, + "grad_norm": 0.19872242212295532, + "learning_rate": 6.812846560175916e-05, + "loss": 0.5702, + "step": 5083 + }, + { + "epoch": 1.0451228286565937, + "grad_norm": 0.18654018640518188, + "learning_rate": 6.811989712772704e-05, + "loss": 0.5414, + "step": 5084 + }, + { + "epoch": 1.0453283996299723, + "grad_norm": 0.1918267160654068, + "learning_rate": 6.811132751468566e-05, + "loss": 0.5687, + "step": 5085 + }, + { + "epoch": 1.0455339706033508, + "grad_norm": 0.1659933179616928, + "learning_rate": 6.810275676305719e-05, + "loss": 0.5324, + "step": 5086 + }, + { + "epoch": 1.0457395415767294, + "grad_norm": 0.13235360383987427, + "learning_rate": 6.809418487326388e-05, + "loss": 0.5161, + "step": 5087 + }, + { + "epoch": 1.045945112550108, + "grad_norm": 0.17006467282772064, + "learning_rate": 6.808561184572802e-05, + "loss": 0.5641, + "step": 5088 + }, + { + "epoch": 1.0461506835234864, + "grad_norm": 0.20476646721363068, + "learning_rate": 6.807703768087196e-05, + "loss": 0.5604, + "step": 5089 + }, + { + "epoch": 1.046356254496865, + "grad_norm": 0.2010469287633896, + "learning_rate": 6.806846237911815e-05, + "loss": 0.559, + "step": 5090 + }, + { + "epoch": 1.0465618254702436, + "grad_norm": 0.1989106982946396, + "learning_rate": 6.805988594088898e-05, + "loss": 0.5642, + "step": 5091 + }, + { + "epoch": 1.0467673964436222, + "grad_norm": 0.17077521979808807, + "learning_rate": 6.805130836660703e-05, + "loss": 0.537, + "step": 5092 + }, + { + "epoch": 1.0469729674170007, + "grad_norm": 0.16533872485160828, + "learning_rate": 6.804272965669486e-05, + "loss": 0.5552, + "step": 5093 + }, + { + "epoch": 1.0471785383903793, + "grad_norm": 0.19124990701675415, + "learning_rate": 6.80341498115751e-05, + "loss": 0.5574, + "step": 5094 + }, + { + "epoch": 1.047384109363758, + "grad_norm": 0.19139717519283295, + "learning_rate": 6.802556883167043e-05, + "loss": 0.5446, + "step": 5095 + }, + { + "epoch": 1.0475896803371363, + "grad_norm": 0.19126202166080475, + "learning_rate": 6.801698671740362e-05, + "loss": 0.5634, + "step": 5096 + }, + { + "epoch": 1.0477952513105149, + "grad_norm": 0.19115038216114044, + "learning_rate": 6.800840346919744e-05, + "loss": 0.5393, + "step": 5097 + }, + { + "epoch": 1.0480008222838935, + "grad_norm": 0.1927635818719864, + "learning_rate": 6.799981908747476e-05, + "loss": 0.5527, + "step": 5098 + }, + { + "epoch": 1.048206393257272, + "grad_norm": 0.20182408392429352, + "learning_rate": 6.799123357265852e-05, + "loss": 0.5691, + "step": 5099 + }, + { + "epoch": 1.0484119642306506, + "grad_norm": 0.1980399638414383, + "learning_rate": 6.798264692517165e-05, + "loss": 0.5593, + "step": 5100 + }, + { + "epoch": 1.0486175352040292, + "grad_norm": 0.19788923859596252, + "learning_rate": 6.797405914543717e-05, + "loss": 0.571, + "step": 5101 + }, + { + "epoch": 1.0488231061774078, + "grad_norm": 0.18928498029708862, + "learning_rate": 6.79654702338782e-05, + "loss": 0.5616, + "step": 5102 + }, + { + "epoch": 1.0490286771507864, + "grad_norm": 0.18653394281864166, + "learning_rate": 6.795688019091784e-05, + "loss": 0.5553, + "step": 5103 + }, + { + "epoch": 1.0492342481241648, + "grad_norm": 0.17353960871696472, + "learning_rate": 6.79482890169793e-05, + "loss": 0.5348, + "step": 5104 + }, + { + "epoch": 1.0494398190975434, + "grad_norm": 0.1659521758556366, + "learning_rate": 6.79396967124858e-05, + "loss": 0.5799, + "step": 5105 + }, + { + "epoch": 1.049645390070922, + "grad_norm": 0.16796258091926575, + "learning_rate": 6.79311032778607e-05, + "loss": 0.5178, + "step": 5106 + }, + { + "epoch": 1.0498509610443005, + "grad_norm": 0.16122405230998993, + "learning_rate": 6.79225087135273e-05, + "loss": 0.555, + "step": 5107 + }, + { + "epoch": 1.0500565320176791, + "grad_norm": 0.20025917887687683, + "learning_rate": 6.791391301990902e-05, + "loss": 0.5649, + "step": 5108 + }, + { + "epoch": 1.0502621029910577, + "grad_norm": 0.19537703692913055, + "learning_rate": 6.790531619742936e-05, + "loss": 0.5517, + "step": 5109 + }, + { + "epoch": 1.0504676739644363, + "grad_norm": 0.18383800983428955, + "learning_rate": 6.789671824651183e-05, + "loss": 0.5673, + "step": 5110 + }, + { + "epoch": 1.0506732449378149, + "grad_norm": 0.19807079434394836, + "learning_rate": 6.788811916758002e-05, + "loss": 0.5811, + "step": 5111 + }, + { + "epoch": 1.0508788159111933, + "grad_norm": 0.19630371034145355, + "learning_rate": 6.787951896105754e-05, + "loss": 0.5306, + "step": 5112 + }, + { + "epoch": 1.0510843868845718, + "grad_norm": 0.18975067138671875, + "learning_rate": 6.78709176273681e-05, + "loss": 0.569, + "step": 5113 + }, + { + "epoch": 1.0512899578579504, + "grad_norm": 0.20035415887832642, + "learning_rate": 6.786231516693547e-05, + "loss": 0.5387, + "step": 5114 + }, + { + "epoch": 1.051495528831329, + "grad_norm": 0.1959179788827896, + "learning_rate": 6.785371158018341e-05, + "loss": 0.5653, + "step": 5115 + }, + { + "epoch": 1.0517010998047076, + "grad_norm": 0.16765445470809937, + "learning_rate": 6.78451068675358e-05, + "loss": 0.5406, + "step": 5116 + }, + { + "epoch": 1.0519066707780862, + "grad_norm": 0.15998391807079315, + "learning_rate": 6.783650102941656e-05, + "loss": 0.5506, + "step": 5117 + }, + { + "epoch": 1.0521122417514648, + "grad_norm": 0.19628630578517914, + "learning_rate": 6.782789406624964e-05, + "loss": 0.5581, + "step": 5118 + }, + { + "epoch": 1.0523178127248431, + "grad_norm": 0.20828314125537872, + "learning_rate": 6.781928597845909e-05, + "loss": 0.549, + "step": 5119 + }, + { + "epoch": 1.0525233836982217, + "grad_norm": 0.1985846757888794, + "learning_rate": 6.781067676646896e-05, + "loss": 0.5625, + "step": 5120 + }, + { + "epoch": 1.0527289546716003, + "grad_norm": 0.21041736006736755, + "learning_rate": 6.780206643070343e-05, + "loss": 0.5387, + "step": 5121 + }, + { + "epoch": 1.052934525644979, + "grad_norm": 0.20188267529010773, + "learning_rate": 6.779345497158664e-05, + "loss": 0.5511, + "step": 5122 + }, + { + "epoch": 1.0531400966183575, + "grad_norm": 0.19628292322158813, + "learning_rate": 6.778484238954287e-05, + "loss": 0.5509, + "step": 5123 + }, + { + "epoch": 1.053345667591736, + "grad_norm": 0.19556038081645966, + "learning_rate": 6.77762286849964e-05, + "loss": 0.5563, + "step": 5124 + }, + { + "epoch": 1.0535512385651147, + "grad_norm": 0.18803851306438446, + "learning_rate": 6.776761385837161e-05, + "loss": 0.5833, + "step": 5125 + }, + { + "epoch": 1.0537568095384933, + "grad_norm": 0.20179903507232666, + "learning_rate": 6.77589979100929e-05, + "loss": 0.5745, + "step": 5126 + }, + { + "epoch": 1.0539623805118716, + "grad_norm": 0.19985097646713257, + "learning_rate": 6.775038084058473e-05, + "loss": 0.5741, + "step": 5127 + }, + { + "epoch": 1.0541679514852502, + "grad_norm": 0.19471241533756256, + "learning_rate": 6.774176265027164e-05, + "loss": 0.569, + "step": 5128 + }, + { + "epoch": 1.0543735224586288, + "grad_norm": 0.18633931875228882, + "learning_rate": 6.77331433395782e-05, + "loss": 0.5274, + "step": 5129 + }, + { + "epoch": 1.0545790934320074, + "grad_norm": 0.18471461534500122, + "learning_rate": 6.772452290892902e-05, + "loss": 0.5643, + "step": 5130 + }, + { + "epoch": 1.054784664405386, + "grad_norm": 0.16938886046409607, + "learning_rate": 6.771590135874883e-05, + "loss": 0.5321, + "step": 5131 + }, + { + "epoch": 1.0549902353787646, + "grad_norm": 0.16139011085033417, + "learning_rate": 6.770727868946237e-05, + "loss": 0.5531, + "step": 5132 + }, + { + "epoch": 1.0551958063521432, + "grad_norm": 0.1963978409767151, + "learning_rate": 6.769865490149439e-05, + "loss": 0.5727, + "step": 5133 + }, + { + "epoch": 1.0554013773255218, + "grad_norm": 0.16599130630493164, + "learning_rate": 6.76900299952698e-05, + "loss": 0.5472, + "step": 5134 + }, + { + "epoch": 1.0556069482989001, + "grad_norm": 0.16204878687858582, + "learning_rate": 6.768140397121347e-05, + "loss": 0.5799, + "step": 5135 + }, + { + "epoch": 1.0558125192722787, + "grad_norm": 0.19888906180858612, + "learning_rate": 6.767277682975037e-05, + "loss": 0.571, + "step": 5136 + }, + { + "epoch": 1.0560180902456573, + "grad_norm": 0.19019466638565063, + "learning_rate": 6.766414857130556e-05, + "loss": 0.5547, + "step": 5137 + }, + { + "epoch": 1.0562236612190359, + "grad_norm": 0.19364401698112488, + "learning_rate": 6.765551919630407e-05, + "loss": 0.5889, + "step": 5138 + }, + { + "epoch": 1.0564292321924145, + "grad_norm": 0.19653116166591644, + "learning_rate": 6.764688870517104e-05, + "loss": 0.5778, + "step": 5139 + }, + { + "epoch": 1.056634803165793, + "grad_norm": 0.20161549746990204, + "learning_rate": 6.763825709833164e-05, + "loss": 0.5708, + "step": 5140 + }, + { + "epoch": 1.0568403741391716, + "grad_norm": 0.19760295748710632, + "learning_rate": 6.762962437621112e-05, + "loss": 0.5555, + "step": 5141 + }, + { + "epoch": 1.05704594511255, + "grad_norm": 0.1907760202884674, + "learning_rate": 6.76209905392348e-05, + "loss": 0.5496, + "step": 5142 + }, + { + "epoch": 1.0572515160859286, + "grad_norm": 0.19071267545223236, + "learning_rate": 6.7612355587828e-05, + "loss": 0.5601, + "step": 5143 + }, + { + "epoch": 1.0574570870593072, + "grad_norm": 0.1920759677886963, + "learning_rate": 6.760371952241613e-05, + "loss": 0.5783, + "step": 5144 + }, + { + "epoch": 1.0576626580326858, + "grad_norm": 0.17088396847248077, + "learning_rate": 6.759508234342465e-05, + "loss": 0.5436, + "step": 5145 + }, + { + "epoch": 1.0578682290060644, + "grad_norm": 0.13676489889621735, + "learning_rate": 6.758644405127908e-05, + "loss": 0.5261, + "step": 5146 + }, + { + "epoch": 1.058073799979443, + "grad_norm": 0.16138045489788055, + "learning_rate": 6.757780464640496e-05, + "loss": 0.5751, + "step": 5147 + }, + { + "epoch": 1.0582793709528215, + "grad_norm": 0.191030353307724, + "learning_rate": 6.756916412922794e-05, + "loss": 0.5597, + "step": 5148 + }, + { + "epoch": 1.0584849419262001, + "grad_norm": 0.19613182544708252, + "learning_rate": 6.75605225001737e-05, + "loss": 0.5644, + "step": 5149 + }, + { + "epoch": 1.0586905128995785, + "grad_norm": 0.1948171854019165, + "learning_rate": 6.755187975966795e-05, + "loss": 0.5637, + "step": 5150 + }, + { + "epoch": 1.058896083872957, + "grad_norm": 0.1817820966243744, + "learning_rate": 6.754323590813649e-05, + "loss": 0.5389, + "step": 5151 + }, + { + "epoch": 1.0591016548463357, + "grad_norm": 0.1902126520872116, + "learning_rate": 6.753459094600518e-05, + "loss": 0.5745, + "step": 5152 + }, + { + "epoch": 1.0593072258197143, + "grad_norm": 0.19361747801303864, + "learning_rate": 6.752594487369989e-05, + "loss": 0.5834, + "step": 5153 + }, + { + "epoch": 1.0595127967930928, + "grad_norm": 0.176842600107193, + "learning_rate": 6.751729769164659e-05, + "loss": 0.5306, + "step": 5154 + }, + { + "epoch": 1.0597183677664714, + "grad_norm": 0.16082750260829926, + "learning_rate": 6.750864940027127e-05, + "loss": 0.5461, + "step": 5155 + }, + { + "epoch": 1.05992393873985, + "grad_norm": 0.17407187819480896, + "learning_rate": 6.75e-05, + "loss": 0.5479, + "step": 5156 + }, + { + "epoch": 1.0601295097132284, + "grad_norm": 0.2048111855983734, + "learning_rate": 6.74913494912589e-05, + "loss": 0.5636, + "step": 5157 + }, + { + "epoch": 1.060335080686607, + "grad_norm": 0.19643527269363403, + "learning_rate": 6.748269787447414e-05, + "loss": 0.577, + "step": 5158 + }, + { + "epoch": 1.0605406516599856, + "grad_norm": 0.19927412271499634, + "learning_rate": 6.747404515007194e-05, + "loss": 0.5753, + "step": 5159 + }, + { + "epoch": 1.0607462226333642, + "grad_norm": 0.20352114737033844, + "learning_rate": 6.746539131847856e-05, + "loss": 0.5699, + "step": 5160 + }, + { + "epoch": 1.0609517936067427, + "grad_norm": 0.19683125615119934, + "learning_rate": 6.745673638012037e-05, + "loss": 0.5847, + "step": 5161 + }, + { + "epoch": 1.0611573645801213, + "grad_norm": 0.19472289085388184, + "learning_rate": 6.744808033542373e-05, + "loss": 0.5613, + "step": 5162 + }, + { + "epoch": 1.0613629355535, + "grad_norm": 0.19928975403308868, + "learning_rate": 6.74394231848151e-05, + "loss": 0.5638, + "step": 5163 + }, + { + "epoch": 1.0615685065268785, + "grad_norm": 0.2239234298467636, + "learning_rate": 6.743076492872096e-05, + "loss": 0.5674, + "step": 5164 + }, + { + "epoch": 1.0617740775002569, + "grad_norm": 0.18867623805999756, + "learning_rate": 6.742210556756789e-05, + "loss": 0.5242, + "step": 5165 + }, + { + "epoch": 1.0619796484736355, + "grad_norm": 0.17035862803459167, + "learning_rate": 6.741344510178247e-05, + "loss": 0.5613, + "step": 5166 + }, + { + "epoch": 1.062185219447014, + "grad_norm": 0.20985183119773865, + "learning_rate": 6.740478353179138e-05, + "loss": 0.5737, + "step": 5167 + }, + { + "epoch": 1.0623907904203926, + "grad_norm": 0.21184340119361877, + "learning_rate": 6.739612085802131e-05, + "loss": 0.5656, + "step": 5168 + }, + { + "epoch": 1.0625963613937712, + "grad_norm": 0.19316667318344116, + "learning_rate": 6.738745708089905e-05, + "loss": 0.5726, + "step": 5169 + }, + { + "epoch": 1.0628019323671498, + "grad_norm": 0.19877000153064728, + "learning_rate": 6.737879220085143e-05, + "loss": 0.5813, + "step": 5170 + }, + { + "epoch": 1.0630075033405284, + "grad_norm": 0.20379842817783356, + "learning_rate": 6.73701262183053e-05, + "loss": 0.5675, + "step": 5171 + }, + { + "epoch": 1.0632130743139068, + "grad_norm": 0.2046133428812027, + "learning_rate": 6.736145913368762e-05, + "loss": 0.5525, + "step": 5172 + }, + { + "epoch": 1.0634186452872854, + "grad_norm": 0.19095589220523834, + "learning_rate": 6.735279094742535e-05, + "loss": 0.549, + "step": 5173 + }, + { + "epoch": 1.063624216260664, + "grad_norm": 0.20165902376174927, + "learning_rate": 6.734412165994556e-05, + "loss": 0.5807, + "step": 5174 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 0.19797958433628082, + "learning_rate": 6.733545127167533e-05, + "loss": 0.5532, + "step": 5175 + }, + { + "epoch": 1.0640353582074211, + "grad_norm": 0.1996331512928009, + "learning_rate": 6.732677978304182e-05, + "loss": 0.5686, + "step": 5176 + }, + { + "epoch": 1.0642409291807997, + "grad_norm": 0.19206491112709045, + "learning_rate": 6.731810719447222e-05, + "loss": 0.5458, + "step": 5177 + }, + { + "epoch": 1.0644465001541783, + "grad_norm": 0.2006731927394867, + "learning_rate": 6.730943350639379e-05, + "loss": 0.5576, + "step": 5178 + }, + { + "epoch": 1.0646520711275569, + "grad_norm": 0.20075003802776337, + "learning_rate": 6.730075871923384e-05, + "loss": 0.615, + "step": 5179 + }, + { + "epoch": 1.0648576421009353, + "grad_norm": 0.1874108761548996, + "learning_rate": 6.729208283341975e-05, + "loss": 0.5487, + "step": 5180 + }, + { + "epoch": 1.0650632130743138, + "grad_norm": 0.19633813202381134, + "learning_rate": 6.728340584937892e-05, + "loss": 0.57, + "step": 5181 + }, + { + "epoch": 1.0652687840476924, + "grad_norm": 0.18520689010620117, + "learning_rate": 6.727472776753885e-05, + "loss": 0.5608, + "step": 5182 + }, + { + "epoch": 1.065474355021071, + "grad_norm": 0.19115997850894928, + "learning_rate": 6.726604858832704e-05, + "loss": 0.5627, + "step": 5183 + }, + { + "epoch": 1.0656799259944496, + "grad_norm": 0.19133557379245758, + "learning_rate": 6.725736831217111e-05, + "loss": 0.5502, + "step": 5184 + }, + { + "epoch": 1.0658854969678282, + "grad_norm": 0.19521182775497437, + "learning_rate": 6.724868693949864e-05, + "loss": 0.5613, + "step": 5185 + }, + { + "epoch": 1.0660910679412068, + "grad_norm": 0.19791938364505768, + "learning_rate": 6.724000447073739e-05, + "loss": 0.5791, + "step": 5186 + }, + { + "epoch": 1.0662966389145851, + "grad_norm": 0.20099619030952454, + "learning_rate": 6.723132090631505e-05, + "loss": 0.5536, + "step": 5187 + }, + { + "epoch": 1.0665022098879637, + "grad_norm": 0.19620858132839203, + "learning_rate": 6.722263624665944e-05, + "loss": 0.564, + "step": 5188 + }, + { + "epoch": 1.0667077808613423, + "grad_norm": 0.19696368277072906, + "learning_rate": 6.721395049219841e-05, + "loss": 0.5791, + "step": 5189 + }, + { + "epoch": 1.066913351834721, + "grad_norm": 0.1899513453245163, + "learning_rate": 6.720526364335987e-05, + "loss": 0.5405, + "step": 5190 + }, + { + "epoch": 1.0671189228080995, + "grad_norm": 0.1824042797088623, + "learning_rate": 6.719657570057178e-05, + "loss": 0.5631, + "step": 5191 + }, + { + "epoch": 1.067324493781478, + "grad_norm": 0.18800349533557892, + "learning_rate": 6.718788666426216e-05, + "loss": 0.5563, + "step": 5192 + }, + { + "epoch": 1.0675300647548567, + "grad_norm": 0.18425163626670837, + "learning_rate": 6.717919653485905e-05, + "loss": 0.5475, + "step": 5193 + }, + { + "epoch": 1.0677356357282353, + "grad_norm": 0.1711423397064209, + "learning_rate": 6.71705053127906e-05, + "loss": 0.5814, + "step": 5194 + }, + { + "epoch": 1.0679412067016136, + "grad_norm": 0.2124950885772705, + "learning_rate": 6.716181299848497e-05, + "loss": 0.5802, + "step": 5195 + }, + { + "epoch": 1.0681467776749922, + "grad_norm": 0.16117288172245026, + "learning_rate": 6.715311959237042e-05, + "loss": 0.5269, + "step": 5196 + }, + { + "epoch": 1.0683523486483708, + "grad_norm": 0.16499152779579163, + "learning_rate": 6.714442509487519e-05, + "loss": 0.5588, + "step": 5197 + }, + { + "epoch": 1.0685579196217494, + "grad_norm": 0.20156873762607574, + "learning_rate": 6.713572950642765e-05, + "loss": 0.575, + "step": 5198 + }, + { + "epoch": 1.068763490595128, + "grad_norm": 0.19442300498485565, + "learning_rate": 6.712703282745618e-05, + "loss": 0.5727, + "step": 5199 + }, + { + "epoch": 1.0689690615685066, + "grad_norm": 0.19860216975212097, + "learning_rate": 6.711833505838921e-05, + "loss": 0.5325, + "step": 5200 + }, + { + "epoch": 1.0691746325418852, + "grad_norm": 0.1945996880531311, + "learning_rate": 6.710963619965526e-05, + "loss": 0.569, + "step": 5201 + }, + { + "epoch": 1.0693802035152635, + "grad_norm": 0.20251908898353577, + "learning_rate": 6.710093625168289e-05, + "loss": 0.5884, + "step": 5202 + }, + { + "epoch": 1.0695857744886421, + "grad_norm": 0.20491336286067963, + "learning_rate": 6.709223521490067e-05, + "loss": 0.5788, + "step": 5203 + }, + { + "epoch": 1.0697913454620207, + "grad_norm": 0.19236387312412262, + "learning_rate": 6.708353308973728e-05, + "loss": 0.5606, + "step": 5204 + }, + { + "epoch": 1.0699969164353993, + "grad_norm": 0.16769090294837952, + "learning_rate": 6.707482987662144e-05, + "loss": 0.5143, + "step": 5205 + }, + { + "epoch": 1.0702024874087779, + "grad_norm": 0.1578342467546463, + "learning_rate": 6.70661255759819e-05, + "loss": 0.556, + "step": 5206 + }, + { + "epoch": 1.0704080583821565, + "grad_norm": 0.19746308028697968, + "learning_rate": 6.705742018824751e-05, + "loss": 0.5619, + "step": 5207 + }, + { + "epoch": 1.070613629355535, + "grad_norm": 0.1942613422870636, + "learning_rate": 6.704871371384711e-05, + "loss": 0.5738, + "step": 5208 + }, + { + "epoch": 1.0708192003289136, + "grad_norm": 0.1871325820684433, + "learning_rate": 6.704000615320964e-05, + "loss": 0.578, + "step": 5209 + }, + { + "epoch": 1.0710247713022922, + "grad_norm": 0.19533534348011017, + "learning_rate": 6.703129750676409e-05, + "loss": 0.574, + "step": 5210 + }, + { + "epoch": 1.0712303422756706, + "grad_norm": 0.19535306096076965, + "learning_rate": 6.702258777493947e-05, + "loss": 0.5756, + "step": 5211 + }, + { + "epoch": 1.0714359132490492, + "grad_norm": 0.18510495126247406, + "learning_rate": 6.70138769581649e-05, + "loss": 0.5639, + "step": 5212 + }, + { + "epoch": 1.0716414842224278, + "grad_norm": 0.18865178525447845, + "learning_rate": 6.70051650568695e-05, + "loss": 0.5588, + "step": 5213 + }, + { + "epoch": 1.0718470551958064, + "grad_norm": 0.19118379056453705, + "learning_rate": 6.699645207148247e-05, + "loss": 0.5695, + "step": 5214 + }, + { + "epoch": 1.072052626169185, + "grad_norm": 0.19162502884864807, + "learning_rate": 6.698773800243305e-05, + "loss": 0.5647, + "step": 5215 + }, + { + "epoch": 1.0722581971425635, + "grad_norm": 0.18810777366161346, + "learning_rate": 6.697902285015056e-05, + "loss": 0.5608, + "step": 5216 + }, + { + "epoch": 1.0724637681159421, + "grad_norm": 0.16454185545444489, + "learning_rate": 6.697030661506433e-05, + "loss": 0.5383, + "step": 5217 + }, + { + "epoch": 1.0726693390893205, + "grad_norm": 0.15946544706821442, + "learning_rate": 6.69615892976038e-05, + "loss": 0.551, + "step": 5218 + }, + { + "epoch": 1.072874910062699, + "grad_norm": 0.20395736396312714, + "learning_rate": 6.695287089819838e-05, + "loss": 0.5643, + "step": 5219 + }, + { + "epoch": 1.0730804810360777, + "grad_norm": 0.19649049639701843, + "learning_rate": 6.694415141727766e-05, + "loss": 0.5594, + "step": 5220 + }, + { + "epoch": 1.0732860520094563, + "grad_norm": 0.18935894966125488, + "learning_rate": 6.693543085527115e-05, + "loss": 0.5627, + "step": 5221 + }, + { + "epoch": 1.0734916229828348, + "grad_norm": 0.21237939596176147, + "learning_rate": 6.69267092126085e-05, + "loss": 0.5703, + "step": 5222 + }, + { + "epoch": 1.0736971939562134, + "grad_norm": 0.19148610532283783, + "learning_rate": 6.691798648971935e-05, + "loss": 0.5535, + "step": 5223 + }, + { + "epoch": 1.073902764929592, + "grad_norm": 2.203571319580078, + "learning_rate": 6.690926268703345e-05, + "loss": 0.6328, + "step": 5224 + }, + { + "epoch": 1.0741083359029706, + "grad_norm": 0.20368382334709167, + "learning_rate": 6.69005378049806e-05, + "loss": 0.559, + "step": 5225 + }, + { + "epoch": 1.074313906876349, + "grad_norm": 0.22243089973926544, + "learning_rate": 6.68918118439906e-05, + "loss": 0.5621, + "step": 5226 + }, + { + "epoch": 1.0745194778497276, + "grad_norm": 0.25355663895606995, + "learning_rate": 6.688308480449335e-05, + "loss": 0.5876, + "step": 5227 + }, + { + "epoch": 1.0747250488231062, + "grad_norm": 0.2832355201244354, + "learning_rate": 6.68743566869188e-05, + "loss": 0.5542, + "step": 5228 + }, + { + "epoch": 1.0749306197964847, + "grad_norm": 0.1938430219888687, + "learning_rate": 6.686562749169694e-05, + "loss": 0.5282, + "step": 5229 + }, + { + "epoch": 1.0751361907698633, + "grad_norm": 0.263157457113266, + "learning_rate": 6.685689721925782e-05, + "loss": 0.5783, + "step": 5230 + }, + { + "epoch": 1.075341761743242, + "grad_norm": 0.2489389330148697, + "learning_rate": 6.684816587003152e-05, + "loss": 0.5584, + "step": 5231 + }, + { + "epoch": 1.0755473327166205, + "grad_norm": 0.18948674201965332, + "learning_rate": 6.683943344444821e-05, + "loss": 0.5549, + "step": 5232 + }, + { + "epoch": 1.0757529036899989, + "grad_norm": 0.18527735769748688, + "learning_rate": 6.683069994293808e-05, + "loss": 0.5533, + "step": 5233 + }, + { + "epoch": 1.0759584746633775, + "grad_norm": 0.23963753879070282, + "learning_rate": 6.682196536593142e-05, + "loss": 0.5675, + "step": 5234 + }, + { + "epoch": 1.076164045636756, + "grad_norm": 0.23618869483470917, + "learning_rate": 6.681322971385852e-05, + "loss": 0.581, + "step": 5235 + }, + { + "epoch": 1.0763696166101346, + "grad_norm": 0.18081532418727875, + "learning_rate": 6.680449298714974e-05, + "loss": 0.5488, + "step": 5236 + }, + { + "epoch": 1.0765751875835132, + "grad_norm": 0.17601439356803894, + "learning_rate": 6.679575518623549e-05, + "loss": 0.5718, + "step": 5237 + }, + { + "epoch": 1.0767807585568918, + "grad_norm": 0.22532643377780914, + "learning_rate": 6.678701631154627e-05, + "loss": 0.5777, + "step": 5238 + }, + { + "epoch": 1.0769863295302704, + "grad_norm": 0.21322833001613617, + "learning_rate": 6.677827636351259e-05, + "loss": 0.5803, + "step": 5239 + }, + { + "epoch": 1.077191900503649, + "grad_norm": 0.19407659769058228, + "learning_rate": 6.676953534256501e-05, + "loss": 0.525, + "step": 5240 + }, + { + "epoch": 1.0773974714770274, + "grad_norm": 0.32453683018684387, + "learning_rate": 6.676079324913419e-05, + "loss": 0.5812, + "step": 5241 + }, + { + "epoch": 1.077603042450406, + "grad_norm": 0.2009628862142563, + "learning_rate": 6.675205008365081e-05, + "loss": 0.557, + "step": 5242 + }, + { + "epoch": 1.0778086134237845, + "grad_norm": 0.17055638134479523, + "learning_rate": 6.674330584654557e-05, + "loss": 0.5503, + "step": 5243 + }, + { + "epoch": 1.0780141843971631, + "grad_norm": 0.21184168756008148, + "learning_rate": 6.673456053824928e-05, + "loss": 0.5644, + "step": 5244 + }, + { + "epoch": 1.0782197553705417, + "grad_norm": 0.20383425056934357, + "learning_rate": 6.672581415919279e-05, + "loss": 0.586, + "step": 5245 + }, + { + "epoch": 1.0784253263439203, + "grad_norm": 0.20638933777809143, + "learning_rate": 6.671706670980697e-05, + "loss": 0.5577, + "step": 5246 + }, + { + "epoch": 1.0786308973172989, + "grad_norm": 0.2084139883518219, + "learning_rate": 6.670831819052278e-05, + "loss": 0.5818, + "step": 5247 + }, + { + "epoch": 1.0788364682906773, + "grad_norm": 0.1990043967962265, + "learning_rate": 6.669956860177122e-05, + "loss": 0.5694, + "step": 5248 + }, + { + "epoch": 1.0790420392640558, + "grad_norm": 0.22096286714076996, + "learning_rate": 6.669081794398334e-05, + "loss": 0.5815, + "step": 5249 + }, + { + "epoch": 1.0792476102374344, + "grad_norm": 0.19977039098739624, + "learning_rate": 6.668206621759023e-05, + "loss": 0.552, + "step": 5250 + }, + { + "epoch": 1.079453181210813, + "grad_norm": 0.16734324395656586, + "learning_rate": 6.667331342302308e-05, + "loss": 0.5378, + "step": 5251 + }, + { + "epoch": 1.0796587521841916, + "grad_norm": 0.1833125352859497, + "learning_rate": 6.666455956071307e-05, + "loss": 0.5605, + "step": 5252 + }, + { + "epoch": 1.0798643231575702, + "grad_norm": 0.2064771205186844, + "learning_rate": 6.665580463109147e-05, + "loss": 0.5548, + "step": 5253 + }, + { + "epoch": 1.0800698941309488, + "grad_norm": 0.19935967028141022, + "learning_rate": 6.664704863458959e-05, + "loss": 0.5673, + "step": 5254 + }, + { + "epoch": 1.0802754651043274, + "grad_norm": 0.20248223841190338, + "learning_rate": 6.66382915716388e-05, + "loss": 0.5799, + "step": 5255 + }, + { + "epoch": 1.0804810360777057, + "grad_norm": 0.19460590183734894, + "learning_rate": 6.662953344267054e-05, + "loss": 0.5549, + "step": 5256 + }, + { + "epoch": 1.0806866070510843, + "grad_norm": 0.19697195291519165, + "learning_rate": 6.662077424811624e-05, + "loss": 0.5809, + "step": 5257 + }, + { + "epoch": 1.080892178024463, + "grad_norm": 0.19642481207847595, + "learning_rate": 6.661201398840747e-05, + "loss": 0.5708, + "step": 5258 + }, + { + "epoch": 1.0810977489978415, + "grad_norm": 0.1706124097108841, + "learning_rate": 6.660325266397576e-05, + "loss": 0.5569, + "step": 5259 + }, + { + "epoch": 1.08130331997122, + "grad_norm": 0.16144689917564392, + "learning_rate": 6.659449027525279e-05, + "loss": 0.5646, + "step": 5260 + }, + { + "epoch": 1.0815088909445987, + "grad_norm": 0.17022046446800232, + "learning_rate": 6.658572682267019e-05, + "loss": 0.5469, + "step": 5261 + }, + { + "epoch": 1.0817144619179773, + "grad_norm": 0.16290414333343506, + "learning_rate": 6.657696230665974e-05, + "loss": 0.5779, + "step": 5262 + }, + { + "epoch": 1.0819200328913556, + "grad_norm": 0.20431680977344513, + "learning_rate": 6.656819672765321e-05, + "loss": 0.5886, + "step": 5263 + }, + { + "epoch": 1.0821256038647342, + "grad_norm": 0.19603441655635834, + "learning_rate": 6.655943008608243e-05, + "loss": 0.5559, + "step": 5264 + }, + { + "epoch": 1.0823311748381128, + "grad_norm": 0.19341200590133667, + "learning_rate": 6.65506623823793e-05, + "loss": 0.5685, + "step": 5265 + }, + { + "epoch": 1.0825367458114914, + "grad_norm": 0.17218126356601715, + "learning_rate": 6.654189361697576e-05, + "loss": 0.5443, + "step": 5266 + }, + { + "epoch": 1.08274231678487, + "grad_norm": 0.13615413010120392, + "learning_rate": 6.653312379030381e-05, + "loss": 0.5515, + "step": 5267 + }, + { + "epoch": 1.0829478877582486, + "grad_norm": 0.1653267741203308, + "learning_rate": 6.652435290279549e-05, + "loss": 0.5599, + "step": 5268 + }, + { + "epoch": 1.0831534587316272, + "grad_norm": 0.1650351732969284, + "learning_rate": 6.651558095488292e-05, + "loss": 0.5362, + "step": 5269 + }, + { + "epoch": 1.0833590297050057, + "grad_norm": 0.16134947538375854, + "learning_rate": 6.650680794699823e-05, + "loss": 0.559, + "step": 5270 + }, + { + "epoch": 1.0835646006783841, + "grad_norm": 0.1984068751335144, + "learning_rate": 6.649803387957362e-05, + "loss": 0.5631, + "step": 5271 + }, + { + "epoch": 1.0837701716517627, + "grad_norm": 0.1926686316728592, + "learning_rate": 6.648925875304139e-05, + "loss": 0.5864, + "step": 5272 + }, + { + "epoch": 1.0839757426251413, + "grad_norm": 0.1906096339225769, + "learning_rate": 6.648048256783382e-05, + "loss": 0.557, + "step": 5273 + }, + { + "epoch": 1.0841813135985199, + "grad_norm": 0.1856287568807602, + "learning_rate": 6.647170532438327e-05, + "loss": 0.5717, + "step": 5274 + }, + { + "epoch": 1.0843868845718985, + "grad_norm": 0.17093409597873688, + "learning_rate": 6.646292702312214e-05, + "loss": 0.5314, + "step": 5275 + }, + { + "epoch": 1.084592455545277, + "grad_norm": 0.17193061113357544, + "learning_rate": 6.645414766448293e-05, + "loss": 0.5795, + "step": 5276 + }, + { + "epoch": 1.0847980265186556, + "grad_norm": 0.1909974366426468, + "learning_rate": 6.644536724889814e-05, + "loss": 0.5638, + "step": 5277 + }, + { + "epoch": 1.085003597492034, + "grad_norm": 0.16503417491912842, + "learning_rate": 6.643658577680033e-05, + "loss": 0.5364, + "step": 5278 + }, + { + "epoch": 1.0852091684654126, + "grad_norm": 0.12726576626300812, + "learning_rate": 6.642780324862215e-05, + "loss": 0.5181, + "step": 5279 + }, + { + "epoch": 1.0854147394387912, + "grad_norm": 0.15936200320720673, + "learning_rate": 6.641901966479623e-05, + "loss": 0.5601, + "step": 5280 + }, + { + "epoch": 1.0856203104121698, + "grad_norm": 0.1608133465051651, + "learning_rate": 6.641023502575535e-05, + "loss": 0.5125, + "step": 5281 + }, + { + "epoch": 1.0858258813855484, + "grad_norm": 0.15383280813694, + "learning_rate": 6.640144933193223e-05, + "loss": 0.5724, + "step": 5282 + }, + { + "epoch": 1.086031452358927, + "grad_norm": 0.15517185628414154, + "learning_rate": 6.639266258375977e-05, + "loss": 0.5275, + "step": 5283 + }, + { + "epoch": 1.0862370233323055, + "grad_norm": 0.16167797148227692, + "learning_rate": 6.63838747816708e-05, + "loss": 0.5644, + "step": 5284 + }, + { + "epoch": 1.0864425943056841, + "grad_norm": 0.1940879076719284, + "learning_rate": 6.637508592609827e-05, + "loss": 0.5677, + "step": 5285 + }, + { + "epoch": 1.0866481652790625, + "grad_norm": 0.18758495151996613, + "learning_rate": 6.636629601747515e-05, + "loss": 0.5874, + "step": 5286 + }, + { + "epoch": 1.086853736252441, + "grad_norm": 0.1906895488500595, + "learning_rate": 6.635750505623451e-05, + "loss": 0.5747, + "step": 5287 + }, + { + "epoch": 1.0870593072258197, + "grad_norm": 0.18820390105247498, + "learning_rate": 6.63487130428094e-05, + "loss": 0.5581, + "step": 5288 + }, + { + "epoch": 1.0872648781991983, + "grad_norm": 0.19802720844745636, + "learning_rate": 6.633991997763299e-05, + "loss": 0.5604, + "step": 5289 + }, + { + "epoch": 1.0874704491725768, + "grad_norm": 0.16742005944252014, + "learning_rate": 6.633112586113847e-05, + "loss": 0.5223, + "step": 5290 + }, + { + "epoch": 1.0876760201459554, + "grad_norm": 0.14373008906841278, + "learning_rate": 6.632233069375907e-05, + "loss": 0.5368, + "step": 5291 + }, + { + "epoch": 1.087881591119334, + "grad_norm": 0.1611548662185669, + "learning_rate": 6.63135344759281e-05, + "loss": 0.5546, + "step": 5292 + }, + { + "epoch": 1.0880871620927126, + "grad_norm": 0.19752389192581177, + "learning_rate": 6.630473720807892e-05, + "loss": 0.5597, + "step": 5293 + }, + { + "epoch": 1.088292733066091, + "grad_norm": 0.19738554954528809, + "learning_rate": 6.62959388906449e-05, + "loss": 0.5787, + "step": 5294 + }, + { + "epoch": 1.0884983040394696, + "grad_norm": 0.1929868459701538, + "learning_rate": 6.628713952405951e-05, + "loss": 0.5614, + "step": 5295 + }, + { + "epoch": 1.0887038750128482, + "grad_norm": 0.2048940360546112, + "learning_rate": 6.627833910875626e-05, + "loss": 0.5715, + "step": 5296 + }, + { + "epoch": 1.0889094459862267, + "grad_norm": 0.19857628643512726, + "learning_rate": 6.62695376451687e-05, + "loss": 0.5694, + "step": 5297 + }, + { + "epoch": 1.0891150169596053, + "grad_norm": 0.19346579909324646, + "learning_rate": 6.626073513373043e-05, + "loss": 0.5612, + "step": 5298 + }, + { + "epoch": 1.089320587932984, + "grad_norm": 0.1919691264629364, + "learning_rate": 6.62519315748751e-05, + "loss": 0.5584, + "step": 5299 + }, + { + "epoch": 1.0895261589063625, + "grad_norm": 0.1884642243385315, + "learning_rate": 6.624312696903644e-05, + "loss": 0.5576, + "step": 5300 + }, + { + "epoch": 1.089731729879741, + "grad_norm": 0.1730055809020996, + "learning_rate": 6.623432131664822e-05, + "loss": 0.5565, + "step": 5301 + }, + { + "epoch": 1.0899373008531195, + "grad_norm": 0.19262228906154633, + "learning_rate": 6.62255146181442e-05, + "loss": 0.5645, + "step": 5302 + }, + { + "epoch": 1.090142871826498, + "grad_norm": 0.19675207138061523, + "learning_rate": 6.62167068739583e-05, + "loss": 0.5845, + "step": 5303 + }, + { + "epoch": 1.0903484427998766, + "grad_norm": 0.1958772838115692, + "learning_rate": 6.620789808452443e-05, + "loss": 0.5653, + "step": 5304 + }, + { + "epoch": 1.0905540137732552, + "grad_norm": 0.18935401737689972, + "learning_rate": 6.619908825027655e-05, + "loss": 0.5523, + "step": 5305 + }, + { + "epoch": 1.0907595847466338, + "grad_norm": 0.19371245801448822, + "learning_rate": 6.619027737164865e-05, + "loss": 0.551, + "step": 5306 + }, + { + "epoch": 1.0909651557200124, + "grad_norm": 0.19392549991607666, + "learning_rate": 6.618146544907485e-05, + "loss": 0.5731, + "step": 5307 + }, + { + "epoch": 1.091170726693391, + "grad_norm": 0.19857439398765564, + "learning_rate": 6.617265248298926e-05, + "loss": 0.5364, + "step": 5308 + }, + { + "epoch": 1.0913762976667694, + "grad_norm": 0.2129819244146347, + "learning_rate": 6.616383847382601e-05, + "loss": 0.5635, + "step": 5309 + }, + { + "epoch": 1.091581868640148, + "grad_norm": 0.18669261038303375, + "learning_rate": 6.615502342201938e-05, + "loss": 0.5533, + "step": 5310 + }, + { + "epoch": 1.0917874396135265, + "grad_norm": 0.20277494192123413, + "learning_rate": 6.614620732800363e-05, + "loss": 0.5874, + "step": 5311 + }, + { + "epoch": 1.0919930105869051, + "grad_norm": 0.19775375723838806, + "learning_rate": 6.613739019221306e-05, + "loss": 0.5737, + "step": 5312 + }, + { + "epoch": 1.0921985815602837, + "grad_norm": 0.19743028283119202, + "learning_rate": 6.612857201508208e-05, + "loss": 0.5853, + "step": 5313 + }, + { + "epoch": 1.0924041525336623, + "grad_norm": 0.18763835728168488, + "learning_rate": 6.611975279704511e-05, + "loss": 0.5728, + "step": 5314 + }, + { + "epoch": 1.0926097235070409, + "grad_norm": 0.19164253771305084, + "learning_rate": 6.611093253853664e-05, + "loss": 0.5734, + "step": 5315 + }, + { + "epoch": 1.0928152944804195, + "grad_norm": 0.19013293087482452, + "learning_rate": 6.610211123999119e-05, + "loss": 0.5647, + "step": 5316 + }, + { + "epoch": 1.0930208654537978, + "grad_norm": 0.19846196472644806, + "learning_rate": 6.609328890184334e-05, + "loss": 0.5613, + "step": 5317 + }, + { + "epoch": 1.0932264364271764, + "grad_norm": 0.18824782967567444, + "learning_rate": 6.608446552452777e-05, + "loss": 0.5496, + "step": 5318 + }, + { + "epoch": 1.093432007400555, + "grad_norm": 0.19030706584453583, + "learning_rate": 6.60756411084791e-05, + "loss": 0.5488, + "step": 5319 + }, + { + "epoch": 1.0936375783739336, + "grad_norm": 0.16634370386600494, + "learning_rate": 6.606681565413211e-05, + "loss": 0.5618, + "step": 5320 + }, + { + "epoch": 1.0938431493473122, + "grad_norm": 0.20281003415584564, + "learning_rate": 6.605798916192157e-05, + "loss": 0.5718, + "step": 5321 + }, + { + "epoch": 1.0940487203206908, + "grad_norm": 0.16052670776844025, + "learning_rate": 6.604916163228235e-05, + "loss": 0.5356, + "step": 5322 + }, + { + "epoch": 1.0942542912940694, + "grad_norm": 0.19108809530735016, + "learning_rate": 6.60403330656493e-05, + "loss": 0.5525, + "step": 5323 + }, + { + "epoch": 1.0944598622674477, + "grad_norm": 0.20535770058631897, + "learning_rate": 6.603150346245738e-05, + "loss": 0.5542, + "step": 5324 + }, + { + "epoch": 1.0946654332408263, + "grad_norm": 0.20727907121181488, + "learning_rate": 6.60226728231416e-05, + "loss": 0.583, + "step": 5325 + }, + { + "epoch": 1.094871004214205, + "grad_norm": 0.19222858548164368, + "learning_rate": 6.601384114813699e-05, + "loss": 0.5626, + "step": 5326 + }, + { + "epoch": 1.0950765751875835, + "grad_norm": 0.19487687945365906, + "learning_rate": 6.600500843787864e-05, + "loss": 0.5649, + "step": 5327 + }, + { + "epoch": 1.095282146160962, + "grad_norm": 0.1915174126625061, + "learning_rate": 6.599617469280171e-05, + "loss": 0.5561, + "step": 5328 + }, + { + "epoch": 1.0954877171343407, + "grad_norm": 0.17732886970043182, + "learning_rate": 6.598733991334137e-05, + "loss": 0.5102, + "step": 5329 + }, + { + "epoch": 1.0956932881077193, + "grad_norm": 0.17406459152698517, + "learning_rate": 6.59785040999329e-05, + "loss": 0.5818, + "step": 5330 + }, + { + "epoch": 1.0958988590810979, + "grad_norm": 0.19681067764759064, + "learning_rate": 6.596966725301158e-05, + "loss": 0.5573, + "step": 5331 + }, + { + "epoch": 1.0961044300544762, + "grad_norm": 0.2060333490371704, + "learning_rate": 6.596082937301277e-05, + "loss": 0.5757, + "step": 5332 + }, + { + "epoch": 1.0963100010278548, + "grad_norm": 0.1740088164806366, + "learning_rate": 6.595199046037187e-05, + "loss": 0.5204, + "step": 5333 + }, + { + "epoch": 1.0965155720012334, + "grad_norm": 0.15425589680671692, + "learning_rate": 6.594315051552434e-05, + "loss": 0.549, + "step": 5334 + }, + { + "epoch": 1.096721142974612, + "grad_norm": 0.20004071295261383, + "learning_rate": 6.593430953890564e-05, + "loss": 0.5745, + "step": 5335 + }, + { + "epoch": 1.0969267139479906, + "grad_norm": 0.21765153110027313, + "learning_rate": 6.592546753095138e-05, + "loss": 0.5779, + "step": 5336 + }, + { + "epoch": 1.0971322849213692, + "grad_norm": 0.2069845348596573, + "learning_rate": 6.591662449209714e-05, + "loss": 0.5681, + "step": 5337 + }, + { + "epoch": 1.0973378558947477, + "grad_norm": 0.16083793342113495, + "learning_rate": 6.590778042277856e-05, + "loss": 0.522, + "step": 5338 + }, + { + "epoch": 1.0975434268681261, + "grad_norm": 0.13454684615135193, + "learning_rate": 6.589893532343137e-05, + "loss": 0.5234, + "step": 5339 + }, + { + "epoch": 1.0977489978415047, + "grad_norm": 0.16017797589302063, + "learning_rate": 6.589008919449132e-05, + "loss": 0.5456, + "step": 5340 + }, + { + "epoch": 1.0979545688148833, + "grad_norm": 0.19706310331821442, + "learning_rate": 6.588124203639421e-05, + "loss": 0.5598, + "step": 5341 + }, + { + "epoch": 1.0981601397882619, + "grad_norm": 0.19830232858657837, + "learning_rate": 6.587239384957593e-05, + "loss": 0.5587, + "step": 5342 + }, + { + "epoch": 1.0983657107616405, + "grad_norm": 0.19337981939315796, + "learning_rate": 6.586354463447233e-05, + "loss": 0.5635, + "step": 5343 + }, + { + "epoch": 1.098571281735019, + "grad_norm": 0.19587767124176025, + "learning_rate": 6.585469439151942e-05, + "loss": 0.5626, + "step": 5344 + }, + { + "epoch": 1.0987768527083976, + "grad_norm": 0.20316268503665924, + "learning_rate": 6.584584312115318e-05, + "loss": 0.5743, + "step": 5345 + }, + { + "epoch": 1.0989824236817762, + "grad_norm": 0.19595171511173248, + "learning_rate": 6.583699082380969e-05, + "loss": 0.5579, + "step": 5346 + }, + { + "epoch": 1.0991879946551546, + "grad_norm": 0.18075229227542877, + "learning_rate": 6.582813749992504e-05, + "loss": 0.5336, + "step": 5347 + }, + { + "epoch": 1.0993935656285332, + "grad_norm": 0.1714819073677063, + "learning_rate": 6.581928314993542e-05, + "loss": 0.5727, + "step": 5348 + }, + { + "epoch": 1.0995991366019118, + "grad_norm": 0.2072882503271103, + "learning_rate": 6.581042777427703e-05, + "loss": 0.5859, + "step": 5349 + }, + { + "epoch": 1.0998047075752904, + "grad_norm": 0.19539666175842285, + "learning_rate": 6.580157137338613e-05, + "loss": 0.5764, + "step": 5350 + }, + { + "epoch": 1.100010278548669, + "grad_norm": 0.20228314399719238, + "learning_rate": 6.579271394769901e-05, + "loss": 0.5831, + "step": 5351 + }, + { + "epoch": 1.1002158495220475, + "grad_norm": 0.16698522865772247, + "learning_rate": 6.578385549765209e-05, + "loss": 0.5292, + "step": 5352 + }, + { + "epoch": 1.1004214204954261, + "grad_norm": 0.15886962413787842, + "learning_rate": 6.577499602368176e-05, + "loss": 0.5586, + "step": 5353 + }, + { + "epoch": 1.1006269914688045, + "grad_norm": 0.16973358392715454, + "learning_rate": 6.576613552622443e-05, + "loss": 0.5373, + "step": 5354 + }, + { + "epoch": 1.100832562442183, + "grad_norm": 0.16206781566143036, + "learning_rate": 6.575727400571672e-05, + "loss": 0.5407, + "step": 5355 + }, + { + "epoch": 1.1010381334155617, + "grad_norm": 0.19393891096115112, + "learning_rate": 6.57484114625951e-05, + "loss": 0.5683, + "step": 5356 + }, + { + "epoch": 1.1012437043889403, + "grad_norm": 0.19983628392219543, + "learning_rate": 6.573954789729625e-05, + "loss": 0.5771, + "step": 5357 + }, + { + "epoch": 1.1014492753623188, + "grad_norm": 0.19126926362514496, + "learning_rate": 6.573068331025679e-05, + "loss": 0.559, + "step": 5358 + }, + { + "epoch": 1.1016548463356974, + "grad_norm": 0.19822482764720917, + "learning_rate": 6.572181770191347e-05, + "loss": 0.5587, + "step": 5359 + }, + { + "epoch": 1.101860417309076, + "grad_norm": 0.19558537006378174, + "learning_rate": 6.571295107270304e-05, + "loss": 0.562, + "step": 5360 + }, + { + "epoch": 1.1020659882824546, + "grad_norm": 0.1923174262046814, + "learning_rate": 6.570408342306233e-05, + "loss": 0.545, + "step": 5361 + }, + { + "epoch": 1.102271559255833, + "grad_norm": 0.19644415378570557, + "learning_rate": 6.569521475342819e-05, + "loss": 0.5765, + "step": 5362 + }, + { + "epoch": 1.1024771302292116, + "grad_norm": 0.16399532556533813, + "learning_rate": 6.568634506423757e-05, + "loss": 0.5231, + "step": 5363 + }, + { + "epoch": 1.1026827012025902, + "grad_norm": 0.13997915387153625, + "learning_rate": 6.567747435592738e-05, + "loss": 0.5166, + "step": 5364 + }, + { + "epoch": 1.1028882721759687, + "grad_norm": 0.16803216934204102, + "learning_rate": 6.56686026289347e-05, + "loss": 0.5639, + "step": 5365 + }, + { + "epoch": 1.1030938431493473, + "grad_norm": 0.16553597152233124, + "learning_rate": 6.565972988369658e-05, + "loss": 0.5339, + "step": 5366 + }, + { + "epoch": 1.103299414122726, + "grad_norm": 0.16284188628196716, + "learning_rate": 6.565085612065012e-05, + "loss": 0.5559, + "step": 5367 + }, + { + "epoch": 1.1035049850961045, + "grad_norm": 0.20028123259544373, + "learning_rate": 6.56419813402325e-05, + "loss": 0.5803, + "step": 5368 + }, + { + "epoch": 1.1037105560694829, + "grad_norm": 0.1928570717573166, + "learning_rate": 6.563310554288094e-05, + "loss": 0.5508, + "step": 5369 + }, + { + "epoch": 1.1039161270428615, + "grad_norm": 0.1684267520904541, + "learning_rate": 6.562422872903271e-05, + "loss": 0.5431, + "step": 5370 + }, + { + "epoch": 1.10412169801624, + "grad_norm": 0.13885952532291412, + "learning_rate": 6.561535089912512e-05, + "loss": 0.5324, + "step": 5371 + }, + { + "epoch": 1.1043272689896186, + "grad_norm": 0.4177161455154419, + "learning_rate": 6.560647205359556e-05, + "loss": 0.5474, + "step": 5372 + }, + { + "epoch": 1.1045328399629972, + "grad_norm": 0.201374813914299, + "learning_rate": 6.559759219288145e-05, + "loss": 0.5477, + "step": 5373 + }, + { + "epoch": 1.1047384109363758, + "grad_norm": 0.19698698818683624, + "learning_rate": 6.558871131742022e-05, + "loss": 0.5579, + "step": 5374 + }, + { + "epoch": 1.1049439819097544, + "grad_norm": 0.19668418169021606, + "learning_rate": 6.557982942764941e-05, + "loss": 0.592, + "step": 5375 + }, + { + "epoch": 1.105149552883133, + "grad_norm": 0.18167072534561157, + "learning_rate": 6.557094652400662e-05, + "loss": 0.5506, + "step": 5376 + }, + { + "epoch": 1.1053551238565114, + "grad_norm": 0.15802860260009766, + "learning_rate": 6.556206260692943e-05, + "loss": 0.5303, + "step": 5377 + }, + { + "epoch": 1.10556069482989, + "grad_norm": 0.1602732092142105, + "learning_rate": 6.55531776768555e-05, + "loss": 0.5617, + "step": 5378 + }, + { + "epoch": 1.1057662658032685, + "grad_norm": 0.20059829950332642, + "learning_rate": 6.55442917342226e-05, + "loss": 0.5598, + "step": 5379 + }, + { + "epoch": 1.1059718367766471, + "grad_norm": 0.19668720662593842, + "learning_rate": 6.553540477946846e-05, + "loss": 0.5747, + "step": 5380 + }, + { + "epoch": 1.1061774077500257, + "grad_norm": 0.19892635941505432, + "learning_rate": 6.552651681303091e-05, + "loss": 0.5767, + "step": 5381 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 0.2011987864971161, + "learning_rate": 6.551762783534783e-05, + "loss": 0.5782, + "step": 5382 + }, + { + "epoch": 1.1065885496967829, + "grad_norm": 0.18638330698013306, + "learning_rate": 6.550873784685711e-05, + "loss": 0.5516, + "step": 5383 + }, + { + "epoch": 1.1067941206701615, + "grad_norm": 0.1961633563041687, + "learning_rate": 6.549984684799675e-05, + "loss": 0.5462, + "step": 5384 + }, + { + "epoch": 1.1069996916435398, + "grad_norm": 0.18275189399719238, + "learning_rate": 6.549095483920473e-05, + "loss": 0.5296, + "step": 5385 + }, + { + "epoch": 1.1072052626169184, + "grad_norm": 0.16657038033008575, + "learning_rate": 6.548206182091915e-05, + "loss": 0.5184, + "step": 5386 + }, + { + "epoch": 1.107410833590297, + "grad_norm": 0.16570012271404266, + "learning_rate": 6.547316779357812e-05, + "loss": 0.5533, + "step": 5387 + }, + { + "epoch": 1.1076164045636756, + "grad_norm": 0.21582432091236115, + "learning_rate": 6.546427275761979e-05, + "loss": 0.5526, + "step": 5388 + }, + { + "epoch": 1.1078219755370542, + "grad_norm": 0.19760467112064362, + "learning_rate": 6.54553767134824e-05, + "loss": 0.5558, + "step": 5389 + }, + { + "epoch": 1.1080275465104328, + "grad_norm": 0.19710463285446167, + "learning_rate": 6.544647966160421e-05, + "loss": 0.5413, + "step": 5390 + }, + { + "epoch": 1.1082331174838114, + "grad_norm": 0.195608451962471, + "learning_rate": 6.543758160242353e-05, + "loss": 0.581, + "step": 5391 + }, + { + "epoch": 1.10843868845719, + "grad_norm": 0.1914118230342865, + "learning_rate": 6.542868253637873e-05, + "loss": 0.5282, + "step": 5392 + }, + { + "epoch": 1.1086442594305683, + "grad_norm": 0.16971172392368317, + "learning_rate": 6.541978246390823e-05, + "loss": 0.5427, + "step": 5393 + }, + { + "epoch": 1.108849830403947, + "grad_norm": 0.19938012957572937, + "learning_rate": 6.541088138545049e-05, + "loss": 0.5378, + "step": 5394 + }, + { + "epoch": 1.1090554013773255, + "grad_norm": 0.2031160593032837, + "learning_rate": 6.540197930144403e-05, + "loss": 0.5679, + "step": 5395 + }, + { + "epoch": 1.109260972350704, + "grad_norm": 0.19984202086925507, + "learning_rate": 6.53930762123274e-05, + "loss": 0.5565, + "step": 5396 + }, + { + "epoch": 1.1094665433240827, + "grad_norm": 0.17272289097309113, + "learning_rate": 6.538417211853923e-05, + "loss": 0.5411, + "step": 5397 + }, + { + "epoch": 1.1096721142974613, + "grad_norm": 0.17256368696689606, + "learning_rate": 6.537526702051815e-05, + "loss": 0.5649, + "step": 5398 + }, + { + "epoch": 1.1098776852708399, + "grad_norm": 0.1994207799434662, + "learning_rate": 6.536636091870292e-05, + "loss": 0.5794, + "step": 5399 + }, + { + "epoch": 1.1100832562442182, + "grad_norm": 0.18973985314369202, + "learning_rate": 6.535745381353226e-05, + "loss": 0.5726, + "step": 5400 + }, + { + "epoch": 1.1102888272175968, + "grad_norm": 0.18999481201171875, + "learning_rate": 6.534854570544502e-05, + "loss": 0.582, + "step": 5401 + }, + { + "epoch": 1.1104943981909754, + "grad_norm": 0.19933466613292694, + "learning_rate": 6.533963659488005e-05, + "loss": 0.5795, + "step": 5402 + }, + { + "epoch": 1.110699969164354, + "grad_norm": 0.16730111837387085, + "learning_rate": 6.533072648227623e-05, + "loss": 0.5339, + "step": 5403 + }, + { + "epoch": 1.1109055401377326, + "grad_norm": 0.13518232107162476, + "learning_rate": 6.532181536807256e-05, + "loss": 0.5358, + "step": 5404 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.13433937728405, + "learning_rate": 6.531290325270802e-05, + "loss": 0.5316, + "step": 5405 + }, + { + "epoch": 1.1113166820844897, + "grad_norm": 0.16764889657497406, + "learning_rate": 6.530399013662168e-05, + "loss": 0.5494, + "step": 5406 + }, + { + "epoch": 1.1115222530578683, + "grad_norm": 0.20979219675064087, + "learning_rate": 6.529507602025265e-05, + "loss": 0.5727, + "step": 5407 + }, + { + "epoch": 1.1117278240312467, + "grad_norm": 0.196084126830101, + "learning_rate": 6.528616090404008e-05, + "loss": 0.5542, + "step": 5408 + }, + { + "epoch": 1.1119333950046253, + "grad_norm": 0.19469597935676575, + "learning_rate": 6.527724478842318e-05, + "loss": 0.5717, + "step": 5409 + }, + { + "epoch": 1.1121389659780039, + "grad_norm": 0.19987237453460693, + "learning_rate": 6.526832767384121e-05, + "loss": 0.5782, + "step": 5410 + }, + { + "epoch": 1.1123445369513825, + "grad_norm": 0.20070701837539673, + "learning_rate": 6.525940956073347e-05, + "loss": 0.5578, + "step": 5411 + }, + { + "epoch": 1.112550107924761, + "grad_norm": 0.20500093698501587, + "learning_rate": 6.52504904495393e-05, + "loss": 0.566, + "step": 5412 + }, + { + "epoch": 1.1127556788981396, + "grad_norm": 0.17960810661315918, + "learning_rate": 6.524157034069813e-05, + "loss": 0.5331, + "step": 5413 + }, + { + "epoch": 1.1129612498715182, + "grad_norm": 0.17575471103191376, + "learning_rate": 6.523264923464939e-05, + "loss": 0.5575, + "step": 5414 + }, + { + "epoch": 1.1131668208448966, + "grad_norm": 0.2222844958305359, + "learning_rate": 6.522372713183259e-05, + "loss": 0.5928, + "step": 5415 + }, + { + "epoch": 1.1133723918182752, + "grad_norm": 0.19698132574558258, + "learning_rate": 6.521480403268727e-05, + "loss": 0.5484, + "step": 5416 + }, + { + "epoch": 1.1135779627916538, + "grad_norm": 0.16682282090187073, + "learning_rate": 6.520587993765305e-05, + "loss": 0.5474, + "step": 5417 + }, + { + "epoch": 1.1137835337650324, + "grad_norm": 0.5412604808807373, + "learning_rate": 6.519695484716958e-05, + "loss": 0.5692, + "step": 5418 + }, + { + "epoch": 1.113989104738411, + "grad_norm": 0.1983635425567627, + "learning_rate": 6.518802876167654e-05, + "loss": 0.5231, + "step": 5419 + }, + { + "epoch": 1.1141946757117895, + "grad_norm": 0.1765107810497284, + "learning_rate": 6.517910168161367e-05, + "loss": 0.5307, + "step": 5420 + }, + { + "epoch": 1.1144002466851681, + "grad_norm": 0.17099499702453613, + "learning_rate": 6.517017360742077e-05, + "loss": 0.5787, + "step": 5421 + }, + { + "epoch": 1.1146058176585467, + "grad_norm": 0.174418643116951, + "learning_rate": 6.51612445395377e-05, + "loss": 0.524, + "step": 5422 + }, + { + "epoch": 1.114811388631925, + "grad_norm": 0.1620262712240219, + "learning_rate": 6.515231447840435e-05, + "loss": 0.5454, + "step": 5423 + }, + { + "epoch": 1.1150169596053037, + "grad_norm": 0.20404332876205444, + "learning_rate": 6.514338342446066e-05, + "loss": 0.5735, + "step": 5424 + }, + { + "epoch": 1.1152225305786823, + "grad_norm": 0.19146005809307098, + "learning_rate": 6.513445137814661e-05, + "loss": 0.5627, + "step": 5425 + }, + { + "epoch": 1.1154281015520608, + "grad_norm": 0.1799180954694748, + "learning_rate": 6.512551833990226e-05, + "loss": 0.5394, + "step": 5426 + }, + { + "epoch": 1.1156336725254394, + "grad_norm": 0.14841869473457336, + "learning_rate": 6.511658431016768e-05, + "loss": 0.5174, + "step": 5427 + }, + { + "epoch": 1.115839243498818, + "grad_norm": 0.1631341278553009, + "learning_rate": 6.510764928938301e-05, + "loss": 0.5401, + "step": 5428 + }, + { + "epoch": 1.1160448144721966, + "grad_norm": 0.202928826212883, + "learning_rate": 6.509871327798846e-05, + "loss": 0.5576, + "step": 5429 + }, + { + "epoch": 1.116250385445575, + "grad_norm": 0.20302633941173553, + "learning_rate": 6.508977627642423e-05, + "loss": 0.5684, + "step": 5430 + }, + { + "epoch": 1.1164559564189536, + "grad_norm": 0.2004452496767044, + "learning_rate": 6.508083828513062e-05, + "loss": 0.5695, + "step": 5431 + }, + { + "epoch": 1.1166615273923322, + "grad_norm": 0.21618925034999847, + "learning_rate": 6.507189930454797e-05, + "loss": 0.5447, + "step": 5432 + }, + { + "epoch": 1.1168670983657107, + "grad_norm": 0.41041454672813416, + "learning_rate": 6.506295933511667e-05, + "loss": 0.5416, + "step": 5433 + }, + { + "epoch": 1.1170726693390893, + "grad_norm": 0.17429187893867493, + "learning_rate": 6.505401837727712e-05, + "loss": 0.5784, + "step": 5434 + }, + { + "epoch": 1.117278240312468, + "grad_norm": 0.20878252387046814, + "learning_rate": 6.504507643146983e-05, + "loss": 0.5594, + "step": 5435 + }, + { + "epoch": 1.1174838112858465, + "grad_norm": 0.21358852088451385, + "learning_rate": 6.503613349813532e-05, + "loss": 0.5902, + "step": 5436 + }, + { + "epoch": 1.117689382259225, + "grad_norm": 0.1675240397453308, + "learning_rate": 6.502718957771415e-05, + "loss": 0.5253, + "step": 5437 + }, + { + "epoch": 1.1178949532326035, + "grad_norm": 0.1364658623933792, + "learning_rate": 6.501824467064695e-05, + "loss": 0.5097, + "step": 5438 + }, + { + "epoch": 1.118100524205982, + "grad_norm": 0.170567587018013, + "learning_rate": 6.500929877737442e-05, + "loss": 0.5704, + "step": 5439 + }, + { + "epoch": 1.1183060951793606, + "grad_norm": 0.2064054310321808, + "learning_rate": 6.500035189833725e-05, + "loss": 0.5945, + "step": 5440 + }, + { + "epoch": 1.1185116661527392, + "grad_norm": 0.19481217861175537, + "learning_rate": 6.499140403397623e-05, + "loss": 0.5454, + "step": 5441 + }, + { + "epoch": 1.1187172371261178, + "grad_norm": 0.19907177984714508, + "learning_rate": 6.498245518473216e-05, + "loss": 0.5479, + "step": 5442 + }, + { + "epoch": 1.1189228080994964, + "grad_norm": 0.20047436654567719, + "learning_rate": 6.497350535104592e-05, + "loss": 0.5321, + "step": 5443 + }, + { + "epoch": 1.119128379072875, + "grad_norm": 0.19042466580867767, + "learning_rate": 6.496455453335842e-05, + "loss": 0.5252, + "step": 5444 + }, + { + "epoch": 1.1193339500462534, + "grad_norm": 0.17058107256889343, + "learning_rate": 6.495560273211066e-05, + "loss": 0.5588, + "step": 5445 + }, + { + "epoch": 1.119539521019632, + "grad_norm": 0.21633589267730713, + "learning_rate": 6.494664994774363e-05, + "loss": 0.5613, + "step": 5446 + }, + { + "epoch": 1.1197450919930105, + "grad_norm": 0.18286935985088348, + "learning_rate": 6.493769618069835e-05, + "loss": 0.5415, + "step": 5447 + }, + { + "epoch": 1.1199506629663891, + "grad_norm": 0.17194852232933044, + "learning_rate": 6.492874143141599e-05, + "loss": 0.5713, + "step": 5448 + }, + { + "epoch": 1.1201562339397677, + "grad_norm": 0.1954166442155838, + "learning_rate": 6.49197857003377e-05, + "loss": 0.5635, + "step": 5449 + }, + { + "epoch": 1.1203618049131463, + "grad_norm": 0.22501884400844574, + "learning_rate": 6.491082898790465e-05, + "loss": 0.5615, + "step": 5450 + }, + { + "epoch": 1.1205673758865249, + "grad_norm": 0.19493956863880157, + "learning_rate": 6.490187129455813e-05, + "loss": 0.5409, + "step": 5451 + }, + { + "epoch": 1.1207729468599035, + "grad_norm": 0.19634434580802917, + "learning_rate": 6.489291262073942e-05, + "loss": 0.5698, + "step": 5452 + }, + { + "epoch": 1.1209785178332818, + "grad_norm": 0.22804930806159973, + "learning_rate": 6.48839529668899e-05, + "loss": 0.5685, + "step": 5453 + }, + { + "epoch": 1.1211840888066604, + "grad_norm": 0.18841257691383362, + "learning_rate": 6.487499233345094e-05, + "loss": 0.5362, + "step": 5454 + }, + { + "epoch": 1.121389659780039, + "grad_norm": 0.16956526041030884, + "learning_rate": 6.4866030720864e-05, + "loss": 0.5396, + "step": 5455 + }, + { + "epoch": 1.1215952307534176, + "grad_norm": 0.1671314686536789, + "learning_rate": 6.48570681295706e-05, + "loss": 0.5741, + "step": 5456 + }, + { + "epoch": 1.1218008017267962, + "grad_norm": 0.16687725484371185, + "learning_rate": 6.484810456001226e-05, + "loss": 0.5651, + "step": 5457 + }, + { + "epoch": 1.1220063727001748, + "grad_norm": 0.16390031576156616, + "learning_rate": 6.483914001263058e-05, + "loss": 0.5638, + "step": 5458 + }, + { + "epoch": 1.1222119436735534, + "grad_norm": 0.16974018514156342, + "learning_rate": 6.483017448786719e-05, + "loss": 0.5198, + "step": 5459 + }, + { + "epoch": 1.1224175146469317, + "grad_norm": 0.17362362146377563, + "learning_rate": 6.48212079861638e-05, + "loss": 0.5437, + "step": 5460 + }, + { + "epoch": 1.1226230856203103, + "grad_norm": 0.19400741159915924, + "learning_rate": 6.481224050796213e-05, + "loss": 0.5481, + "step": 5461 + }, + { + "epoch": 1.122828656593689, + "grad_norm": 0.1908549964427948, + "learning_rate": 6.480327205370397e-05, + "loss": 0.5593, + "step": 5462 + }, + { + "epoch": 1.1230342275670675, + "grad_norm": 0.1656675636768341, + "learning_rate": 6.479430262383116e-05, + "loss": 0.5369, + "step": 5463 + }, + { + "epoch": 1.123239798540446, + "grad_norm": 0.16304363310337067, + "learning_rate": 6.478533221878556e-05, + "loss": 0.5697, + "step": 5464 + }, + { + "epoch": 1.1234453695138247, + "grad_norm": 0.19559811055660248, + "learning_rate": 6.477636083900914e-05, + "loss": 0.5856, + "step": 5465 + }, + { + "epoch": 1.1236509404872033, + "grad_norm": 0.17000918090343475, + "learning_rate": 6.476738848494385e-05, + "loss": 0.5545, + "step": 5466 + }, + { + "epoch": 1.1238565114605819, + "grad_norm": 0.1553100198507309, + "learning_rate": 6.475841515703172e-05, + "loss": 0.5531, + "step": 5467 + }, + { + "epoch": 1.1240620824339604, + "grad_norm": 0.1977023035287857, + "learning_rate": 6.474944085571482e-05, + "loss": 0.5735, + "step": 5468 + }, + { + "epoch": 1.1242676534073388, + "grad_norm": 0.1898386925458908, + "learning_rate": 6.47404655814353e-05, + "loss": 0.5487, + "step": 5469 + }, + { + "epoch": 1.1244732243807174, + "grad_norm": 0.18860745429992676, + "learning_rate": 6.473148933463529e-05, + "loss": 0.5634, + "step": 5470 + }, + { + "epoch": 1.124678795354096, + "grad_norm": 0.1715293824672699, + "learning_rate": 6.472251211575704e-05, + "loss": 0.546, + "step": 5471 + }, + { + "epoch": 1.1248843663274746, + "grad_norm": 0.13662804663181305, + "learning_rate": 6.471353392524277e-05, + "loss": 0.5186, + "step": 5472 + }, + { + "epoch": 1.1250899373008532, + "grad_norm": 0.16437150537967682, + "learning_rate": 6.470455476353486e-05, + "loss": 0.5628, + "step": 5473 + }, + { + "epoch": 1.1252955082742317, + "grad_norm": 0.20408563315868378, + "learning_rate": 6.469557463107562e-05, + "loss": 0.5723, + "step": 5474 + }, + { + "epoch": 1.1255010792476101, + "grad_norm": 0.19299282133579254, + "learning_rate": 6.468659352830746e-05, + "loss": 0.5923, + "step": 5475 + }, + { + "epoch": 1.1257066502209887, + "grad_norm": 0.18226416409015656, + "learning_rate": 6.467761145567286e-05, + "loss": 0.5624, + "step": 5476 + }, + { + "epoch": 1.1259122211943673, + "grad_norm": 0.18840613961219788, + "learning_rate": 6.466862841361432e-05, + "loss": 0.5697, + "step": 5477 + }, + { + "epoch": 1.1261177921677459, + "grad_norm": 0.1877157837152481, + "learning_rate": 6.465964440257438e-05, + "loss": 0.5625, + "step": 5478 + }, + { + "epoch": 1.1263233631411245, + "grad_norm": 0.25759997963905334, + "learning_rate": 6.465065942299567e-05, + "loss": 0.5425, + "step": 5479 + }, + { + "epoch": 1.126528934114503, + "grad_norm": 0.19235903024673462, + "learning_rate": 6.46416734753208e-05, + "loss": 0.5562, + "step": 5480 + }, + { + "epoch": 1.1267345050878816, + "grad_norm": 0.20213893055915833, + "learning_rate": 6.46326865599925e-05, + "loss": 0.5467, + "step": 5481 + }, + { + "epoch": 1.1269400760612602, + "grad_norm": 0.19602036476135254, + "learning_rate": 6.462369867745348e-05, + "loss": 0.5814, + "step": 5482 + }, + { + "epoch": 1.1271456470346388, + "grad_norm": 0.19586962461471558, + "learning_rate": 6.461470982814657e-05, + "loss": 0.5604, + "step": 5483 + }, + { + "epoch": 1.1273512180080172, + "grad_norm": 0.18865470588207245, + "learning_rate": 6.460572001251456e-05, + "loss": 0.5345, + "step": 5484 + }, + { + "epoch": 1.1275567889813958, + "grad_norm": 0.19333775341510773, + "learning_rate": 6.459672923100036e-05, + "loss": 0.572, + "step": 5485 + }, + { + "epoch": 1.1277623599547744, + "grad_norm": 0.1982879638671875, + "learning_rate": 6.458773748404693e-05, + "loss": 0.593, + "step": 5486 + }, + { + "epoch": 1.127967930928153, + "grad_norm": 0.19559934735298157, + "learning_rate": 6.457874477209722e-05, + "loss": 0.5625, + "step": 5487 + }, + { + "epoch": 1.1281735019015315, + "grad_norm": 0.190285325050354, + "learning_rate": 6.456975109559425e-05, + "loss": 0.5579, + "step": 5488 + }, + { + "epoch": 1.1283790728749101, + "grad_norm": 0.1935376673936844, + "learning_rate": 6.456075645498113e-05, + "loss": 0.5611, + "step": 5489 + }, + { + "epoch": 1.1285846438482887, + "grad_norm": 0.1908402442932129, + "learning_rate": 6.455176085070095e-05, + "loss": 0.5556, + "step": 5490 + }, + { + "epoch": 1.128790214821667, + "grad_norm": 0.1894407868385315, + "learning_rate": 6.45427642831969e-05, + "loss": 0.5655, + "step": 5491 + }, + { + "epoch": 1.1289957857950457, + "grad_norm": 0.16991651058197021, + "learning_rate": 6.453376675291221e-05, + "loss": 0.5269, + "step": 5492 + }, + { + "epoch": 1.1292013567684243, + "grad_norm": 0.14893554151058197, + "learning_rate": 6.452476826029012e-05, + "loss": 0.5192, + "step": 5493 + }, + { + "epoch": 1.1294069277418028, + "grad_norm": 0.15781262516975403, + "learning_rate": 6.451576880577397e-05, + "loss": 0.5827, + "step": 5494 + }, + { + "epoch": 1.1296124987151814, + "grad_norm": 0.16692712903022766, + "learning_rate": 6.45067683898071e-05, + "loss": 0.5338, + "step": 5495 + }, + { + "epoch": 1.12981806968856, + "grad_norm": 0.18039274215698242, + "learning_rate": 6.449776701283292e-05, + "loss": 0.5598, + "step": 5496 + }, + { + "epoch": 1.1300236406619386, + "grad_norm": 0.20324920117855072, + "learning_rate": 6.448876467529488e-05, + "loss": 0.5711, + "step": 5497 + }, + { + "epoch": 1.1302292116353172, + "grad_norm": 0.19356949627399445, + "learning_rate": 6.447976137763652e-05, + "loss": 0.5498, + "step": 5498 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.19591811299324036, + "learning_rate": 6.447075712030135e-05, + "loss": 0.5585, + "step": 5499 + }, + { + "epoch": 1.1306403535820742, + "grad_norm": 0.18893134593963623, + "learning_rate": 6.4461751903733e-05, + "loss": 0.5425, + "step": 5500 + }, + { + "epoch": 1.1308459245554527, + "grad_norm": 0.1979568600654602, + "learning_rate": 6.445274572837509e-05, + "loss": 0.5395, + "step": 5501 + }, + { + "epoch": 1.1310514955288313, + "grad_norm": 0.20097365975379944, + "learning_rate": 6.444373859467131e-05, + "loss": 0.5571, + "step": 5502 + }, + { + "epoch": 1.13125706650221, + "grad_norm": 0.1974884420633316, + "learning_rate": 6.443473050306541e-05, + "loss": 0.5778, + "step": 5503 + }, + { + "epoch": 1.1314626374755885, + "grad_norm": 0.19491606950759888, + "learning_rate": 6.442572145400119e-05, + "loss": 0.5408, + "step": 5504 + }, + { + "epoch": 1.131668208448967, + "grad_norm": 0.2038283497095108, + "learning_rate": 6.441671144792245e-05, + "loss": 0.5597, + "step": 5505 + }, + { + "epoch": 1.1318737794223455, + "grad_norm": 0.2011345475912094, + "learning_rate": 6.440770048527311e-05, + "loss": 0.5645, + "step": 5506 + }, + { + "epoch": 1.132079350395724, + "grad_norm": 0.20046375691890717, + "learning_rate": 6.439868856649706e-05, + "loss": 0.565, + "step": 5507 + }, + { + "epoch": 1.1322849213691026, + "grad_norm": 0.19624361395835876, + "learning_rate": 6.438967569203831e-05, + "loss": 0.5556, + "step": 5508 + }, + { + "epoch": 1.1324904923424812, + "grad_norm": 0.19601401686668396, + "learning_rate": 6.438066186234086e-05, + "loss": 0.5608, + "step": 5509 + }, + { + "epoch": 1.1326960633158598, + "grad_norm": 0.19871017336845398, + "learning_rate": 6.437164707784877e-05, + "loss": 0.5616, + "step": 5510 + }, + { + "epoch": 1.1329016342892384, + "grad_norm": 0.19127802550792694, + "learning_rate": 6.43626313390062e-05, + "loss": 0.5778, + "step": 5511 + }, + { + "epoch": 1.133107205262617, + "grad_norm": 0.19276481866836548, + "learning_rate": 6.435361464625726e-05, + "loss": 0.5488, + "step": 5512 + }, + { + "epoch": 1.1333127762359956, + "grad_norm": 0.19331035017967224, + "learning_rate": 6.434459700004619e-05, + "loss": 0.5149, + "step": 5513 + }, + { + "epoch": 1.133518347209374, + "grad_norm": 0.19106508791446686, + "learning_rate": 6.433557840081726e-05, + "loss": 0.5277, + "step": 5514 + }, + { + "epoch": 1.1337239181827525, + "grad_norm": 0.1550726294517517, + "learning_rate": 6.432655884901473e-05, + "loss": 0.5596, + "step": 5515 + }, + { + "epoch": 1.1339294891561311, + "grad_norm": 0.20075179636478424, + "learning_rate": 6.431753834508299e-05, + "loss": 0.5461, + "step": 5516 + }, + { + "epoch": 1.1341350601295097, + "grad_norm": 0.20653320848941803, + "learning_rate": 6.430851688946643e-05, + "loss": 0.6038, + "step": 5517 + }, + { + "epoch": 1.1343406311028883, + "grad_norm": 0.19482316076755524, + "learning_rate": 6.42994944826095e-05, + "loss": 0.5716, + "step": 5518 + }, + { + "epoch": 1.1345462020762669, + "grad_norm": 0.18027710914611816, + "learning_rate": 6.429047112495667e-05, + "loss": 0.5531, + "step": 5519 + }, + { + "epoch": 1.1347517730496455, + "grad_norm": 0.15849310159683228, + "learning_rate": 6.428144681695247e-05, + "loss": 0.5674, + "step": 5520 + }, + { + "epoch": 1.1349573440230238, + "grad_norm": 0.19099898636341095, + "learning_rate": 6.427242155904154e-05, + "loss": 0.5405, + "step": 5521 + }, + { + "epoch": 1.1351629149964024, + "grad_norm": 0.2118232399225235, + "learning_rate": 6.426339535166847e-05, + "loss": 0.5569, + "step": 5522 + }, + { + "epoch": 1.135368485969781, + "grad_norm": 0.19552506506443024, + "learning_rate": 6.425436819527792e-05, + "loss": 0.5575, + "step": 5523 + }, + { + "epoch": 1.1355740569431596, + "grad_norm": 0.19680903851985931, + "learning_rate": 6.424534009031468e-05, + "loss": 0.5644, + "step": 5524 + }, + { + "epoch": 1.1357796279165382, + "grad_norm": 0.19150924682617188, + "learning_rate": 6.423631103722348e-05, + "loss": 0.5453, + "step": 5525 + }, + { + "epoch": 1.1359851988899168, + "grad_norm": 0.19185394048690796, + "learning_rate": 6.422728103644915e-05, + "loss": 0.5408, + "step": 5526 + }, + { + "epoch": 1.1361907698632954, + "grad_norm": 0.19700084626674652, + "learning_rate": 6.421825008843652e-05, + "loss": 0.5664, + "step": 5527 + }, + { + "epoch": 1.136396340836674, + "grad_norm": 0.19622080028057098, + "learning_rate": 6.420921819363057e-05, + "loss": 0.5848, + "step": 5528 + }, + { + "epoch": 1.1366019118100525, + "grad_norm": 0.19052082300186157, + "learning_rate": 6.420018535247621e-05, + "loss": 0.5607, + "step": 5529 + }, + { + "epoch": 1.136807482783431, + "grad_norm": 0.18648898601531982, + "learning_rate": 6.419115156541846e-05, + "loss": 0.5627, + "step": 5530 + }, + { + "epoch": 1.1370130537568095, + "grad_norm": 0.20063170790672302, + "learning_rate": 6.418211683290235e-05, + "loss": 0.5857, + "step": 5531 + }, + { + "epoch": 1.137218624730188, + "grad_norm": 0.18962214887142181, + "learning_rate": 6.417308115537303e-05, + "loss": 0.5854, + "step": 5532 + }, + { + "epoch": 1.1374241957035667, + "grad_norm": 0.20246468484401703, + "learning_rate": 6.41640445332756e-05, + "loss": 0.5883, + "step": 5533 + }, + { + "epoch": 1.1376297666769453, + "grad_norm": 0.18931740522384644, + "learning_rate": 6.415500696705528e-05, + "loss": 0.5262, + "step": 5534 + }, + { + "epoch": 1.1378353376503239, + "grad_norm": 0.19331716001033783, + "learning_rate": 6.41459684571573e-05, + "loss": 0.5534, + "step": 5535 + }, + { + "epoch": 1.1380409086237022, + "grad_norm": 0.19788740575313568, + "learning_rate": 6.413692900402693e-05, + "loss": 0.5702, + "step": 5536 + }, + { + "epoch": 1.1382464795970808, + "grad_norm": 0.19547824561595917, + "learning_rate": 6.41278886081095e-05, + "loss": 0.5647, + "step": 5537 + }, + { + "epoch": 1.1384520505704594, + "grad_norm": 0.1888136863708496, + "learning_rate": 6.411884726985043e-05, + "loss": 0.5445, + "step": 5538 + }, + { + "epoch": 1.138657621543838, + "grad_norm": 0.19497732818126678, + "learning_rate": 6.410980498969512e-05, + "loss": 0.5777, + "step": 5539 + }, + { + "epoch": 1.1388631925172166, + "grad_norm": 0.18465173244476318, + "learning_rate": 6.410076176808901e-05, + "loss": 0.5299, + "step": 5540 + }, + { + "epoch": 1.1390687634905952, + "grad_norm": 0.1675313413143158, + "learning_rate": 6.409171760547765e-05, + "loss": 0.5722, + "step": 5541 + }, + { + "epoch": 1.1392743344639737, + "grad_norm": 0.2085336148738861, + "learning_rate": 6.408267250230661e-05, + "loss": 0.5745, + "step": 5542 + }, + { + "epoch": 1.1394799054373523, + "grad_norm": 0.19899022579193115, + "learning_rate": 6.407362645902148e-05, + "loss": 0.5709, + "step": 5543 + }, + { + "epoch": 1.139685476410731, + "grad_norm": 0.1954008936882019, + "learning_rate": 6.406457947606792e-05, + "loss": 0.5704, + "step": 5544 + }, + { + "epoch": 1.1398910473841093, + "grad_norm": 0.17613859474658966, + "learning_rate": 6.405553155389165e-05, + "loss": 0.5395, + "step": 5545 + }, + { + "epoch": 1.1400966183574879, + "grad_norm": 0.1824086457490921, + "learning_rate": 6.40464826929384e-05, + "loss": 0.5558, + "step": 5546 + }, + { + "epoch": 1.1403021893308665, + "grad_norm": 0.20690536499023438, + "learning_rate": 6.403743289365398e-05, + "loss": 0.5626, + "step": 5547 + }, + { + "epoch": 1.140507760304245, + "grad_norm": 0.20793819427490234, + "learning_rate": 6.40283821564842e-05, + "loss": 0.5819, + "step": 5548 + }, + { + "epoch": 1.1407133312776236, + "grad_norm": 0.19500964879989624, + "learning_rate": 6.401933048187499e-05, + "loss": 0.5696, + "step": 5549 + }, + { + "epoch": 1.1409189022510022, + "grad_norm": 0.19967152178287506, + "learning_rate": 6.401027787027225e-05, + "loss": 0.5567, + "step": 5550 + }, + { + "epoch": 1.1411244732243806, + "grad_norm": 0.19124870002269745, + "learning_rate": 6.400122432212198e-05, + "loss": 0.5276, + "step": 5551 + }, + { + "epoch": 1.1413300441977592, + "grad_norm": 0.1926882117986679, + "learning_rate": 6.399216983787019e-05, + "loss": 0.5785, + "step": 5552 + }, + { + "epoch": 1.1415356151711378, + "grad_norm": 0.1873985081911087, + "learning_rate": 6.398311441796297e-05, + "loss": 0.5496, + "step": 5553 + }, + { + "epoch": 1.1417411861445164, + "grad_norm": 0.164115771651268, + "learning_rate": 6.397405806284642e-05, + "loss": 0.5343, + "step": 5554 + }, + { + "epoch": 1.141946757117895, + "grad_norm": 0.16665267944335938, + "learning_rate": 6.396500077296673e-05, + "loss": 0.5769, + "step": 5555 + }, + { + "epoch": 1.1421523280912735, + "grad_norm": 0.1954329013824463, + "learning_rate": 6.395594254877009e-05, + "loss": 0.5652, + "step": 5556 + }, + { + "epoch": 1.1423578990646521, + "grad_norm": 0.19422973692417145, + "learning_rate": 6.394688339070277e-05, + "loss": 0.5596, + "step": 5557 + }, + { + "epoch": 1.1425634700380307, + "grad_norm": 0.19732142984867096, + "learning_rate": 6.393782329921104e-05, + "loss": 0.5887, + "step": 5558 + }, + { + "epoch": 1.1427690410114093, + "grad_norm": 0.195445254445076, + "learning_rate": 6.392876227474128e-05, + "loss": 0.5737, + "step": 5559 + }, + { + "epoch": 1.1429746119847877, + "grad_norm": 0.18976660072803497, + "learning_rate": 6.391970031773988e-05, + "loss": 0.5693, + "step": 5560 + }, + { + "epoch": 1.1431801829581663, + "grad_norm": 0.18721553683280945, + "learning_rate": 6.391063742865327e-05, + "loss": 0.5393, + "step": 5561 + }, + { + "epoch": 1.1433857539315448, + "grad_norm": 0.19081273674964905, + "learning_rate": 6.390157360792794e-05, + "loss": 0.5565, + "step": 5562 + }, + { + "epoch": 1.1435913249049234, + "grad_norm": 0.19391131401062012, + "learning_rate": 6.389250885601043e-05, + "loss": 0.5571, + "step": 5563 + }, + { + "epoch": 1.143796895878302, + "grad_norm": 0.18970650434494019, + "learning_rate": 6.388344317334732e-05, + "loss": 0.571, + "step": 5564 + }, + { + "epoch": 1.1440024668516806, + "grad_norm": 0.1937200129032135, + "learning_rate": 6.38743765603852e-05, + "loss": 0.5619, + "step": 5565 + }, + { + "epoch": 1.1442080378250592, + "grad_norm": 0.189813494682312, + "learning_rate": 6.386530901757078e-05, + "loss": 0.562, + "step": 5566 + }, + { + "epoch": 1.1444136087984376, + "grad_norm": 0.19848157465457916, + "learning_rate": 6.385624054535078e-05, + "loss": 0.5776, + "step": 5567 + }, + { + "epoch": 1.1446191797718162, + "grad_norm": 0.19412924349308014, + "learning_rate": 6.384717114417191e-05, + "loss": 0.5637, + "step": 5568 + }, + { + "epoch": 1.1448247507451947, + "grad_norm": 0.20294548571109772, + "learning_rate": 6.383810081448103e-05, + "loss": 0.5626, + "step": 5569 + }, + { + "epoch": 1.1450303217185733, + "grad_norm": 0.17422319948673248, + "learning_rate": 6.382902955672496e-05, + "loss": 0.5506, + "step": 5570 + }, + { + "epoch": 1.145235892691952, + "grad_norm": 0.15921704471111298, + "learning_rate": 6.381995737135062e-05, + "loss": 0.5882, + "step": 5571 + }, + { + "epoch": 1.1454414636653305, + "grad_norm": 0.20002704858779907, + "learning_rate": 6.381088425880495e-05, + "loss": 0.5677, + "step": 5572 + }, + { + "epoch": 1.145647034638709, + "grad_norm": 0.1957893818616867, + "learning_rate": 6.38018102195349e-05, + "loss": 0.5629, + "step": 5573 + }, + { + "epoch": 1.1458526056120877, + "grad_norm": 0.19180312752723694, + "learning_rate": 6.379273525398758e-05, + "loss": 0.5645, + "step": 5574 + }, + { + "epoch": 1.146058176585466, + "grad_norm": 0.18908941745758057, + "learning_rate": 6.378365936261e-05, + "loss": 0.558, + "step": 5575 + }, + { + "epoch": 1.1462637475588446, + "grad_norm": 0.19693338871002197, + "learning_rate": 6.377458254584934e-05, + "loss": 0.5741, + "step": 5576 + }, + { + "epoch": 1.1464693185322232, + "grad_norm": 0.190039724111557, + "learning_rate": 6.376550480415275e-05, + "loss": 0.5431, + "step": 5577 + }, + { + "epoch": 1.1466748895056018, + "grad_norm": 0.1961604356765747, + "learning_rate": 6.375642613796745e-05, + "loss": 0.563, + "step": 5578 + }, + { + "epoch": 1.1468804604789804, + "grad_norm": 0.19689500331878662, + "learning_rate": 6.374734654774068e-05, + "loss": 0.5579, + "step": 5579 + }, + { + "epoch": 1.147086031452359, + "grad_norm": 0.1831909865140915, + "learning_rate": 6.373826603391979e-05, + "loss": 0.5688, + "step": 5580 + }, + { + "epoch": 1.1472916024257376, + "grad_norm": 0.18834874033927917, + "learning_rate": 6.372918459695212e-05, + "loss": 0.55, + "step": 5581 + }, + { + "epoch": 1.147497173399116, + "grad_norm": 0.19605682790279388, + "learning_rate": 6.372010223728504e-05, + "loss": 0.5774, + "step": 5582 + }, + { + "epoch": 1.1477027443724945, + "grad_norm": 0.19253866374492645, + "learning_rate": 6.371101895536605e-05, + "loss": 0.589, + "step": 5583 + }, + { + "epoch": 1.1479083153458731, + "grad_norm": 0.18777357041835785, + "learning_rate": 6.370193475164258e-05, + "loss": 0.5665, + "step": 5584 + }, + { + "epoch": 1.1481138863192517, + "grad_norm": 0.1673029363155365, + "learning_rate": 6.36928496265622e-05, + "loss": 0.5298, + "step": 5585 + }, + { + "epoch": 1.1483194572926303, + "grad_norm": 0.15895609557628632, + "learning_rate": 6.36837635805725e-05, + "loss": 0.5323, + "step": 5586 + }, + { + "epoch": 1.1485250282660089, + "grad_norm": 0.19794027507305145, + "learning_rate": 6.367467661412111e-05, + "loss": 0.5677, + "step": 5587 + }, + { + "epoch": 1.1487305992393875, + "grad_norm": 0.20095770061016083, + "learning_rate": 6.366558872765569e-05, + "loss": 0.5562, + "step": 5588 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 0.16546010971069336, + "learning_rate": 6.365649992162393e-05, + "loss": 0.5212, + "step": 5589 + }, + { + "epoch": 1.1491417411861444, + "grad_norm": 0.16688905656337738, + "learning_rate": 6.364741019647363e-05, + "loss": 0.5421, + "step": 5590 + }, + { + "epoch": 1.149347312159523, + "grad_norm": 0.12764035165309906, + "learning_rate": 6.36383195526526e-05, + "loss": 0.5154, + "step": 5591 + }, + { + "epoch": 1.1495528831329016, + "grad_norm": 0.16854409873485565, + "learning_rate": 6.362922799060866e-05, + "loss": 0.5689, + "step": 5592 + }, + { + "epoch": 1.1497584541062802, + "grad_norm": 0.19557413458824158, + "learning_rate": 6.362013551078974e-05, + "loss": 0.5581, + "step": 5593 + }, + { + "epoch": 1.1499640250796588, + "grad_norm": 0.19200196862220764, + "learning_rate": 6.361104211364377e-05, + "loss": 0.5744, + "step": 5594 + }, + { + "epoch": 1.1501695960530374, + "grad_norm": 0.19255445897579193, + "learning_rate": 6.360194779961875e-05, + "loss": 0.5677, + "step": 5595 + }, + { + "epoch": 1.150375167026416, + "grad_norm": 0.1860598772764206, + "learning_rate": 6.359285256916269e-05, + "loss": 0.5239, + "step": 5596 + }, + { + "epoch": 1.1505807379997943, + "grad_norm": 0.18977835774421692, + "learning_rate": 6.358375642272371e-05, + "loss": 0.5502, + "step": 5597 + }, + { + "epoch": 1.150786308973173, + "grad_norm": 0.18825951218605042, + "learning_rate": 6.35746593607499e-05, + "loss": 0.5701, + "step": 5598 + }, + { + "epoch": 1.1509918799465515, + "grad_norm": 0.1946858912706375, + "learning_rate": 6.356556138368945e-05, + "loss": 0.5735, + "step": 5599 + }, + { + "epoch": 1.15119745091993, + "grad_norm": 0.1934114545583725, + "learning_rate": 6.355646249199055e-05, + "loss": 0.57, + "step": 5600 + }, + { + "epoch": 1.1514030218933087, + "grad_norm": 0.19345784187316895, + "learning_rate": 6.354736268610148e-05, + "loss": 0.568, + "step": 5601 + }, + { + "epoch": 1.1516085928666873, + "grad_norm": 0.1907486766576767, + "learning_rate": 6.353826196647056e-05, + "loss": 0.5609, + "step": 5602 + }, + { + "epoch": 1.1518141638400659, + "grad_norm": 0.19529633224010468, + "learning_rate": 6.35291603335461e-05, + "loss": 0.5531, + "step": 5603 + }, + { + "epoch": 1.1520197348134444, + "grad_norm": 0.19347496330738068, + "learning_rate": 6.352005778777652e-05, + "loss": 0.5748, + "step": 5604 + }, + { + "epoch": 1.1522253057868228, + "grad_norm": 0.1948879212141037, + "learning_rate": 6.351095432961024e-05, + "loss": 0.565, + "step": 5605 + }, + { + "epoch": 1.1524308767602014, + "grad_norm": 0.19510291516780853, + "learning_rate": 6.350184995949578e-05, + "loss": 0.5492, + "step": 5606 + }, + { + "epoch": 1.15263644773358, + "grad_norm": 0.198397696018219, + "learning_rate": 6.349274467788165e-05, + "loss": 0.5506, + "step": 5607 + }, + { + "epoch": 1.1528420187069586, + "grad_norm": 0.1937544345855713, + "learning_rate": 6.348363848521643e-05, + "loss": 0.556, + "step": 5608 + }, + { + "epoch": 1.1530475896803372, + "grad_norm": 0.1949324756860733, + "learning_rate": 6.347453138194872e-05, + "loss": 0.5608, + "step": 5609 + }, + { + "epoch": 1.1532531606537157, + "grad_norm": 0.18160304427146912, + "learning_rate": 6.34654233685272e-05, + "loss": 0.5396, + "step": 5610 + }, + { + "epoch": 1.1534587316270943, + "grad_norm": 0.1651293933391571, + "learning_rate": 6.345631444540058e-05, + "loss": 0.5618, + "step": 5611 + }, + { + "epoch": 1.1536643026004727, + "grad_norm": 0.19430503249168396, + "learning_rate": 6.344720461301761e-05, + "loss": 0.5766, + "step": 5612 + }, + { + "epoch": 1.1538698735738513, + "grad_norm": 0.19232423603534698, + "learning_rate": 6.34380938718271e-05, + "loss": 0.5543, + "step": 5613 + }, + { + "epoch": 1.1540754445472299, + "grad_norm": 0.19700485467910767, + "learning_rate": 6.342898222227788e-05, + "loss": 0.6007, + "step": 5614 + }, + { + "epoch": 1.1542810155206085, + "grad_norm": 0.18897385895252228, + "learning_rate": 6.341986966481883e-05, + "loss": 0.5658, + "step": 5615 + }, + { + "epoch": 1.154486586493987, + "grad_norm": 0.18891417980194092, + "learning_rate": 6.341075619989891e-05, + "loss": 0.5725, + "step": 5616 + }, + { + "epoch": 1.1546921574673656, + "grad_norm": 0.17012788355350494, + "learning_rate": 6.340164182796707e-05, + "loss": 0.5365, + "step": 5617 + }, + { + "epoch": 1.1548977284407442, + "grad_norm": 0.16269567608833313, + "learning_rate": 6.339252654947236e-05, + "loss": 0.5708, + "step": 5618 + }, + { + "epoch": 1.1551032994141228, + "grad_norm": 0.19354234635829926, + "learning_rate": 6.338341036486385e-05, + "loss": 0.5645, + "step": 5619 + }, + { + "epoch": 1.1553088703875014, + "grad_norm": 0.19386227428913116, + "learning_rate": 6.33742932745906e-05, + "loss": 0.5772, + "step": 5620 + }, + { + "epoch": 1.1555144413608798, + "grad_norm": 0.17871583998203278, + "learning_rate": 6.336517527910182e-05, + "loss": 0.5568, + "step": 5621 + }, + { + "epoch": 1.1557200123342584, + "grad_norm": 0.18921589851379395, + "learning_rate": 6.335605637884668e-05, + "loss": 0.5555, + "step": 5622 + }, + { + "epoch": 1.155925583307637, + "grad_norm": 0.19476552307605743, + "learning_rate": 6.334693657427446e-05, + "loss": 0.5581, + "step": 5623 + }, + { + "epoch": 1.1561311542810155, + "grad_norm": 0.18380312621593475, + "learning_rate": 6.333781586583441e-05, + "loss": 0.5322, + "step": 5624 + }, + { + "epoch": 1.1563367252543941, + "grad_norm": 0.18677309155464172, + "learning_rate": 6.332869425397588e-05, + "loss": 0.5712, + "step": 5625 + }, + { + "epoch": 1.1565422962277727, + "grad_norm": 0.19158649444580078, + "learning_rate": 6.331957173914826e-05, + "loss": 0.5846, + "step": 5626 + }, + { + "epoch": 1.156747867201151, + "grad_norm": 0.19586919248104095, + "learning_rate": 6.331044832180098e-05, + "loss": 0.5589, + "step": 5627 + }, + { + "epoch": 1.1569534381745297, + "grad_norm": 0.15967777371406555, + "learning_rate": 6.330132400238347e-05, + "loss": 0.5268, + "step": 5628 + }, + { + "epoch": 1.1571590091479083, + "grad_norm": 0.1551171988248825, + "learning_rate": 6.329219878134528e-05, + "loss": 0.5509, + "step": 5629 + }, + { + "epoch": 1.1573645801212868, + "grad_norm": 0.18467473983764648, + "learning_rate": 6.328307265913595e-05, + "loss": 0.5574, + "step": 5630 + }, + { + "epoch": 1.1575701510946654, + "grad_norm": 0.19859431684017181, + "learning_rate": 6.327394563620509e-05, + "loss": 0.5613, + "step": 5631 + }, + { + "epoch": 1.157775722068044, + "grad_norm": 0.19411081075668335, + "learning_rate": 6.326481771300234e-05, + "loss": 0.5589, + "step": 5632 + }, + { + "epoch": 1.1579812930414226, + "grad_norm": 0.18985684216022491, + "learning_rate": 6.325568888997739e-05, + "loss": 0.5673, + "step": 5633 + }, + { + "epoch": 1.1581868640148012, + "grad_norm": 0.1923382729291916, + "learning_rate": 6.324655916757997e-05, + "loss": 0.558, + "step": 5634 + }, + { + "epoch": 1.1583924349881798, + "grad_norm": 0.20484760403633118, + "learning_rate": 6.323742854625986e-05, + "loss": 0.5561, + "step": 5635 + }, + { + "epoch": 1.1585980059615582, + "grad_norm": 0.15869790315628052, + "learning_rate": 6.32282970264669e-05, + "loss": 0.5412, + "step": 5636 + }, + { + "epoch": 1.1588035769349367, + "grad_norm": 0.16667144000530243, + "learning_rate": 6.321916460865092e-05, + "loss": 0.5605, + "step": 5637 + }, + { + "epoch": 1.1590091479083153, + "grad_norm": 0.1636246144771576, + "learning_rate": 6.321003129326187e-05, + "loss": 0.5297, + "step": 5638 + }, + { + "epoch": 1.159214718881694, + "grad_norm": 0.1557888388633728, + "learning_rate": 6.320089708074971e-05, + "loss": 0.5433, + "step": 5639 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.18941344320774078, + "learning_rate": 6.31917619715644e-05, + "loss": 0.552, + "step": 5640 + }, + { + "epoch": 1.159625860828451, + "grad_norm": 0.18825402855873108, + "learning_rate": 6.318262596615602e-05, + "loss": 0.5447, + "step": 5641 + }, + { + "epoch": 1.1598314318018295, + "grad_norm": 0.16153112053871155, + "learning_rate": 6.317348906497463e-05, + "loss": 0.5363, + "step": 5642 + }, + { + "epoch": 1.160037002775208, + "grad_norm": 0.15847079455852509, + "learning_rate": 6.31643512684704e-05, + "loss": 0.5523, + "step": 5643 + }, + { + "epoch": 1.1602425737485866, + "grad_norm": 0.19507279992103577, + "learning_rate": 6.315521257709345e-05, + "loss": 0.5575, + "step": 5644 + }, + { + "epoch": 1.1604481447219652, + "grad_norm": 0.19227170944213867, + "learning_rate": 6.314607299129406e-05, + "loss": 0.5725, + "step": 5645 + }, + { + "epoch": 1.1606537156953438, + "grad_norm": 0.18888604640960693, + "learning_rate": 6.313693251152247e-05, + "loss": 0.5532, + "step": 5646 + }, + { + "epoch": 1.1608592866687224, + "grad_norm": 0.20485495030879974, + "learning_rate": 6.312779113822896e-05, + "loss": 0.5469, + "step": 5647 + }, + { + "epoch": 1.161064857642101, + "grad_norm": 0.1900404691696167, + "learning_rate": 6.311864887186393e-05, + "loss": 0.5593, + "step": 5648 + }, + { + "epoch": 1.1612704286154796, + "grad_norm": 0.1928151249885559, + "learning_rate": 6.310950571287774e-05, + "loss": 0.553, + "step": 5649 + }, + { + "epoch": 1.1614759995888582, + "grad_norm": 0.2048550844192505, + "learning_rate": 6.310036166172086e-05, + "loss": 0.5602, + "step": 5650 + }, + { + "epoch": 1.1616815705622365, + "grad_norm": 0.16492126882076263, + "learning_rate": 6.309121671884375e-05, + "loss": 0.5306, + "step": 5651 + }, + { + "epoch": 1.1618871415356151, + "grad_norm": 0.1620352864265442, + "learning_rate": 6.308207088469697e-05, + "loss": 0.5384, + "step": 5652 + }, + { + "epoch": 1.1620927125089937, + "grad_norm": 0.19016869366168976, + "learning_rate": 6.307292415973108e-05, + "loss": 0.5666, + "step": 5653 + }, + { + "epoch": 1.1622982834823723, + "grad_norm": 0.18808448314666748, + "learning_rate": 6.306377654439666e-05, + "loss": 0.5522, + "step": 5654 + }, + { + "epoch": 1.1625038544557509, + "grad_norm": 0.1816331297159195, + "learning_rate": 6.305462803914441e-05, + "loss": 0.543, + "step": 5655 + }, + { + "epoch": 1.1627094254291295, + "grad_norm": 0.18618269264698029, + "learning_rate": 6.304547864442503e-05, + "loss": 0.5674, + "step": 5656 + }, + { + "epoch": 1.162914996402508, + "grad_norm": 0.1989988088607788, + "learning_rate": 6.303632836068925e-05, + "loss": 0.5658, + "step": 5657 + }, + { + "epoch": 1.1631205673758864, + "grad_norm": 0.1896226555109024, + "learning_rate": 6.302717718838788e-05, + "loss": 0.572, + "step": 5658 + }, + { + "epoch": 1.163326138349265, + "grad_norm": 0.16433072090148926, + "learning_rate": 6.301802512797176e-05, + "loss": 0.542, + "step": 5659 + }, + { + "epoch": 1.1635317093226436, + "grad_norm": 0.1611773520708084, + "learning_rate": 6.300887217989174e-05, + "loss": 0.5528, + "step": 5660 + }, + { + "epoch": 1.1637372802960222, + "grad_norm": 0.1935834139585495, + "learning_rate": 6.299971834459877e-05, + "loss": 0.5699, + "step": 5661 + }, + { + "epoch": 1.1639428512694008, + "grad_norm": 0.19088830053806305, + "learning_rate": 6.29905636225438e-05, + "loss": 0.5742, + "step": 5662 + }, + { + "epoch": 1.1641484222427794, + "grad_norm": 0.1988966315984726, + "learning_rate": 6.298140801417786e-05, + "loss": 0.566, + "step": 5663 + }, + { + "epoch": 1.164353993216158, + "grad_norm": 0.2001844048500061, + "learning_rate": 6.297225151995198e-05, + "loss": 0.5765, + "step": 5664 + }, + { + "epoch": 1.1645595641895365, + "grad_norm": 0.16796830296516418, + "learning_rate": 6.296309414031727e-05, + "loss": 0.5534, + "step": 5665 + }, + { + "epoch": 1.164765135162915, + "grad_norm": 0.15863637626171112, + "learning_rate": 6.295393587572489e-05, + "loss": 0.576, + "step": 5666 + }, + { + "epoch": 1.1649707061362935, + "grad_norm": 0.19147972762584686, + "learning_rate": 6.2944776726626e-05, + "loss": 0.5694, + "step": 5667 + }, + { + "epoch": 1.165176277109672, + "grad_norm": 0.18630050122737885, + "learning_rate": 6.293561669347181e-05, + "loss": 0.561, + "step": 5668 + }, + { + "epoch": 1.1653818480830507, + "grad_norm": 0.1899455487728119, + "learning_rate": 6.292645577671364e-05, + "loss": 0.5807, + "step": 5669 + }, + { + "epoch": 1.1655874190564293, + "grad_norm": 0.19663108885288239, + "learning_rate": 6.291729397680277e-05, + "loss": 0.5594, + "step": 5670 + }, + { + "epoch": 1.1657929900298079, + "grad_norm": 0.18838699162006378, + "learning_rate": 6.290813129419058e-05, + "loss": 0.5572, + "step": 5671 + }, + { + "epoch": 1.1659985610031864, + "grad_norm": 0.19074362516403198, + "learning_rate": 6.289896772932845e-05, + "loss": 0.5593, + "step": 5672 + }, + { + "epoch": 1.1662041319765648, + "grad_norm": 0.1634715497493744, + "learning_rate": 6.288980328266785e-05, + "loss": 0.5333, + "step": 5673 + }, + { + "epoch": 1.1664097029499434, + "grad_norm": 0.13483376801013947, + "learning_rate": 6.288063795466027e-05, + "loss": 0.5092, + "step": 5674 + }, + { + "epoch": 1.166615273923322, + "grad_norm": 0.18257947266101837, + "learning_rate": 6.28714717457572e-05, + "loss": 0.5607, + "step": 5675 + }, + { + "epoch": 1.1668208448967006, + "grad_norm": 0.19993911683559418, + "learning_rate": 6.286230465641028e-05, + "loss": 0.5628, + "step": 5676 + }, + { + "epoch": 1.1670264158700792, + "grad_norm": 0.1948871612548828, + "learning_rate": 6.28531366870711e-05, + "loss": 0.5566, + "step": 5677 + }, + { + "epoch": 1.1672319868434577, + "grad_norm": 0.1864452362060547, + "learning_rate": 6.28439678381913e-05, + "loss": 0.5496, + "step": 5678 + }, + { + "epoch": 1.1674375578168363, + "grad_norm": 0.17033499479293823, + "learning_rate": 6.28347981102226e-05, + "loss": 0.5291, + "step": 5679 + }, + { + "epoch": 1.167643128790215, + "grad_norm": 0.16329137980937958, + "learning_rate": 6.282562750361679e-05, + "loss": 0.5538, + "step": 5680 + }, + { + "epoch": 1.1678486997635933, + "grad_norm": 0.20135296881198883, + "learning_rate": 6.281645601882561e-05, + "loss": 0.5409, + "step": 5681 + }, + { + "epoch": 1.1680542707369719, + "grad_norm": 0.16525396704673767, + "learning_rate": 6.28072836563009e-05, + "loss": 0.5034, + "step": 5682 + }, + { + "epoch": 1.1682598417103505, + "grad_norm": 0.16303305327892303, + "learning_rate": 6.279811041649457e-05, + "loss": 0.5464, + "step": 5683 + }, + { + "epoch": 1.168465412683729, + "grad_norm": 0.20432288944721222, + "learning_rate": 6.278893629985854e-05, + "loss": 0.5617, + "step": 5684 + }, + { + "epoch": 1.1686709836571076, + "grad_norm": 0.19627077877521515, + "learning_rate": 6.277976130684476e-05, + "loss": 0.5516, + "step": 5685 + }, + { + "epoch": 1.1688765546304862, + "grad_norm": 0.19442994892597198, + "learning_rate": 6.277058543790522e-05, + "loss": 0.5859, + "step": 5686 + }, + { + "epoch": 1.1690821256038648, + "grad_norm": 0.1668756902217865, + "learning_rate": 6.276140869349202e-05, + "loss": 0.5412, + "step": 5687 + }, + { + "epoch": 1.1692876965772432, + "grad_norm": 0.16319718956947327, + "learning_rate": 6.275223107405723e-05, + "loss": 0.5365, + "step": 5688 + }, + { + "epoch": 1.1694932675506218, + "grad_norm": 0.20029065012931824, + "learning_rate": 6.274305258005296e-05, + "loss": 0.5555, + "step": 5689 + }, + { + "epoch": 1.1696988385240004, + "grad_norm": 0.16278813779354095, + "learning_rate": 6.273387321193146e-05, + "loss": 0.5314, + "step": 5690 + }, + { + "epoch": 1.169904409497379, + "grad_norm": 0.16741250455379486, + "learning_rate": 6.272469297014488e-05, + "loss": 0.5435, + "step": 5691 + }, + { + "epoch": 1.1701099804707575, + "grad_norm": 0.2003338634967804, + "learning_rate": 6.271551185514553e-05, + "loss": 0.5842, + "step": 5692 + }, + { + "epoch": 1.1703155514441361, + "grad_norm": 0.17789803445339203, + "learning_rate": 6.270632986738573e-05, + "loss": 0.5276, + "step": 5693 + }, + { + "epoch": 1.1705211224175147, + "grad_norm": 0.16743101179599762, + "learning_rate": 6.269714700731782e-05, + "loss": 0.5777, + "step": 5694 + }, + { + "epoch": 1.1707266933908933, + "grad_norm": 0.19358138740062714, + "learning_rate": 6.268796327539417e-05, + "loss": 0.5585, + "step": 5695 + }, + { + "epoch": 1.1709322643642717, + "grad_norm": 0.16014361381530762, + "learning_rate": 6.267877867206724e-05, + "loss": 0.506, + "step": 5696 + }, + { + "epoch": 1.1711378353376503, + "grad_norm": 0.15720070898532867, + "learning_rate": 6.266959319778953e-05, + "loss": 0.5688, + "step": 5697 + }, + { + "epoch": 1.1713434063110288, + "grad_norm": 0.1944281905889511, + "learning_rate": 6.266040685301356e-05, + "loss": 0.5611, + "step": 5698 + }, + { + "epoch": 1.1715489772844074, + "grad_norm": 0.19197237491607666, + "learning_rate": 6.265121963819189e-05, + "loss": 0.5491, + "step": 5699 + }, + { + "epoch": 1.171754548257786, + "grad_norm": 0.1880941390991211, + "learning_rate": 6.26420315537771e-05, + "loss": 0.5478, + "step": 5700 + }, + { + "epoch": 1.1719601192311646, + "grad_norm": 0.18762564659118652, + "learning_rate": 6.26328426002219e-05, + "loss": 0.5592, + "step": 5701 + }, + { + "epoch": 1.1721656902045432, + "grad_norm": 0.19078297913074493, + "learning_rate": 6.262365277797894e-05, + "loss": 0.5801, + "step": 5702 + }, + { + "epoch": 1.1723712611779216, + "grad_norm": 0.15825822949409485, + "learning_rate": 6.2614462087501e-05, + "loss": 0.5238, + "step": 5703 + }, + { + "epoch": 1.1725768321513002, + "grad_norm": 0.16313259303569794, + "learning_rate": 6.260527052924083e-05, + "loss": 0.5675, + "step": 5704 + }, + { + "epoch": 1.1727824031246787, + "grad_norm": 0.20915348827838898, + "learning_rate": 6.259607810365128e-05, + "loss": 0.5871, + "step": 5705 + }, + { + "epoch": 1.1729879740980573, + "grad_norm": 0.1840449571609497, + "learning_rate": 6.258688481118519e-05, + "loss": 0.5617, + "step": 5706 + }, + { + "epoch": 1.173193545071436, + "grad_norm": 0.19125378131866455, + "learning_rate": 6.257769065229551e-05, + "loss": 0.5525, + "step": 5707 + }, + { + "epoch": 1.1733991160448145, + "grad_norm": 0.16844969987869263, + "learning_rate": 6.256849562743514e-05, + "loss": 0.5422, + "step": 5708 + }, + { + "epoch": 1.173604687018193, + "grad_norm": 0.17428073287010193, + "learning_rate": 6.255929973705714e-05, + "loss": 0.5564, + "step": 5709 + }, + { + "epoch": 1.1738102579915717, + "grad_norm": 0.1962093710899353, + "learning_rate": 6.255010298161448e-05, + "loss": 0.5671, + "step": 5710 + }, + { + "epoch": 1.1740158289649503, + "grad_norm": 0.19688303768634796, + "learning_rate": 6.254090536156028e-05, + "loss": 0.5736, + "step": 5711 + }, + { + "epoch": 1.1742213999383286, + "grad_norm": 0.19924046099185944, + "learning_rate": 6.253170687734769e-05, + "loss": 0.5536, + "step": 5712 + }, + { + "epoch": 1.1744269709117072, + "grad_norm": 0.21053309738636017, + "learning_rate": 6.252250752942981e-05, + "loss": 0.5725, + "step": 5713 + }, + { + "epoch": 1.1746325418850858, + "grad_norm": 0.15548844635486603, + "learning_rate": 6.251330731825989e-05, + "loss": 0.5061, + "step": 5714 + }, + { + "epoch": 1.1748381128584644, + "grad_norm": 0.16448529064655304, + "learning_rate": 6.250410624429118e-05, + "loss": 0.5618, + "step": 5715 + }, + { + "epoch": 1.175043683831843, + "grad_norm": 0.19345583021640778, + "learning_rate": 6.249490430797699e-05, + "loss": 0.548, + "step": 5716 + }, + { + "epoch": 1.1752492548052216, + "grad_norm": 0.19691455364227295, + "learning_rate": 6.248570150977061e-05, + "loss": 0.5466, + "step": 5717 + }, + { + "epoch": 1.1754548257786, + "grad_norm": 0.19735218584537506, + "learning_rate": 6.247649785012545e-05, + "loss": 0.5595, + "step": 5718 + }, + { + "epoch": 1.1756603967519785, + "grad_norm": 0.19617964327335358, + "learning_rate": 6.246729332949493e-05, + "loss": 0.5774, + "step": 5719 + }, + { + "epoch": 1.1758659677253571, + "grad_norm": 0.19635650515556335, + "learning_rate": 6.24580879483325e-05, + "loss": 0.5542, + "step": 5720 + }, + { + "epoch": 1.1760715386987357, + "grad_norm": 0.19671329855918884, + "learning_rate": 6.244888170709169e-05, + "loss": 0.5775, + "step": 5721 + }, + { + "epoch": 1.1762771096721143, + "grad_norm": 0.20057837665081024, + "learning_rate": 6.243967460622603e-05, + "loss": 0.5706, + "step": 5722 + }, + { + "epoch": 1.1764826806454929, + "grad_norm": 0.1965552419424057, + "learning_rate": 6.243046664618911e-05, + "loss": 0.5698, + "step": 5723 + }, + { + "epoch": 1.1766882516188715, + "grad_norm": 0.19308249652385712, + "learning_rate": 6.242125782743456e-05, + "loss": 0.5642, + "step": 5724 + }, + { + "epoch": 1.17689382259225, + "grad_norm": 0.19306235015392303, + "learning_rate": 6.241204815041608e-05, + "loss": 0.576, + "step": 5725 + }, + { + "epoch": 1.1770993935656286, + "grad_norm": 0.18735530972480774, + "learning_rate": 6.240283761558737e-05, + "loss": 0.5678, + "step": 5726 + }, + { + "epoch": 1.177304964539007, + "grad_norm": 0.1929217427968979, + "learning_rate": 6.239362622340218e-05, + "loss": 0.5542, + "step": 5727 + }, + { + "epoch": 1.1775105355123856, + "grad_norm": 0.19190043210983276, + "learning_rate": 6.238441397431433e-05, + "loss": 0.5836, + "step": 5728 + }, + { + "epoch": 1.1777161064857642, + "grad_norm": 0.1934564858675003, + "learning_rate": 6.237520086877766e-05, + "loss": 0.5532, + "step": 5729 + }, + { + "epoch": 1.1779216774591428, + "grad_norm": 0.16846685111522675, + "learning_rate": 6.236598690724606e-05, + "loss": 0.5279, + "step": 5730 + }, + { + "epoch": 1.1781272484325214, + "grad_norm": 0.1717388778924942, + "learning_rate": 6.235677209017345e-05, + "loss": 0.5595, + "step": 5731 + }, + { + "epoch": 1.1783328194059, + "grad_norm": 0.18958315253257751, + "learning_rate": 6.234755641801379e-05, + "loss": 0.5657, + "step": 5732 + }, + { + "epoch": 1.1785383903792783, + "grad_norm": 0.19686202704906464, + "learning_rate": 6.233833989122112e-05, + "loss": 0.5983, + "step": 5733 + }, + { + "epoch": 1.178743961352657, + "grad_norm": 0.1927022784948349, + "learning_rate": 6.232912251024948e-05, + "loss": 0.5968, + "step": 5734 + }, + { + "epoch": 1.1789495323260355, + "grad_norm": 0.19848833978176117, + "learning_rate": 6.231990427555297e-05, + "loss": 0.5491, + "step": 5735 + }, + { + "epoch": 1.179155103299414, + "grad_norm": 0.189555823802948, + "learning_rate": 6.231068518758572e-05, + "loss": 0.5525, + "step": 5736 + }, + { + "epoch": 1.1793606742727927, + "grad_norm": 0.19321559369564056, + "learning_rate": 6.230146524680194e-05, + "loss": 0.5792, + "step": 5737 + }, + { + "epoch": 1.1795662452461713, + "grad_norm": 0.19412335753440857, + "learning_rate": 6.229224445365582e-05, + "loss": 0.5731, + "step": 5738 + }, + { + "epoch": 1.1797718162195499, + "grad_norm": 0.20160719752311707, + "learning_rate": 6.228302280860166e-05, + "loss": 0.5931, + "step": 5739 + }, + { + "epoch": 1.1799773871929284, + "grad_norm": 0.19900692999362946, + "learning_rate": 6.227380031209373e-05, + "loss": 0.5437, + "step": 5740 + }, + { + "epoch": 1.180182958166307, + "grad_norm": 0.19047874212265015, + "learning_rate": 6.226457696458639e-05, + "loss": 0.5529, + "step": 5741 + }, + { + "epoch": 1.1803885291396854, + "grad_norm": 0.19529984891414642, + "learning_rate": 6.225535276653405e-05, + "loss": 0.5672, + "step": 5742 + }, + { + "epoch": 1.180594100113064, + "grad_norm": 0.19696053862571716, + "learning_rate": 6.224612771839113e-05, + "loss": 0.572, + "step": 5743 + }, + { + "epoch": 1.1807996710864426, + "grad_norm": 0.19073131680488586, + "learning_rate": 6.22369018206121e-05, + "loss": 0.5524, + "step": 5744 + }, + { + "epoch": 1.1810052420598212, + "grad_norm": 0.18917502462863922, + "learning_rate": 6.222767507365148e-05, + "loss": 0.5542, + "step": 5745 + }, + { + "epoch": 1.1812108130331997, + "grad_norm": 0.19207759201526642, + "learning_rate": 6.221844747796384e-05, + "loss": 0.5594, + "step": 5746 + }, + { + "epoch": 1.1814163840065783, + "grad_norm": 0.1916734278202057, + "learning_rate": 6.220921903400376e-05, + "loss": 0.554, + "step": 5747 + }, + { + "epoch": 1.181621954979957, + "grad_norm": 0.1720525622367859, + "learning_rate": 6.21999897422259e-05, + "loss": 0.517, + "step": 5748 + }, + { + "epoch": 1.1818275259533353, + "grad_norm": 0.1582804173231125, + "learning_rate": 6.219075960308494e-05, + "loss": 0.5714, + "step": 5749 + }, + { + "epoch": 1.1820330969267139, + "grad_norm": 0.20018833875656128, + "learning_rate": 6.218152861703561e-05, + "loss": 0.5783, + "step": 5750 + }, + { + "epoch": 1.1822386679000925, + "grad_norm": 0.16681919991970062, + "learning_rate": 6.217229678453265e-05, + "loss": 0.5182, + "step": 5751 + }, + { + "epoch": 1.182444238873471, + "grad_norm": 0.1674472838640213, + "learning_rate": 6.21630641060309e-05, + "loss": 0.5756, + "step": 5752 + }, + { + "epoch": 1.1826498098468496, + "grad_norm": 0.19080859422683716, + "learning_rate": 6.215383058198521e-05, + "loss": 0.5616, + "step": 5753 + }, + { + "epoch": 1.1828553808202282, + "grad_norm": 0.18792377412319183, + "learning_rate": 6.214459621285047e-05, + "loss": 0.5482, + "step": 5754 + }, + { + "epoch": 1.1830609517936068, + "grad_norm": 0.1907912641763687, + "learning_rate": 6.21353609990816e-05, + "loss": 0.5613, + "step": 5755 + }, + { + "epoch": 1.1832665227669854, + "grad_norm": 0.1828346997499466, + "learning_rate": 6.212612494113358e-05, + "loss": 0.5496, + "step": 5756 + }, + { + "epoch": 1.1834720937403638, + "grad_norm": 0.19093002378940582, + "learning_rate": 6.211688803946142e-05, + "loss": 0.5769, + "step": 5757 + }, + { + "epoch": 1.1836776647137424, + "grad_norm": 0.1904676854610443, + "learning_rate": 6.21076502945202e-05, + "loss": 0.5385, + "step": 5758 + }, + { + "epoch": 1.183883235687121, + "grad_norm": 0.1881975680589676, + "learning_rate": 6.209841170676502e-05, + "loss": 0.5633, + "step": 5759 + }, + { + "epoch": 1.1840888066604995, + "grad_norm": 0.20327463746070862, + "learning_rate": 6.208917227665102e-05, + "loss": 0.5714, + "step": 5760 + }, + { + "epoch": 1.1842943776338781, + "grad_norm": 0.18997357785701752, + "learning_rate": 6.207993200463335e-05, + "loss": 0.551, + "step": 5761 + }, + { + "epoch": 1.1844999486072567, + "grad_norm": 0.1653435230255127, + "learning_rate": 6.207069089116728e-05, + "loss": 0.5465, + "step": 5762 + }, + { + "epoch": 1.1847055195806353, + "grad_norm": 0.1645163893699646, + "learning_rate": 6.206144893670805e-05, + "loss": 0.5411, + "step": 5763 + }, + { + "epoch": 1.1849110905540137, + "grad_norm": 0.18971189856529236, + "learning_rate": 6.205220614171098e-05, + "loss": 0.5724, + "step": 5764 + }, + { + "epoch": 1.1851166615273923, + "grad_norm": 0.19266551733016968, + "learning_rate": 6.204296250663142e-05, + "loss": 0.544, + "step": 5765 + }, + { + "epoch": 1.1853222325007708, + "grad_norm": 0.1676861196756363, + "learning_rate": 6.203371803192475e-05, + "loss": 0.5232, + "step": 5766 + }, + { + "epoch": 1.1855278034741494, + "grad_norm": 0.16158527135849, + "learning_rate": 6.20244727180464e-05, + "loss": 0.5324, + "step": 5767 + }, + { + "epoch": 1.185733374447528, + "grad_norm": 0.16184964776039124, + "learning_rate": 6.201522656545186e-05, + "loss": 0.5454, + "step": 5768 + }, + { + "epoch": 1.1859389454209066, + "grad_norm": 0.16072934865951538, + "learning_rate": 6.200597957459664e-05, + "loss": 0.5676, + "step": 5769 + }, + { + "epoch": 1.1861445163942852, + "grad_norm": 0.19808636605739594, + "learning_rate": 6.199673174593629e-05, + "loss": 0.5426, + "step": 5770 + }, + { + "epoch": 1.1863500873676638, + "grad_norm": 0.19355566799640656, + "learning_rate": 6.19874830799264e-05, + "loss": 0.5601, + "step": 5771 + }, + { + "epoch": 1.1865556583410422, + "grad_norm": 0.1977650374174118, + "learning_rate": 6.197823357702263e-05, + "loss": 0.5749, + "step": 5772 + }, + { + "epoch": 1.1867612293144207, + "grad_norm": 0.17442461848258972, + "learning_rate": 6.196898323768065e-05, + "loss": 0.5253, + "step": 5773 + }, + { + "epoch": 1.1869668002877993, + "grad_norm": 0.15890754759311676, + "learning_rate": 6.195973206235616e-05, + "loss": 0.5509, + "step": 5774 + }, + { + "epoch": 1.187172371261178, + "grad_norm": 0.18826748430728912, + "learning_rate": 6.195048005150496e-05, + "loss": 0.54, + "step": 5775 + }, + { + "epoch": 1.1873779422345565, + "grad_norm": 0.18961307406425476, + "learning_rate": 6.194122720558282e-05, + "loss": 0.5505, + "step": 5776 + }, + { + "epoch": 1.187583513207935, + "grad_norm": 0.19002290070056915, + "learning_rate": 6.193197352504561e-05, + "loss": 0.5637, + "step": 5777 + }, + { + "epoch": 1.1877890841813137, + "grad_norm": 0.1975557655096054, + "learning_rate": 6.19227190103492e-05, + "loss": 0.5667, + "step": 5778 + }, + { + "epoch": 1.187994655154692, + "grad_norm": 0.20086504518985748, + "learning_rate": 6.191346366194952e-05, + "loss": 0.5792, + "step": 5779 + }, + { + "epoch": 1.1882002261280706, + "grad_norm": 0.19469043612480164, + "learning_rate": 6.190420748030253e-05, + "loss": 0.562, + "step": 5780 + }, + { + "epoch": 1.1884057971014492, + "grad_norm": 0.19469872117042542, + "learning_rate": 6.189495046586427e-05, + "loss": 0.5725, + "step": 5781 + }, + { + "epoch": 1.1886113680748278, + "grad_norm": 0.1903071254491806, + "learning_rate": 6.188569261909076e-05, + "loss": 0.5604, + "step": 5782 + }, + { + "epoch": 1.1888169390482064, + "grad_norm": 0.18922393023967743, + "learning_rate": 6.187643394043808e-05, + "loss": 0.5336, + "step": 5783 + }, + { + "epoch": 1.189022510021585, + "grad_norm": 0.19879461824893951, + "learning_rate": 6.186717443036239e-05, + "loss": 0.5699, + "step": 5784 + }, + { + "epoch": 1.1892280809949636, + "grad_norm": 0.19611231982707977, + "learning_rate": 6.185791408931986e-05, + "loss": 0.533, + "step": 5785 + }, + { + "epoch": 1.1894336519683422, + "grad_norm": 0.17245331406593323, + "learning_rate": 6.18486529177667e-05, + "loss": 0.5268, + "step": 5786 + }, + { + "epoch": 1.1896392229417208, + "grad_norm": 0.15049666166305542, + "learning_rate": 6.183939091615915e-05, + "loss": 0.5324, + "step": 5787 + }, + { + "epoch": 1.1898447939150991, + "grad_norm": 0.1296570748090744, + "learning_rate": 6.183012808495353e-05, + "loss": 0.5245, + "step": 5788 + }, + { + "epoch": 1.1900503648884777, + "grad_norm": 0.1654006838798523, + "learning_rate": 6.182086442460614e-05, + "loss": 0.5405, + "step": 5789 + }, + { + "epoch": 1.1902559358618563, + "grad_norm": 0.20028263330459595, + "learning_rate": 6.181159993557338e-05, + "loss": 0.5792, + "step": 5790 + }, + { + "epoch": 1.1904615068352349, + "grad_norm": 0.19533969461917877, + "learning_rate": 6.18023346183117e-05, + "loss": 0.5698, + "step": 5791 + }, + { + "epoch": 1.1906670778086135, + "grad_norm": 0.16536763310432434, + "learning_rate": 6.17930684732775e-05, + "loss": 0.5253, + "step": 5792 + }, + { + "epoch": 1.190872648781992, + "grad_norm": 0.16189715266227722, + "learning_rate": 6.178380150092732e-05, + "loss": 0.5759, + "step": 5793 + }, + { + "epoch": 1.1910782197553704, + "grad_norm": 0.1967983990907669, + "learning_rate": 6.177453370171768e-05, + "loss": 0.5721, + "step": 5794 + }, + { + "epoch": 1.191283790728749, + "grad_norm": 0.1946103274822235, + "learning_rate": 6.176526507610518e-05, + "loss": 0.5587, + "step": 5795 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 0.20200130343437195, + "learning_rate": 6.175599562454641e-05, + "loss": 0.571, + "step": 5796 + }, + { + "epoch": 1.1916949326755062, + "grad_norm": 0.19911526143550873, + "learning_rate": 6.174672534749808e-05, + "loss": 0.5615, + "step": 5797 + }, + { + "epoch": 1.1919005036488848, + "grad_norm": 0.19905459880828857, + "learning_rate": 6.173745424541684e-05, + "loss": 0.5793, + "step": 5798 + }, + { + "epoch": 1.1921060746222634, + "grad_norm": 0.1912047415971756, + "learning_rate": 6.172818231875947e-05, + "loss": 0.5543, + "step": 5799 + }, + { + "epoch": 1.192311645595642, + "grad_norm": 0.16958840191364288, + "learning_rate": 6.171890956798275e-05, + "loss": 0.5339, + "step": 5800 + }, + { + "epoch": 1.1925172165690205, + "grad_norm": 0.1356760561466217, + "learning_rate": 6.170963599354349e-05, + "loss": 0.5175, + "step": 5801 + }, + { + "epoch": 1.1927227875423991, + "grad_norm": 0.1700810045003891, + "learning_rate": 6.170036159589856e-05, + "loss": 0.554, + "step": 5802 + }, + { + "epoch": 1.1929283585157775, + "grad_norm": 0.17295996844768524, + "learning_rate": 6.169108637550488e-05, + "loss": 0.5169, + "step": 5803 + }, + { + "epoch": 1.193133929489156, + "grad_norm": 0.1662554293870926, + "learning_rate": 6.16818103328194e-05, + "loss": 0.5882, + "step": 5804 + }, + { + "epoch": 1.1933395004625347, + "grad_norm": 0.1974506676197052, + "learning_rate": 6.167253346829909e-05, + "loss": 0.5556, + "step": 5805 + }, + { + "epoch": 1.1935450714359133, + "grad_norm": 0.19866618514060974, + "learning_rate": 6.166325578240098e-05, + "loss": 0.5748, + "step": 5806 + }, + { + "epoch": 1.1937506424092919, + "grad_norm": 0.19283287227153778, + "learning_rate": 6.165397727558214e-05, + "loss": 0.5611, + "step": 5807 + }, + { + "epoch": 1.1939562133826704, + "grad_norm": 0.19626696407794952, + "learning_rate": 6.164469794829967e-05, + "loss": 0.5579, + "step": 5808 + }, + { + "epoch": 1.1941617843560488, + "grad_norm": 0.19367843866348267, + "learning_rate": 6.163541780101075e-05, + "loss": 0.5642, + "step": 5809 + }, + { + "epoch": 1.1943673553294274, + "grad_norm": 0.19207385182380676, + "learning_rate": 6.162613683417253e-05, + "loss": 0.5586, + "step": 5810 + }, + { + "epoch": 1.194572926302806, + "grad_norm": 0.19212685525417328, + "learning_rate": 6.161685504824227e-05, + "loss": 0.5427, + "step": 5811 + }, + { + "epoch": 1.1947784972761846, + "grad_norm": 0.1972237080335617, + "learning_rate": 6.160757244367723e-05, + "loss": 0.5595, + "step": 5812 + }, + { + "epoch": 1.1949840682495632, + "grad_norm": 0.2040352076292038, + "learning_rate": 6.159828902093471e-05, + "loss": 0.5384, + "step": 5813 + }, + { + "epoch": 1.1951896392229417, + "grad_norm": 0.1992282271385193, + "learning_rate": 6.158900478047206e-05, + "loss": 0.5757, + "step": 5814 + }, + { + "epoch": 1.1953952101963203, + "grad_norm": 0.18852105736732483, + "learning_rate": 6.15797197227467e-05, + "loss": 0.5714, + "step": 5815 + }, + { + "epoch": 1.195600781169699, + "grad_norm": 0.18910926580429077, + "learning_rate": 6.157043384821604e-05, + "loss": 0.5506, + "step": 5816 + }, + { + "epoch": 1.1958063521430775, + "grad_norm": 0.19245147705078125, + "learning_rate": 6.156114715733756e-05, + "loss": 0.5513, + "step": 5817 + }, + { + "epoch": 1.1960119231164559, + "grad_norm": 0.19064119458198547, + "learning_rate": 6.155185965056875e-05, + "loss": 0.5643, + "step": 5818 + }, + { + "epoch": 1.1962174940898345, + "grad_norm": 0.2007809430360794, + "learning_rate": 6.15425713283672e-05, + "loss": 0.5773, + "step": 5819 + }, + { + "epoch": 1.196423065063213, + "grad_norm": 0.1933142989873886, + "learning_rate": 6.153328219119048e-05, + "loss": 0.5504, + "step": 5820 + }, + { + "epoch": 1.1966286360365916, + "grad_norm": 0.16889862716197968, + "learning_rate": 6.152399223949619e-05, + "loss": 0.5338, + "step": 5821 + }, + { + "epoch": 1.1968342070099702, + "grad_norm": 0.16849687695503235, + "learning_rate": 6.151470147374206e-05, + "loss": 0.5679, + "step": 5822 + }, + { + "epoch": 1.1970397779833488, + "grad_norm": 0.19202522933483124, + "learning_rate": 6.150540989438577e-05, + "loss": 0.5656, + "step": 5823 + }, + { + "epoch": 1.1972453489567274, + "grad_norm": 0.19393931329250336, + "learning_rate": 6.149611750188508e-05, + "loss": 0.5745, + "step": 5824 + }, + { + "epoch": 1.1974509199301058, + "grad_norm": 0.15858381986618042, + "learning_rate": 6.14868242966978e-05, + "loss": 0.5202, + "step": 5825 + }, + { + "epoch": 1.1976564909034844, + "grad_norm": 0.15841448307037354, + "learning_rate": 6.147753027928173e-05, + "loss": 0.5518, + "step": 5826 + }, + { + "epoch": 1.197862061876863, + "grad_norm": 0.18990083038806915, + "learning_rate": 6.146823545009475e-05, + "loss": 0.5576, + "step": 5827 + }, + { + "epoch": 1.1980676328502415, + "grad_norm": 0.1819765716791153, + "learning_rate": 6.14589398095948e-05, + "loss": 0.5608, + "step": 5828 + }, + { + "epoch": 1.1982732038236201, + "grad_norm": 0.1861831545829773, + "learning_rate": 6.144964335823981e-05, + "loss": 0.5659, + "step": 5829 + }, + { + "epoch": 1.1984787747969987, + "grad_norm": 0.18785440921783447, + "learning_rate": 6.14403460964878e-05, + "loss": 0.5752, + "step": 5830 + }, + { + "epoch": 1.1986843457703773, + "grad_norm": 0.1981627196073532, + "learning_rate": 6.143104802479673e-05, + "loss": 0.578, + "step": 5831 + }, + { + "epoch": 1.198889916743756, + "grad_norm": 0.19505171477794647, + "learning_rate": 6.142174914362476e-05, + "loss": 0.542, + "step": 5832 + }, + { + "epoch": 1.1990954877171343, + "grad_norm": 0.1755106896162033, + "learning_rate": 6.141244945342995e-05, + "loss": 0.53, + "step": 5833 + }, + { + "epoch": 1.1993010586905128, + "grad_norm": 0.1715668886899948, + "learning_rate": 6.140314895467045e-05, + "loss": 0.5479, + "step": 5834 + }, + { + "epoch": 1.1995066296638914, + "grad_norm": 0.19255517423152924, + "learning_rate": 6.13938476478045e-05, + "loss": 0.5572, + "step": 5835 + }, + { + "epoch": 1.19971220063727, + "grad_norm": 0.1867235153913498, + "learning_rate": 6.13845455332903e-05, + "loss": 0.5865, + "step": 5836 + }, + { + "epoch": 1.1999177716106486, + "grad_norm": 0.18764084577560425, + "learning_rate": 6.137524261158612e-05, + "loss": 0.5437, + "step": 5837 + }, + { + "epoch": 1.2001233425840272, + "grad_norm": 0.20819789171218872, + "learning_rate": 6.136593888315025e-05, + "loss": 0.5891, + "step": 5838 + }, + { + "epoch": 1.2003289135574058, + "grad_norm": 0.1949729472398758, + "learning_rate": 6.13566343484411e-05, + "loss": 0.5662, + "step": 5839 + }, + { + "epoch": 1.2005344845307842, + "grad_norm": 0.18804004788398743, + "learning_rate": 6.1347329007917e-05, + "loss": 0.5601, + "step": 5840 + }, + { + "epoch": 1.2007400555041627, + "grad_norm": 0.18714557588100433, + "learning_rate": 6.133802286203642e-05, + "loss": 0.5637, + "step": 5841 + }, + { + "epoch": 1.2009456264775413, + "grad_norm": 0.19639329612255096, + "learning_rate": 6.132871591125781e-05, + "loss": 0.5698, + "step": 5842 + }, + { + "epoch": 1.20115119745092, + "grad_norm": 0.20430424809455872, + "learning_rate": 6.131940815603969e-05, + "loss": 0.5739, + "step": 5843 + }, + { + "epoch": 1.2013567684242985, + "grad_norm": 0.19093136489391327, + "learning_rate": 6.13100995968406e-05, + "loss": 0.5455, + "step": 5844 + }, + { + "epoch": 1.201562339397677, + "grad_norm": 0.1929858773946762, + "learning_rate": 6.130079023411915e-05, + "loss": 0.5741, + "step": 5845 + }, + { + "epoch": 1.2017679103710557, + "grad_norm": 0.19032742083072662, + "learning_rate": 6.129148006833394e-05, + "loss": 0.5586, + "step": 5846 + }, + { + "epoch": 1.2019734813444343, + "grad_norm": 0.19212977588176727, + "learning_rate": 6.128216909994367e-05, + "loss": 0.5655, + "step": 5847 + }, + { + "epoch": 1.2021790523178126, + "grad_norm": 0.19061528146266937, + "learning_rate": 6.127285732940702e-05, + "loss": 0.5499, + "step": 5848 + }, + { + "epoch": 1.2023846232911912, + "grad_norm": 0.19122721254825592, + "learning_rate": 6.126354475718275e-05, + "loss": 0.5456, + "step": 5849 + }, + { + "epoch": 1.2025901942645698, + "grad_norm": 0.17146308720111847, + "learning_rate": 6.125423138372965e-05, + "loss": 0.5346, + "step": 5850 + }, + { + "epoch": 1.2027957652379484, + "grad_norm": 0.1573454737663269, + "learning_rate": 6.124491720950655e-05, + "loss": 0.5312, + "step": 5851 + }, + { + "epoch": 1.203001336211327, + "grad_norm": 0.16374094784259796, + "learning_rate": 6.123560223497228e-05, + "loss": 0.5587, + "step": 5852 + }, + { + "epoch": 1.2032069071847056, + "grad_norm": 0.18009409308433533, + "learning_rate": 6.12262864605858e-05, + "loss": 0.5452, + "step": 5853 + }, + { + "epoch": 1.2034124781580842, + "grad_norm": 0.17497576773166656, + "learning_rate": 6.1216969886806e-05, + "loss": 0.5535, + "step": 5854 + }, + { + "epoch": 1.2036180491314625, + "grad_norm": 0.2043164074420929, + "learning_rate": 6.120765251409191e-05, + "loss": 0.591, + "step": 5855 + }, + { + "epoch": 1.2038236201048411, + "grad_norm": 0.1914680004119873, + "learning_rate": 6.119833434290255e-05, + "loss": 0.5526, + "step": 5856 + }, + { + "epoch": 1.2040291910782197, + "grad_norm": 0.1849730759859085, + "learning_rate": 6.118901537369694e-05, + "loss": 0.5739, + "step": 5857 + }, + { + "epoch": 1.2042347620515983, + "grad_norm": 0.1906820684671402, + "learning_rate": 6.117969560693423e-05, + "loss": 0.5544, + "step": 5858 + }, + { + "epoch": 1.2044403330249769, + "grad_norm": 0.19102442264556885, + "learning_rate": 6.117037504307351e-05, + "loss": 0.5478, + "step": 5859 + }, + { + "epoch": 1.2046459039983555, + "grad_norm": 0.1686401218175888, + "learning_rate": 6.116105368257403e-05, + "loss": 0.5448, + "step": 5860 + }, + { + "epoch": 1.204851474971734, + "grad_norm": 0.13795730471611023, + "learning_rate": 6.115173152589495e-05, + "loss": 0.5262, + "step": 5861 + }, + { + "epoch": 1.2050570459451126, + "grad_norm": 0.164164200425148, + "learning_rate": 6.114240857349556e-05, + "loss": 0.5684, + "step": 5862 + }, + { + "epoch": 1.205262616918491, + "grad_norm": 0.19996531307697296, + "learning_rate": 6.113308482583514e-05, + "loss": 0.5608, + "step": 5863 + }, + { + "epoch": 1.2054681878918696, + "grad_norm": 0.19715693593025208, + "learning_rate": 6.112376028337305e-05, + "loss": 0.566, + "step": 5864 + }, + { + "epoch": 1.2056737588652482, + "grad_norm": 0.1752108633518219, + "learning_rate": 6.111443494656864e-05, + "loss": 0.5366, + "step": 5865 + }, + { + "epoch": 1.2058793298386268, + "grad_norm": 0.16722378134727478, + "learning_rate": 6.110510881588135e-05, + "loss": 0.5602, + "step": 5866 + }, + { + "epoch": 1.2060849008120054, + "grad_norm": 0.18732362985610962, + "learning_rate": 6.10957818917706e-05, + "loss": 0.5498, + "step": 5867 + }, + { + "epoch": 1.206290471785384, + "grad_norm": 0.1660609394311905, + "learning_rate": 6.108645417469593e-05, + "loss": 0.5257, + "step": 5868 + }, + { + "epoch": 1.2064960427587625, + "grad_norm": 0.1357351690530777, + "learning_rate": 6.107712566511685e-05, + "loss": 0.5126, + "step": 5869 + }, + { + "epoch": 1.206701613732141, + "grad_norm": 0.1652655005455017, + "learning_rate": 6.106779636349292e-05, + "loss": 0.5602, + "step": 5870 + }, + { + "epoch": 1.2069071847055195, + "grad_norm": 0.20981089770793915, + "learning_rate": 6.105846627028379e-05, + "loss": 0.5616, + "step": 5871 + }, + { + "epoch": 1.207112755678898, + "grad_norm": 0.19564464688301086, + "learning_rate": 6.104913538594905e-05, + "loss": 0.5609, + "step": 5872 + }, + { + "epoch": 1.2073183266522767, + "grad_norm": 0.19752687215805054, + "learning_rate": 6.103980371094844e-05, + "loss": 0.5766, + "step": 5873 + }, + { + "epoch": 1.2075238976256553, + "grad_norm": 0.20465241372585297, + "learning_rate": 6.103047124574167e-05, + "loss": 0.5877, + "step": 5874 + }, + { + "epoch": 1.2077294685990339, + "grad_norm": 0.19926784932613373, + "learning_rate": 6.102113799078851e-05, + "loss": 0.5558, + "step": 5875 + }, + { + "epoch": 1.2079350395724124, + "grad_norm": 0.1923745572566986, + "learning_rate": 6.1011803946548774e-05, + "loss": 0.5595, + "step": 5876 + }, + { + "epoch": 1.208140610545791, + "grad_norm": 0.16840709745883942, + "learning_rate": 6.100246911348227e-05, + "loss": 0.5261, + "step": 5877 + }, + { + "epoch": 1.2083461815191696, + "grad_norm": 0.16660816967487335, + "learning_rate": 6.099313349204893e-05, + "loss": 0.5633, + "step": 5878 + }, + { + "epoch": 1.208551752492548, + "grad_norm": 0.1967456340789795, + "learning_rate": 6.098379708270863e-05, + "loss": 0.5616, + "step": 5879 + }, + { + "epoch": 1.2087573234659266, + "grad_norm": 0.19242748618125916, + "learning_rate": 6.097445988592138e-05, + "loss": 0.5474, + "step": 5880 + }, + { + "epoch": 1.2089628944393052, + "grad_norm": 0.2012694627046585, + "learning_rate": 6.096512190214715e-05, + "loss": 0.5508, + "step": 5881 + }, + { + "epoch": 1.2091684654126837, + "grad_norm": 0.1632763296365738, + "learning_rate": 6.0955783131845994e-05, + "loss": 0.5535, + "step": 5882 + }, + { + "epoch": 1.2093740363860623, + "grad_norm": 0.16215071082115173, + "learning_rate": 6.094644357547796e-05, + "loss": 0.5579, + "step": 5883 + }, + { + "epoch": 1.209579607359441, + "grad_norm": 0.19483166933059692, + "learning_rate": 6.09371032335032e-05, + "loss": 0.5576, + "step": 5884 + }, + { + "epoch": 1.2097851783328193, + "grad_norm": 0.18877603113651276, + "learning_rate": 6.092776210638185e-05, + "loss": 0.5426, + "step": 5885 + }, + { + "epoch": 1.2099907493061979, + "grad_norm": 0.1930856853723526, + "learning_rate": 6.0918420194574104e-05, + "loss": 0.5597, + "step": 5886 + }, + { + "epoch": 1.2101963202795765, + "grad_norm": 0.1913139820098877, + "learning_rate": 6.0909077498540194e-05, + "loss": 0.5747, + "step": 5887 + }, + { + "epoch": 1.210401891252955, + "grad_norm": 0.16376695036888123, + "learning_rate": 6.0899734018740396e-05, + "loss": 0.502, + "step": 5888 + }, + { + "epoch": 1.2106074622263336, + "grad_norm": 0.15658964216709137, + "learning_rate": 6.0890389755635035e-05, + "loss": 0.5453, + "step": 5889 + }, + { + "epoch": 1.2108130331997122, + "grad_norm": 0.1946595311164856, + "learning_rate": 6.088104470968441e-05, + "loss": 0.5533, + "step": 5890 + }, + { + "epoch": 1.2110186041730908, + "grad_norm": 0.19284933805465698, + "learning_rate": 6.0871698881348966e-05, + "loss": 0.5385, + "step": 5891 + }, + { + "epoch": 1.2112241751464694, + "grad_norm": 0.19203589856624603, + "learning_rate": 6.0862352271089104e-05, + "loss": 0.5533, + "step": 5892 + }, + { + "epoch": 1.211429746119848, + "grad_norm": 0.19579070806503296, + "learning_rate": 6.0853004879365265e-05, + "loss": 0.5648, + "step": 5893 + }, + { + "epoch": 1.2116353170932264, + "grad_norm": 0.19746367633342743, + "learning_rate": 6.084365670663799e-05, + "loss": 0.5473, + "step": 5894 + }, + { + "epoch": 1.211840888066605, + "grad_norm": 0.199397012591362, + "learning_rate": 6.08343077533678e-05, + "loss": 0.5522, + "step": 5895 + }, + { + "epoch": 1.2120464590399835, + "grad_norm": 0.16631294786930084, + "learning_rate": 6.082495802001527e-05, + "loss": 0.5414, + "step": 5896 + }, + { + "epoch": 1.2122520300133621, + "grad_norm": 0.15855452418327332, + "learning_rate": 6.0815607507041024e-05, + "loss": 0.5403, + "step": 5897 + }, + { + "epoch": 1.2124576009867407, + "grad_norm": 0.196935772895813, + "learning_rate": 6.08062562149057e-05, + "loss": 0.5665, + "step": 5898 + }, + { + "epoch": 1.2126631719601193, + "grad_norm": 0.19539684057235718, + "learning_rate": 6.079690414407004e-05, + "loss": 0.5524, + "step": 5899 + }, + { + "epoch": 1.2128687429334977, + "grad_norm": 0.19079557061195374, + "learning_rate": 6.078755129499475e-05, + "loss": 0.5628, + "step": 5900 + }, + { + "epoch": 1.2130743139068763, + "grad_norm": 0.19366958737373352, + "learning_rate": 6.077819766814058e-05, + "loss": 0.5889, + "step": 5901 + }, + { + "epoch": 1.2132798848802548, + "grad_norm": 0.19458188116550446, + "learning_rate": 6.076884326396837e-05, + "loss": 0.571, + "step": 5902 + }, + { + "epoch": 1.2134854558536334, + "grad_norm": 0.16850589215755463, + "learning_rate": 6.075948808293894e-05, + "loss": 0.5335, + "step": 5903 + }, + { + "epoch": 1.213691026827012, + "grad_norm": 0.16787506639957428, + "learning_rate": 6.075013212551321e-05, + "loss": 0.5353, + "step": 5904 + }, + { + "epoch": 1.2138965978003906, + "grad_norm": 0.1945338398218155, + "learning_rate": 6.074077539215208e-05, + "loss": 0.5491, + "step": 5905 + }, + { + "epoch": 1.2141021687737692, + "grad_norm": 0.19000251591205597, + "learning_rate": 6.0731417883316524e-05, + "loss": 0.5523, + "step": 5906 + }, + { + "epoch": 1.2143077397471478, + "grad_norm": 0.18971100449562073, + "learning_rate": 6.0722059599467525e-05, + "loss": 0.5531, + "step": 5907 + }, + { + "epoch": 1.2145133107205264, + "grad_norm": 0.16435407102108002, + "learning_rate": 6.071270054106613e-05, + "loss": 0.5286, + "step": 5908 + }, + { + "epoch": 1.2147188816939047, + "grad_norm": 0.17342285811901093, + "learning_rate": 6.070334070857343e-05, + "loss": 0.5616, + "step": 5909 + }, + { + "epoch": 1.2149244526672833, + "grad_norm": 0.19488383829593658, + "learning_rate": 6.069398010245053e-05, + "loss": 0.5584, + "step": 5910 + }, + { + "epoch": 1.215130023640662, + "grad_norm": 0.1964189112186432, + "learning_rate": 6.068461872315858e-05, + "loss": 0.5744, + "step": 5911 + }, + { + "epoch": 1.2153355946140405, + "grad_norm": 0.19528479874134064, + "learning_rate": 6.067525657115879e-05, + "loss": 0.557, + "step": 5912 + }, + { + "epoch": 1.215541165587419, + "grad_norm": 0.19183097779750824, + "learning_rate": 6.066589364691237e-05, + "loss": 0.5591, + "step": 5913 + }, + { + "epoch": 1.2157467365607977, + "grad_norm": 0.19744020700454712, + "learning_rate": 6.065652995088058e-05, + "loss": 0.5627, + "step": 5914 + }, + { + "epoch": 1.2159523075341763, + "grad_norm": 0.18547560274600983, + "learning_rate": 6.064716548352475e-05, + "loss": 0.5539, + "step": 5915 + }, + { + "epoch": 1.2161578785075546, + "grad_norm": 0.19087590277194977, + "learning_rate": 6.063780024530621e-05, + "loss": 0.5627, + "step": 5916 + }, + { + "epoch": 1.2163634494809332, + "grad_norm": 0.19286733865737915, + "learning_rate": 6.0628434236686325e-05, + "loss": 0.5523, + "step": 5917 + }, + { + "epoch": 1.2165690204543118, + "grad_norm": 0.1942092925310135, + "learning_rate": 6.061906745812655e-05, + "loss": 0.574, + "step": 5918 + }, + { + "epoch": 1.2167745914276904, + "grad_norm": 0.19682841002941132, + "learning_rate": 6.060969991008832e-05, + "loss": 0.5768, + "step": 5919 + }, + { + "epoch": 1.216980162401069, + "grad_norm": 0.194288969039917, + "learning_rate": 6.060033159303314e-05, + "loss": 0.5704, + "step": 5920 + }, + { + "epoch": 1.2171857333744476, + "grad_norm": 0.20371194183826447, + "learning_rate": 6.059096250742252e-05, + "loss": 0.5677, + "step": 5921 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.20336924493312836, + "learning_rate": 6.058159265371807e-05, + "loss": 0.5228, + "step": 5922 + }, + { + "epoch": 1.2175968753212048, + "grad_norm": 0.1702810525894165, + "learning_rate": 6.0572222032381374e-05, + "loss": 0.5534, + "step": 5923 + }, + { + "epoch": 1.2178024462945831, + "grad_norm": 0.13445743918418884, + "learning_rate": 6.056285064387407e-05, + "loss": 0.5294, + "step": 5924 + }, + { + "epoch": 1.2180080172679617, + "grad_norm": 0.12932245433330536, + "learning_rate": 6.055347848865787e-05, + "loss": 0.5243, + "step": 5925 + }, + { + "epoch": 1.2182135882413403, + "grad_norm": 0.16721323132514954, + "learning_rate": 6.054410556719448e-05, + "loss": 0.5473, + "step": 5926 + }, + { + "epoch": 1.2184191592147189, + "grad_norm": 0.2189573496580124, + "learning_rate": 6.053473187994566e-05, + "loss": 0.566, + "step": 5927 + }, + { + "epoch": 1.2186247301880975, + "grad_norm": 0.19731007516384125, + "learning_rate": 6.052535742737321e-05, + "loss": 0.533, + "step": 5928 + }, + { + "epoch": 1.218830301161476, + "grad_norm": 0.19551746547222137, + "learning_rate": 6.051598220993896e-05, + "loss": 0.5785, + "step": 5929 + }, + { + "epoch": 1.2190358721348546, + "grad_norm": 0.2288779616355896, + "learning_rate": 6.0506606228104784e-05, + "loss": 0.5354, + "step": 5930 + }, + { + "epoch": 1.219241443108233, + "grad_norm": 0.17528457939624786, + "learning_rate": 6.0497229482332605e-05, + "loss": 0.5383, + "step": 5931 + }, + { + "epoch": 1.2194470140816116, + "grad_norm": 0.17240411043167114, + "learning_rate": 6.0487851973084365e-05, + "loss": 0.5693, + "step": 5932 + }, + { + "epoch": 1.2196525850549902, + "grad_norm": 0.199370875954628, + "learning_rate": 6.047847370082204e-05, + "loss": 0.548, + "step": 5933 + }, + { + "epoch": 1.2198581560283688, + "grad_norm": 0.20105613768100739, + "learning_rate": 6.046909466600768e-05, + "loss": 0.5604, + "step": 5934 + }, + { + "epoch": 1.2200637270017474, + "grad_norm": 0.16920122504234314, + "learning_rate": 6.0459714869103304e-05, + "loss": 0.5377, + "step": 5935 + }, + { + "epoch": 1.220269297975126, + "grad_norm": 0.17022979259490967, + "learning_rate": 6.0450334310571046e-05, + "loss": 0.556, + "step": 5936 + }, + { + "epoch": 1.2204748689485045, + "grad_norm": 0.22041717171669006, + "learning_rate": 6.044095299087304e-05, + "loss": 0.5874, + "step": 5937 + }, + { + "epoch": 1.2206804399218831, + "grad_norm": 0.20872265100479126, + "learning_rate": 6.0431570910471436e-05, + "loss": 0.5687, + "step": 5938 + }, + { + "epoch": 1.2208860108952615, + "grad_norm": 0.18911628425121307, + "learning_rate": 6.042218806982847e-05, + "loss": 0.5712, + "step": 5939 + }, + { + "epoch": 1.22109158186864, + "grad_norm": 0.19167855381965637, + "learning_rate": 6.0412804469406384e-05, + "loss": 0.5601, + "step": 5940 + }, + { + "epoch": 1.2212971528420187, + "grad_norm": 0.19254928827285767, + "learning_rate": 6.040342010966745e-05, + "loss": 0.5746, + "step": 5941 + }, + { + "epoch": 1.2215027238153973, + "grad_norm": 0.19120313227176666, + "learning_rate": 6.0394034991073994e-05, + "loss": 0.5502, + "step": 5942 + }, + { + "epoch": 1.2217082947887759, + "grad_norm": 0.1880388706922531, + "learning_rate": 6.038464911408841e-05, + "loss": 0.5629, + "step": 5943 + }, + { + "epoch": 1.2219138657621544, + "grad_norm": 0.19094626605510712, + "learning_rate": 6.0375262479173064e-05, + "loss": 0.5742, + "step": 5944 + }, + { + "epoch": 1.222119436735533, + "grad_norm": 0.19934087991714478, + "learning_rate": 6.0365875086790386e-05, + "loss": 0.6047, + "step": 5945 + }, + { + "epoch": 1.2223250077089114, + "grad_norm": 0.16785962879657745, + "learning_rate": 6.035648693740287e-05, + "loss": 0.5404, + "step": 5946 + }, + { + "epoch": 1.22253057868229, + "grad_norm": 0.160533607006073, + "learning_rate": 6.0347098031473025e-05, + "loss": 0.5391, + "step": 5947 + }, + { + "epoch": 1.2227361496556686, + "grad_norm": 0.201270192861557, + "learning_rate": 6.033770836946339e-05, + "loss": 0.5811, + "step": 5948 + }, + { + "epoch": 1.2229417206290472, + "grad_norm": 0.1920137256383896, + "learning_rate": 6.0328317951836554e-05, + "loss": 0.5595, + "step": 5949 + }, + { + "epoch": 1.2231472916024257, + "grad_norm": 0.19600927829742432, + "learning_rate": 6.031892677905513e-05, + "loss": 0.5679, + "step": 5950 + }, + { + "epoch": 1.2233528625758043, + "grad_norm": 0.19393356144428253, + "learning_rate": 6.030953485158178e-05, + "loss": 0.5586, + "step": 5951 + }, + { + "epoch": 1.223558433549183, + "grad_norm": 0.19558121263980865, + "learning_rate": 6.030014216987922e-05, + "loss": 0.5584, + "step": 5952 + }, + { + "epoch": 1.2237640045225615, + "grad_norm": 0.1591499000787735, + "learning_rate": 6.029074873441015e-05, + "loss": 0.512, + "step": 5953 + }, + { + "epoch": 1.2239695754959399, + "grad_norm": 0.1601012945175171, + "learning_rate": 6.028135454563737e-05, + "loss": 0.5482, + "step": 5954 + }, + { + "epoch": 1.2241751464693185, + "grad_norm": 0.1917879432439804, + "learning_rate": 6.027195960402367e-05, + "loss": 0.5619, + "step": 5955 + }, + { + "epoch": 1.224380717442697, + "grad_norm": 0.16363351047039032, + "learning_rate": 6.026256391003192e-05, + "loss": 0.5272, + "step": 5956 + }, + { + "epoch": 1.2245862884160756, + "grad_norm": 0.1613667905330658, + "learning_rate": 6.0253167464124965e-05, + "loss": 0.5448, + "step": 5957 + }, + { + "epoch": 1.2247918593894542, + "grad_norm": 0.19327108561992645, + "learning_rate": 6.0243770266765754e-05, + "loss": 0.5631, + "step": 5958 + }, + { + "epoch": 1.2249974303628328, + "grad_norm": 0.20113897323608398, + "learning_rate": 6.023437231841721e-05, + "loss": 0.5433, + "step": 5959 + }, + { + "epoch": 1.2252030013362114, + "grad_norm": 0.19953328371047974, + "learning_rate": 6.022497361954237e-05, + "loss": 0.5555, + "step": 5960 + }, + { + "epoch": 1.2254085723095898, + "grad_norm": 0.16104325652122498, + "learning_rate": 6.021557417060423e-05, + "loss": 0.5269, + "step": 5961 + }, + { + "epoch": 1.2256141432829684, + "grad_norm": 0.16105084121227264, + "learning_rate": 6.0206173972065865e-05, + "loss": 0.5649, + "step": 5962 + }, + { + "epoch": 1.225819714256347, + "grad_norm": 0.1889335662126541, + "learning_rate": 6.0196773024390374e-05, + "loss": 0.5536, + "step": 5963 + }, + { + "epoch": 1.2260252852297255, + "grad_norm": 0.19481204450130463, + "learning_rate": 6.018737132804093e-05, + "loss": 0.5673, + "step": 5964 + }, + { + "epoch": 1.2262308562031041, + "grad_norm": 0.16492706537246704, + "learning_rate": 6.017796888348068e-05, + "loss": 0.548, + "step": 5965 + }, + { + "epoch": 1.2264364271764827, + "grad_norm": 0.1624189019203186, + "learning_rate": 6.016856569117283e-05, + "loss": 0.5659, + "step": 5966 + }, + { + "epoch": 1.2266419981498613, + "grad_norm": 0.19174005091190338, + "learning_rate": 6.015916175158066e-05, + "loss": 0.5483, + "step": 5967 + }, + { + "epoch": 1.22684756912324, + "grad_norm": 0.19172148406505585, + "learning_rate": 6.014975706516744e-05, + "loss": 0.5629, + "step": 5968 + }, + { + "epoch": 1.2270531400966185, + "grad_norm": 0.20126576721668243, + "learning_rate": 6.014035163239649e-05, + "loss": 0.5609, + "step": 5969 + }, + { + "epoch": 1.2272587110699968, + "grad_norm": 0.19356362521648407, + "learning_rate": 6.0130945453731196e-05, + "loss": 0.557, + "step": 5970 + }, + { + "epoch": 1.2274642820433754, + "grad_norm": 0.19379346072673798, + "learning_rate": 6.012153852963494e-05, + "loss": 0.5644, + "step": 5971 + }, + { + "epoch": 1.227669853016754, + "grad_norm": 0.18843898177146912, + "learning_rate": 6.011213086057114e-05, + "loss": 0.5655, + "step": 5972 + }, + { + "epoch": 1.2278754239901326, + "grad_norm": 0.1895827353000641, + "learning_rate": 6.010272244700331e-05, + "loss": 0.5324, + "step": 5973 + }, + { + "epoch": 1.2280809949635112, + "grad_norm": 0.19657573103904724, + "learning_rate": 6.009331328939492e-05, + "loss": 0.5604, + "step": 5974 + }, + { + "epoch": 1.2282865659368898, + "grad_norm": 0.1885729730129242, + "learning_rate": 6.0083903388209536e-05, + "loss": 0.5601, + "step": 5975 + }, + { + "epoch": 1.2284921369102682, + "grad_norm": 0.16260753571987152, + "learning_rate": 6.007449274391073e-05, + "loss": 0.5245, + "step": 5976 + }, + { + "epoch": 1.2286977078836467, + "grad_norm": 0.13464370369911194, + "learning_rate": 6.0065081356962124e-05, + "loss": 0.5164, + "step": 5977 + }, + { + "epoch": 1.2289032788570253, + "grad_norm": 0.17227724194526672, + "learning_rate": 6.0055669227827384e-05, + "loss": 0.5848, + "step": 5978 + }, + { + "epoch": 1.229108849830404, + "grad_norm": 0.19165630638599396, + "learning_rate": 6.0046256356970185e-05, + "loss": 0.5713, + "step": 5979 + }, + { + "epoch": 1.2293144208037825, + "grad_norm": 0.191480353474617, + "learning_rate": 6.003684274485426e-05, + "loss": 0.5564, + "step": 5980 + }, + { + "epoch": 1.229519991777161, + "grad_norm": 0.19356124103069305, + "learning_rate": 6.002742839194338e-05, + "loss": 0.5711, + "step": 5981 + }, + { + "epoch": 1.2297255627505397, + "grad_norm": 0.18836161494255066, + "learning_rate": 6.001801329870134e-05, + "loss": 0.551, + "step": 5982 + }, + { + "epoch": 1.2299311337239183, + "grad_norm": 0.18804924190044403, + "learning_rate": 6.0008597465591966e-05, + "loss": 0.5641, + "step": 5983 + }, + { + "epoch": 1.2301367046972969, + "grad_norm": 0.20674586296081543, + "learning_rate": 5.999918089307915e-05, + "loss": 0.5664, + "step": 5984 + }, + { + "epoch": 1.2303422756706752, + "grad_norm": 0.1936078518629074, + "learning_rate": 5.9989763581626806e-05, + "loss": 0.552, + "step": 5985 + }, + { + "epoch": 1.2305478466440538, + "grad_norm": 0.19843873381614685, + "learning_rate": 5.998034553169886e-05, + "loss": 0.562, + "step": 5986 + }, + { + "epoch": 1.2307534176174324, + "grad_norm": 0.18645739555358887, + "learning_rate": 5.997092674375932e-05, + "loss": 0.5424, + "step": 5987 + }, + { + "epoch": 1.230958988590811, + "grad_norm": 0.18855836987495422, + "learning_rate": 5.9961507218272196e-05, + "loss": 0.5706, + "step": 5988 + }, + { + "epoch": 1.2311645595641896, + "grad_norm": 0.18944047391414642, + "learning_rate": 5.9952086955701535e-05, + "loss": 0.5564, + "step": 5989 + }, + { + "epoch": 1.2313701305375682, + "grad_norm": 0.1880870759487152, + "learning_rate": 5.994266595651143e-05, + "loss": 0.5662, + "step": 5990 + }, + { + "epoch": 1.2315757015109465, + "grad_norm": 0.19140774011611938, + "learning_rate": 5.993324422116602e-05, + "loss": 0.5469, + "step": 5991 + }, + { + "epoch": 1.2317812724843251, + "grad_norm": 0.1923801451921463, + "learning_rate": 5.9923821750129466e-05, + "loss": 0.5715, + "step": 5992 + }, + { + "epoch": 1.2319868434577037, + "grad_norm": 0.18575525283813477, + "learning_rate": 5.991439854386597e-05, + "loss": 0.5325, + "step": 5993 + }, + { + "epoch": 1.2321924144310823, + "grad_norm": 0.19030645489692688, + "learning_rate": 5.9904974602839764e-05, + "loss": 0.5366, + "step": 5994 + }, + { + "epoch": 1.2323979854044609, + "grad_norm": 0.19156965613365173, + "learning_rate": 5.9895549927515114e-05, + "loss": 0.5741, + "step": 5995 + }, + { + "epoch": 1.2326035563778395, + "grad_norm": 0.1905066967010498, + "learning_rate": 5.988612451835636e-05, + "loss": 0.5452, + "step": 5996 + }, + { + "epoch": 1.232809127351218, + "grad_norm": 0.18837079405784607, + "learning_rate": 5.987669837582782e-05, + "loss": 0.5644, + "step": 5997 + }, + { + "epoch": 1.2330146983245966, + "grad_norm": 0.1969577670097351, + "learning_rate": 5.9867271500393884e-05, + "loss": 0.5653, + "step": 5998 + }, + { + "epoch": 1.2332202692979752, + "grad_norm": 0.1714939922094345, + "learning_rate": 5.9857843892518975e-05, + "loss": 0.5255, + "step": 5999 + }, + { + "epoch": 1.2334258402713536, + "grad_norm": 0.16838547587394714, + "learning_rate": 5.984841555266753e-05, + "loss": 0.5574, + "step": 6000 + }, + { + "epoch": 1.2336314112447322, + "grad_norm": 0.18724249303340912, + "learning_rate": 5.983898648130407e-05, + "loss": 0.5286, + "step": 6001 + }, + { + "epoch": 1.2338369822181108, + "grad_norm": 0.1969245970249176, + "learning_rate": 5.98295566788931e-05, + "loss": 0.5673, + "step": 6002 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 0.1898987591266632, + "learning_rate": 5.982012614589917e-05, + "loss": 0.5545, + "step": 6003 + }, + { + "epoch": 1.234248124164868, + "grad_norm": 0.1573200672864914, + "learning_rate": 5.9810694882786916e-05, + "loss": 0.5205, + "step": 6004 + }, + { + "epoch": 1.2344536951382465, + "grad_norm": 0.1741228699684143, + "learning_rate": 5.9801262890020935e-05, + "loss": 0.567, + "step": 6005 + }, + { + "epoch": 1.2346592661116251, + "grad_norm": 0.19393646717071533, + "learning_rate": 5.9791830168065914e-05, + "loss": 0.5476, + "step": 6006 + }, + { + "epoch": 1.2348648370850035, + "grad_norm": 0.19462937116622925, + "learning_rate": 5.978239671738655e-05, + "loss": 0.5361, + "step": 6007 + }, + { + "epoch": 1.235070408058382, + "grad_norm": 0.18887047469615936, + "learning_rate": 5.9772962538447604e-05, + "loss": 0.5682, + "step": 6008 + }, + { + "epoch": 1.2352759790317607, + "grad_norm": 0.19533561170101166, + "learning_rate": 5.976352763171385e-05, + "loss": 0.5776, + "step": 6009 + }, + { + "epoch": 1.2354815500051393, + "grad_norm": 0.2016497403383255, + "learning_rate": 5.975409199765008e-05, + "loss": 0.5768, + "step": 6010 + }, + { + "epoch": 1.2356871209785179, + "grad_norm": 0.19525597989559174, + "learning_rate": 5.9744655636721166e-05, + "loss": 0.5774, + "step": 6011 + }, + { + "epoch": 1.2358926919518964, + "grad_norm": 0.19392353296279907, + "learning_rate": 5.973521854939198e-05, + "loss": 0.5451, + "step": 6012 + }, + { + "epoch": 1.236098262925275, + "grad_norm": 0.1947338730096817, + "learning_rate": 5.9725780736127456e-05, + "loss": 0.5697, + "step": 6013 + }, + { + "epoch": 1.2363038338986536, + "grad_norm": 0.20187315344810486, + "learning_rate": 5.971634219739253e-05, + "loss": 0.5441, + "step": 6014 + }, + { + "epoch": 1.236509404872032, + "grad_norm": 0.1915546953678131, + "learning_rate": 5.970690293365222e-05, + "loss": 0.5692, + "step": 6015 + }, + { + "epoch": 1.2367149758454106, + "grad_norm": 0.18739596009254456, + "learning_rate": 5.969746294537153e-05, + "loss": 0.5582, + "step": 6016 + }, + { + "epoch": 1.2369205468187892, + "grad_norm": 0.18742164969444275, + "learning_rate": 5.968802223301554e-05, + "loss": 0.5538, + "step": 6017 + }, + { + "epoch": 1.2371261177921677, + "grad_norm": 0.18883053958415985, + "learning_rate": 5.967858079704935e-05, + "loss": 0.5569, + "step": 6018 + }, + { + "epoch": 1.2373316887655463, + "grad_norm": 0.1861804723739624, + "learning_rate": 5.966913863793809e-05, + "loss": 0.5506, + "step": 6019 + }, + { + "epoch": 1.237537259738925, + "grad_norm": 0.1672678142786026, + "learning_rate": 5.965969575614694e-05, + "loss": 0.5207, + "step": 6020 + }, + { + "epoch": 1.2377428307123035, + "grad_norm": 0.1628050059080124, + "learning_rate": 5.965025215214109e-05, + "loss": 0.564, + "step": 6021 + }, + { + "epoch": 1.2379484016856819, + "grad_norm": 0.16974832117557526, + "learning_rate": 5.964080782638579e-05, + "loss": 0.5396, + "step": 6022 + }, + { + "epoch": 1.2381539726590605, + "grad_norm": 0.1564965546131134, + "learning_rate": 5.963136277934634e-05, + "loss": 0.5456, + "step": 6023 + }, + { + "epoch": 1.238359543632439, + "grad_norm": 0.19115638732910156, + "learning_rate": 5.962191701148801e-05, + "loss": 0.5821, + "step": 6024 + }, + { + "epoch": 1.2385651146058176, + "grad_norm": 0.1846878081560135, + "learning_rate": 5.9612470523276176e-05, + "loss": 0.5708, + "step": 6025 + }, + { + "epoch": 1.2387706855791962, + "grad_norm": 0.1887466162443161, + "learning_rate": 5.9603023315176224e-05, + "loss": 0.5633, + "step": 6026 + }, + { + "epoch": 1.2389762565525748, + "grad_norm": 0.1877734214067459, + "learning_rate": 5.959357538765356e-05, + "loss": 0.5343, + "step": 6027 + }, + { + "epoch": 1.2391818275259534, + "grad_norm": 0.1928664743900299, + "learning_rate": 5.958412674117365e-05, + "loss": 0.553, + "step": 6028 + }, + { + "epoch": 1.239387398499332, + "grad_norm": 0.19139814376831055, + "learning_rate": 5.957467737620199e-05, + "loss": 0.5586, + "step": 6029 + }, + { + "epoch": 1.2395929694727104, + "grad_norm": 0.18959654867649078, + "learning_rate": 5.9565227293204084e-05, + "loss": 0.5756, + "step": 6030 + }, + { + "epoch": 1.239798540446089, + "grad_norm": 0.17210416495800018, + "learning_rate": 5.9555776492645513e-05, + "loss": 0.5649, + "step": 6031 + }, + { + "epoch": 1.2400041114194675, + "grad_norm": 0.160491481423378, + "learning_rate": 5.954632497499187e-05, + "loss": 0.5464, + "step": 6032 + }, + { + "epoch": 1.2402096823928461, + "grad_norm": 0.19676798582077026, + "learning_rate": 5.9536872740708777e-05, + "loss": 0.5877, + "step": 6033 + }, + { + "epoch": 1.2404152533662247, + "grad_norm": 0.20140545070171356, + "learning_rate": 5.952741979026192e-05, + "loss": 0.5762, + "step": 6034 + }, + { + "epoch": 1.2406208243396033, + "grad_norm": 0.19546420872211456, + "learning_rate": 5.951796612411698e-05, + "loss": 0.5576, + "step": 6035 + }, + { + "epoch": 1.240826395312982, + "grad_norm": 0.16486842930316925, + "learning_rate": 5.9508511742739716e-05, + "loss": 0.5115, + "step": 6036 + }, + { + "epoch": 1.2410319662863603, + "grad_norm": 0.13164182007312775, + "learning_rate": 5.94990566465959e-05, + "loss": 0.5294, + "step": 6037 + }, + { + "epoch": 1.2412375372597388, + "grad_norm": 0.15759903192520142, + "learning_rate": 5.9489600836151305e-05, + "loss": 0.5432, + "step": 6038 + }, + { + "epoch": 1.2414431082331174, + "grad_norm": 0.2032260000705719, + "learning_rate": 5.948014431187181e-05, + "loss": 0.5613, + "step": 6039 + }, + { + "epoch": 1.241648679206496, + "grad_norm": 0.19559217989444733, + "learning_rate": 5.947068707422329e-05, + "loss": 0.5402, + "step": 6040 + }, + { + "epoch": 1.2418542501798746, + "grad_norm": 0.19073714315891266, + "learning_rate": 5.9461229123671654e-05, + "loss": 0.534, + "step": 6041 + }, + { + "epoch": 1.2420598211532532, + "grad_norm": 0.1976533830165863, + "learning_rate": 5.9451770460682846e-05, + "loss": 0.5591, + "step": 6042 + }, + { + "epoch": 1.2422653921266318, + "grad_norm": 0.2046486884355545, + "learning_rate": 5.944231108572287e-05, + "loss": 0.5668, + "step": 6043 + }, + { + "epoch": 1.2424709631000104, + "grad_norm": 0.19867998361587524, + "learning_rate": 5.9432850999257705e-05, + "loss": 0.5453, + "step": 6044 + }, + { + "epoch": 1.242676534073389, + "grad_norm": 0.18936549127101898, + "learning_rate": 5.9423390201753446e-05, + "loss": 0.5649, + "step": 6045 + }, + { + "epoch": 1.2428821050467673, + "grad_norm": 0.19626031816005707, + "learning_rate": 5.941392869367616e-05, + "loss": 0.5673, + "step": 6046 + }, + { + "epoch": 1.243087676020146, + "grad_norm": 0.19594736397266388, + "learning_rate": 5.9404466475492e-05, + "loss": 0.5673, + "step": 6047 + }, + { + "epoch": 1.2432932469935245, + "grad_norm": 0.19246500730514526, + "learning_rate": 5.939500354766707e-05, + "loss": 0.5708, + "step": 6048 + }, + { + "epoch": 1.243498817966903, + "grad_norm": 0.18370835483074188, + "learning_rate": 5.9385539910667615e-05, + "loss": 0.5339, + "step": 6049 + }, + { + "epoch": 1.2437043889402817, + "grad_norm": 0.1910664439201355, + "learning_rate": 5.9376075564959836e-05, + "loss": 0.5801, + "step": 6050 + }, + { + "epoch": 1.2439099599136603, + "grad_norm": 0.19655410945415497, + "learning_rate": 5.936661051101002e-05, + "loss": 0.5389, + "step": 6051 + }, + { + "epoch": 1.2441155308870386, + "grad_norm": 0.23548901081085205, + "learning_rate": 5.9357144749284446e-05, + "loss": 0.5509, + "step": 6052 + }, + { + "epoch": 1.2443211018604172, + "grad_norm": 0.1724226176738739, + "learning_rate": 5.934767828024946e-05, + "loss": 0.5405, + "step": 6053 + }, + { + "epoch": 1.2445266728337958, + "grad_norm": 0.16652943193912506, + "learning_rate": 5.9338211104371424e-05, + "loss": 0.5401, + "step": 6054 + }, + { + "epoch": 1.2447322438071744, + "grad_norm": 0.20364424586296082, + "learning_rate": 5.932874322211674e-05, + "loss": 0.5624, + "step": 6055 + }, + { + "epoch": 1.244937814780553, + "grad_norm": 0.1893276572227478, + "learning_rate": 5.931927463395186e-05, + "loss": 0.541, + "step": 6056 + }, + { + "epoch": 1.2451433857539316, + "grad_norm": 0.1932743936777115, + "learning_rate": 5.930980534034323e-05, + "loss": 0.5789, + "step": 6057 + }, + { + "epoch": 1.2453489567273102, + "grad_norm": 0.192164346575737, + "learning_rate": 5.930033534175739e-05, + "loss": 0.5711, + "step": 6058 + }, + { + "epoch": 1.2455545277006888, + "grad_norm": 0.18755845725536346, + "learning_rate": 5.9290864638660864e-05, + "loss": 0.5503, + "step": 6059 + }, + { + "epoch": 1.2457600986740673, + "grad_norm": 0.19044922292232513, + "learning_rate": 5.928139323152022e-05, + "loss": 0.5441, + "step": 6060 + }, + { + "epoch": 1.2459656696474457, + "grad_norm": 0.16590002179145813, + "learning_rate": 5.9271921120802106e-05, + "loss": 0.5255, + "step": 6061 + }, + { + "epoch": 1.2461712406208243, + "grad_norm": 0.16867230832576752, + "learning_rate": 5.926244830697312e-05, + "loss": 0.5825, + "step": 6062 + }, + { + "epoch": 1.2463768115942029, + "grad_norm": 0.20571991801261902, + "learning_rate": 5.925297479049999e-05, + "loss": 0.552, + "step": 6063 + }, + { + "epoch": 1.2465823825675815, + "grad_norm": 0.20340660214424133, + "learning_rate": 5.92435005718494e-05, + "loss": 0.5572, + "step": 6064 + }, + { + "epoch": 1.24678795354096, + "grad_norm": 0.19198235869407654, + "learning_rate": 5.923402565148811e-05, + "loss": 0.5569, + "step": 6065 + }, + { + "epoch": 1.2469935245143386, + "grad_norm": 0.1904488056898117, + "learning_rate": 5.92245500298829e-05, + "loss": 0.5641, + "step": 6066 + }, + { + "epoch": 1.247199095487717, + "grad_norm": 0.1928306370973587, + "learning_rate": 5.921507370750061e-05, + "loss": 0.5613, + "step": 6067 + }, + { + "epoch": 1.2474046664610956, + "grad_norm": 0.18856725096702576, + "learning_rate": 5.920559668480808e-05, + "loss": 0.5478, + "step": 6068 + }, + { + "epoch": 1.2476102374344742, + "grad_norm": 0.19025270640850067, + "learning_rate": 5.919611896227218e-05, + "loss": 0.553, + "step": 6069 + }, + { + "epoch": 1.2478158084078528, + "grad_norm": 0.18751074373722076, + "learning_rate": 5.918664054035987e-05, + "loss": 0.5571, + "step": 6070 + }, + { + "epoch": 1.2480213793812314, + "grad_norm": 0.18929120898246765, + "learning_rate": 5.917716141953807e-05, + "loss": 0.5674, + "step": 6071 + }, + { + "epoch": 1.24822695035461, + "grad_norm": 0.19729354977607727, + "learning_rate": 5.916768160027381e-05, + "loss": 0.5493, + "step": 6072 + }, + { + "epoch": 1.2484325213279885, + "grad_norm": 0.1939440220594406, + "learning_rate": 5.9158201083034086e-05, + "loss": 0.5617, + "step": 6073 + }, + { + "epoch": 1.2486380923013671, + "grad_norm": 0.19020439684391022, + "learning_rate": 5.914871986828596e-05, + "loss": 0.551, + "step": 6074 + }, + { + "epoch": 1.2488436632747457, + "grad_norm": 0.19423425197601318, + "learning_rate": 5.913923795649656e-05, + "loss": 0.5513, + "step": 6075 + }, + { + "epoch": 1.249049234248124, + "grad_norm": 0.1902787834405899, + "learning_rate": 5.912975534813298e-05, + "loss": 0.5467, + "step": 6076 + }, + { + "epoch": 1.2492548052215027, + "grad_norm": 0.16620683670043945, + "learning_rate": 5.91202720436624e-05, + "loss": 0.5262, + "step": 6077 + }, + { + "epoch": 1.2494603761948813, + "grad_norm": 0.15968933701515198, + "learning_rate": 5.911078804355202e-05, + "loss": 0.5616, + "step": 6078 + }, + { + "epoch": 1.2496659471682598, + "grad_norm": 0.19238422811031342, + "learning_rate": 5.910130334826906e-05, + "loss": 0.5515, + "step": 6079 + }, + { + "epoch": 1.2498715181416384, + "grad_norm": 0.19091928005218506, + "learning_rate": 5.9091817958280786e-05, + "loss": 0.5648, + "step": 6080 + }, + { + "epoch": 1.250077089115017, + "grad_norm": 0.19049179553985596, + "learning_rate": 5.908233187405452e-05, + "loss": 0.55, + "step": 6081 + }, + { + "epoch": 1.2502826600883954, + "grad_norm": 0.19400426745414734, + "learning_rate": 5.907284509605757e-05, + "loss": 0.5554, + "step": 6082 + }, + { + "epoch": 1.250488231061774, + "grad_norm": 0.19264687597751617, + "learning_rate": 5.9063357624757316e-05, + "loss": 0.5693, + "step": 6083 + }, + { + "epoch": 1.2506938020351526, + "grad_norm": 0.1882631927728653, + "learning_rate": 5.905386946062118e-05, + "loss": 0.5509, + "step": 6084 + }, + { + "epoch": 1.2508993730085312, + "grad_norm": 0.1930553913116455, + "learning_rate": 5.9044380604116575e-05, + "loss": 0.5667, + "step": 6085 + }, + { + "epoch": 1.2511049439819097, + "grad_norm": 0.19695702195167542, + "learning_rate": 5.9034891055710985e-05, + "loss": 0.5592, + "step": 6086 + }, + { + "epoch": 1.2513105149552883, + "grad_norm": 0.19834263622760773, + "learning_rate": 5.90254008158719e-05, + "loss": 0.5621, + "step": 6087 + }, + { + "epoch": 1.251516085928667, + "grad_norm": 0.19930176436901093, + "learning_rate": 5.9015909885066885e-05, + "loss": 0.5845, + "step": 6088 + }, + { + "epoch": 1.2517216569020455, + "grad_norm": 0.1929783821105957, + "learning_rate": 5.90064182637635e-05, + "loss": 0.5658, + "step": 6089 + }, + { + "epoch": 1.251927227875424, + "grad_norm": 0.2053227424621582, + "learning_rate": 5.899692595242934e-05, + "loss": 0.559, + "step": 6090 + }, + { + "epoch": 1.2521327988488027, + "grad_norm": 0.1878289431333542, + "learning_rate": 5.898743295153208e-05, + "loss": 0.5331, + "step": 6091 + }, + { + "epoch": 1.252338369822181, + "grad_norm": 0.1905200332403183, + "learning_rate": 5.897793926153935e-05, + "loss": 0.5687, + "step": 6092 + }, + { + "epoch": 1.2525439407955596, + "grad_norm": 0.16592474281787872, + "learning_rate": 5.89684448829189e-05, + "loss": 0.509, + "step": 6093 + }, + { + "epoch": 1.2527495117689382, + "grad_norm": 0.15698356926441193, + "learning_rate": 5.895894981613845e-05, + "loss": 0.558, + "step": 6094 + }, + { + "epoch": 1.2529550827423168, + "grad_norm": 0.19929586350917816, + "learning_rate": 5.89494540616658e-05, + "loss": 0.5595, + "step": 6095 + }, + { + "epoch": 1.2531606537156954, + "grad_norm": 0.19312036037445068, + "learning_rate": 5.893995761996875e-05, + "loss": 0.5577, + "step": 6096 + }, + { + "epoch": 1.2533662246890738, + "grad_norm": 0.19632984697818756, + "learning_rate": 5.8930460491515125e-05, + "loss": 0.5715, + "step": 6097 + }, + { + "epoch": 1.2535717956624524, + "grad_norm": 0.1999562531709671, + "learning_rate": 5.8920962676772836e-05, + "loss": 0.5578, + "step": 6098 + }, + { + "epoch": 1.253777366635831, + "grad_norm": 0.1987222284078598, + "learning_rate": 5.891146417620978e-05, + "loss": 0.5777, + "step": 6099 + }, + { + "epoch": 1.2539829376092095, + "grad_norm": 0.17240692675113678, + "learning_rate": 5.8901964990293894e-05, + "loss": 0.546, + "step": 6100 + }, + { + "epoch": 1.2541885085825881, + "grad_norm": 0.1715145856142044, + "learning_rate": 5.8892465119493184e-05, + "loss": 0.5658, + "step": 6101 + }, + { + "epoch": 1.2543940795559667, + "grad_norm": 0.18989497423171997, + "learning_rate": 5.888296456427565e-05, + "loss": 0.5718, + "step": 6102 + }, + { + "epoch": 1.2545996505293453, + "grad_norm": 0.1893077790737152, + "learning_rate": 5.887346332510934e-05, + "loss": 0.572, + "step": 6103 + }, + { + "epoch": 1.2548052215027239, + "grad_norm": 0.16260646283626556, + "learning_rate": 5.886396140246233e-05, + "loss": 0.5399, + "step": 6104 + }, + { + "epoch": 1.2550107924761025, + "grad_norm": 1.3922818899154663, + "learning_rate": 5.8854458796802744e-05, + "loss": 0.5587, + "step": 6105 + }, + { + "epoch": 1.255216363449481, + "grad_norm": 0.16991350054740906, + "learning_rate": 5.8844955508598745e-05, + "loss": 0.5286, + "step": 6106 + }, + { + "epoch": 1.2554219344228594, + "grad_norm": 0.21412529051303864, + "learning_rate": 5.8835451538318476e-05, + "loss": 0.5637, + "step": 6107 + }, + { + "epoch": 1.255627505396238, + "grad_norm": 0.40624189376831055, + "learning_rate": 5.882594688643019e-05, + "loss": 0.5364, + "step": 6108 + }, + { + "epoch": 1.2558330763696166, + "grad_norm": 0.2089642882347107, + "learning_rate": 5.881644155340213e-05, + "loss": 0.5669, + "step": 6109 + }, + { + "epoch": 1.2560386473429952, + "grad_norm": 0.4316593110561371, + "learning_rate": 5.880693553970256e-05, + "loss": 0.564, + "step": 6110 + }, + { + "epoch": 1.2562442183163738, + "grad_norm": 0.21521629393100739, + "learning_rate": 5.879742884579981e-05, + "loss": 0.5774, + "step": 6111 + }, + { + "epoch": 1.2564497892897522, + "grad_norm": 0.2025582194328308, + "learning_rate": 5.878792147216223e-05, + "loss": 0.5487, + "step": 6112 + }, + { + "epoch": 1.2566553602631307, + "grad_norm": 0.21197755634784698, + "learning_rate": 5.8778413419258204e-05, + "loss": 0.5674, + "step": 6113 + }, + { + "epoch": 1.2568609312365093, + "grad_norm": 0.21161524951457977, + "learning_rate": 5.876890468755614e-05, + "loss": 0.5915, + "step": 6114 + }, + { + "epoch": 1.257066502209888, + "grad_norm": 0.20301292836666107, + "learning_rate": 5.875939527752451e-05, + "loss": 0.5569, + "step": 6115 + }, + { + "epoch": 1.2572720731832665, + "grad_norm": 0.20232078433036804, + "learning_rate": 5.874988518963178e-05, + "loss": 0.5686, + "step": 6116 + }, + { + "epoch": 1.257477644156645, + "grad_norm": 0.19668982923030853, + "learning_rate": 5.8740374424346484e-05, + "loss": 0.5472, + "step": 6117 + }, + { + "epoch": 1.2576832151300237, + "grad_norm": 0.19299955666065216, + "learning_rate": 5.8730862982137155e-05, + "loss": 0.554, + "step": 6118 + }, + { + "epoch": 1.2578887861034023, + "grad_norm": 0.16891315579414368, + "learning_rate": 5.872135086347238e-05, + "loss": 0.549, + "step": 6119 + }, + { + "epoch": 1.2580943570767809, + "grad_norm": 0.19991520047187805, + "learning_rate": 5.87118380688208e-05, + "loss": 0.5791, + "step": 6120 + }, + { + "epoch": 1.2582999280501594, + "grad_norm": 0.19644920527935028, + "learning_rate": 5.870232459865102e-05, + "loss": 0.5416, + "step": 6121 + }, + { + "epoch": 1.2585054990235378, + "grad_norm": 0.19781053066253662, + "learning_rate": 5.869281045343177e-05, + "loss": 0.5701, + "step": 6122 + }, + { + "epoch": 1.2587110699969164, + "grad_norm": 0.1692863404750824, + "learning_rate": 5.868329563363175e-05, + "loss": 0.5307, + "step": 6123 + }, + { + "epoch": 1.258916640970295, + "grad_norm": 0.16794486343860626, + "learning_rate": 5.8673780139719697e-05, + "loss": 0.572, + "step": 6124 + }, + { + "epoch": 1.2591222119436736, + "grad_norm": 0.16393691301345825, + "learning_rate": 5.866426397216442e-05, + "loss": 0.5017, + "step": 6125 + }, + { + "epoch": 1.2593277829170522, + "grad_norm": 0.20335790514945984, + "learning_rate": 5.8654747131434714e-05, + "loss": 0.5663, + "step": 6126 + }, + { + "epoch": 1.2595333538904308, + "grad_norm": 0.20092669129371643, + "learning_rate": 5.864522961799944e-05, + "loss": 0.5714, + "step": 6127 + }, + { + "epoch": 1.2597389248638091, + "grad_norm": 0.16403307020664215, + "learning_rate": 5.863571143232748e-05, + "loss": 0.5319, + "step": 6128 + }, + { + "epoch": 1.2599444958371877, + "grad_norm": 0.1622430980205536, + "learning_rate": 5.8626192574887756e-05, + "loss": 0.5429, + "step": 6129 + }, + { + "epoch": 1.2601500668105663, + "grad_norm": 0.19496072828769684, + "learning_rate": 5.861667304614922e-05, + "loss": 0.5497, + "step": 6130 + }, + { + "epoch": 1.2603556377839449, + "grad_norm": 0.18575909733772278, + "learning_rate": 5.860715284658084e-05, + "loss": 0.5494, + "step": 6131 + }, + { + "epoch": 1.2605612087573235, + "grad_norm": 0.19597534835338593, + "learning_rate": 5.8597631976651635e-05, + "loss": 0.5602, + "step": 6132 + }, + { + "epoch": 1.260766779730702, + "grad_norm": 0.1906193345785141, + "learning_rate": 5.858811043683066e-05, + "loss": 0.5495, + "step": 6133 + }, + { + "epoch": 1.2609723507040806, + "grad_norm": 0.16364972293376923, + "learning_rate": 5.8578588227586995e-05, + "loss": 0.5283, + "step": 6134 + }, + { + "epoch": 1.2611779216774592, + "grad_norm": 0.15908394753932953, + "learning_rate": 5.8569065349389746e-05, + "loss": 0.5484, + "step": 6135 + }, + { + "epoch": 1.2613834926508378, + "grad_norm": 0.18748100101947784, + "learning_rate": 5.855954180270808e-05, + "loss": 0.5653, + "step": 6136 + }, + { + "epoch": 1.2615890636242162, + "grad_norm": 0.19369830191135406, + "learning_rate": 5.855001758801116e-05, + "loss": 0.5627, + "step": 6137 + }, + { + "epoch": 1.2617946345975948, + "grad_norm": 0.19096927344799042, + "learning_rate": 5.8540492705768205e-05, + "loss": 0.5464, + "step": 6138 + }, + { + "epoch": 1.2620002055709734, + "grad_norm": 0.19514234364032745, + "learning_rate": 5.853096715644847e-05, + "loss": 0.569, + "step": 6139 + }, + { + "epoch": 1.262205776544352, + "grad_norm": 0.19120776653289795, + "learning_rate": 5.852144094052123e-05, + "loss": 0.5634, + "step": 6140 + }, + { + "epoch": 1.2624113475177305, + "grad_norm": 0.19928298890590668, + "learning_rate": 5.851191405845579e-05, + "loss": 0.5745, + "step": 6141 + }, + { + "epoch": 1.2626169184911091, + "grad_norm": 0.1887395977973938, + "learning_rate": 5.850238651072149e-05, + "loss": 0.56, + "step": 6142 + }, + { + "epoch": 1.2628224894644875, + "grad_norm": 0.19872866570949554, + "learning_rate": 5.849285829778772e-05, + "loss": 0.5627, + "step": 6143 + }, + { + "epoch": 1.263028060437866, + "grad_norm": 0.16826018691062927, + "learning_rate": 5.8483329420123906e-05, + "loss": 0.5414, + "step": 6144 + }, + { + "epoch": 1.2632336314112447, + "grad_norm": 0.16626615822315216, + "learning_rate": 5.847379987819944e-05, + "loss": 0.5532, + "step": 6145 + }, + { + "epoch": 1.2634392023846233, + "grad_norm": 0.1921907663345337, + "learning_rate": 5.8464269672483855e-05, + "loss": 0.5543, + "step": 6146 + }, + { + "epoch": 1.2636447733580018, + "grad_norm": 0.191694438457489, + "learning_rate": 5.8454738803446616e-05, + "loss": 0.5442, + "step": 6147 + }, + { + "epoch": 1.2638503443313804, + "grad_norm": 0.19045263528823853, + "learning_rate": 5.8445207271557306e-05, + "loss": 0.5794, + "step": 6148 + }, + { + "epoch": 1.264055915304759, + "grad_norm": 0.19358719885349274, + "learning_rate": 5.843567507728545e-05, + "loss": 0.5692, + "step": 6149 + }, + { + "epoch": 1.2642614862781376, + "grad_norm": 0.19511562585830688, + "learning_rate": 5.8426142221100706e-05, + "loss": 0.5648, + "step": 6150 + }, + { + "epoch": 1.2644670572515162, + "grad_norm": 0.1978984773159027, + "learning_rate": 5.841660870347268e-05, + "loss": 0.5792, + "step": 6151 + }, + { + "epoch": 1.2646726282248946, + "grad_norm": 0.189521923661232, + "learning_rate": 5.840707452487104e-05, + "loss": 0.5421, + "step": 6152 + }, + { + "epoch": 1.2648781991982732, + "grad_norm": 0.1647057980298996, + "learning_rate": 5.8397539685765516e-05, + "loss": 0.5296, + "step": 6153 + }, + { + "epoch": 1.2650837701716517, + "grad_norm": 0.15688472986221313, + "learning_rate": 5.8388004186625836e-05, + "loss": 0.5423, + "step": 6154 + }, + { + "epoch": 1.2652893411450303, + "grad_norm": 0.19488799571990967, + "learning_rate": 5.8378468027921766e-05, + "loss": 0.5396, + "step": 6155 + }, + { + "epoch": 1.265494912118409, + "grad_norm": 0.19577009975910187, + "learning_rate": 5.8368931210123085e-05, + "loss": 0.5487, + "step": 6156 + }, + { + "epoch": 1.2657004830917875, + "grad_norm": 0.19283023476600647, + "learning_rate": 5.835939373369966e-05, + "loss": 0.5554, + "step": 6157 + }, + { + "epoch": 1.2659060540651659, + "grad_norm": 0.19187267124652863, + "learning_rate": 5.834985559912136e-05, + "loss": 0.5572, + "step": 6158 + }, + { + "epoch": 1.2661116250385445, + "grad_norm": 0.19688525795936584, + "learning_rate": 5.834031680685805e-05, + "loss": 0.5667, + "step": 6159 + }, + { + "epoch": 1.266317196011923, + "grad_norm": 0.17647728323936462, + "learning_rate": 5.83307773573797e-05, + "loss": 0.546, + "step": 6160 + }, + { + "epoch": 1.2665227669853016, + "grad_norm": 0.16302068531513214, + "learning_rate": 5.8321237251156254e-05, + "loss": 0.5648, + "step": 6161 + }, + { + "epoch": 1.2667283379586802, + "grad_norm": 0.1963539719581604, + "learning_rate": 5.8311696488657714e-05, + "loss": 0.5584, + "step": 6162 + }, + { + "epoch": 1.2669339089320588, + "grad_norm": 0.19600288569927216, + "learning_rate": 5.8302155070354105e-05, + "loss": 0.5657, + "step": 6163 + }, + { + "epoch": 1.2671394799054374, + "grad_norm": 0.17675581574440002, + "learning_rate": 5.829261299671549e-05, + "loss": 0.5394, + "step": 6164 + }, + { + "epoch": 1.267345050878816, + "grad_norm": 0.16274531185626984, + "learning_rate": 5.828307026821196e-05, + "loss": 0.5493, + "step": 6165 + }, + { + "epoch": 1.2675506218521946, + "grad_norm": 0.18789401650428772, + "learning_rate": 5.827352688531365e-05, + "loss": 0.5438, + "step": 6166 + }, + { + "epoch": 1.267756192825573, + "grad_norm": 0.19160960614681244, + "learning_rate": 5.82639828484907e-05, + "loss": 0.558, + "step": 6167 + }, + { + "epoch": 1.2679617637989515, + "grad_norm": 0.1683780699968338, + "learning_rate": 5.8254438158213306e-05, + "loss": 0.5021, + "step": 6168 + }, + { + "epoch": 1.2681673347723301, + "grad_norm": 0.14388030767440796, + "learning_rate": 5.824489281495171e-05, + "loss": 0.5228, + "step": 6169 + }, + { + "epoch": 1.2683729057457087, + "grad_norm": 0.1721310168504715, + "learning_rate": 5.8235346819176135e-05, + "loss": 0.5546, + "step": 6170 + }, + { + "epoch": 1.2685784767190873, + "grad_norm": 0.19721747934818268, + "learning_rate": 5.822580017135691e-05, + "loss": 0.5533, + "step": 6171 + }, + { + "epoch": 1.2687840476924659, + "grad_norm": 0.18930335342884064, + "learning_rate": 5.8216252871964314e-05, + "loss": 0.5671, + "step": 6172 + }, + { + "epoch": 1.2689896186658443, + "grad_norm": 0.1941603124141693, + "learning_rate": 5.8206704921468695e-05, + "loss": 0.5594, + "step": 6173 + }, + { + "epoch": 1.2691951896392228, + "grad_norm": 0.20115360617637634, + "learning_rate": 5.819715632034048e-05, + "loss": 0.5645, + "step": 6174 + }, + { + "epoch": 1.2694007606126014, + "grad_norm": 0.19006428122520447, + "learning_rate": 5.818760706905004e-05, + "loss": 0.5384, + "step": 6175 + }, + { + "epoch": 1.26960633158598, + "grad_norm": 0.18901333212852478, + "learning_rate": 5.8178057168067844e-05, + "loss": 0.5551, + "step": 6176 + }, + { + "epoch": 1.2698119025593586, + "grad_norm": 0.1722274273633957, + "learning_rate": 5.816850661786436e-05, + "loss": 0.529, + "step": 6177 + }, + { + "epoch": 1.2700174735327372, + "grad_norm": 0.16205133497714996, + "learning_rate": 5.815895541891012e-05, + "loss": 0.5608, + "step": 6178 + }, + { + "epoch": 1.2702230445061158, + "grad_norm": 0.20700521767139435, + "learning_rate": 5.814940357167563e-05, + "loss": 0.5537, + "step": 6179 + }, + { + "epoch": 1.2704286154794944, + "grad_norm": 0.19888941943645477, + "learning_rate": 5.8139851076631486e-05, + "loss": 0.5919, + "step": 6180 + }, + { + "epoch": 1.270634186452873, + "grad_norm": 0.18785306811332703, + "learning_rate": 5.813029793424831e-05, + "loss": 0.5355, + "step": 6181 + }, + { + "epoch": 1.2708397574262515, + "grad_norm": 0.1864861100912094, + "learning_rate": 5.812074414499673e-05, + "loss": 0.5585, + "step": 6182 + }, + { + "epoch": 1.27104532839963, + "grad_norm": 0.16200599074363708, + "learning_rate": 5.81111897093474e-05, + "loss": 0.5484, + "step": 6183 + }, + { + "epoch": 1.2712508993730085, + "grad_norm": 0.15543238818645477, + "learning_rate": 5.8101634627771034e-05, + "loss": 0.5398, + "step": 6184 + }, + { + "epoch": 1.271456470346387, + "grad_norm": 0.1934465765953064, + "learning_rate": 5.809207890073837e-05, + "loss": 0.5703, + "step": 6185 + }, + { + "epoch": 1.2716620413197657, + "grad_norm": 0.17177589237689972, + "learning_rate": 5.808252252872018e-05, + "loss": 0.535, + "step": 6186 + }, + { + "epoch": 1.2718676122931443, + "grad_norm": 0.1565936654806137, + "learning_rate": 5.807296551218723e-05, + "loss": 0.5704, + "step": 6187 + }, + { + "epoch": 1.2720731832665226, + "grad_norm": 0.1956259161233902, + "learning_rate": 5.80634078516104e-05, + "loss": 0.5477, + "step": 6188 + }, + { + "epoch": 1.2722787542399012, + "grad_norm": 0.19236725568771362, + "learning_rate": 5.80538495474605e-05, + "loss": 0.5691, + "step": 6189 + }, + { + "epoch": 1.2724843252132798, + "grad_norm": 0.16895383596420288, + "learning_rate": 5.804429060020845e-05, + "loss": 0.5185, + "step": 6190 + }, + { + "epoch": 1.2726898961866584, + "grad_norm": 0.15849240124225616, + "learning_rate": 5.8034731010325176e-05, + "loss": 0.5699, + "step": 6191 + }, + { + "epoch": 1.272895467160037, + "grad_norm": 0.1865822672843933, + "learning_rate": 5.802517077828163e-05, + "loss": 0.5255, + "step": 6192 + }, + { + "epoch": 1.2731010381334156, + "grad_norm": 0.16672882437705994, + "learning_rate": 5.80156099045488e-05, + "loss": 0.5399, + "step": 6193 + }, + { + "epoch": 1.2733066091067942, + "grad_norm": 0.1562536656856537, + "learning_rate": 5.8006048389597694e-05, + "loss": 0.55, + "step": 6194 + }, + { + "epoch": 1.2735121800801728, + "grad_norm": 0.19599376618862152, + "learning_rate": 5.7996486233899395e-05, + "loss": 0.5545, + "step": 6195 + }, + { + "epoch": 1.2737177510535513, + "grad_norm": 0.1640097200870514, + "learning_rate": 5.798692343792495e-05, + "loss": 0.5277, + "step": 6196 + }, + { + "epoch": 1.27392332202693, + "grad_norm": 0.17527011036872864, + "learning_rate": 5.797736000214549e-05, + "loss": 0.5735, + "step": 6197 + }, + { + "epoch": 1.2741288930003083, + "grad_norm": 0.19275882840156555, + "learning_rate": 5.7967795927032164e-05, + "loss": 0.5686, + "step": 6198 + }, + { + "epoch": 1.2743344639736869, + "grad_norm": 0.19368760287761688, + "learning_rate": 5.7958231213056144e-05, + "loss": 0.5665, + "step": 6199 + }, + { + "epoch": 1.2745400349470655, + "grad_norm": 0.1672065258026123, + "learning_rate": 5.794866586068862e-05, + "loss": 0.5532, + "step": 6200 + }, + { + "epoch": 1.274745605920444, + "grad_norm": 0.1615796685218811, + "learning_rate": 5.7939099870400865e-05, + "loss": 0.5549, + "step": 6201 + }, + { + "epoch": 1.2749511768938226, + "grad_norm": 0.18721790611743927, + "learning_rate": 5.7929533242664137e-05, + "loss": 0.5476, + "step": 6202 + }, + { + "epoch": 1.275156747867201, + "grad_norm": 0.19924210011959076, + "learning_rate": 5.791996597794975e-05, + "loss": 0.5929, + "step": 6203 + }, + { + "epoch": 1.2753623188405796, + "grad_norm": 0.16278637945652008, + "learning_rate": 5.791039807672901e-05, + "loss": 0.545, + "step": 6204 + }, + { + "epoch": 1.2755678898139582, + "grad_norm": 0.12655942142009735, + "learning_rate": 5.7900829539473304e-05, + "loss": 0.5253, + "step": 6205 + }, + { + "epoch": 1.2757734607873368, + "grad_norm": 0.16198953986167908, + "learning_rate": 5.789126036665403e-05, + "loss": 0.5607, + "step": 6206 + }, + { + "epoch": 1.2759790317607154, + "grad_norm": 0.1700884997844696, + "learning_rate": 5.7881690558742605e-05, + "loss": 0.5321, + "step": 6207 + }, + { + "epoch": 1.276184602734094, + "grad_norm": 0.15518617630004883, + "learning_rate": 5.7872120116210494e-05, + "loss": 0.5518, + "step": 6208 + }, + { + "epoch": 1.2763901737074725, + "grad_norm": 0.18900856375694275, + "learning_rate": 5.7862549039529196e-05, + "loss": 0.5467, + "step": 6209 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 0.2112400233745575, + "learning_rate": 5.785297732917023e-05, + "loss": 0.5821, + "step": 6210 + }, + { + "epoch": 1.2768013156542297, + "grad_norm": 0.19592179358005524, + "learning_rate": 5.784340498560513e-05, + "loss": 0.5889, + "step": 6211 + }, + { + "epoch": 1.2770068866276083, + "grad_norm": 0.1897910088300705, + "learning_rate": 5.783383200930551e-05, + "loss": 0.5657, + "step": 6212 + }, + { + "epoch": 1.2772124576009867, + "grad_norm": 0.1914108544588089, + "learning_rate": 5.782425840074297e-05, + "loss": 0.5578, + "step": 6213 + }, + { + "epoch": 1.2774180285743653, + "grad_norm": 0.19016936421394348, + "learning_rate": 5.781468416038914e-05, + "loss": 0.5599, + "step": 6214 + }, + { + "epoch": 1.2776235995477438, + "grad_norm": 0.18804775178432465, + "learning_rate": 5.780510928871574e-05, + "loss": 0.5671, + "step": 6215 + }, + { + "epoch": 1.2778291705211224, + "grad_norm": 0.18596555292606354, + "learning_rate": 5.779553378619445e-05, + "loss": 0.5355, + "step": 6216 + }, + { + "epoch": 1.278034741494501, + "grad_norm": 0.19289173185825348, + "learning_rate": 5.778595765329702e-05, + "loss": 0.583, + "step": 6217 + }, + { + "epoch": 1.2782403124678796, + "grad_norm": 0.18467681109905243, + "learning_rate": 5.7776380890495214e-05, + "loss": 0.561, + "step": 6218 + }, + { + "epoch": 1.278445883441258, + "grad_norm": 0.19433990120887756, + "learning_rate": 5.776680349826083e-05, + "loss": 0.5548, + "step": 6219 + }, + { + "epoch": 1.2786514544146366, + "grad_norm": 0.1940041035413742, + "learning_rate": 5.7757225477065725e-05, + "loss": 0.5654, + "step": 6220 + }, + { + "epoch": 1.2788570253880152, + "grad_norm": 0.1894046515226364, + "learning_rate": 5.774764682738174e-05, + "loss": 0.5628, + "step": 6221 + }, + { + "epoch": 1.2790625963613937, + "grad_norm": 0.20354604721069336, + "learning_rate": 5.7738067549680776e-05, + "loss": 0.569, + "step": 6222 + }, + { + "epoch": 1.2792681673347723, + "grad_norm": 0.18965789675712585, + "learning_rate": 5.7728487644434754e-05, + "loss": 0.5458, + "step": 6223 + }, + { + "epoch": 1.279473738308151, + "grad_norm": 0.18858371675014496, + "learning_rate": 5.771890711211566e-05, + "loss": 0.5415, + "step": 6224 + }, + { + "epoch": 1.2796793092815295, + "grad_norm": 0.19351953268051147, + "learning_rate": 5.7709325953195444e-05, + "loss": 0.5504, + "step": 6225 + }, + { + "epoch": 1.279884880254908, + "grad_norm": 0.18949908018112183, + "learning_rate": 5.769974416814615e-05, + "loss": 0.541, + "step": 6226 + }, + { + "epoch": 1.2800904512282867, + "grad_norm": 0.19526349008083344, + "learning_rate": 5.769016175743982e-05, + "loss": 0.5634, + "step": 6227 + }, + { + "epoch": 1.280296022201665, + "grad_norm": 0.17583510279655457, + "learning_rate": 5.7680578721548524e-05, + "loss": 0.5462, + "step": 6228 + }, + { + "epoch": 1.2805015931750436, + "grad_norm": 0.1601148396730423, + "learning_rate": 5.767099506094438e-05, + "loss": 0.5474, + "step": 6229 + }, + { + "epoch": 1.2807071641484222, + "grad_norm": 0.19925040006637573, + "learning_rate": 5.766141077609955e-05, + "loss": 0.5884, + "step": 6230 + }, + { + "epoch": 1.2809127351218008, + "grad_norm": 0.20039363205432892, + "learning_rate": 5.765182586748619e-05, + "loss": 0.5624, + "step": 6231 + }, + { + "epoch": 1.2811183060951794, + "grad_norm": 0.19234807789325714, + "learning_rate": 5.764224033557649e-05, + "loss": 0.5994, + "step": 6232 + }, + { + "epoch": 1.281323877068558, + "grad_norm": 0.19299016892910004, + "learning_rate": 5.76326541808427e-05, + "loss": 0.5786, + "step": 6233 + }, + { + "epoch": 1.2815294480419364, + "grad_norm": 0.2128915637731552, + "learning_rate": 5.762306740375709e-05, + "loss": 0.5763, + "step": 6234 + }, + { + "epoch": 1.281735019015315, + "grad_norm": 0.19753651320934296, + "learning_rate": 5.761348000479194e-05, + "loss": 0.5565, + "step": 6235 + }, + { + "epoch": 1.2819405899886935, + "grad_norm": 0.19530276954174042, + "learning_rate": 5.76038919844196e-05, + "loss": 0.567, + "step": 6236 + }, + { + "epoch": 1.2821461609620721, + "grad_norm": 0.1876569539308548, + "learning_rate": 5.7594303343112406e-05, + "loss": 0.5358, + "step": 6237 + }, + { + "epoch": 1.2823517319354507, + "grad_norm": 0.19202187657356262, + "learning_rate": 5.758471408134276e-05, + "loss": 0.5589, + "step": 6238 + }, + { + "epoch": 1.2825573029088293, + "grad_norm": 0.2080259472131729, + "learning_rate": 5.757512419958305e-05, + "loss": 0.5767, + "step": 6239 + }, + { + "epoch": 1.2827628738822079, + "grad_norm": 0.2008046805858612, + "learning_rate": 5.756553369830577e-05, + "loss": 0.5486, + "step": 6240 + }, + { + "epoch": 1.2829684448555865, + "grad_norm": 0.18698541820049286, + "learning_rate": 5.7555942577983364e-05, + "loss": 0.5471, + "step": 6241 + }, + { + "epoch": 1.283174015828965, + "grad_norm": 0.19184443354606628, + "learning_rate": 5.754635083908835e-05, + "loss": 0.5703, + "step": 6242 + }, + { + "epoch": 1.2833795868023434, + "grad_norm": 0.18551193177700043, + "learning_rate": 5.753675848209329e-05, + "loss": 0.5353, + "step": 6243 + }, + { + "epoch": 1.283585157775722, + "grad_norm": 0.17165902256965637, + "learning_rate": 5.7527165507470705e-05, + "loss": 0.5094, + "step": 6244 + }, + { + "epoch": 1.2837907287491006, + "grad_norm": 0.16080299019813538, + "learning_rate": 5.7517571915693255e-05, + "loss": 0.5797, + "step": 6245 + }, + { + "epoch": 1.2839962997224792, + "grad_norm": 0.16521471738815308, + "learning_rate": 5.750797770723353e-05, + "loss": 0.5199, + "step": 6246 + }, + { + "epoch": 1.2842018706958578, + "grad_norm": 0.12971197068691254, + "learning_rate": 5.749838288256421e-05, + "loss": 0.5376, + "step": 6247 + }, + { + "epoch": 1.2844074416692364, + "grad_norm": 0.13733793795108795, + "learning_rate": 5.748878744215799e-05, + "loss": 0.5266, + "step": 6248 + }, + { + "epoch": 1.2846130126426147, + "grad_norm": 0.1690482795238495, + "learning_rate": 5.747919138648757e-05, + "loss": 0.5737, + "step": 6249 + }, + { + "epoch": 1.2848185836159933, + "grad_norm": 0.19658613204956055, + "learning_rate": 5.746959471602572e-05, + "loss": 0.5531, + "step": 6250 + }, + { + "epoch": 1.285024154589372, + "grad_norm": 0.1984742283821106, + "learning_rate": 5.7459997431245236e-05, + "loss": 0.5877, + "step": 6251 + }, + { + "epoch": 1.2852297255627505, + "grad_norm": 0.1888909637928009, + "learning_rate": 5.74503995326189e-05, + "loss": 0.5391, + "step": 6252 + }, + { + "epoch": 1.285435296536129, + "grad_norm": 0.19062168896198273, + "learning_rate": 5.744080102061958e-05, + "loss": 0.5662, + "step": 6253 + }, + { + "epoch": 1.2856408675095077, + "grad_norm": 0.1896916627883911, + "learning_rate": 5.7431201895720146e-05, + "loss": 0.5658, + "step": 6254 + }, + { + "epoch": 1.2858464384828863, + "grad_norm": 0.19082388281822205, + "learning_rate": 5.742160215839349e-05, + "loss": 0.5624, + "step": 6255 + }, + { + "epoch": 1.2860520094562649, + "grad_norm": 0.1924538016319275, + "learning_rate": 5.741200180911255e-05, + "loss": 0.5813, + "step": 6256 + }, + { + "epoch": 1.2862575804296434, + "grad_norm": 0.18487077951431274, + "learning_rate": 5.740240084835031e-05, + "loss": 0.5528, + "step": 6257 + }, + { + "epoch": 1.286463151403022, + "grad_norm": 0.18869616091251373, + "learning_rate": 5.7392799276579745e-05, + "loss": 0.5472, + "step": 6258 + }, + { + "epoch": 1.2866687223764004, + "grad_norm": 0.19108757376670837, + "learning_rate": 5.738319709427386e-05, + "loss": 0.5516, + "step": 6259 + }, + { + "epoch": 1.286874293349779, + "grad_norm": 0.18827085196971893, + "learning_rate": 5.7373594301905764e-05, + "loss": 0.519, + "step": 6260 + }, + { + "epoch": 1.2870798643231576, + "grad_norm": 0.17874634265899658, + "learning_rate": 5.736399089994849e-05, + "loss": 0.5608, + "step": 6261 + }, + { + "epoch": 1.2872854352965362, + "grad_norm": 0.19754135608673096, + "learning_rate": 5.73543868888752e-05, + "loss": 0.5846, + "step": 6262 + }, + { + "epoch": 1.2874910062699148, + "grad_norm": 0.16421428322792053, + "learning_rate": 5.734478226915899e-05, + "loss": 0.5233, + "step": 6263 + }, + { + "epoch": 1.2876965772432931, + "grad_norm": 0.16342876851558685, + "learning_rate": 5.733517704127306e-05, + "loss": 0.5307, + "step": 6264 + }, + { + "epoch": 1.2879021482166717, + "grad_norm": 0.19278982281684875, + "learning_rate": 5.732557120569061e-05, + "loss": 0.5424, + "step": 6265 + }, + { + "epoch": 1.2881077191900503, + "grad_norm": 0.18997056782245636, + "learning_rate": 5.731596476288488e-05, + "loss": 0.5628, + "step": 6266 + }, + { + "epoch": 1.2883132901634289, + "grad_norm": 0.19608962535858154, + "learning_rate": 5.730635771332912e-05, + "loss": 0.546, + "step": 6267 + }, + { + "epoch": 1.2885188611368075, + "grad_norm": 0.18659254908561707, + "learning_rate": 5.729675005749666e-05, + "loss": 0.5634, + "step": 6268 + }, + { + "epoch": 1.288724432110186, + "grad_norm": 0.1904764473438263, + "learning_rate": 5.7287141795860774e-05, + "loss": 0.5523, + "step": 6269 + }, + { + "epoch": 1.2889300030835646, + "grad_norm": 0.685501754283905, + "learning_rate": 5.727753292889485e-05, + "loss": 0.5588, + "step": 6270 + }, + { + "epoch": 1.2891355740569432, + "grad_norm": 0.19180195033550262, + "learning_rate": 5.726792345707227e-05, + "loss": 0.552, + "step": 6271 + }, + { + "epoch": 1.2893411450303218, + "grad_norm": 0.18611235916614532, + "learning_rate": 5.7258313380866436e-05, + "loss": 0.5342, + "step": 6272 + }, + { + "epoch": 1.2895467160037004, + "grad_norm": 0.1877206414937973, + "learning_rate": 5.7248702700750796e-05, + "loss": 0.5512, + "step": 6273 + }, + { + "epoch": 1.2897522869770788, + "grad_norm": 0.19219855964183807, + "learning_rate": 5.723909141719883e-05, + "loss": 0.5525, + "step": 6274 + }, + { + "epoch": 1.2899578579504574, + "grad_norm": 0.1869809925556183, + "learning_rate": 5.722947953068403e-05, + "loss": 0.541, + "step": 6275 + }, + { + "epoch": 1.290163428923836, + "grad_norm": 0.19108881056308746, + "learning_rate": 5.721986704167994e-05, + "loss": 0.5669, + "step": 6276 + }, + { + "epoch": 1.2903689998972145, + "grad_norm": 0.1971481740474701, + "learning_rate": 5.72102539506601e-05, + "loss": 0.5596, + "step": 6277 + }, + { + "epoch": 1.2905745708705931, + "grad_norm": 0.24877598881721497, + "learning_rate": 5.7200640258098134e-05, + "loss": 0.5511, + "step": 6278 + }, + { + "epoch": 1.2907801418439715, + "grad_norm": 0.16880907118320465, + "learning_rate": 5.719102596446765e-05, + "loss": 0.5211, + "step": 6279 + }, + { + "epoch": 1.29098571281735, + "grad_norm": 0.16007640957832336, + "learning_rate": 5.718141107024229e-05, + "loss": 0.5402, + "step": 6280 + }, + { + "epoch": 1.2911912837907287, + "grad_norm": 0.1952618956565857, + "learning_rate": 5.717179557589574e-05, + "loss": 0.5729, + "step": 6281 + }, + { + "epoch": 1.2913968547641073, + "grad_norm": 0.16671602427959442, + "learning_rate": 5.7162179481901725e-05, + "loss": 0.5312, + "step": 6282 + }, + { + "epoch": 1.2916024257374858, + "grad_norm": 0.15948770940303802, + "learning_rate": 5.7152562788733975e-05, + "loss": 0.5243, + "step": 6283 + }, + { + "epoch": 1.2918079967108644, + "grad_norm": 0.1951056569814682, + "learning_rate": 5.7142945496866235e-05, + "loss": 0.5665, + "step": 6284 + }, + { + "epoch": 1.292013567684243, + "grad_norm": 0.1952039748430252, + "learning_rate": 5.713332760677234e-05, + "loss": 0.5717, + "step": 6285 + }, + { + "epoch": 1.2922191386576216, + "grad_norm": 0.1987905502319336, + "learning_rate": 5.7123709118926104e-05, + "loss": 0.567, + "step": 6286 + }, + { + "epoch": 1.2924247096310002, + "grad_norm": 0.19743449985980988, + "learning_rate": 5.711409003380138e-05, + "loss": 0.5466, + "step": 6287 + }, + { + "epoch": 1.2926302806043788, + "grad_norm": 0.19229763746261597, + "learning_rate": 5.710447035187206e-05, + "loss": 0.5583, + "step": 6288 + }, + { + "epoch": 1.2928358515777572, + "grad_norm": 0.18883401155471802, + "learning_rate": 5.709485007361208e-05, + "loss": 0.54, + "step": 6289 + }, + { + "epoch": 1.2930414225511357, + "grad_norm": 0.19647282361984253, + "learning_rate": 5.708522919949536e-05, + "loss": 0.583, + "step": 6290 + }, + { + "epoch": 1.2932469935245143, + "grad_norm": 0.18365654349327087, + "learning_rate": 5.707560772999587e-05, + "loss": 0.5476, + "step": 6291 + }, + { + "epoch": 1.293452564497893, + "grad_norm": 0.19475975632667542, + "learning_rate": 5.7065985665587646e-05, + "loss": 0.5476, + "step": 6292 + }, + { + "epoch": 1.2936581354712715, + "grad_norm": 0.18907500803470612, + "learning_rate": 5.70563630067447e-05, + "loss": 0.5483, + "step": 6293 + }, + { + "epoch": 1.29386370644465, + "grad_norm": 0.189442440867424, + "learning_rate": 5.704673975394109e-05, + "loss": 0.5387, + "step": 6294 + }, + { + "epoch": 1.2940692774180285, + "grad_norm": 0.19112446904182434, + "learning_rate": 5.703711590765093e-05, + "loss": 0.5714, + "step": 6295 + }, + { + "epoch": 1.294274848391407, + "grad_norm": 0.19194044172763824, + "learning_rate": 5.7027491468348326e-05, + "loss": 0.5521, + "step": 6296 + }, + { + "epoch": 1.2944804193647856, + "grad_norm": 0.18977665901184082, + "learning_rate": 5.7017866436507434e-05, + "loss": 0.5738, + "step": 6297 + }, + { + "epoch": 1.2946859903381642, + "grad_norm": 0.19306746125221252, + "learning_rate": 5.700824081260243e-05, + "loss": 0.5636, + "step": 6298 + }, + { + "epoch": 1.2948915613115428, + "grad_norm": 0.19150002300739288, + "learning_rate": 5.699861459710753e-05, + "loss": 0.5506, + "step": 6299 + }, + { + "epoch": 1.2950971322849214, + "grad_norm": 0.211594358086586, + "learning_rate": 5.698898779049697e-05, + "loss": 0.5631, + "step": 6300 + }, + { + "epoch": 1.2953027032583, + "grad_norm": 0.19325849413871765, + "learning_rate": 5.697936039324502e-05, + "loss": 0.5571, + "step": 6301 + }, + { + "epoch": 1.2955082742316786, + "grad_norm": 0.1876952350139618, + "learning_rate": 5.696973240582597e-05, + "loss": 0.5579, + "step": 6302 + }, + { + "epoch": 1.2957138452050572, + "grad_norm": 0.16953028738498688, + "learning_rate": 5.6960103828714164e-05, + "loss": 0.5279, + "step": 6303 + }, + { + "epoch": 1.2959194161784355, + "grad_norm": 0.16833354532718658, + "learning_rate": 5.695047466238393e-05, + "loss": 0.5394, + "step": 6304 + }, + { + "epoch": 1.2961249871518141, + "grad_norm": 0.16338950395584106, + "learning_rate": 5.694084490730967e-05, + "loss": 0.5196, + "step": 6305 + }, + { + "epoch": 1.2963305581251927, + "grad_norm": 0.16173096001148224, + "learning_rate": 5.6931214563965805e-05, + "loss": 0.5538, + "step": 6306 + }, + { + "epoch": 1.2965361290985713, + "grad_norm": 0.19378416240215302, + "learning_rate": 5.692158363282675e-05, + "loss": 0.5448, + "step": 6307 + }, + { + "epoch": 1.2967417000719499, + "grad_norm": 0.18964388966560364, + "learning_rate": 5.691195211436699e-05, + "loss": 0.5423, + "step": 6308 + }, + { + "epoch": 1.2969472710453285, + "grad_norm": 0.18687476217746735, + "learning_rate": 5.690232000906103e-05, + "loss": 0.5643, + "step": 6309 + }, + { + "epoch": 1.2971528420187068, + "grad_norm": 0.1913549154996872, + "learning_rate": 5.689268731738339e-05, + "loss": 0.554, + "step": 6310 + }, + { + "epoch": 1.2973584129920854, + "grad_norm": 0.19576480984687805, + "learning_rate": 5.688305403980863e-05, + "loss": 0.5846, + "step": 6311 + }, + { + "epoch": 1.297563983965464, + "grad_norm": 0.2015174776315689, + "learning_rate": 5.687342017681135e-05, + "loss": 0.5571, + "step": 6312 + }, + { + "epoch": 1.2977695549388426, + "grad_norm": 0.1950497329235077, + "learning_rate": 5.6863785728866154e-05, + "loss": 0.5471, + "step": 6313 + }, + { + "epoch": 1.2979751259122212, + "grad_norm": 0.19457519054412842, + "learning_rate": 5.6854150696447686e-05, + "loss": 0.5689, + "step": 6314 + }, + { + "epoch": 1.2981806968855998, + "grad_norm": 0.18924319744110107, + "learning_rate": 5.684451508003061e-05, + "loss": 0.5632, + "step": 6315 + }, + { + "epoch": 1.2983862678589784, + "grad_norm": 0.20829612016677856, + "learning_rate": 5.6834878880089635e-05, + "loss": 0.554, + "step": 6316 + }, + { + "epoch": 1.298591838832357, + "grad_norm": 0.19046112895011902, + "learning_rate": 5.6825242097099514e-05, + "loss": 0.5508, + "step": 6317 + }, + { + "epoch": 1.2987974098057355, + "grad_norm": 0.19234079122543335, + "learning_rate": 5.681560473153495e-05, + "loss": 0.5417, + "step": 6318 + }, + { + "epoch": 1.299002980779114, + "grad_norm": 0.19579647481441498, + "learning_rate": 5.68059667838708e-05, + "loss": 0.5449, + "step": 6319 + }, + { + "epoch": 1.2992085517524925, + "grad_norm": 0.19146116077899933, + "learning_rate": 5.679632825458184e-05, + "loss": 0.5603, + "step": 6320 + }, + { + "epoch": 1.299414122725871, + "grad_norm": 0.19622944295406342, + "learning_rate": 5.6786689144142917e-05, + "loss": 0.5568, + "step": 6321 + }, + { + "epoch": 1.2996196936992497, + "grad_norm": 0.19650766253471375, + "learning_rate": 5.6777049453028914e-05, + "loss": 0.5603, + "step": 6322 + }, + { + "epoch": 1.2998252646726283, + "grad_norm": 0.20279136300086975, + "learning_rate": 5.676740918171472e-05, + "loss": 0.5455, + "step": 6323 + }, + { + "epoch": 1.3000308356460069, + "grad_norm": 0.1786477267742157, + "learning_rate": 5.67577683306753e-05, + "loss": 0.5148, + "step": 6324 + }, + { + "epoch": 1.3002364066193852, + "grad_norm": 0.15858376026153564, + "learning_rate": 5.674812690038557e-05, + "loss": 0.5217, + "step": 6325 + }, + { + "epoch": 1.3004419775927638, + "grad_norm": 0.16333921253681183, + "learning_rate": 5.673848489132054e-05, + "loss": 0.5504, + "step": 6326 + }, + { + "epoch": 1.3006475485661424, + "grad_norm": 0.20864447951316833, + "learning_rate": 5.672884230395524e-05, + "loss": 0.5664, + "step": 6327 + }, + { + "epoch": 1.300853119539521, + "grad_norm": 0.20059353113174438, + "learning_rate": 5.6719199138764686e-05, + "loss": 0.575, + "step": 6328 + }, + { + "epoch": 1.3010586905128996, + "grad_norm": 0.1858949512243271, + "learning_rate": 5.670955539622396e-05, + "loss": 0.535, + "step": 6329 + }, + { + "epoch": 1.3012642614862782, + "grad_norm": 0.1687631458044052, + "learning_rate": 5.669991107680818e-05, + "loss": 0.54, + "step": 6330 + }, + { + "epoch": 1.3014698324596568, + "grad_norm": 0.16431094706058502, + "learning_rate": 5.6690266180992464e-05, + "loss": 0.5506, + "step": 6331 + }, + { + "epoch": 1.3016754034330353, + "grad_norm": 0.21161231398582458, + "learning_rate": 5.668062070925197e-05, + "loss": 0.5579, + "step": 6332 + }, + { + "epoch": 1.301880974406414, + "grad_norm": 0.20481392741203308, + "learning_rate": 5.66709746620619e-05, + "loss": 0.5693, + "step": 6333 + }, + { + "epoch": 1.3020865453797923, + "grad_norm": 0.2095717191696167, + "learning_rate": 5.6661328039897456e-05, + "loss": 0.5543, + "step": 6334 + }, + { + "epoch": 1.3022921163531709, + "grad_norm": 0.17169706523418427, + "learning_rate": 5.665168084323387e-05, + "loss": 0.513, + "step": 6335 + }, + { + "epoch": 1.3024976873265495, + "grad_norm": 0.184236079454422, + "learning_rate": 5.664203307254644e-05, + "loss": 0.5606, + "step": 6336 + }, + { + "epoch": 1.302703258299928, + "grad_norm": 0.210636168718338, + "learning_rate": 5.6632384728310464e-05, + "loss": 0.5587, + "step": 6337 + }, + { + "epoch": 1.3029088292733066, + "grad_norm": 0.20916485786437988, + "learning_rate": 5.6622735811001255e-05, + "loss": 0.5563, + "step": 6338 + }, + { + "epoch": 1.3031144002466852, + "grad_norm": 0.19716860353946686, + "learning_rate": 5.6613086321094175e-05, + "loss": 0.5461, + "step": 6339 + }, + { + "epoch": 1.3033199712200636, + "grad_norm": 0.20383410155773163, + "learning_rate": 5.660343625906461e-05, + "loss": 0.5711, + "step": 6340 + }, + { + "epoch": 1.3035255421934422, + "grad_norm": 0.19553574919700623, + "learning_rate": 5.6593785625387965e-05, + "loss": 0.5719, + "step": 6341 + }, + { + "epoch": 1.3037311131668208, + "grad_norm": 0.20345737040042877, + "learning_rate": 5.65841344205397e-05, + "loss": 0.5902, + "step": 6342 + }, + { + "epoch": 1.3039366841401994, + "grad_norm": 0.1968560367822647, + "learning_rate": 5.657448264499528e-05, + "loss": 0.5552, + "step": 6343 + }, + { + "epoch": 1.304142255113578, + "grad_norm": 0.19714896380901337, + "learning_rate": 5.6564830299230204e-05, + "loss": 0.5477, + "step": 6344 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.2418747991323471, + "learning_rate": 5.6555177383719986e-05, + "loss": 0.5675, + "step": 6345 + }, + { + "epoch": 1.3045533970603351, + "grad_norm": 0.16260170936584473, + "learning_rate": 5.654552389894019e-05, + "loss": 0.5324, + "step": 6346 + }, + { + "epoch": 1.3047589680337137, + "grad_norm": 0.15336725115776062, + "learning_rate": 5.653586984536639e-05, + "loss": 0.5376, + "step": 6347 + }, + { + "epoch": 1.3049645390070923, + "grad_norm": 0.13179324567317963, + "learning_rate": 5.652621522347421e-05, + "loss": 0.5133, + "step": 6348 + }, + { + "epoch": 1.305170109980471, + "grad_norm": 0.16065613925457, + "learning_rate": 5.651656003373927e-05, + "loss": 0.5376, + "step": 6349 + }, + { + "epoch": 1.3053756809538493, + "grad_norm": 0.20791570842266083, + "learning_rate": 5.650690427663725e-05, + "loss": 0.5707, + "step": 6350 + }, + { + "epoch": 1.3055812519272278, + "grad_norm": 0.19432078301906586, + "learning_rate": 5.649724795264384e-05, + "loss": 0.5642, + "step": 6351 + }, + { + "epoch": 1.3057868229006064, + "grad_norm": 0.19507555663585663, + "learning_rate": 5.6487591062234756e-05, + "loss": 0.5484, + "step": 6352 + }, + { + "epoch": 1.305992393873985, + "grad_norm": 0.18937799334526062, + "learning_rate": 5.647793360588575e-05, + "loss": 0.5504, + "step": 6353 + }, + { + "epoch": 1.3061979648473636, + "grad_norm": 0.18545973300933838, + "learning_rate": 5.646827558407261e-05, + "loss": 0.5353, + "step": 6354 + }, + { + "epoch": 1.306403535820742, + "grad_norm": 0.210302472114563, + "learning_rate": 5.645861699727114e-05, + "loss": 0.5373, + "step": 6355 + }, + { + "epoch": 1.3066091067941206, + "grad_norm": 0.20394356548786163, + "learning_rate": 5.644895784595715e-05, + "loss": 0.5707, + "step": 6356 + }, + { + "epoch": 1.3068146777674992, + "grad_norm": 0.20221911370754242, + "learning_rate": 5.6439298130606546e-05, + "loss": 0.5635, + "step": 6357 + }, + { + "epoch": 1.3070202487408777, + "grad_norm": 0.20493952929973602, + "learning_rate": 5.642963785169518e-05, + "loss": 0.5635, + "step": 6358 + }, + { + "epoch": 1.3072258197142563, + "grad_norm": 0.2118876874446869, + "learning_rate": 5.641997700969898e-05, + "loss": 0.5578, + "step": 6359 + }, + { + "epoch": 1.307431390687635, + "grad_norm": 0.1980256587266922, + "learning_rate": 5.6410315605093875e-05, + "loss": 0.5551, + "step": 6360 + }, + { + "epoch": 1.3076369616610135, + "grad_norm": 0.20084832608699799, + "learning_rate": 5.640065363835586e-05, + "loss": 0.569, + "step": 6361 + }, + { + "epoch": 1.307842532634392, + "grad_norm": 0.1686294972896576, + "learning_rate": 5.639099110996092e-05, + "loss": 0.5371, + "step": 6362 + }, + { + "epoch": 1.3080481036077707, + "grad_norm": 0.15857572853565216, + "learning_rate": 5.63813280203851e-05, + "loss": 0.5402, + "step": 6363 + }, + { + "epoch": 1.3082536745811493, + "grad_norm": 0.15745136141777039, + "learning_rate": 5.6371664370104435e-05, + "loss": 0.5196, + "step": 6364 + }, + { + "epoch": 1.3084592455545276, + "grad_norm": 0.15688499808311462, + "learning_rate": 5.6362000159595034e-05, + "loss": 0.5361, + "step": 6365 + }, + { + "epoch": 1.3086648165279062, + "grad_norm": 0.18788595497608185, + "learning_rate": 5.635233538933298e-05, + "loss": 0.551, + "step": 6366 + }, + { + "epoch": 1.3088703875012848, + "grad_norm": 0.19345730543136597, + "learning_rate": 5.634267005979442e-05, + "loss": 0.5762, + "step": 6367 + }, + { + "epoch": 1.3090759584746634, + "grad_norm": 0.1903630942106247, + "learning_rate": 5.633300417145553e-05, + "loss": 0.5489, + "step": 6368 + }, + { + "epoch": 1.309281529448042, + "grad_norm": 0.19679617881774902, + "learning_rate": 5.632333772479249e-05, + "loss": 0.5641, + "step": 6369 + }, + { + "epoch": 1.3094871004214204, + "grad_norm": 0.19722123444080353, + "learning_rate": 5.631367072028152e-05, + "loss": 0.5428, + "step": 6370 + }, + { + "epoch": 1.309692671394799, + "grad_norm": 0.19673387706279755, + "learning_rate": 5.630400315839888e-05, + "loss": 0.5763, + "step": 6371 + }, + { + "epoch": 1.3098982423681775, + "grad_norm": 0.19249959290027618, + "learning_rate": 5.629433503962084e-05, + "loss": 0.5687, + "step": 6372 + }, + { + "epoch": 1.3101038133415561, + "grad_norm": 0.18873926997184753, + "learning_rate": 5.6284666364423695e-05, + "loss": 0.557, + "step": 6373 + }, + { + "epoch": 1.3103093843149347, + "grad_norm": 0.2006826251745224, + "learning_rate": 5.627499713328378e-05, + "loss": 0.549, + "step": 6374 + }, + { + "epoch": 1.3105149552883133, + "grad_norm": 0.15970605611801147, + "learning_rate": 5.6265327346677465e-05, + "loss": 0.5264, + "step": 6375 + }, + { + "epoch": 1.3107205262616919, + "grad_norm": 0.16438056528568268, + "learning_rate": 5.6255657005081134e-05, + "loss": 0.5647, + "step": 6376 + }, + { + "epoch": 1.3109260972350705, + "grad_norm": 0.19391551613807678, + "learning_rate": 5.624598610897117e-05, + "loss": 0.5691, + "step": 6377 + }, + { + "epoch": 1.311131668208449, + "grad_norm": 0.19656315445899963, + "learning_rate": 5.623631465882405e-05, + "loss": 0.5626, + "step": 6378 + }, + { + "epoch": 1.3113372391818277, + "grad_norm": 0.18690890073776245, + "learning_rate": 5.622664265511623e-05, + "loss": 0.5395, + "step": 6379 + }, + { + "epoch": 1.311542810155206, + "grad_norm": 0.19605736434459686, + "learning_rate": 5.621697009832418e-05, + "loss": 0.5796, + "step": 6380 + }, + { + "epoch": 1.3117483811285846, + "grad_norm": 0.19763530790805817, + "learning_rate": 5.620729698892445e-05, + "loss": 0.5447, + "step": 6381 + }, + { + "epoch": 1.3119539521019632, + "grad_norm": 0.18934392929077148, + "learning_rate": 5.6197623327393584e-05, + "loss": 0.575, + "step": 6382 + }, + { + "epoch": 1.3121595230753418, + "grad_norm": 0.19040028750896454, + "learning_rate": 5.6187949114208155e-05, + "loss": 0.5448, + "step": 6383 + }, + { + "epoch": 1.3123650940487204, + "grad_norm": 0.20778769254684448, + "learning_rate": 5.6178274349844766e-05, + "loss": 0.5336, + "step": 6384 + }, + { + "epoch": 1.312570665022099, + "grad_norm": 0.18825723230838776, + "learning_rate": 5.6168599034780034e-05, + "loss": 0.5409, + "step": 6385 + }, + { + "epoch": 1.3127762359954773, + "grad_norm": 0.1885683834552765, + "learning_rate": 5.615892316949064e-05, + "loss": 0.5617, + "step": 6386 + }, + { + "epoch": 1.312981806968856, + "grad_norm": 0.16970692574977875, + "learning_rate": 5.614924675445325e-05, + "loss": 0.5322, + "step": 6387 + }, + { + "epoch": 1.3131873779422345, + "grad_norm": 0.1596226543188095, + "learning_rate": 5.613956979014459e-05, + "loss": 0.5696, + "step": 6388 + }, + { + "epoch": 1.313392948915613, + "grad_norm": 0.18783892691135406, + "learning_rate": 5.61298922770414e-05, + "loss": 0.5507, + "step": 6389 + }, + { + "epoch": 1.3135985198889917, + "grad_norm": 0.2017127424478531, + "learning_rate": 5.612021421562043e-05, + "loss": 0.5858, + "step": 6390 + }, + { + "epoch": 1.3138040908623703, + "grad_norm": 0.1910979151725769, + "learning_rate": 5.611053560635848e-05, + "loss": 0.5607, + "step": 6391 + }, + { + "epoch": 1.3140096618357489, + "grad_norm": 0.2119234949350357, + "learning_rate": 5.6100856449732384e-05, + "loss": 0.5665, + "step": 6392 + }, + { + "epoch": 1.3142152328091274, + "grad_norm": 0.19099730253219604, + "learning_rate": 5.609117674621896e-05, + "loss": 0.5601, + "step": 6393 + }, + { + "epoch": 1.314420803782506, + "grad_norm": 0.18972419202327728, + "learning_rate": 5.60814964962951e-05, + "loss": 0.5419, + "step": 6394 + }, + { + "epoch": 1.3146263747558844, + "grad_norm": 0.15883517265319824, + "learning_rate": 5.6071815700437716e-05, + "loss": 0.5145, + "step": 6395 + }, + { + "epoch": 1.314831945729263, + "grad_norm": 0.1622246950864792, + "learning_rate": 5.606213435912371e-05, + "loss": 0.5542, + "step": 6396 + }, + { + "epoch": 1.3150375167026416, + "grad_norm": 0.20873090624809265, + "learning_rate": 5.605245247283005e-05, + "loss": 0.5812, + "step": 6397 + }, + { + "epoch": 1.3152430876760202, + "grad_norm": 0.1877153068780899, + "learning_rate": 5.604277004203371e-05, + "loss": 0.5479, + "step": 6398 + }, + { + "epoch": 1.3154486586493987, + "grad_norm": 0.19027303159236908, + "learning_rate": 5.6033087067211714e-05, + "loss": 0.5552, + "step": 6399 + }, + { + "epoch": 1.3156542296227773, + "grad_norm": 0.19082914292812347, + "learning_rate": 5.602340354884108e-05, + "loss": 0.5544, + "step": 6400 + }, + { + "epoch": 1.3158598005961557, + "grad_norm": 0.1900823563337326, + "learning_rate": 5.601371948739888e-05, + "loss": 0.5564, + "step": 6401 + }, + { + "epoch": 1.3160653715695343, + "grad_norm": 0.1659982055425644, + "learning_rate": 5.60040348833622e-05, + "loss": 0.5338, + "step": 6402 + }, + { + "epoch": 1.3162709425429129, + "grad_norm": 0.16377677023410797, + "learning_rate": 5.599434973720815e-05, + "loss": 0.5685, + "step": 6403 + }, + { + "epoch": 1.3164765135162915, + "grad_norm": 0.1914215385913849, + "learning_rate": 5.5984664049413884e-05, + "loss": 0.5734, + "step": 6404 + }, + { + "epoch": 1.31668208448967, + "grad_norm": 0.19817842543125153, + "learning_rate": 5.5974977820456546e-05, + "loss": 0.5658, + "step": 6405 + }, + { + "epoch": 1.3168876554630486, + "grad_norm": 0.1932641863822937, + "learning_rate": 5.596529105081336e-05, + "loss": 0.5597, + "step": 6406 + }, + { + "epoch": 1.3170932264364272, + "grad_norm": 0.18866626918315887, + "learning_rate": 5.595560374096154e-05, + "loss": 0.5736, + "step": 6407 + }, + { + "epoch": 1.3172987974098058, + "grad_norm": 0.1907801777124405, + "learning_rate": 5.594591589137831e-05, + "loss": 0.575, + "step": 6408 + }, + { + "epoch": 1.3175043683831844, + "grad_norm": 0.18488825857639313, + "learning_rate": 5.5936227502540984e-05, + "loss": 0.5658, + "step": 6409 + }, + { + "epoch": 1.3177099393565628, + "grad_norm": 0.18911798298358917, + "learning_rate": 5.592653857492684e-05, + "loss": 0.5505, + "step": 6410 + }, + { + "epoch": 1.3179155103299414, + "grad_norm": 0.161835715174675, + "learning_rate": 5.59168491090132e-05, + "loss": 0.5313, + "step": 6411 + }, + { + "epoch": 1.31812108130332, + "grad_norm": 0.15991567075252533, + "learning_rate": 5.590715910527745e-05, + "loss": 0.5707, + "step": 6412 + }, + { + "epoch": 1.3183266522766985, + "grad_norm": 0.1980849802494049, + "learning_rate": 5.589746856419694e-05, + "loss": 0.5339, + "step": 6413 + }, + { + "epoch": 1.3185322232500771, + "grad_norm": 0.1609208732843399, + "learning_rate": 5.58877774862491e-05, + "loss": 0.5264, + "step": 6414 + }, + { + "epoch": 1.3187377942234557, + "grad_norm": 0.16349831223487854, + "learning_rate": 5.587808587191134e-05, + "loss": 0.5642, + "step": 6415 + }, + { + "epoch": 1.318943365196834, + "grad_norm": 0.1919315755367279, + "learning_rate": 5.586839372166113e-05, + "loss": 0.57, + "step": 6416 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 0.19255201518535614, + "learning_rate": 5.585870103597596e-05, + "loss": 0.5692, + "step": 6417 + }, + { + "epoch": 1.3193545071435913, + "grad_norm": 0.1922633796930313, + "learning_rate": 5.584900781533334e-05, + "loss": 0.5675, + "step": 6418 + }, + { + "epoch": 1.3195600781169698, + "grad_norm": 0.19982829689979553, + "learning_rate": 5.5839314060210826e-05, + "loss": 0.5711, + "step": 6419 + }, + { + "epoch": 1.3197656490903484, + "grad_norm": 0.19519644975662231, + "learning_rate": 5.582961977108598e-05, + "loss": 0.5645, + "step": 6420 + }, + { + "epoch": 1.319971220063727, + "grad_norm": 0.19568218290805817, + "learning_rate": 5.5819924948436374e-05, + "loss": 0.5638, + "step": 6421 + }, + { + "epoch": 1.3201767910371056, + "grad_norm": 0.1948254555463791, + "learning_rate": 5.581022959273963e-05, + "loss": 0.5511, + "step": 6422 + }, + { + "epoch": 1.3203823620104842, + "grad_norm": 0.19327300786972046, + "learning_rate": 5.580053370447341e-05, + "loss": 0.523, + "step": 6423 + }, + { + "epoch": 1.3205879329838628, + "grad_norm": 0.19158729910850525, + "learning_rate": 5.5790837284115365e-05, + "loss": 0.5628, + "step": 6424 + }, + { + "epoch": 1.3207935039572412, + "grad_norm": 0.2012944519519806, + "learning_rate": 5.578114033214322e-05, + "loss": 0.5486, + "step": 6425 + }, + { + "epoch": 1.3209990749306197, + "grad_norm": 0.19401337206363678, + "learning_rate": 5.577144284903466e-05, + "loss": 0.569, + "step": 6426 + }, + { + "epoch": 1.3212046459039983, + "grad_norm": 0.19512306153774261, + "learning_rate": 5.576174483526748e-05, + "loss": 0.5581, + "step": 6427 + }, + { + "epoch": 1.321410216877377, + "grad_norm": 0.18876834213733673, + "learning_rate": 5.5752046291319415e-05, + "loss": 0.5591, + "step": 6428 + }, + { + "epoch": 1.3216157878507555, + "grad_norm": 0.19513283669948578, + "learning_rate": 5.574234721766829e-05, + "loss": 0.57, + "step": 6429 + }, + { + "epoch": 1.321821358824134, + "grad_norm": 0.16624127328395844, + "learning_rate": 5.5732647614791933e-05, + "loss": 0.5405, + "step": 6430 + }, + { + "epoch": 1.3220269297975125, + "grad_norm": 0.16485817730426788, + "learning_rate": 5.572294748316818e-05, + "loss": 0.54, + "step": 6431 + }, + { + "epoch": 1.322232500770891, + "grad_norm": 0.16315220296382904, + "learning_rate": 5.571324682327493e-05, + "loss": 0.5326, + "step": 6432 + }, + { + "epoch": 1.3224380717442696, + "grad_norm": 0.17077341675758362, + "learning_rate": 5.570354563559009e-05, + "loss": 0.5464, + "step": 6433 + }, + { + "epoch": 1.3226436427176482, + "grad_norm": 0.19310691952705383, + "learning_rate": 5.569384392059158e-05, + "loss": 0.5544, + "step": 6434 + }, + { + "epoch": 1.3228492136910268, + "grad_norm": 0.19178032875061035, + "learning_rate": 5.568414167875736e-05, + "loss": 0.5595, + "step": 6435 + }, + { + "epoch": 1.3230547846644054, + "grad_norm": 0.19363771378993988, + "learning_rate": 5.567443891056542e-05, + "loss": 0.5565, + "step": 6436 + }, + { + "epoch": 1.323260355637784, + "grad_norm": 0.16950379312038422, + "learning_rate": 5.566473561649376e-05, + "loss": 0.5465, + "step": 6437 + }, + { + "epoch": 1.3234659266111626, + "grad_norm": 0.15700620412826538, + "learning_rate": 5.565503179702043e-05, + "loss": 0.5377, + "step": 6438 + }, + { + "epoch": 1.3236714975845412, + "grad_norm": 0.16397301852703094, + "learning_rate": 5.564532745262348e-05, + "loss": 0.536, + "step": 6439 + }, + { + "epoch": 1.3238770685579198, + "grad_norm": 0.20148152112960815, + "learning_rate": 5.5635622583781e-05, + "loss": 0.5598, + "step": 6440 + }, + { + "epoch": 1.3240826395312981, + "grad_norm": 0.16813023388385773, + "learning_rate": 5.562591719097112e-05, + "loss": 0.5117, + "step": 6441 + }, + { + "epoch": 1.3242882105046767, + "grad_norm": 0.15760543942451477, + "learning_rate": 5.5616211274671956e-05, + "loss": 0.5487, + "step": 6442 + }, + { + "epoch": 1.3244937814780553, + "grad_norm": 0.18859198689460754, + "learning_rate": 5.5606504835361675e-05, + "loss": 0.5293, + "step": 6443 + }, + { + "epoch": 1.3246993524514339, + "grad_norm": 0.19250252842903137, + "learning_rate": 5.559679787351849e-05, + "loss": 0.5722, + "step": 6444 + }, + { + "epoch": 1.3249049234248125, + "grad_norm": 0.1938043236732483, + "learning_rate": 5.558709038962061e-05, + "loss": 0.553, + "step": 6445 + }, + { + "epoch": 1.3251104943981908, + "grad_norm": 0.19342714548110962, + "learning_rate": 5.557738238414624e-05, + "loss": 0.5467, + "step": 6446 + }, + { + "epoch": 1.3253160653715694, + "grad_norm": 0.20176750421524048, + "learning_rate": 5.556767385757371e-05, + "loss": 0.5503, + "step": 6447 + }, + { + "epoch": 1.325521636344948, + "grad_norm": 0.19387808442115784, + "learning_rate": 5.555796481038127e-05, + "loss": 0.5651, + "step": 6448 + }, + { + "epoch": 1.3257272073183266, + "grad_norm": 0.17772021889686584, + "learning_rate": 5.5548255243047236e-05, + "loss": 0.506, + "step": 6449 + }, + { + "epoch": 1.3259327782917052, + "grad_norm": 0.1652149111032486, + "learning_rate": 5.553854515604998e-05, + "loss": 0.5591, + "step": 6450 + }, + { + "epoch": 1.3261383492650838, + "grad_norm": 0.19004401564598083, + "learning_rate": 5.552883454986786e-05, + "loss": 0.5616, + "step": 6451 + }, + { + "epoch": 1.3263439202384624, + "grad_norm": 0.1958709955215454, + "learning_rate": 5.551912342497929e-05, + "loss": 0.5523, + "step": 6452 + }, + { + "epoch": 1.326549491211841, + "grad_norm": 0.18773847818374634, + "learning_rate": 5.550941178186265e-05, + "loss": 0.5625, + "step": 6453 + }, + { + "epoch": 1.3267550621852195, + "grad_norm": 0.16042830049991608, + "learning_rate": 5.549969962099643e-05, + "loss": 0.5096, + "step": 6454 + }, + { + "epoch": 1.3269606331585981, + "grad_norm": 0.1585341989994049, + "learning_rate": 5.548998694285908e-05, + "loss": 0.5587, + "step": 6455 + }, + { + "epoch": 1.3271662041319765, + "grad_norm": 0.18803685903549194, + "learning_rate": 5.54802737479291e-05, + "loss": 0.5649, + "step": 6456 + }, + { + "epoch": 1.327371775105355, + "grad_norm": 0.1625043749809265, + "learning_rate": 5.5470560036685025e-05, + "loss": 0.5228, + "step": 6457 + }, + { + "epoch": 1.3275773460787337, + "grad_norm": 0.1575174331665039, + "learning_rate": 5.54608458096054e-05, + "loss": 0.5426, + "step": 6458 + }, + { + "epoch": 1.3277829170521123, + "grad_norm": 0.19953930377960205, + "learning_rate": 5.545113106716877e-05, + "loss": 0.5559, + "step": 6459 + }, + { + "epoch": 1.3279884880254909, + "grad_norm": 0.2004413902759552, + "learning_rate": 5.5441415809853786e-05, + "loss": 0.5624, + "step": 6460 + }, + { + "epoch": 1.3281940589988694, + "grad_norm": 0.18838083744049072, + "learning_rate": 5.543170003813903e-05, + "loss": 0.5626, + "step": 6461 + }, + { + "epoch": 1.3283996299722478, + "grad_norm": 0.1713562160730362, + "learning_rate": 5.542198375250319e-05, + "loss": 0.5454, + "step": 6462 + }, + { + "epoch": 1.3286052009456264, + "grad_norm": 0.13531114161014557, + "learning_rate": 5.5412266953424905e-05, + "loss": 0.5289, + "step": 6463 + }, + { + "epoch": 1.328810771919005, + "grad_norm": 0.16264608502388, + "learning_rate": 5.540254964138291e-05, + "loss": 0.5403, + "step": 6464 + }, + { + "epoch": 1.3290163428923836, + "grad_norm": 0.16079317033290863, + "learning_rate": 5.5392831816855915e-05, + "loss": 0.5081, + "step": 6465 + }, + { + "epoch": 1.3292219138657622, + "grad_norm": 0.15615412592887878, + "learning_rate": 5.538311348032266e-05, + "loss": 0.558, + "step": 6466 + }, + { + "epoch": 1.3294274848391407, + "grad_norm": 0.18808799982070923, + "learning_rate": 5.5373394632261934e-05, + "loss": 0.5462, + "step": 6467 + }, + { + "epoch": 1.3296330558125193, + "grad_norm": 0.1914406418800354, + "learning_rate": 5.536367527315255e-05, + "loss": 0.5668, + "step": 6468 + }, + { + "epoch": 1.329838626785898, + "grad_norm": 0.27818214893341064, + "learning_rate": 5.5353955403473325e-05, + "loss": 0.5524, + "step": 6469 + }, + { + "epoch": 1.3300441977592765, + "grad_norm": 0.19103524088859558, + "learning_rate": 5.53442350237031e-05, + "loss": 0.577, + "step": 6470 + }, + { + "epoch": 1.3302497687326549, + "grad_norm": 0.17256119847297668, + "learning_rate": 5.533451413432077e-05, + "loss": 0.5307, + "step": 6471 + }, + { + "epoch": 1.3304553397060335, + "grad_norm": 0.1665564626455307, + "learning_rate": 5.532479273580523e-05, + "loss": 0.5791, + "step": 6472 + }, + { + "epoch": 1.330660910679412, + "grad_norm": 0.16080975532531738, + "learning_rate": 5.531507082863542e-05, + "loss": 0.5073, + "step": 6473 + }, + { + "epoch": 1.3308664816527906, + "grad_norm": 0.16216245293617249, + "learning_rate": 5.5305348413290264e-05, + "loss": 0.5609, + "step": 6474 + }, + { + "epoch": 1.3310720526261692, + "grad_norm": 0.16360749304294586, + "learning_rate": 5.529562549024878e-05, + "loss": 0.5257, + "step": 6475 + }, + { + "epoch": 1.3312776235995478, + "grad_norm": 0.1617291420698166, + "learning_rate": 5.528590205998994e-05, + "loss": 0.5577, + "step": 6476 + }, + { + "epoch": 1.3314831945729262, + "grad_norm": 0.1931338757276535, + "learning_rate": 5.527617812299278e-05, + "loss": 0.5589, + "step": 6477 + }, + { + "epoch": 1.3316887655463048, + "grad_norm": 0.18447865545749664, + "learning_rate": 5.526645367973636e-05, + "loss": 0.5692, + "step": 6478 + }, + { + "epoch": 1.3318943365196834, + "grad_norm": 0.16455183923244476, + "learning_rate": 5.525672873069975e-05, + "loss": 0.5236, + "step": 6479 + }, + { + "epoch": 1.332099907493062, + "grad_norm": 0.15722709894180298, + "learning_rate": 5.524700327636206e-05, + "loss": 0.5514, + "step": 6480 + }, + { + "epoch": 1.3323054784664405, + "grad_norm": 0.18713107705116272, + "learning_rate": 5.5237277317202405e-05, + "loss": 0.5401, + "step": 6481 + }, + { + "epoch": 1.3325110494398191, + "grad_norm": 0.19015434384346008, + "learning_rate": 5.522755085369994e-05, + "loss": 0.5464, + "step": 6482 + }, + { + "epoch": 1.3327166204131977, + "grad_norm": 0.18974623084068298, + "learning_rate": 5.5217823886333854e-05, + "loss": 0.5409, + "step": 6483 + }, + { + "epoch": 1.3329221913865763, + "grad_norm": 0.19141395390033722, + "learning_rate": 5.520809641558334e-05, + "loss": 0.5512, + "step": 6484 + }, + { + "epoch": 1.333127762359955, + "grad_norm": 0.19724808633327484, + "learning_rate": 5.519836844192763e-05, + "loss": 0.5687, + "step": 6485 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.18789160251617432, + "learning_rate": 5.518863996584599e-05, + "loss": 0.5373, + "step": 6486 + }, + { + "epoch": 1.3335389043067118, + "grad_norm": 0.198290154337883, + "learning_rate": 5.517891098781766e-05, + "loss": 0.5726, + "step": 6487 + }, + { + "epoch": 1.3337444752800904, + "grad_norm": 0.19129502773284912, + "learning_rate": 5.516918150832197e-05, + "loss": 0.547, + "step": 6488 + }, + { + "epoch": 1.333950046253469, + "grad_norm": 0.16152769327163696, + "learning_rate": 5.515945152783824e-05, + "loss": 0.5004, + "step": 6489 + }, + { + "epoch": 1.3341556172268476, + "grad_norm": 0.1580476313829422, + "learning_rate": 5.5149721046845824e-05, + "loss": 0.5455, + "step": 6490 + }, + { + "epoch": 1.3343611882002262, + "grad_norm": 0.190731480717659, + "learning_rate": 5.513999006582407e-05, + "loss": 0.566, + "step": 6491 + }, + { + "epoch": 1.3345667591736046, + "grad_norm": 0.1941419392824173, + "learning_rate": 5.513025858525242e-05, + "loss": 0.5748, + "step": 6492 + }, + { + "epoch": 1.3347723301469832, + "grad_norm": 0.20120371878147125, + "learning_rate": 5.512052660561026e-05, + "loss": 0.5662, + "step": 6493 + }, + { + "epoch": 1.3349779011203617, + "grad_norm": 0.2006073296070099, + "learning_rate": 5.511079412737706e-05, + "loss": 0.5741, + "step": 6494 + }, + { + "epoch": 1.3351834720937403, + "grad_norm": 0.18886934220790863, + "learning_rate": 5.510106115103231e-05, + "loss": 0.5534, + "step": 6495 + }, + { + "epoch": 1.335389043067119, + "grad_norm": 0.18579721450805664, + "learning_rate": 5.5091327677055484e-05, + "loss": 0.5403, + "step": 6496 + }, + { + "epoch": 1.3355946140404975, + "grad_norm": 0.1928054839372635, + "learning_rate": 5.50815937059261e-05, + "loss": 0.5666, + "step": 6497 + }, + { + "epoch": 1.335800185013876, + "grad_norm": 0.161499485373497, + "learning_rate": 5.5071859238123714e-05, + "loss": 0.5366, + "step": 6498 + }, + { + "epoch": 1.3360057559872547, + "grad_norm": 0.1295616626739502, + "learning_rate": 5.506212427412791e-05, + "loss": 0.4916, + "step": 6499 + }, + { + "epoch": 1.3362113269606333, + "grad_norm": 0.15952670574188232, + "learning_rate": 5.505238881441827e-05, + "loss": 0.5444, + "step": 6500 + }, + { + "epoch": 1.3364168979340116, + "grad_norm": 0.202559694647789, + "learning_rate": 5.5042652859474414e-05, + "loss": 0.5592, + "step": 6501 + }, + { + "epoch": 1.3366224689073902, + "grad_norm": 0.16196085512638092, + "learning_rate": 5.5032916409776003e-05, + "loss": 0.5164, + "step": 6502 + }, + { + "epoch": 1.3368280398807688, + "grad_norm": 0.1672007143497467, + "learning_rate": 5.502317946580268e-05, + "loss": 0.5319, + "step": 6503 + }, + { + "epoch": 1.3370336108541474, + "grad_norm": 0.16251109540462494, + "learning_rate": 5.501344202803415e-05, + "loss": 0.5215, + "step": 6504 + }, + { + "epoch": 1.337239181827526, + "grad_norm": 0.12841519713401794, + "learning_rate": 5.500370409695014e-05, + "loss": 0.5087, + "step": 6505 + }, + { + "epoch": 1.3374447528009046, + "grad_norm": 0.16203691065311432, + "learning_rate": 5.499396567303039e-05, + "loss": 0.5683, + "step": 6506 + }, + { + "epoch": 1.337650323774283, + "grad_norm": 0.18712860345840454, + "learning_rate": 5.4984226756754664e-05, + "loss": 0.5488, + "step": 6507 + }, + { + "epoch": 1.3378558947476615, + "grad_norm": 0.19168932735919952, + "learning_rate": 5.497448734860274e-05, + "loss": 0.5639, + "step": 6508 + }, + { + "epoch": 1.3380614657210401, + "grad_norm": 0.18323485553264618, + "learning_rate": 5.4964747449054464e-05, + "loss": 0.5504, + "step": 6509 + }, + { + "epoch": 1.3382670366944187, + "grad_norm": 0.16930492222309113, + "learning_rate": 5.4955007058589646e-05, + "loss": 0.5296, + "step": 6510 + }, + { + "epoch": 1.3384726076677973, + "grad_norm": 0.16478413343429565, + "learning_rate": 5.494526617768816e-05, + "loss": 0.557, + "step": 6511 + }, + { + "epoch": 1.3386781786411759, + "grad_norm": 0.1620486080646515, + "learning_rate": 5.4935524806829885e-05, + "loss": 0.5328, + "step": 6512 + }, + { + "epoch": 1.3388837496145545, + "grad_norm": 0.15588897466659546, + "learning_rate": 5.4925782946494754e-05, + "loss": 0.5307, + "step": 6513 + }, + { + "epoch": 1.339089320587933, + "grad_norm": 0.16102923452854156, + "learning_rate": 5.4916040597162677e-05, + "loss": 0.5318, + "step": 6514 + }, + { + "epoch": 1.3392948915613117, + "grad_norm": 0.13110311329364777, + "learning_rate": 5.490629775931364e-05, + "loss": 0.515, + "step": 6515 + }, + { + "epoch": 1.3395004625346902, + "grad_norm": 0.1619655340909958, + "learning_rate": 5.4896554433427606e-05, + "loss": 0.5477, + "step": 6516 + }, + { + "epoch": 1.3397060335080686, + "grad_norm": 0.20572912693023682, + "learning_rate": 5.48868106199846e-05, + "loss": 0.5571, + "step": 6517 + }, + { + "epoch": 1.3399116044814472, + "grad_norm": 0.156040221452713, + "learning_rate": 5.487706631946464e-05, + "loss": 0.5231, + "step": 6518 + }, + { + "epoch": 1.3401171754548258, + "grad_norm": 0.16056253015995026, + "learning_rate": 5.486732153234778e-05, + "loss": 0.5529, + "step": 6519 + }, + { + "epoch": 1.3403227464282044, + "grad_norm": 0.19152522087097168, + "learning_rate": 5.485757625911413e-05, + "loss": 0.545, + "step": 6520 + }, + { + "epoch": 1.340528317401583, + "grad_norm": 0.185153067111969, + "learning_rate": 5.484783050024376e-05, + "loss": 0.5545, + "step": 6521 + }, + { + "epoch": 1.3407338883749613, + "grad_norm": 0.18557578325271606, + "learning_rate": 5.4838084256216796e-05, + "loss": 0.5631, + "step": 6522 + }, + { + "epoch": 1.34093945934834, + "grad_norm": 0.1944609433412552, + "learning_rate": 5.482833752751343e-05, + "loss": 0.5673, + "step": 6523 + }, + { + "epoch": 1.3411450303217185, + "grad_norm": 0.1916920244693756, + "learning_rate": 5.4818590314613796e-05, + "loss": 0.5406, + "step": 6524 + }, + { + "epoch": 1.341350601295097, + "grad_norm": 0.199026957154274, + "learning_rate": 5.48088426179981e-05, + "loss": 0.5614, + "step": 6525 + }, + { + "epoch": 1.3415561722684757, + "grad_norm": 0.19180314242839813, + "learning_rate": 5.479909443814658e-05, + "loss": 0.5676, + "step": 6526 + }, + { + "epoch": 1.3417617432418543, + "grad_norm": 0.18850663304328918, + "learning_rate": 5.478934577553949e-05, + "loss": 0.5644, + "step": 6527 + }, + { + "epoch": 1.3419673142152329, + "grad_norm": 0.19104434549808502, + "learning_rate": 5.477959663065709e-05, + "loss": 0.5517, + "step": 6528 + }, + { + "epoch": 1.3421728851886114, + "grad_norm": 0.16571475565433502, + "learning_rate": 5.476984700397966e-05, + "loss": 0.5328, + "step": 6529 + }, + { + "epoch": 1.34237845616199, + "grad_norm": 0.1614765077829361, + "learning_rate": 5.4760096895987535e-05, + "loss": 0.5574, + "step": 6530 + }, + { + "epoch": 1.3425840271353686, + "grad_norm": 0.18632696568965912, + "learning_rate": 5.4750346307161064e-05, + "loss": 0.5605, + "step": 6531 + }, + { + "epoch": 1.342789598108747, + "grad_norm": 0.1589028239250183, + "learning_rate": 5.474059523798059e-05, + "loss": 0.5214, + "step": 6532 + }, + { + "epoch": 1.3429951690821256, + "grad_norm": 0.16524967551231384, + "learning_rate": 5.473084368892653e-05, + "loss": 0.565, + "step": 6533 + }, + { + "epoch": 1.3432007400555042, + "grad_norm": 0.1631617695093155, + "learning_rate": 5.4721091660479276e-05, + "loss": 0.5324, + "step": 6534 + }, + { + "epoch": 1.3434063110288827, + "grad_norm": 0.1608559638261795, + "learning_rate": 5.471133915311927e-05, + "loss": 0.5469, + "step": 6535 + }, + { + "epoch": 1.3436118820022613, + "grad_norm": 0.1971094310283661, + "learning_rate": 5.470158616732698e-05, + "loss": 0.5692, + "step": 6536 + }, + { + "epoch": 1.3438174529756397, + "grad_norm": 0.19706624746322632, + "learning_rate": 5.469183270358288e-05, + "loss": 0.5694, + "step": 6537 + }, + { + "epoch": 1.3440230239490183, + "grad_norm": 0.18402022123336792, + "learning_rate": 5.468207876236748e-05, + "loss": 0.5478, + "step": 6538 + }, + { + "epoch": 1.3442285949223969, + "grad_norm": 0.16580908000469208, + "learning_rate": 5.467232434416132e-05, + "loss": 0.5444, + "step": 6539 + }, + { + "epoch": 1.3444341658957755, + "grad_norm": 0.1564161777496338, + "learning_rate": 5.466256944944494e-05, + "loss": 0.5379, + "step": 6540 + }, + { + "epoch": 1.344639736869154, + "grad_norm": 0.19156378507614136, + "learning_rate": 5.465281407869894e-05, + "loss": 0.5479, + "step": 6541 + }, + { + "epoch": 1.3448453078425326, + "grad_norm": 0.18408456444740295, + "learning_rate": 5.46430582324039e-05, + "loss": 0.5402, + "step": 6542 + }, + { + "epoch": 1.3450508788159112, + "grad_norm": 0.18590892851352692, + "learning_rate": 5.463330191104045e-05, + "loss": 0.5345, + "step": 6543 + }, + { + "epoch": 1.3452564497892898, + "grad_norm": 0.2050226926803589, + "learning_rate": 5.4623545115089246e-05, + "loss": 0.5731, + "step": 6544 + }, + { + "epoch": 1.3454620207626684, + "grad_norm": 0.19850295782089233, + "learning_rate": 5.461378784503095e-05, + "loss": 0.5583, + "step": 6545 + }, + { + "epoch": 1.345667591736047, + "grad_norm": 0.16567668318748474, + "learning_rate": 5.4604030101346255e-05, + "loss": 0.531, + "step": 6546 + }, + { + "epoch": 1.3458731627094254, + "grad_norm": 0.15176017582416534, + "learning_rate": 5.4594271884515884e-05, + "loss": 0.5567, + "step": 6547 + }, + { + "epoch": 1.346078733682804, + "grad_norm": 0.19408267736434937, + "learning_rate": 5.45845131950206e-05, + "loss": 0.5601, + "step": 6548 + }, + { + "epoch": 1.3462843046561825, + "grad_norm": 0.18972966074943542, + "learning_rate": 5.457475403334114e-05, + "loss": 0.574, + "step": 6549 + }, + { + "epoch": 1.3464898756295611, + "grad_norm": 0.19591477513313293, + "learning_rate": 5.456499439995829e-05, + "loss": 0.5559, + "step": 6550 + }, + { + "epoch": 1.3466954466029397, + "grad_norm": 0.18834471702575684, + "learning_rate": 5.455523429535289e-05, + "loss": 0.5537, + "step": 6551 + }, + { + "epoch": 1.3469010175763183, + "grad_norm": 0.1918981820344925, + "learning_rate": 5.454547372000575e-05, + "loss": 0.5594, + "step": 6552 + }, + { + "epoch": 1.3471065885496967, + "grad_norm": 0.16592934727668762, + "learning_rate": 5.453571267439773e-05, + "loss": 0.509, + "step": 6553 + }, + { + "epoch": 1.3473121595230753, + "grad_norm": 0.16087022423744202, + "learning_rate": 5.4525951159009726e-05, + "loss": 0.5429, + "step": 6554 + }, + { + "epoch": 1.3475177304964538, + "grad_norm": 0.19623617827892303, + "learning_rate": 5.4516189174322635e-05, + "loss": 0.5526, + "step": 6555 + }, + { + "epoch": 1.3477233014698324, + "grad_norm": 0.19142059981822968, + "learning_rate": 5.450642672081737e-05, + "loss": 0.5726, + "step": 6556 + }, + { + "epoch": 1.347928872443211, + "grad_norm": 0.1905898004770279, + "learning_rate": 5.44966637989749e-05, + "loss": 0.556, + "step": 6557 + }, + { + "epoch": 1.3481344434165896, + "grad_norm": 0.19187632203102112, + "learning_rate": 5.448690040927618e-05, + "loss": 0.5517, + "step": 6558 + }, + { + "epoch": 1.3483400143899682, + "grad_norm": 0.19854268431663513, + "learning_rate": 5.447713655220224e-05, + "loss": 0.5642, + "step": 6559 + }, + { + "epoch": 1.3485455853633468, + "grad_norm": 0.18761958181858063, + "learning_rate": 5.446737222823405e-05, + "loss": 0.5595, + "step": 6560 + }, + { + "epoch": 1.3487511563367254, + "grad_norm": 0.19532154500484467, + "learning_rate": 5.445760743785271e-05, + "loss": 0.5764, + "step": 6561 + }, + { + "epoch": 1.3489567273101037, + "grad_norm": 0.20847441256046295, + "learning_rate": 5.444784218153924e-05, + "loss": 0.5326, + "step": 6562 + }, + { + "epoch": 1.3491622982834823, + "grad_norm": 0.2053038477897644, + "learning_rate": 5.4438076459774746e-05, + "loss": 0.54, + "step": 6563 + }, + { + "epoch": 1.349367869256861, + "grad_norm": 0.1965019851922989, + "learning_rate": 5.4428310273040335e-05, + "loss": 0.5454, + "step": 6564 + }, + { + "epoch": 1.3495734402302395, + "grad_norm": 0.19706155359745026, + "learning_rate": 5.4418543621817165e-05, + "loss": 0.5847, + "step": 6565 + }, + { + "epoch": 1.349779011203618, + "grad_norm": 0.18815022706985474, + "learning_rate": 5.440877650658636e-05, + "loss": 0.5541, + "step": 6566 + }, + { + "epoch": 1.3499845821769967, + "grad_norm": 0.16428446769714355, + "learning_rate": 5.43990089278291e-05, + "loss": 0.5459, + "step": 6567 + }, + { + "epoch": 1.350190153150375, + "grad_norm": 0.16542398929595947, + "learning_rate": 5.438924088602662e-05, + "loss": 0.5646, + "step": 6568 + }, + { + "epoch": 1.3503957241237536, + "grad_norm": 0.15714940428733826, + "learning_rate": 5.437947238166012e-05, + "loss": 0.5173, + "step": 6569 + }, + { + "epoch": 1.3506012950971322, + "grad_norm": 0.15711595118045807, + "learning_rate": 5.436970341521084e-05, + "loss": 0.5552, + "step": 6570 + }, + { + "epoch": 1.3508068660705108, + "grad_norm": 0.1985914558172226, + "learning_rate": 5.4359933987160086e-05, + "loss": 0.5668, + "step": 6571 + }, + { + "epoch": 1.3510124370438894, + "grad_norm": 0.19462761282920837, + "learning_rate": 5.435016409798913e-05, + "loss": 0.5585, + "step": 6572 + }, + { + "epoch": 1.351218008017268, + "grad_norm": 0.19194667041301727, + "learning_rate": 5.434039374817929e-05, + "loss": 0.5631, + "step": 6573 + }, + { + "epoch": 1.3514235789906466, + "grad_norm": 0.19980405271053314, + "learning_rate": 5.43306229382119e-05, + "loss": 0.5535, + "step": 6574 + }, + { + "epoch": 1.3516291499640252, + "grad_norm": 0.193598210811615, + "learning_rate": 5.432085166856834e-05, + "loss": 0.5606, + "step": 6575 + }, + { + "epoch": 1.3518347209374038, + "grad_norm": 0.16227704286575317, + "learning_rate": 5.431107993972999e-05, + "loss": 0.5169, + "step": 6576 + }, + { + "epoch": 1.3520402919107821, + "grad_norm": 0.16246087849140167, + "learning_rate": 5.430130775217823e-05, + "loss": 0.5548, + "step": 6577 + }, + { + "epoch": 1.3522458628841607, + "grad_norm": 0.16693639755249023, + "learning_rate": 5.4291535106394524e-05, + "loss": 0.5287, + "step": 6578 + }, + { + "epoch": 1.3524514338575393, + "grad_norm": 0.16185717284679413, + "learning_rate": 5.4281762002860304e-05, + "loss": 0.5556, + "step": 6579 + }, + { + "epoch": 1.3526570048309179, + "grad_norm": 0.19650043547153473, + "learning_rate": 5.427198844205706e-05, + "loss": 0.5632, + "step": 6580 + }, + { + "epoch": 1.3528625758042965, + "grad_norm": 0.16057594120502472, + "learning_rate": 5.426221442446627e-05, + "loss": 0.5163, + "step": 6581 + }, + { + "epoch": 1.353068146777675, + "grad_norm": 0.15515869855880737, + "learning_rate": 5.425243995056949e-05, + "loss": 0.5588, + "step": 6582 + }, + { + "epoch": 1.3532737177510534, + "grad_norm": 0.19516292214393616, + "learning_rate": 5.4242665020848224e-05, + "loss": 0.5814, + "step": 6583 + }, + { + "epoch": 1.353479288724432, + "grad_norm": 0.1625499576330185, + "learning_rate": 5.423288963578405e-05, + "loss": 0.5264, + "step": 6584 + }, + { + "epoch": 1.3536848596978106, + "grad_norm": 0.16830846667289734, + "learning_rate": 5.422311379585857e-05, + "loss": 0.5258, + "step": 6585 + }, + { + "epoch": 1.3538904306711892, + "grad_norm": 0.19009056687355042, + "learning_rate": 5.4213337501553374e-05, + "loss": 0.5549, + "step": 6586 + }, + { + "epoch": 1.3540960016445678, + "grad_norm": 0.18671362102031708, + "learning_rate": 5.4203560753350115e-05, + "loss": 0.5482, + "step": 6587 + }, + { + "epoch": 1.3543015726179464, + "grad_norm": 0.1931658238172531, + "learning_rate": 5.419378355173042e-05, + "loss": 0.5665, + "step": 6588 + }, + { + "epoch": 1.354507143591325, + "grad_norm": 0.1925138682126999, + "learning_rate": 5.4184005897175985e-05, + "loss": 0.5649, + "step": 6589 + }, + { + "epoch": 1.3547127145647035, + "grad_norm": 0.1919427365064621, + "learning_rate": 5.41742277901685e-05, + "loss": 0.5425, + "step": 6590 + }, + { + "epoch": 1.3549182855380821, + "grad_norm": 0.19209784269332886, + "learning_rate": 5.416444923118968e-05, + "loss": 0.5561, + "step": 6591 + }, + { + "epoch": 1.3551238565114605, + "grad_norm": 0.17238673567771912, + "learning_rate": 5.415467022072131e-05, + "loss": 0.5302, + "step": 6592 + }, + { + "epoch": 1.355329427484839, + "grad_norm": 0.1562458574771881, + "learning_rate": 5.414489075924512e-05, + "loss": 0.5435, + "step": 6593 + }, + { + "epoch": 1.3555349984582177, + "grad_norm": 0.19020064175128937, + "learning_rate": 5.41351108472429e-05, + "loss": 0.5327, + "step": 6594 + }, + { + "epoch": 1.3557405694315963, + "grad_norm": 0.20159995555877686, + "learning_rate": 5.412533048519646e-05, + "loss": 0.5489, + "step": 6595 + }, + { + "epoch": 1.3559461404049749, + "grad_norm": 0.19280879199504852, + "learning_rate": 5.411554967358765e-05, + "loss": 0.542, + "step": 6596 + }, + { + "epoch": 1.3561517113783534, + "grad_norm": 0.18953213095664978, + "learning_rate": 5.410576841289831e-05, + "loss": 0.5464, + "step": 6597 + }, + { + "epoch": 1.3563572823517318, + "grad_norm": 0.18897344172000885, + "learning_rate": 5.409598670361032e-05, + "loss": 0.5427, + "step": 6598 + }, + { + "epoch": 1.3565628533251104, + "grad_norm": 0.20002910494804382, + "learning_rate": 5.408620454620558e-05, + "loss": 0.5554, + "step": 6599 + }, + { + "epoch": 1.356768424298489, + "grad_norm": 0.18375547230243683, + "learning_rate": 5.4076421941166016e-05, + "loss": 0.555, + "step": 6600 + }, + { + "epoch": 1.3569739952718676, + "grad_norm": 0.17289654910564423, + "learning_rate": 5.406663888897355e-05, + "loss": 0.5342, + "step": 6601 + }, + { + "epoch": 1.3571795662452462, + "grad_norm": 0.16519290208816528, + "learning_rate": 5.405685539011017e-05, + "loss": 0.5506, + "step": 6602 + }, + { + "epoch": 1.3573851372186247, + "grad_norm": 0.19404758512973785, + "learning_rate": 5.404707144505786e-05, + "loss": 0.5703, + "step": 6603 + }, + { + "epoch": 1.3575907081920033, + "grad_norm": 0.1909807026386261, + "learning_rate": 5.403728705429864e-05, + "loss": 0.5762, + "step": 6604 + }, + { + "epoch": 1.357796279165382, + "grad_norm": 0.19107364118099213, + "learning_rate": 5.4027502218314505e-05, + "loss": 0.5411, + "step": 6605 + }, + { + "epoch": 1.3580018501387605, + "grad_norm": 0.18892939388751984, + "learning_rate": 5.401771693758754e-05, + "loss": 0.5456, + "step": 6606 + }, + { + "epoch": 1.358207421112139, + "grad_norm": 0.19617542624473572, + "learning_rate": 5.400793121259981e-05, + "loss": 0.5759, + "step": 6607 + }, + { + "epoch": 1.3584129920855175, + "grad_norm": 0.19577234983444214, + "learning_rate": 5.39981450438334e-05, + "loss": 0.5668, + "step": 6608 + }, + { + "epoch": 1.358618563058896, + "grad_norm": 0.21422545611858368, + "learning_rate": 5.3988358431770455e-05, + "loss": 0.5677, + "step": 6609 + }, + { + "epoch": 1.3588241340322746, + "grad_norm": 0.16092784702777863, + "learning_rate": 5.397857137689311e-05, + "loss": 0.5076, + "step": 6610 + }, + { + "epoch": 1.3590297050056532, + "grad_norm": 0.15695548057556152, + "learning_rate": 5.39687838796835e-05, + "loss": 0.5357, + "step": 6611 + }, + { + "epoch": 1.3592352759790318, + "grad_norm": 0.20313376188278198, + "learning_rate": 5.395899594062383e-05, + "loss": 0.5823, + "step": 6612 + }, + { + "epoch": 1.3594408469524102, + "grad_norm": 0.19227701425552368, + "learning_rate": 5.3949207560196306e-05, + "loss": 0.5674, + "step": 6613 + }, + { + "epoch": 1.3596464179257888, + "grad_norm": 0.190741628408432, + "learning_rate": 5.393941873888316e-05, + "loss": 0.548, + "step": 6614 + }, + { + "epoch": 1.3598519888991674, + "grad_norm": 0.19307512044906616, + "learning_rate": 5.3929629477166624e-05, + "loss": 0.5449, + "step": 6615 + }, + { + "epoch": 1.360057559872546, + "grad_norm": 0.19279111921787262, + "learning_rate": 5.3919839775529e-05, + "loss": 0.5505, + "step": 6616 + }, + { + "epoch": 1.3602631308459245, + "grad_norm": 0.1940283179283142, + "learning_rate": 5.391004963445255e-05, + "loss": 0.5564, + "step": 6617 + }, + { + "epoch": 1.3604687018193031, + "grad_norm": 0.22000883519649506, + "learning_rate": 5.39002590544196e-05, + "loss": 0.5643, + "step": 6618 + }, + { + "epoch": 1.3606742727926817, + "grad_norm": 0.1951514333486557, + "learning_rate": 5.3890468035912484e-05, + "loss": 0.5502, + "step": 6619 + }, + { + "epoch": 1.3608798437660603, + "grad_norm": 0.19694966077804565, + "learning_rate": 5.388067657941357e-05, + "loss": 0.5609, + "step": 6620 + }, + { + "epoch": 1.361085414739439, + "grad_norm": 0.165736585855484, + "learning_rate": 5.387088468540522e-05, + "loss": 0.5275, + "step": 6621 + }, + { + "epoch": 1.3612909857128175, + "grad_norm": 0.1606799215078354, + "learning_rate": 5.3861092354369843e-05, + "loss": 0.5503, + "step": 6622 + }, + { + "epoch": 1.3614965566861958, + "grad_norm": 0.1982721984386444, + "learning_rate": 5.385129958678986e-05, + "loss": 0.5561, + "step": 6623 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 0.20562221109867096, + "learning_rate": 5.384150638314773e-05, + "loss": 0.5675, + "step": 6624 + }, + { + "epoch": 1.361907698632953, + "grad_norm": 0.19149377942085266, + "learning_rate": 5.3831712743925905e-05, + "loss": 0.5675, + "step": 6625 + }, + { + "epoch": 1.3621132696063316, + "grad_norm": 0.19633962213993073, + "learning_rate": 5.382191866960686e-05, + "loss": 0.5566, + "step": 6626 + }, + { + "epoch": 1.3623188405797102, + "grad_norm": 0.19432850182056427, + "learning_rate": 5.381212416067313e-05, + "loss": 0.5525, + "step": 6627 + }, + { + "epoch": 1.3625244115530886, + "grad_norm": 0.18926875293254852, + "learning_rate": 5.380232921760723e-05, + "loss": 0.5573, + "step": 6628 + }, + { + "epoch": 1.3627299825264672, + "grad_norm": 0.16620329022407532, + "learning_rate": 5.379253384089169e-05, + "loss": 0.5206, + "step": 6629 + }, + { + "epoch": 1.3629355534998457, + "grad_norm": 0.1583135575056076, + "learning_rate": 5.378273803100913e-05, + "loss": 0.5458, + "step": 6630 + }, + { + "epoch": 1.3631411244732243, + "grad_norm": 0.19092857837677002, + "learning_rate": 5.3772941788442106e-05, + "loss": 0.5782, + "step": 6631 + }, + { + "epoch": 1.363346695446603, + "grad_norm": 0.19434650242328644, + "learning_rate": 5.3763145113673234e-05, + "loss": 0.5743, + "step": 6632 + }, + { + "epoch": 1.3635522664199815, + "grad_norm": 0.19643783569335938, + "learning_rate": 5.375334800718518e-05, + "loss": 0.5689, + "step": 6633 + }, + { + "epoch": 1.36375783739336, + "grad_norm": 0.16674213111400604, + "learning_rate": 5.374355046946057e-05, + "loss": 0.5268, + "step": 6634 + }, + { + "epoch": 1.3639634083667387, + "grad_norm": 0.16963227093219757, + "learning_rate": 5.3733752500982095e-05, + "loss": 0.5625, + "step": 6635 + }, + { + "epoch": 1.3641689793401173, + "grad_norm": 0.18819878995418549, + "learning_rate": 5.372395410223246e-05, + "loss": 0.5633, + "step": 6636 + }, + { + "epoch": 1.3643745503134959, + "grad_norm": 0.19265903532505035, + "learning_rate": 5.371415527369439e-05, + "loss": 0.5459, + "step": 6637 + }, + { + "epoch": 1.3645801212868742, + "grad_norm": 0.19311292469501495, + "learning_rate": 5.370435601585061e-05, + "loss": 0.5648, + "step": 6638 + }, + { + "epoch": 1.3647856922602528, + "grad_norm": 0.19344937801361084, + "learning_rate": 5.3694556329183904e-05, + "loss": 0.5701, + "step": 6639 + }, + { + "epoch": 1.3649912632336314, + "grad_norm": 0.24478478729724884, + "learning_rate": 5.368475621417703e-05, + "loss": 0.5532, + "step": 6640 + }, + { + "epoch": 1.36519683420701, + "grad_norm": 0.19150310754776, + "learning_rate": 5.367495567131282e-05, + "loss": 0.5471, + "step": 6641 + }, + { + "epoch": 1.3654024051803886, + "grad_norm": 0.19197209179401398, + "learning_rate": 5.3665154701074097e-05, + "loss": 0.5406, + "step": 6642 + }, + { + "epoch": 1.3656079761537672, + "grad_norm": 0.19130434095859528, + "learning_rate": 5.365535330394368e-05, + "loss": 0.5363, + "step": 6643 + }, + { + "epoch": 1.3658135471271455, + "grad_norm": 0.19257521629333496, + "learning_rate": 5.3645551480404487e-05, + "loss": 0.5547, + "step": 6644 + }, + { + "epoch": 1.3660191181005241, + "grad_norm": 0.18824981153011322, + "learning_rate": 5.363574923093936e-05, + "loss": 0.5723, + "step": 6645 + }, + { + "epoch": 1.3662246890739027, + "grad_norm": 0.19089485704898834, + "learning_rate": 5.362594655603123e-05, + "loss": 0.536, + "step": 6646 + }, + { + "epoch": 1.3664302600472813, + "grad_norm": 0.1918558031320572, + "learning_rate": 5.3616143456163055e-05, + "loss": 0.5404, + "step": 6647 + }, + { + "epoch": 1.3666358310206599, + "grad_norm": 0.199978306889534, + "learning_rate": 5.3606339931817756e-05, + "loss": 0.5633, + "step": 6648 + }, + { + "epoch": 1.3668414019940385, + "grad_norm": 0.1935882270336151, + "learning_rate": 5.35965359834783e-05, + "loss": 0.5777, + "step": 6649 + }, + { + "epoch": 1.367046972967417, + "grad_norm": 0.19100281596183777, + "learning_rate": 5.358673161162771e-05, + "loss": 0.547, + "step": 6650 + }, + { + "epoch": 1.3672525439407957, + "grad_norm": 0.19073952734470367, + "learning_rate": 5.357692681674898e-05, + "loss": 0.5613, + "step": 6651 + }, + { + "epoch": 1.3674581149141742, + "grad_norm": 0.16322961449623108, + "learning_rate": 5.356712159932516e-05, + "loss": 0.5327, + "step": 6652 + }, + { + "epoch": 1.3676636858875526, + "grad_norm": 0.1632666438817978, + "learning_rate": 5.35573159598393e-05, + "loss": 0.5418, + "step": 6653 + }, + { + "epoch": 1.3678692568609312, + "grad_norm": 0.1909777820110321, + "learning_rate": 5.3547509898774476e-05, + "loss": 0.5595, + "step": 6654 + }, + { + "epoch": 1.3680748278343098, + "grad_norm": 0.19034305214881897, + "learning_rate": 5.353770341661378e-05, + "loss": 0.5576, + "step": 6655 + }, + { + "epoch": 1.3682803988076884, + "grad_norm": 0.19562803208827972, + "learning_rate": 5.352789651384036e-05, + "loss": 0.5549, + "step": 6656 + }, + { + "epoch": 1.368485969781067, + "grad_norm": 0.2044394165277481, + "learning_rate": 5.351808919093733e-05, + "loss": 0.5686, + "step": 6657 + }, + { + "epoch": 1.3686915407544455, + "grad_norm": 0.19082361459732056, + "learning_rate": 5.350828144838786e-05, + "loss": 0.5626, + "step": 6658 + }, + { + "epoch": 1.368897111727824, + "grad_norm": 0.21942925453186035, + "learning_rate": 5.349847328667514e-05, + "loss": 0.583, + "step": 6659 + }, + { + "epoch": 1.3691026827012025, + "grad_norm": 0.19300974905490875, + "learning_rate": 5.348866470628235e-05, + "loss": 0.5538, + "step": 6660 + }, + { + "epoch": 1.369308253674581, + "grad_norm": 0.1846531480550766, + "learning_rate": 5.347885570769273e-05, + "loss": 0.5331, + "step": 6661 + }, + { + "epoch": 1.3695138246479597, + "grad_norm": 0.19142849743366241, + "learning_rate": 5.346904629138953e-05, + "loss": 0.5606, + "step": 6662 + }, + { + "epoch": 1.3697193956213383, + "grad_norm": 0.19237980246543884, + "learning_rate": 5.3459236457856e-05, + "loss": 0.5426, + "step": 6663 + }, + { + "epoch": 1.3699249665947169, + "grad_norm": 0.20076246559619904, + "learning_rate": 5.344942620757541e-05, + "loss": 0.5676, + "step": 6664 + }, + { + "epoch": 1.3701305375680954, + "grad_norm": 0.193067267537117, + "learning_rate": 5.34396155410311e-05, + "loss": 0.557, + "step": 6665 + }, + { + "epoch": 1.370336108541474, + "grad_norm": 0.19357764720916748, + "learning_rate": 5.342980445870637e-05, + "loss": 0.5676, + "step": 6666 + }, + { + "epoch": 1.3705416795148526, + "grad_norm": 0.16621150076389313, + "learning_rate": 5.341999296108457e-05, + "loss": 0.526, + "step": 6667 + }, + { + "epoch": 1.370747250488231, + "grad_norm": 0.13069656491279602, + "learning_rate": 5.341018104864909e-05, + "loss": 0.5275, + "step": 6668 + }, + { + "epoch": 1.3709528214616096, + "grad_norm": 0.16230368614196777, + "learning_rate": 5.3400368721883284e-05, + "loss": 0.5518, + "step": 6669 + }, + { + "epoch": 1.3711583924349882, + "grad_norm": 0.2009955644607544, + "learning_rate": 5.339055598127059e-05, + "loss": 0.5503, + "step": 6670 + }, + { + "epoch": 1.3713639634083667, + "grad_norm": 0.18965907394886017, + "learning_rate": 5.33807428272944e-05, + "loss": 0.566, + "step": 6671 + }, + { + "epoch": 1.3715695343817453, + "grad_norm": 0.1938343495130539, + "learning_rate": 5.3370929260438196e-05, + "loss": 0.5559, + "step": 6672 + }, + { + "epoch": 1.371775105355124, + "grad_norm": 0.17998439073562622, + "learning_rate": 5.336111528118543e-05, + "loss": 0.515, + "step": 6673 + }, + { + "epoch": 1.3719806763285023, + "grad_norm": 0.16804425418376923, + "learning_rate": 5.335130089001958e-05, + "loss": 0.5192, + "step": 6674 + }, + { + "epoch": 1.3721862473018809, + "grad_norm": 0.16128107905387878, + "learning_rate": 5.3341486087424194e-05, + "loss": 0.5566, + "step": 6675 + }, + { + "epoch": 1.3723918182752595, + "grad_norm": 0.1895219087600708, + "learning_rate": 5.333167087388276e-05, + "loss": 0.5678, + "step": 6676 + }, + { + "epoch": 1.372597389248638, + "grad_norm": 0.18738722801208496, + "learning_rate": 5.3321855249878845e-05, + "loss": 0.5647, + "step": 6677 + }, + { + "epoch": 1.3728029602220166, + "grad_norm": 0.19784080982208252, + "learning_rate": 5.331203921589602e-05, + "loss": 0.5661, + "step": 6678 + }, + { + "epoch": 1.3730085311953952, + "grad_norm": 0.17455421388149261, + "learning_rate": 5.3302222772417875e-05, + "loss": 0.5411, + "step": 6679 + }, + { + "epoch": 1.3732141021687738, + "grad_norm": 0.13297952711582184, + "learning_rate": 5.329240591992803e-05, + "loss": 0.532, + "step": 6680 + }, + { + "epoch": 1.3734196731421524, + "grad_norm": 0.16244389116764069, + "learning_rate": 5.328258865891008e-05, + "loss": 0.5617, + "step": 6681 + }, + { + "epoch": 1.373625244115531, + "grad_norm": 0.19854487478733063, + "learning_rate": 5.3272770989847724e-05, + "loss": 0.5724, + "step": 6682 + }, + { + "epoch": 1.3738308150889094, + "grad_norm": 0.19856125116348267, + "learning_rate": 5.32629529132246e-05, + "loss": 0.588, + "step": 6683 + }, + { + "epoch": 1.374036386062288, + "grad_norm": 0.19242699444293976, + "learning_rate": 5.32531344295244e-05, + "loss": 0.5468, + "step": 6684 + }, + { + "epoch": 1.3742419570356665, + "grad_norm": 0.19373014569282532, + "learning_rate": 5.3243315539230844e-05, + "loss": 0.5487, + "step": 6685 + }, + { + "epoch": 1.3744475280090451, + "grad_norm": 0.19233091175556183, + "learning_rate": 5.323349624282766e-05, + "loss": 0.5524, + "step": 6686 + }, + { + "epoch": 1.3746530989824237, + "grad_norm": 0.1918216347694397, + "learning_rate": 5.32236765407986e-05, + "loss": 0.5538, + "step": 6687 + }, + { + "epoch": 1.3748586699558023, + "grad_norm": 0.1914103925228119, + "learning_rate": 5.3213856433627426e-05, + "loss": 0.5608, + "step": 6688 + }, + { + "epoch": 1.3750642409291807, + "grad_norm": 0.19780538976192474, + "learning_rate": 5.320403592179795e-05, + "loss": 0.5701, + "step": 6689 + }, + { + "epoch": 1.3752698119025593, + "grad_norm": 0.19317637383937836, + "learning_rate": 5.3194215005793964e-05, + "loss": 0.551, + "step": 6690 + }, + { + "epoch": 1.3754753828759378, + "grad_norm": 0.17101670801639557, + "learning_rate": 5.31843936860993e-05, + "loss": 0.5369, + "step": 6691 + }, + { + "epoch": 1.3756809538493164, + "grad_norm": 0.1648482233285904, + "learning_rate": 5.317457196319782e-05, + "loss": 0.5706, + "step": 6692 + }, + { + "epoch": 1.375886524822695, + "grad_norm": 0.1978417932987213, + "learning_rate": 5.3164749837573395e-05, + "loss": 0.5429, + "step": 6693 + }, + { + "epoch": 1.3760920957960736, + "grad_norm": 0.19628840684890747, + "learning_rate": 5.31549273097099e-05, + "loss": 0.5649, + "step": 6694 + }, + { + "epoch": 1.3762976667694522, + "grad_norm": 0.1944446712732315, + "learning_rate": 5.314510438009125e-05, + "loss": 0.548, + "step": 6695 + }, + { + "epoch": 1.3765032377428308, + "grad_norm": 0.19895857572555542, + "learning_rate": 5.313528104920138e-05, + "loss": 0.5428, + "step": 6696 + }, + { + "epoch": 1.3767088087162094, + "grad_norm": 0.18742914497852325, + "learning_rate": 5.312545731752423e-05, + "loss": 0.5525, + "step": 6697 + }, + { + "epoch": 1.376914379689588, + "grad_norm": 0.1647169291973114, + "learning_rate": 5.311563318554379e-05, + "loss": 0.5259, + "step": 6698 + }, + { + "epoch": 1.3771199506629663, + "grad_norm": 0.1640775054693222, + "learning_rate": 5.310580865374401e-05, + "loss": 0.5602, + "step": 6699 + }, + { + "epoch": 1.377325521636345, + "grad_norm": 0.19247397780418396, + "learning_rate": 5.309598372260895e-05, + "loss": 0.5539, + "step": 6700 + }, + { + "epoch": 1.3775310926097235, + "grad_norm": 0.17393262684345245, + "learning_rate": 5.3086158392622606e-05, + "loss": 0.5212, + "step": 6701 + }, + { + "epoch": 1.377736663583102, + "grad_norm": 0.17243215441703796, + "learning_rate": 5.307633266426903e-05, + "loss": 0.5667, + "step": 6702 + }, + { + "epoch": 1.3779422345564807, + "grad_norm": 0.19524256885051727, + "learning_rate": 5.3066506538032286e-05, + "loss": 0.5447, + "step": 6703 + }, + { + "epoch": 1.378147805529859, + "grad_norm": 0.19185814261436462, + "learning_rate": 5.305668001439647e-05, + "loss": 0.5564, + "step": 6704 + }, + { + "epoch": 1.3783533765032376, + "grad_norm": 0.19080397486686707, + "learning_rate": 5.3046853093845694e-05, + "loss": 0.5545, + "step": 6705 + }, + { + "epoch": 1.3785589474766162, + "grad_norm": 0.20013724267482758, + "learning_rate": 5.303702577686408e-05, + "loss": 0.5444, + "step": 6706 + }, + { + "epoch": 1.3787645184499948, + "grad_norm": 0.19205878674983978, + "learning_rate": 5.302719806393576e-05, + "loss": 0.5582, + "step": 6707 + }, + { + "epoch": 1.3789700894233734, + "grad_norm": 0.16551436483860016, + "learning_rate": 5.3017369955544915e-05, + "loss": 0.5166, + "step": 6708 + }, + { + "epoch": 1.379175660396752, + "grad_norm": 0.15659868717193604, + "learning_rate": 5.300754145217573e-05, + "loss": 0.5345, + "step": 6709 + }, + { + "epoch": 1.3793812313701306, + "grad_norm": 0.19091999530792236, + "learning_rate": 5.299771255431239e-05, + "loss": 0.5393, + "step": 6710 + }, + { + "epoch": 1.3795868023435092, + "grad_norm": 0.19453977048397064, + "learning_rate": 5.298788326243915e-05, + "loss": 0.5471, + "step": 6711 + }, + { + "epoch": 1.3797923733168878, + "grad_norm": 0.18982084095478058, + "learning_rate": 5.2978053577040225e-05, + "loss": 0.5482, + "step": 6712 + }, + { + "epoch": 1.3799979442902663, + "grad_norm": 0.20918771624565125, + "learning_rate": 5.2968223498599895e-05, + "loss": 0.5698, + "step": 6713 + }, + { + "epoch": 1.3802035152636447, + "grad_norm": 0.20116795599460602, + "learning_rate": 5.2958393027602444e-05, + "loss": 0.5605, + "step": 6714 + }, + { + "epoch": 1.3804090862370233, + "grad_norm": 0.18591387569904327, + "learning_rate": 5.294856216453216e-05, + "loss": 0.5381, + "step": 6715 + }, + { + "epoch": 1.3806146572104019, + "grad_norm": 0.19346030056476593, + "learning_rate": 5.293873090987336e-05, + "loss": 0.565, + "step": 6716 + }, + { + "epoch": 1.3808202281837805, + "grad_norm": 0.18695658445358276, + "learning_rate": 5.292889926411041e-05, + "loss": 0.5261, + "step": 6717 + }, + { + "epoch": 1.381025799157159, + "grad_norm": 0.16254091262817383, + "learning_rate": 5.291906722772765e-05, + "loss": 0.5208, + "step": 6718 + }, + { + "epoch": 1.3812313701305377, + "grad_norm": 0.15224479138851166, + "learning_rate": 5.2909234801209445e-05, + "loss": 0.5667, + "step": 6719 + }, + { + "epoch": 1.381436941103916, + "grad_norm": 0.16312278807163239, + "learning_rate": 5.2899401985040215e-05, + "loss": 0.5439, + "step": 6720 + }, + { + "epoch": 1.3816425120772946, + "grad_norm": 0.15921905636787415, + "learning_rate": 5.288956877970438e-05, + "loss": 0.5442, + "step": 6721 + }, + { + "epoch": 1.3818480830506732, + "grad_norm": 0.20192372798919678, + "learning_rate": 5.287973518568635e-05, + "loss": 0.5779, + "step": 6722 + }, + { + "epoch": 1.3820536540240518, + "grad_norm": 0.16968026757240295, + "learning_rate": 5.286990120347061e-05, + "loss": 0.5533, + "step": 6723 + }, + { + "epoch": 1.3822592249974304, + "grad_norm": 0.16610193252563477, + "learning_rate": 5.2860066833541636e-05, + "loss": 0.5593, + "step": 6724 + }, + { + "epoch": 1.382464795970809, + "grad_norm": 0.19108933210372925, + "learning_rate": 5.285023207638389e-05, + "loss": 0.564, + "step": 6725 + }, + { + "epoch": 1.3826703669441875, + "grad_norm": 0.18995323777198792, + "learning_rate": 5.28403969324819e-05, + "loss": 0.5523, + "step": 6726 + }, + { + "epoch": 1.3828759379175661, + "grad_norm": 0.18891942501068115, + "learning_rate": 5.2830561402320215e-05, + "loss": 0.5453, + "step": 6727 + }, + { + "epoch": 1.3830815088909447, + "grad_norm": 0.19134697318077087, + "learning_rate": 5.2820725486383356e-05, + "loss": 0.565, + "step": 6728 + }, + { + "epoch": 1.383287079864323, + "grad_norm": 0.16310301423072815, + "learning_rate": 5.28108891851559e-05, + "loss": 0.549, + "step": 6729 + }, + { + "epoch": 1.3834926508377017, + "grad_norm": 0.13107767701148987, + "learning_rate": 5.280105249912246e-05, + "loss": 0.5449, + "step": 6730 + }, + { + "epoch": 1.3836982218110803, + "grad_norm": 0.12225886434316635, + "learning_rate": 5.279121542876761e-05, + "loss": 0.5211, + "step": 6731 + }, + { + "epoch": 1.3839037927844589, + "grad_norm": 0.16120769083499908, + "learning_rate": 5.2781377974576e-05, + "loss": 0.536, + "step": 6732 + }, + { + "epoch": 1.3841093637578374, + "grad_norm": 0.20347453653812408, + "learning_rate": 5.2771540137032256e-05, + "loss": 0.5692, + "step": 6733 + }, + { + "epoch": 1.384314934731216, + "grad_norm": 0.19555138051509857, + "learning_rate": 5.2761701916621064e-05, + "loss": 0.5155, + "step": 6734 + }, + { + "epoch": 1.3845205057045944, + "grad_norm": 0.2035539448261261, + "learning_rate": 5.27518633138271e-05, + "loss": 0.5697, + "step": 6735 + }, + { + "epoch": 1.384726076677973, + "grad_norm": 0.18798959255218506, + "learning_rate": 5.274202432913505e-05, + "loss": 0.553, + "step": 6736 + }, + { + "epoch": 1.3849316476513516, + "grad_norm": 0.1946985423564911, + "learning_rate": 5.2732184963029663e-05, + "loss": 0.5551, + "step": 6737 + }, + { + "epoch": 1.3851372186247302, + "grad_norm": 0.17025156319141388, + "learning_rate": 5.272234521599565e-05, + "loss": 0.5342, + "step": 6738 + }, + { + "epoch": 1.3853427895981087, + "grad_norm": 0.16380397975444794, + "learning_rate": 5.27125050885178e-05, + "loss": 0.5592, + "step": 6739 + }, + { + "epoch": 1.3855483605714873, + "grad_norm": 0.19385696947574615, + "learning_rate": 5.2702664581080845e-05, + "loss": 0.5499, + "step": 6740 + }, + { + "epoch": 1.385753931544866, + "grad_norm": 0.19014237821102142, + "learning_rate": 5.2692823694169624e-05, + "loss": 0.5322, + "step": 6741 + }, + { + "epoch": 1.3859595025182445, + "grad_norm": 0.16555199027061462, + "learning_rate": 5.2682982428268926e-05, + "loss": 0.5253, + "step": 6742 + }, + { + "epoch": 1.386165073491623, + "grad_norm": 0.1773664802312851, + "learning_rate": 5.26731407838636e-05, + "loss": 0.5609, + "step": 6743 + }, + { + "epoch": 1.3863706444650015, + "grad_norm": 0.20064838230609894, + "learning_rate": 5.26632987614385e-05, + "loss": 0.5682, + "step": 6744 + }, + { + "epoch": 1.38657621543838, + "grad_norm": 0.16893361508846283, + "learning_rate": 5.2653456361478486e-05, + "loss": 0.5296, + "step": 6745 + }, + { + "epoch": 1.3867817864117586, + "grad_norm": 0.12028443813323975, + "learning_rate": 5.264361358446845e-05, + "loss": 0.5109, + "step": 6746 + }, + { + "epoch": 1.3869873573851372, + "grad_norm": 0.16433177888393402, + "learning_rate": 5.263377043089329e-05, + "loss": 0.5492, + "step": 6747 + }, + { + "epoch": 1.3871929283585158, + "grad_norm": 0.20058415830135345, + "learning_rate": 5.262392690123795e-05, + "loss": 0.5544, + "step": 6748 + }, + { + "epoch": 1.3873984993318944, + "grad_norm": 0.1890854686498642, + "learning_rate": 5.261408299598737e-05, + "loss": 0.5518, + "step": 6749 + }, + { + "epoch": 1.3876040703052728, + "grad_norm": 0.1862923949956894, + "learning_rate": 5.260423871562648e-05, + "loss": 0.5598, + "step": 6750 + }, + { + "epoch": 1.3878096412786514, + "grad_norm": 0.18234452605247498, + "learning_rate": 5.2594394060640325e-05, + "loss": 0.5486, + "step": 6751 + }, + { + "epoch": 1.38801521225203, + "grad_norm": 0.16496604681015015, + "learning_rate": 5.258454903151385e-05, + "loss": 0.4984, + "step": 6752 + }, + { + "epoch": 1.3882207832254085, + "grad_norm": 0.1623886525630951, + "learning_rate": 5.2574703628732104e-05, + "loss": 0.5521, + "step": 6753 + }, + { + "epoch": 1.3884263541987871, + "grad_norm": 0.20108892023563385, + "learning_rate": 5.25648578527801e-05, + "loss": 0.571, + "step": 6754 + }, + { + "epoch": 1.3886319251721657, + "grad_norm": 0.18858185410499573, + "learning_rate": 5.2555011704142925e-05, + "loss": 0.5343, + "step": 6755 + }, + { + "epoch": 1.3888374961455443, + "grad_norm": 0.18392902612686157, + "learning_rate": 5.2545165183305625e-05, + "loss": 0.5448, + "step": 6756 + }, + { + "epoch": 1.389043067118923, + "grad_norm": 0.19124126434326172, + "learning_rate": 5.253531829075331e-05, + "loss": 0.5493, + "step": 6757 + }, + { + "epoch": 1.3892486380923015, + "grad_norm": 0.19267001748085022, + "learning_rate": 5.252547102697108e-05, + "loss": 0.5504, + "step": 6758 + }, + { + "epoch": 1.3894542090656798, + "grad_norm": 0.19391465187072754, + "learning_rate": 5.251562339244407e-05, + "loss": 0.5503, + "step": 6759 + }, + { + "epoch": 1.3896597800390584, + "grad_norm": 0.16429035365581512, + "learning_rate": 5.250577538765741e-05, + "loss": 0.5135, + "step": 6760 + }, + { + "epoch": 1.389865351012437, + "grad_norm": 0.15530334413051605, + "learning_rate": 5.249592701309629e-05, + "loss": 0.5197, + "step": 6761 + }, + { + "epoch": 1.3900709219858156, + "grad_norm": 0.19579361379146576, + "learning_rate": 5.248607826924589e-05, + "loss": 0.5486, + "step": 6762 + }, + { + "epoch": 1.3902764929591942, + "grad_norm": 0.1991192102432251, + "learning_rate": 5.2476229156591384e-05, + "loss": 0.5713, + "step": 6763 + }, + { + "epoch": 1.3904820639325728, + "grad_norm": 0.19221562147140503, + "learning_rate": 5.246637967561802e-05, + "loss": 0.5394, + "step": 6764 + }, + { + "epoch": 1.3906876349059512, + "grad_norm": 0.18756262958049774, + "learning_rate": 5.245652982681102e-05, + "loss": 0.5317, + "step": 6765 + }, + { + "epoch": 1.3908932058793297, + "grad_norm": 0.16349650919437408, + "learning_rate": 5.244667961065567e-05, + "loss": 0.5351, + "step": 6766 + }, + { + "epoch": 1.3910987768527083, + "grad_norm": 0.17436912655830383, + "learning_rate": 5.24368290276372e-05, + "loss": 0.5597, + "step": 6767 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.19610293209552765, + "learning_rate": 5.242697807824093e-05, + "loss": 0.5688, + "step": 6768 + }, + { + "epoch": 1.3915099187994655, + "grad_norm": 0.19287322461605072, + "learning_rate": 5.241712676295217e-05, + "loss": 0.5456, + "step": 6769 + }, + { + "epoch": 1.391715489772844, + "grad_norm": 0.1898210346698761, + "learning_rate": 5.240727508225623e-05, + "loss": 0.5595, + "step": 6770 + }, + { + "epoch": 1.3919210607462227, + "grad_norm": 0.1842799186706543, + "learning_rate": 5.239742303663847e-05, + "loss": 0.5492, + "step": 6771 + }, + { + "epoch": 1.3921266317196013, + "grad_norm": 0.18624331057071686, + "learning_rate": 5.238757062658426e-05, + "loss": 0.5388, + "step": 6772 + }, + { + "epoch": 1.3923322026929799, + "grad_norm": 0.16960440576076508, + "learning_rate": 5.237771785257897e-05, + "loss": 0.5353, + "step": 6773 + }, + { + "epoch": 1.3925377736663584, + "grad_norm": 0.13957920670509338, + "learning_rate": 5.2367864715108005e-05, + "loss": 0.5144, + "step": 6774 + }, + { + "epoch": 1.3927433446397368, + "grad_norm": 0.1618185192346573, + "learning_rate": 5.235801121465677e-05, + "loss": 0.5447, + "step": 6775 + }, + { + "epoch": 1.3929489156131154, + "grad_norm": 0.19508126378059387, + "learning_rate": 5.234815735171073e-05, + "loss": 0.5684, + "step": 6776 + }, + { + "epoch": 1.393154486586494, + "grad_norm": 0.1584571748971939, + "learning_rate": 5.233830312675533e-05, + "loss": 0.4997, + "step": 6777 + }, + { + "epoch": 1.3933600575598726, + "grad_norm": 0.15756317973136902, + "learning_rate": 5.232844854027601e-05, + "loss": 0.5506, + "step": 6778 + }, + { + "epoch": 1.3935656285332512, + "grad_norm": 0.2031278908252716, + "learning_rate": 5.231859359275831e-05, + "loss": 0.5452, + "step": 6779 + }, + { + "epoch": 1.3937711995066295, + "grad_norm": 0.1902448683977127, + "learning_rate": 5.230873828468769e-05, + "loss": 0.5624, + "step": 6780 + }, + { + "epoch": 1.3939767704800081, + "grad_norm": 0.19727613031864166, + "learning_rate": 5.22988826165497e-05, + "loss": 0.5699, + "step": 6781 + }, + { + "epoch": 1.3941823414533867, + "grad_norm": 0.18354666233062744, + "learning_rate": 5.228902658882989e-05, + "loss": 0.5463, + "step": 6782 + }, + { + "epoch": 1.3943879124267653, + "grad_norm": 0.16275332868099213, + "learning_rate": 5.22791702020138e-05, + "loss": 0.5167, + "step": 6783 + }, + { + "epoch": 1.3945934834001439, + "grad_norm": 0.15852688252925873, + "learning_rate": 5.226931345658701e-05, + "loss": 0.5342, + "step": 6784 + }, + { + "epoch": 1.3947990543735225, + "grad_norm": 0.16829104721546173, + "learning_rate": 5.2259456353035136e-05, + "loss": 0.5206, + "step": 6785 + }, + { + "epoch": 1.395004625346901, + "grad_norm": 0.15513145923614502, + "learning_rate": 5.2249598891843765e-05, + "loss": 0.5584, + "step": 6786 + }, + { + "epoch": 1.3952101963202796, + "grad_norm": 0.1593499630689621, + "learning_rate": 5.223974107349855e-05, + "loss": 0.5256, + "step": 6787 + }, + { + "epoch": 1.3954157672936582, + "grad_norm": 0.16022507846355438, + "learning_rate": 5.222988289848512e-05, + "loss": 0.5377, + "step": 6788 + }, + { + "epoch": 1.3956213382670368, + "grad_norm": 0.19908879697322845, + "learning_rate": 5.222002436728917e-05, + "loss": 0.5567, + "step": 6789 + }, + { + "epoch": 1.3958269092404152, + "grad_norm": 0.1942145675420761, + "learning_rate": 5.2210165480396364e-05, + "loss": 0.5503, + "step": 6790 + }, + { + "epoch": 1.3960324802137938, + "grad_norm": 0.20177899301052094, + "learning_rate": 5.2200306238292396e-05, + "loss": 0.5572, + "step": 6791 + }, + { + "epoch": 1.3962380511871724, + "grad_norm": 0.20615504682064056, + "learning_rate": 5.219044664146299e-05, + "loss": 0.5572, + "step": 6792 + }, + { + "epoch": 1.396443622160551, + "grad_norm": 0.16137507557868958, + "learning_rate": 5.21805866903939e-05, + "loss": 0.5327, + "step": 6793 + }, + { + "epoch": 1.3966491931339295, + "grad_norm": 0.13222044706344604, + "learning_rate": 5.217072638557086e-05, + "loss": 0.5397, + "step": 6794 + }, + { + "epoch": 1.396854764107308, + "grad_norm": 0.15501753985881805, + "learning_rate": 5.216086572747963e-05, + "loss": 0.5588, + "step": 6795 + }, + { + "epoch": 1.3970603350806865, + "grad_norm": 0.16480109095573425, + "learning_rate": 5.2151004716606035e-05, + "loss": 0.4947, + "step": 6796 + }, + { + "epoch": 1.397265906054065, + "grad_norm": 0.1597471535205841, + "learning_rate": 5.214114335343585e-05, + "loss": 0.5504, + "step": 6797 + }, + { + "epoch": 1.3974714770274437, + "grad_norm": 0.18874730169773102, + "learning_rate": 5.2131281638454914e-05, + "loss": 0.5601, + "step": 6798 + }, + { + "epoch": 1.3976770480008223, + "grad_norm": 0.19088098406791687, + "learning_rate": 5.212141957214907e-05, + "loss": 0.557, + "step": 6799 + }, + { + "epoch": 1.3978826189742009, + "grad_norm": 0.19219143688678741, + "learning_rate": 5.2111557155004156e-05, + "loss": 0.5574, + "step": 6800 + }, + { + "epoch": 1.3980881899475794, + "grad_norm": 0.19509856402873993, + "learning_rate": 5.2101694387506074e-05, + "loss": 0.5609, + "step": 6801 + }, + { + "epoch": 1.398293760920958, + "grad_norm": 0.19519266486167908, + "learning_rate": 5.2091831270140694e-05, + "loss": 0.5598, + "step": 6802 + }, + { + "epoch": 1.3984993318943366, + "grad_norm": 0.16416554152965546, + "learning_rate": 5.208196780339394e-05, + "loss": 0.5073, + "step": 6803 + }, + { + "epoch": 1.3987049028677152, + "grad_norm": 0.16652482748031616, + "learning_rate": 5.207210398775174e-05, + "loss": 0.5577, + "step": 6804 + }, + { + "epoch": 1.3989104738410936, + "grad_norm": 0.1610838919878006, + "learning_rate": 5.206223982370001e-05, + "loss": 0.5424, + "step": 6805 + }, + { + "epoch": 1.3991160448144722, + "grad_norm": 0.12500424683094025, + "learning_rate": 5.2052375311724755e-05, + "loss": 0.5185, + "step": 6806 + }, + { + "epoch": 1.3993216157878507, + "grad_norm": 0.16289743781089783, + "learning_rate": 5.204251045231191e-05, + "loss": 0.548, + "step": 6807 + }, + { + "epoch": 1.3995271867612293, + "grad_norm": 0.1971302479505539, + "learning_rate": 5.203264524594751e-05, + "loss": 0.5481, + "step": 6808 + }, + { + "epoch": 1.399732757734608, + "grad_norm": 0.1616830974817276, + "learning_rate": 5.2022779693117535e-05, + "loss": 0.5206, + "step": 6809 + }, + { + "epoch": 1.3999383287079865, + "grad_norm": 0.13564690947532654, + "learning_rate": 5.201291379430804e-05, + "loss": 0.5078, + "step": 6810 + }, + { + "epoch": 1.4001438996813649, + "grad_norm": 0.20377317070960999, + "learning_rate": 5.200304755000506e-05, + "loss": 0.5494, + "step": 6811 + }, + { + "epoch": 1.4003494706547435, + "grad_norm": 0.20373232662677765, + "learning_rate": 5.199318096069465e-05, + "loss": 0.5652, + "step": 6812 + }, + { + "epoch": 1.400555041628122, + "grad_norm": 0.19755113124847412, + "learning_rate": 5.198331402686291e-05, + "loss": 0.5687, + "step": 6813 + }, + { + "epoch": 1.4007606126015006, + "grad_norm": 0.18689025938510895, + "learning_rate": 5.197344674899593e-05, + "loss": 0.5576, + "step": 6814 + }, + { + "epoch": 1.4009661835748792, + "grad_norm": 0.1978052258491516, + "learning_rate": 5.196357912757982e-05, + "loss": 0.5807, + "step": 6815 + }, + { + "epoch": 1.4011717545482578, + "grad_norm": 0.16826669871807098, + "learning_rate": 5.19537111631007e-05, + "loss": 0.4959, + "step": 6816 + }, + { + "epoch": 1.4013773255216364, + "grad_norm": 0.16866251826286316, + "learning_rate": 5.1943842856044745e-05, + "loss": 0.5509, + "step": 6817 + }, + { + "epoch": 1.401582896495015, + "grad_norm": 0.16553765535354614, + "learning_rate": 5.19339742068981e-05, + "loss": 0.5225, + "step": 6818 + }, + { + "epoch": 1.4017884674683936, + "grad_norm": 0.15738850831985474, + "learning_rate": 5.192410521614695e-05, + "loss": 0.5439, + "step": 6819 + }, + { + "epoch": 1.401994038441772, + "grad_norm": 0.1941434144973755, + "learning_rate": 5.1914235884277515e-05, + "loss": 0.5431, + "step": 6820 + }, + { + "epoch": 1.4021996094151505, + "grad_norm": 0.19510993361473083, + "learning_rate": 5.1904366211775995e-05, + "loss": 0.5699, + "step": 6821 + }, + { + "epoch": 1.4024051803885291, + "grad_norm": 0.21199296414852142, + "learning_rate": 5.189449619912862e-05, + "loss": 0.5497, + "step": 6822 + }, + { + "epoch": 1.4026107513619077, + "grad_norm": 0.20840586721897125, + "learning_rate": 5.188462584682163e-05, + "loss": 0.5692, + "step": 6823 + }, + { + "epoch": 1.4028163223352863, + "grad_norm": 0.18796321749687195, + "learning_rate": 5.187475515534132e-05, + "loss": 0.5497, + "step": 6824 + }, + { + "epoch": 1.403021893308665, + "grad_norm": 0.18638098239898682, + "learning_rate": 5.186488412517396e-05, + "loss": 0.556, + "step": 6825 + }, + { + "epoch": 1.4032274642820433, + "grad_norm": 0.18943150341510773, + "learning_rate": 5.185501275680582e-05, + "loss": 0.5451, + "step": 6826 + }, + { + "epoch": 1.4034330352554218, + "grad_norm": 0.19243142008781433, + "learning_rate": 5.184514105072326e-05, + "loss": 0.5348, + "step": 6827 + }, + { + "epoch": 1.4036386062288004, + "grad_norm": 0.19465966522693634, + "learning_rate": 5.1835269007412585e-05, + "loss": 0.5711, + "step": 6828 + }, + { + "epoch": 1.403844177202179, + "grad_norm": 0.1992519199848175, + "learning_rate": 5.1825396627360166e-05, + "loss": 0.5768, + "step": 6829 + }, + { + "epoch": 1.4040497481755576, + "grad_norm": 0.16625314950942993, + "learning_rate": 5.181552391105235e-05, + "loss": 0.5431, + "step": 6830 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 0.16418209671974182, + "learning_rate": 5.180565085897552e-05, + "loss": 0.5585, + "step": 6831 + }, + { + "epoch": 1.4044608901223148, + "grad_norm": 0.19852881133556366, + "learning_rate": 5.17957774716161e-05, + "loss": 0.5298, + "step": 6832 + }, + { + "epoch": 1.4046664610956934, + "grad_norm": 0.18581949174404144, + "learning_rate": 5.178590374946047e-05, + "loss": 0.5466, + "step": 6833 + }, + { + "epoch": 1.404872032069072, + "grad_norm": 0.19243168830871582, + "learning_rate": 5.177602969299509e-05, + "loss": 0.552, + "step": 6834 + }, + { + "epoch": 1.4050776030424503, + "grad_norm": 0.20078270137310028, + "learning_rate": 5.1766155302706397e-05, + "loss": 0.56, + "step": 6835 + }, + { + "epoch": 1.405283174015829, + "grad_norm": 0.18953198194503784, + "learning_rate": 5.175628057908085e-05, + "loss": 0.5404, + "step": 6836 + }, + { + "epoch": 1.4054887449892075, + "grad_norm": 0.19314275681972504, + "learning_rate": 5.174640552260494e-05, + "loss": 0.553, + "step": 6837 + }, + { + "epoch": 1.405694315962586, + "grad_norm": 0.19777776300907135, + "learning_rate": 5.1736530133765175e-05, + "loss": 0.5539, + "step": 6838 + }, + { + "epoch": 1.4058998869359647, + "grad_norm": 0.18886315822601318, + "learning_rate": 5.1726654413048036e-05, + "loss": 0.5508, + "step": 6839 + }, + { + "epoch": 1.4061054579093433, + "grad_norm": 0.16566768288612366, + "learning_rate": 5.171677836094008e-05, + "loss": 0.5384, + "step": 6840 + }, + { + "epoch": 1.4063110288827216, + "grad_norm": 0.12670090794563293, + "learning_rate": 5.170690197792785e-05, + "loss": 0.5064, + "step": 6841 + }, + { + "epoch": 1.4065165998561002, + "grad_norm": 0.16452710330486298, + "learning_rate": 5.1697025264497915e-05, + "loss": 0.5549, + "step": 6842 + }, + { + "epoch": 1.4067221708294788, + "grad_norm": 0.23035211861133575, + "learning_rate": 5.168714822113684e-05, + "loss": 0.533, + "step": 6843 + }, + { + "epoch": 1.4069277418028574, + "grad_norm": 0.1920643001794815, + "learning_rate": 5.167727084833123e-05, + "loss": 0.5667, + "step": 6844 + }, + { + "epoch": 1.407133312776236, + "grad_norm": 0.1763206124305725, + "learning_rate": 5.1667393146567695e-05, + "loss": 0.5285, + "step": 6845 + }, + { + "epoch": 1.4073388837496146, + "grad_norm": 0.17114083468914032, + "learning_rate": 5.1657515116332866e-05, + "loss": 0.5385, + "step": 6846 + }, + { + "epoch": 1.4075444547229932, + "grad_norm": 0.19775407016277313, + "learning_rate": 5.164763675811338e-05, + "loss": 0.569, + "step": 6847 + }, + { + "epoch": 1.4077500256963718, + "grad_norm": 0.18887090682983398, + "learning_rate": 5.163775807239591e-05, + "loss": 0.5487, + "step": 6848 + }, + { + "epoch": 1.4079555966697503, + "grad_norm": 0.1911323517560959, + "learning_rate": 5.162787905966711e-05, + "loss": 0.5632, + "step": 6849 + }, + { + "epoch": 1.4081611676431287, + "grad_norm": 0.19571152329444885, + "learning_rate": 5.16179997204137e-05, + "loss": 0.5655, + "step": 6850 + }, + { + "epoch": 1.4083667386165073, + "grad_norm": 0.1829329878091812, + "learning_rate": 5.160812005512236e-05, + "loss": 0.5319, + "step": 6851 + }, + { + "epoch": 1.4085723095898859, + "grad_norm": 0.19352376461029053, + "learning_rate": 5.1598240064279846e-05, + "loss": 0.5616, + "step": 6852 + }, + { + "epoch": 1.4087778805632645, + "grad_norm": 0.19807998836040497, + "learning_rate": 5.158835974837289e-05, + "loss": 0.5414, + "step": 6853 + }, + { + "epoch": 1.408983451536643, + "grad_norm": 0.1893458068370819, + "learning_rate": 5.157847910788822e-05, + "loss": 0.5426, + "step": 6854 + }, + { + "epoch": 1.4091890225100216, + "grad_norm": 0.1907995045185089, + "learning_rate": 5.1568598143312656e-05, + "loss": 0.5472, + "step": 6855 + }, + { + "epoch": 1.4093945934834, + "grad_norm": 0.17473357915878296, + "learning_rate": 5.1558716855132956e-05, + "loss": 0.4997, + "step": 6856 + }, + { + "epoch": 1.4096001644567786, + "grad_norm": 0.16449564695358276, + "learning_rate": 5.154883524383592e-05, + "loss": 0.5579, + "step": 6857 + }, + { + "epoch": 1.4098057354301572, + "grad_norm": 0.1907692849636078, + "learning_rate": 5.153895330990839e-05, + "loss": 0.5778, + "step": 6858 + }, + { + "epoch": 1.4100113064035358, + "grad_norm": 0.18911254405975342, + "learning_rate": 5.1529071053837206e-05, + "loss": 0.532, + "step": 6859 + }, + { + "epoch": 1.4102168773769144, + "grad_norm": 0.19013933837413788, + "learning_rate": 5.151918847610918e-05, + "loss": 0.5414, + "step": 6860 + }, + { + "epoch": 1.410422448350293, + "grad_norm": 0.1888997107744217, + "learning_rate": 5.150930557721122e-05, + "loss": 0.5472, + "step": 6861 + }, + { + "epoch": 1.4106280193236715, + "grad_norm": 0.18794280290603638, + "learning_rate": 5.14994223576302e-05, + "loss": 0.5541, + "step": 6862 + }, + { + "epoch": 1.4108335902970501, + "grad_norm": 0.19255901873111725, + "learning_rate": 5.1489538817853034e-05, + "loss": 0.5695, + "step": 6863 + }, + { + "epoch": 1.4110391612704287, + "grad_norm": 0.18833082914352417, + "learning_rate": 5.1479654958366594e-05, + "loss": 0.5571, + "step": 6864 + }, + { + "epoch": 1.4112447322438073, + "grad_norm": 0.1937963217496872, + "learning_rate": 5.1469770779657864e-05, + "loss": 0.5531, + "step": 6865 + }, + { + "epoch": 1.4114503032171857, + "grad_norm": 0.16009144484996796, + "learning_rate": 5.145988628221376e-05, + "loss": 0.5195, + "step": 6866 + }, + { + "epoch": 1.4116558741905643, + "grad_norm": 0.15770770609378815, + "learning_rate": 5.145000146652126e-05, + "loss": 0.5767, + "step": 6867 + }, + { + "epoch": 1.4118614451639429, + "grad_norm": 0.18932950496673584, + "learning_rate": 5.1440116333067313e-05, + "loss": 0.5413, + "step": 6868 + }, + { + "epoch": 1.4120670161373214, + "grad_norm": 0.2200823277235031, + "learning_rate": 5.143023088233895e-05, + "loss": 0.5721, + "step": 6869 + }, + { + "epoch": 1.4122725871107, + "grad_norm": 0.19378498196601868, + "learning_rate": 5.142034511482317e-05, + "loss": 0.5732, + "step": 6870 + }, + { + "epoch": 1.4124781580840784, + "grad_norm": 0.20359185338020325, + "learning_rate": 5.141045903100698e-05, + "loss": 0.5555, + "step": 6871 + }, + { + "epoch": 1.412683729057457, + "grad_norm": 0.18266808986663818, + "learning_rate": 5.140057263137744e-05, + "loss": 0.5287, + "step": 6872 + }, + { + "epoch": 1.4128893000308356, + "grad_norm": 0.191037118434906, + "learning_rate": 5.139068591642161e-05, + "loss": 0.5536, + "step": 6873 + }, + { + "epoch": 1.4130948710042142, + "grad_norm": 0.19039712846279144, + "learning_rate": 5.138079888662654e-05, + "loss": 0.5692, + "step": 6874 + }, + { + "epoch": 1.4133004419775927, + "grad_norm": 0.1601129174232483, + "learning_rate": 5.1370911542479354e-05, + "loss": 0.5244, + "step": 6875 + }, + { + "epoch": 1.4135060129509713, + "grad_norm": 0.1585390418767929, + "learning_rate": 5.1361023884467136e-05, + "loss": 0.5695, + "step": 6876 + }, + { + "epoch": 1.41371158392435, + "grad_norm": 0.2022130936384201, + "learning_rate": 5.135113591307699e-05, + "loss": 0.5696, + "step": 6877 + }, + { + "epoch": 1.4139171548977285, + "grad_norm": 0.1920463740825653, + "learning_rate": 5.134124762879606e-05, + "loss": 0.5397, + "step": 6878 + }, + { + "epoch": 1.414122725871107, + "grad_norm": 0.1937701404094696, + "learning_rate": 5.13313590321115e-05, + "loss": 0.5513, + "step": 6879 + }, + { + "epoch": 1.4143282968444857, + "grad_norm": 0.16302789747714996, + "learning_rate": 5.1321470123510486e-05, + "loss": 0.524, + "step": 6880 + }, + { + "epoch": 1.414533867817864, + "grad_norm": 0.1612044721841812, + "learning_rate": 5.131158090348017e-05, + "loss": 0.5558, + "step": 6881 + }, + { + "epoch": 1.4147394387912426, + "grad_norm": 0.18755872547626495, + "learning_rate": 5.130169137250777e-05, + "loss": 0.5448, + "step": 6882 + }, + { + "epoch": 1.4149450097646212, + "grad_norm": 0.16323046386241913, + "learning_rate": 5.1291801531080475e-05, + "loss": 0.5202, + "step": 6883 + }, + { + "epoch": 1.4151505807379998, + "grad_norm": 0.15463986992835999, + "learning_rate": 5.128191137968555e-05, + "loss": 0.5395, + "step": 6884 + }, + { + "epoch": 1.4153561517113784, + "grad_norm": 0.1867363005876541, + "learning_rate": 5.12720209188102e-05, + "loss": 0.5608, + "step": 6885 + }, + { + "epoch": 1.4155617226847568, + "grad_norm": 0.18984296917915344, + "learning_rate": 5.1262130148941705e-05, + "loss": 0.5527, + "step": 6886 + }, + { + "epoch": 1.4157672936581354, + "grad_norm": 0.18599240481853485, + "learning_rate": 5.1252239070567315e-05, + "loss": 0.538, + "step": 6887 + }, + { + "epoch": 1.415972864631514, + "grad_norm": 0.19605940580368042, + "learning_rate": 5.1242347684174327e-05, + "loss": 0.5715, + "step": 6888 + }, + { + "epoch": 1.4161784356048925, + "grad_norm": 0.19661271572113037, + "learning_rate": 5.1232455990250055e-05, + "loss": 0.5538, + "step": 6889 + }, + { + "epoch": 1.4163840065782711, + "grad_norm": 0.1689828336238861, + "learning_rate": 5.12225639892818e-05, + "loss": 0.5337, + "step": 6890 + }, + { + "epoch": 1.4165895775516497, + "grad_norm": 0.16040822863578796, + "learning_rate": 5.1212671681756916e-05, + "loss": 0.5651, + "step": 6891 + }, + { + "epoch": 1.4167951485250283, + "grad_norm": 0.16304267942905426, + "learning_rate": 5.120277906816272e-05, + "loss": 0.5215, + "step": 6892 + }, + { + "epoch": 1.417000719498407, + "grad_norm": 0.1574201136827469, + "learning_rate": 5.119288614898659e-05, + "loss": 0.5349, + "step": 6893 + }, + { + "epoch": 1.4172062904717855, + "grad_norm": 0.20037010312080383, + "learning_rate": 5.118299292471591e-05, + "loss": 0.5484, + "step": 6894 + }, + { + "epoch": 1.417411861445164, + "grad_norm": 0.16355712711811066, + "learning_rate": 5.117309939583806e-05, + "loss": 0.517, + "step": 6895 + }, + { + "epoch": 1.4176174324185424, + "grad_norm": 0.15935970842838287, + "learning_rate": 5.116320556284047e-05, + "loss": 0.5531, + "step": 6896 + }, + { + "epoch": 1.417823003391921, + "grad_norm": 0.20276428759098053, + "learning_rate": 5.115331142621055e-05, + "loss": 0.5586, + "step": 6897 + }, + { + "epoch": 1.4180285743652996, + "grad_norm": 0.1946752518415451, + "learning_rate": 5.114341698643573e-05, + "loss": 0.5415, + "step": 6898 + }, + { + "epoch": 1.4182341453386782, + "grad_norm": 0.1875738501548767, + "learning_rate": 5.113352224400347e-05, + "loss": 0.5354, + "step": 6899 + }, + { + "epoch": 1.4184397163120568, + "grad_norm": 0.1904314160346985, + "learning_rate": 5.112362719940123e-05, + "loss": 0.5619, + "step": 6900 + }, + { + "epoch": 1.4186452872854354, + "grad_norm": 0.20147216320037842, + "learning_rate": 5.111373185311651e-05, + "loss": 0.5728, + "step": 6901 + }, + { + "epoch": 1.4188508582588137, + "grad_norm": 0.19195587933063507, + "learning_rate": 5.110383620563679e-05, + "loss": 0.5806, + "step": 6902 + }, + { + "epoch": 1.4190564292321923, + "grad_norm": 0.16246861219406128, + "learning_rate": 5.109394025744959e-05, + "loss": 0.5218, + "step": 6903 + }, + { + "epoch": 1.419262000205571, + "grad_norm": 0.16603510081768036, + "learning_rate": 5.108404400904243e-05, + "loss": 0.5348, + "step": 6904 + }, + { + "epoch": 1.4194675711789495, + "grad_norm": 0.1957361102104187, + "learning_rate": 5.1074147460902876e-05, + "loss": 0.5661, + "step": 6905 + }, + { + "epoch": 1.419673142152328, + "grad_norm": 0.1889890879392624, + "learning_rate": 5.106425061351845e-05, + "loss": 0.5672, + "step": 6906 + }, + { + "epoch": 1.4198787131257067, + "grad_norm": 0.19111685454845428, + "learning_rate": 5.1054353467376756e-05, + "loss": 0.5739, + "step": 6907 + }, + { + "epoch": 1.4200842840990853, + "grad_norm": 0.17033053934574127, + "learning_rate": 5.104445602296536e-05, + "loss": 0.5152, + "step": 6908 + }, + { + "epoch": 1.4202898550724639, + "grad_norm": 0.1564977467060089, + "learning_rate": 5.103455828077186e-05, + "loss": 0.5598, + "step": 6909 + }, + { + "epoch": 1.4204954260458424, + "grad_norm": 0.19049371778964996, + "learning_rate": 5.1024660241283884e-05, + "loss": 0.5463, + "step": 6910 + }, + { + "epoch": 1.4207009970192208, + "grad_norm": 0.19642889499664307, + "learning_rate": 5.101476190498906e-05, + "loss": 0.578, + "step": 6911 + }, + { + "epoch": 1.4209065679925994, + "grad_norm": 0.19157302379608154, + "learning_rate": 5.1004863272375034e-05, + "loss": 0.5386, + "step": 6912 + }, + { + "epoch": 1.421112138965978, + "grad_norm": 0.19283618032932281, + "learning_rate": 5.0994964343929445e-05, + "loss": 0.5429, + "step": 6913 + }, + { + "epoch": 1.4213177099393566, + "grad_norm": 0.19500254094600677, + "learning_rate": 5.0985065120139994e-05, + "loss": 0.54, + "step": 6914 + }, + { + "epoch": 1.4215232809127352, + "grad_norm": 0.18495769798755646, + "learning_rate": 5.097516560149434e-05, + "loss": 0.5359, + "step": 6915 + }, + { + "epoch": 1.4217288518861138, + "grad_norm": 0.18928299844264984, + "learning_rate": 5.0965265788480225e-05, + "loss": 0.5567, + "step": 6916 + }, + { + "epoch": 1.4219344228594921, + "grad_norm": 0.18935348093509674, + "learning_rate": 5.095536568158535e-05, + "loss": 0.5359, + "step": 6917 + }, + { + "epoch": 1.4221399938328707, + "grad_norm": 0.1989513635635376, + "learning_rate": 5.094546528129743e-05, + "loss": 0.5603, + "step": 6918 + }, + { + "epoch": 1.4223455648062493, + "grad_norm": 0.16001847386360168, + "learning_rate": 5.093556458810423e-05, + "loss": 0.5223, + "step": 6919 + }, + { + "epoch": 1.4225511357796279, + "grad_norm": 0.15646837651729584, + "learning_rate": 5.0925663602493503e-05, + "loss": 0.5285, + "step": 6920 + }, + { + "epoch": 1.4227567067530065, + "grad_norm": 0.20338685810565948, + "learning_rate": 5.091576232495304e-05, + "loss": 0.574, + "step": 6921 + }, + { + "epoch": 1.422962277726385, + "grad_norm": 0.1922929286956787, + "learning_rate": 5.090586075597061e-05, + "loss": 0.5376, + "step": 6922 + }, + { + "epoch": 1.4231678486997636, + "grad_norm": 0.24350236356258392, + "learning_rate": 5.089595889603401e-05, + "loss": 0.5544, + "step": 6923 + }, + { + "epoch": 1.4233734196731422, + "grad_norm": 0.1872577667236328, + "learning_rate": 5.088605674563109e-05, + "loss": 0.5748, + "step": 6924 + }, + { + "epoch": 1.4235789906465208, + "grad_norm": 0.18415029346942902, + "learning_rate": 5.0876154305249654e-05, + "loss": 0.5457, + "step": 6925 + }, + { + "epoch": 1.4237845616198992, + "grad_norm": 0.1886397749185562, + "learning_rate": 5.086625157537757e-05, + "loss": 0.5477, + "step": 6926 + }, + { + "epoch": 1.4239901325932778, + "grad_norm": 0.19316554069519043, + "learning_rate": 5.085634855650268e-05, + "loss": 0.5608, + "step": 6927 + }, + { + "epoch": 1.4241957035666564, + "grad_norm": 0.1911771446466446, + "learning_rate": 5.084644524911288e-05, + "loss": 0.5427, + "step": 6928 + }, + { + "epoch": 1.424401274540035, + "grad_norm": 0.19828177988529205, + "learning_rate": 5.083654165369604e-05, + "loss": 0.5518, + "step": 6929 + }, + { + "epoch": 1.4246068455134135, + "grad_norm": 0.16796253621578217, + "learning_rate": 5.082663777074008e-05, + "loss": 0.5173, + "step": 6930 + }, + { + "epoch": 1.4248124164867921, + "grad_norm": 0.16129761934280396, + "learning_rate": 5.0816733600732905e-05, + "loss": 0.562, + "step": 6931 + }, + { + "epoch": 1.4250179874601705, + "grad_norm": 0.19917796552181244, + "learning_rate": 5.0806829144162455e-05, + "loss": 0.5394, + "step": 6932 + }, + { + "epoch": 1.425223558433549, + "grad_norm": 0.19599252939224243, + "learning_rate": 5.079692440151668e-05, + "loss": 0.5829, + "step": 6933 + }, + { + "epoch": 1.4254291294069277, + "grad_norm": 0.1711527705192566, + "learning_rate": 5.078701937328352e-05, + "loss": 0.5075, + "step": 6934 + }, + { + "epoch": 1.4256347003803063, + "grad_norm": 0.12597279250621796, + "learning_rate": 5.077711405995098e-05, + "loss": 0.497, + "step": 6935 + }, + { + "epoch": 1.4258402713536849, + "grad_norm": 0.15089215338230133, + "learning_rate": 5.076720846200702e-05, + "loss": 0.5364, + "step": 6936 + }, + { + "epoch": 1.4260458423270634, + "grad_norm": 0.19826306402683258, + "learning_rate": 5.0757302579939656e-05, + "loss": 0.5371, + "step": 6937 + }, + { + "epoch": 1.426251413300442, + "grad_norm": 0.1632860153913498, + "learning_rate": 5.0747396414236906e-05, + "loss": 0.5114, + "step": 6938 + }, + { + "epoch": 1.4264569842738206, + "grad_norm": 0.15971128642559052, + "learning_rate": 5.07374899653868e-05, + "loss": 0.5575, + "step": 6939 + }, + { + "epoch": 1.4266625552471992, + "grad_norm": 0.18618735671043396, + "learning_rate": 5.0727583233877376e-05, + "loss": 0.557, + "step": 6940 + }, + { + "epoch": 1.4268681262205778, + "grad_norm": 0.19377268850803375, + "learning_rate": 5.07176762201967e-05, + "loss": 0.5608, + "step": 6941 + }, + { + "epoch": 1.4270736971939562, + "grad_norm": 0.18944592773914337, + "learning_rate": 5.0707768924832844e-05, + "loss": 0.5356, + "step": 6942 + }, + { + "epoch": 1.4272792681673347, + "grad_norm": 0.1696036458015442, + "learning_rate": 5.06978613482739e-05, + "loss": 0.529, + "step": 6943 + }, + { + "epoch": 1.4274848391407133, + "grad_norm": 0.1654544472694397, + "learning_rate": 5.068795349100794e-05, + "loss": 0.57, + "step": 6944 + }, + { + "epoch": 1.427690410114092, + "grad_norm": 0.19743849337100983, + "learning_rate": 5.067804535352311e-05, + "loss": 0.558, + "step": 6945 + }, + { + "epoch": 1.4278959810874705, + "grad_norm": 0.188226580619812, + "learning_rate": 5.066813693630752e-05, + "loss": 0.5425, + "step": 6946 + }, + { + "epoch": 1.4281015520608489, + "grad_norm": 0.1916334182024002, + "learning_rate": 5.065822823984931e-05, + "loss": 0.582, + "step": 6947 + }, + { + "epoch": 1.4283071230342275, + "grad_norm": 0.1938442885875702, + "learning_rate": 5.064831926463664e-05, + "loss": 0.5607, + "step": 6948 + }, + { + "epoch": 1.428512694007606, + "grad_norm": 0.19236359000205994, + "learning_rate": 5.0638410011157694e-05, + "loss": 0.5811, + "step": 6949 + }, + { + "epoch": 1.4287182649809846, + "grad_norm": 0.19282235205173492, + "learning_rate": 5.0628500479900636e-05, + "loss": 0.5456, + "step": 6950 + }, + { + "epoch": 1.4289238359543632, + "grad_norm": 0.19609522819519043, + "learning_rate": 5.0618590671353655e-05, + "loss": 0.5484, + "step": 6951 + }, + { + "epoch": 1.4291294069277418, + "grad_norm": 0.19038927555084229, + "learning_rate": 5.060868058600499e-05, + "loss": 0.538, + "step": 6952 + }, + { + "epoch": 1.4293349779011204, + "grad_norm": 0.15865328907966614, + "learning_rate": 5.0598770224342834e-05, + "loss": 0.5187, + "step": 6953 + }, + { + "epoch": 1.429540548874499, + "grad_norm": 0.1643393188714981, + "learning_rate": 5.0588859586855435e-05, + "loss": 0.561, + "step": 6954 + }, + { + "epoch": 1.4297461198478776, + "grad_norm": 0.18920312821865082, + "learning_rate": 5.057894867403106e-05, + "loss": 0.5582, + "step": 6955 + }, + { + "epoch": 1.4299516908212562, + "grad_norm": 0.20650269091129303, + "learning_rate": 5.0569037486357954e-05, + "loss": 0.5485, + "step": 6956 + }, + { + "epoch": 1.4301572617946345, + "grad_norm": 0.19086134433746338, + "learning_rate": 5.0559126024324394e-05, + "loss": 0.5668, + "step": 6957 + }, + { + "epoch": 1.4303628327680131, + "grad_norm": 0.18574881553649902, + "learning_rate": 5.0549214288418695e-05, + "loss": 0.5305, + "step": 6958 + }, + { + "epoch": 1.4305684037413917, + "grad_norm": 0.16486965119838715, + "learning_rate": 5.053930227912913e-05, + "loss": 0.5394, + "step": 6959 + }, + { + "epoch": 1.4307739747147703, + "grad_norm": 0.1669962853193283, + "learning_rate": 5.052938999694403e-05, + "loss": 0.5604, + "step": 6960 + }, + { + "epoch": 1.430979545688149, + "grad_norm": 0.16902011632919312, + "learning_rate": 5.0519477442351735e-05, + "loss": 0.5269, + "step": 6961 + }, + { + "epoch": 1.4311851166615273, + "grad_norm": 0.1662750244140625, + "learning_rate": 5.0509564615840586e-05, + "loss": 0.5506, + "step": 6962 + }, + { + "epoch": 1.4313906876349058, + "grad_norm": 0.19221939146518707, + "learning_rate": 5.049965151789895e-05, + "loss": 0.5682, + "step": 6963 + }, + { + "epoch": 1.4315962586082844, + "grad_norm": 0.18976832926273346, + "learning_rate": 5.048973814901516e-05, + "loss": 0.5402, + "step": 6964 + }, + { + "epoch": 1.431801829581663, + "grad_norm": 0.18504224717617035, + "learning_rate": 5.047982450967766e-05, + "loss": 0.536, + "step": 6965 + }, + { + "epoch": 1.4320074005550416, + "grad_norm": 0.18513992428779602, + "learning_rate": 5.0469910600374815e-05, + "loss": 0.5433, + "step": 6966 + }, + { + "epoch": 1.4322129715284202, + "grad_norm": 0.1597176045179367, + "learning_rate": 5.045999642159503e-05, + "loss": 0.5006, + "step": 6967 + }, + { + "epoch": 1.4324185425017988, + "grad_norm": 0.13741186261177063, + "learning_rate": 5.045008197382674e-05, + "loss": 0.5147, + "step": 6968 + }, + { + "epoch": 1.4326241134751774, + "grad_norm": 0.16074904799461365, + "learning_rate": 5.044016725755838e-05, + "loss": 0.5536, + "step": 6969 + }, + { + "epoch": 1.432829684448556, + "grad_norm": 0.21094325184822083, + "learning_rate": 5.043025227327842e-05, + "loss": 0.5529, + "step": 6970 + }, + { + "epoch": 1.4330352554219346, + "grad_norm": 0.19735904037952423, + "learning_rate": 5.0420337021475304e-05, + "loss": 0.5282, + "step": 6971 + }, + { + "epoch": 1.433240826395313, + "grad_norm": 0.1973976045846939, + "learning_rate": 5.041042150263753e-05, + "loss": 0.5593, + "step": 6972 + }, + { + "epoch": 1.4334463973686915, + "grad_norm": 0.19355326890945435, + "learning_rate": 5.0400505717253575e-05, + "loss": 0.5692, + "step": 6973 + }, + { + "epoch": 1.43365196834207, + "grad_norm": 0.19223208725452423, + "learning_rate": 5.0390589665811944e-05, + "loss": 0.5534, + "step": 6974 + }, + { + "epoch": 1.4338575393154487, + "grad_norm": 0.1662292182445526, + "learning_rate": 5.038067334880113e-05, + "loss": 0.5175, + "step": 6975 + }, + { + "epoch": 1.4340631102888273, + "grad_norm": 0.15810272097587585, + "learning_rate": 5.0370756766709716e-05, + "loss": 0.5404, + "step": 6976 + }, + { + "epoch": 1.4342686812622059, + "grad_norm": 0.19795885682106018, + "learning_rate": 5.0360839920026215e-05, + "loss": 0.5718, + "step": 6977 + }, + { + "epoch": 1.4344742522355842, + "grad_norm": 0.19126173853874207, + "learning_rate": 5.0350922809239184e-05, + "loss": 0.5549, + "step": 6978 + }, + { + "epoch": 1.4346798232089628, + "grad_norm": 0.20567071437835693, + "learning_rate": 5.03410054348372e-05, + "loss": 0.5577, + "step": 6979 + }, + { + "epoch": 1.4348853941823414, + "grad_norm": 0.1884375363588333, + "learning_rate": 5.033108779730883e-05, + "loss": 0.5491, + "step": 6980 + }, + { + "epoch": 1.43509096515572, + "grad_norm": 0.16468265652656555, + "learning_rate": 5.0321169897142695e-05, + "loss": 0.5049, + "step": 6981 + }, + { + "epoch": 1.4352965361290986, + "grad_norm": 0.16884614527225494, + "learning_rate": 5.031125173482738e-05, + "loss": 0.5472, + "step": 6982 + }, + { + "epoch": 1.4355021071024772, + "grad_norm": 0.2028854638338089, + "learning_rate": 5.0301333310851526e-05, + "loss": 0.5737, + "step": 6983 + }, + { + "epoch": 1.4357076780758558, + "grad_norm": 0.19400665163993835, + "learning_rate": 5.029141462570376e-05, + "loss": 0.5492, + "step": 6984 + }, + { + "epoch": 1.4359132490492343, + "grad_norm": 0.19768649339675903, + "learning_rate": 5.028149567987271e-05, + "loss": 0.5461, + "step": 6985 + }, + { + "epoch": 1.436118820022613, + "grad_norm": 0.164305180311203, + "learning_rate": 5.027157647384708e-05, + "loss": 0.5386, + "step": 6986 + }, + { + "epoch": 1.4363243909959913, + "grad_norm": 0.16050846874713898, + "learning_rate": 5.02616570081155e-05, + "loss": 0.5472, + "step": 6987 + }, + { + "epoch": 1.4365299619693699, + "grad_norm": 0.19127194583415985, + "learning_rate": 5.025173728316668e-05, + "loss": 0.5656, + "step": 6988 + }, + { + "epoch": 1.4367355329427485, + "grad_norm": 0.1859859675168991, + "learning_rate": 5.02418172994893e-05, + "loss": 0.5506, + "step": 6989 + }, + { + "epoch": 1.436941103916127, + "grad_norm": 0.16769689321517944, + "learning_rate": 5.0231897057572085e-05, + "loss": 0.5391, + "step": 6990 + }, + { + "epoch": 1.4371466748895056, + "grad_norm": 0.16699868440628052, + "learning_rate": 5.0221976557903755e-05, + "loss": 0.5287, + "step": 6991 + }, + { + "epoch": 1.4373522458628842, + "grad_norm": 0.19447840750217438, + "learning_rate": 5.021205580097305e-05, + "loss": 0.5451, + "step": 6992 + }, + { + "epoch": 1.4375578168362626, + "grad_norm": 0.1894395351409912, + "learning_rate": 5.020213478726871e-05, + "loss": 0.546, + "step": 6993 + }, + { + "epoch": 1.4377633878096412, + "grad_norm": 0.20027700066566467, + "learning_rate": 5.0192213517279524e-05, + "loss": 0.5488, + "step": 6994 + }, + { + "epoch": 1.4379689587830198, + "grad_norm": 0.15890729427337646, + "learning_rate": 5.0182291991494224e-05, + "loss": 0.5155, + "step": 6995 + }, + { + "epoch": 1.4381745297563984, + "grad_norm": 0.16410616040229797, + "learning_rate": 5.017237021040163e-05, + "loss": 0.5709, + "step": 6996 + }, + { + "epoch": 1.438380100729777, + "grad_norm": 0.19332385063171387, + "learning_rate": 5.016244817449054e-05, + "loss": 0.5472, + "step": 6997 + }, + { + "epoch": 1.4385856717031555, + "grad_norm": 0.18809527158737183, + "learning_rate": 5.015252588424975e-05, + "loss": 0.5594, + "step": 6998 + }, + { + "epoch": 1.4387912426765341, + "grad_norm": 0.19198375940322876, + "learning_rate": 5.0142603340168084e-05, + "loss": 0.5545, + "step": 6999 + }, + { + "epoch": 1.4389968136499127, + "grad_norm": 0.1915784478187561, + "learning_rate": 5.0132680542734396e-05, + "loss": 0.5627, + "step": 7000 + }, + { + "epoch": 1.4392023846232913, + "grad_norm": 0.19142676889896393, + "learning_rate": 5.012275749243752e-05, + "loss": 0.5473, + "step": 7001 + }, + { + "epoch": 1.4394079555966697, + "grad_norm": 0.18919003009796143, + "learning_rate": 5.011283418976633e-05, + "loss": 0.5513, + "step": 7002 + }, + { + "epoch": 1.4396135265700483, + "grad_norm": 0.16133341193199158, + "learning_rate": 5.010291063520969e-05, + "loss": 0.4986, + "step": 7003 + }, + { + "epoch": 1.4398190975434269, + "grad_norm": 0.15433275699615479, + "learning_rate": 5.009298682925651e-05, + "loss": 0.5429, + "step": 7004 + }, + { + "epoch": 1.4400246685168054, + "grad_norm": 0.17464013397693634, + "learning_rate": 5.008306277239567e-05, + "loss": 0.524, + "step": 7005 + }, + { + "epoch": 1.440230239490184, + "grad_norm": 0.15277941524982452, + "learning_rate": 5.0073138465116075e-05, + "loss": 0.5293, + "step": 7006 + }, + { + "epoch": 1.4404358104635626, + "grad_norm": 0.1988225281238556, + "learning_rate": 5.0063213907906665e-05, + "loss": 0.5324, + "step": 7007 + }, + { + "epoch": 1.440641381436941, + "grad_norm": 0.2008810192346573, + "learning_rate": 5.005328910125638e-05, + "loss": 0.5634, + "step": 7008 + }, + { + "epoch": 1.4408469524103196, + "grad_norm": 0.19552162289619446, + "learning_rate": 5.004336404565415e-05, + "loss": 0.5382, + "step": 7009 + }, + { + "epoch": 1.4410525233836982, + "grad_norm": 0.1576053947210312, + "learning_rate": 5.003343874158895e-05, + "loss": 0.4966, + "step": 7010 + }, + { + "epoch": 1.4412580943570767, + "grad_norm": 0.18060800433158875, + "learning_rate": 5.002351318954975e-05, + "loss": 0.5758, + "step": 7011 + }, + { + "epoch": 1.4414636653304553, + "grad_norm": 0.19537772238254547, + "learning_rate": 5.001358739002553e-05, + "loss": 0.5713, + "step": 7012 + }, + { + "epoch": 1.441669236303834, + "grad_norm": 0.18666040897369385, + "learning_rate": 5.0003661343505284e-05, + "loss": 0.5334, + "step": 7013 + }, + { + "epoch": 1.4418748072772125, + "grad_norm": 0.16254711151123047, + "learning_rate": 4.9993735050478045e-05, + "loss": 0.5159, + "step": 7014 + }, + { + "epoch": 1.442080378250591, + "grad_norm": 0.1602196842432022, + "learning_rate": 4.9983808511432824e-05, + "loss": 0.5267, + "step": 7015 + }, + { + "epoch": 1.4422859492239697, + "grad_norm": 0.1874070167541504, + "learning_rate": 4.9973881726858644e-05, + "loss": 0.5258, + "step": 7016 + }, + { + "epoch": 1.442491520197348, + "grad_norm": 0.19187650084495544, + "learning_rate": 4.996395469724456e-05, + "loss": 0.5574, + "step": 7017 + }, + { + "epoch": 1.4426970911707266, + "grad_norm": 0.1952408105134964, + "learning_rate": 4.995402742307963e-05, + "loss": 0.5735, + "step": 7018 + }, + { + "epoch": 1.4429026621441052, + "grad_norm": 0.20097225904464722, + "learning_rate": 4.9944099904852926e-05, + "loss": 0.572, + "step": 7019 + }, + { + "epoch": 1.4431082331174838, + "grad_norm": 0.16808289289474487, + "learning_rate": 4.993417214305352e-05, + "loss": 0.5367, + "step": 7020 + }, + { + "epoch": 1.4433138040908624, + "grad_norm": 0.16581854224205017, + "learning_rate": 4.992424413817053e-05, + "loss": 0.5764, + "step": 7021 + }, + { + "epoch": 1.443519375064241, + "grad_norm": 0.15527617931365967, + "learning_rate": 4.9914315890693035e-05, + "loss": 0.5166, + "step": 7022 + }, + { + "epoch": 1.4437249460376194, + "grad_norm": 0.15834735333919525, + "learning_rate": 4.990438740111017e-05, + "loss": 0.5397, + "step": 7023 + }, + { + "epoch": 1.443930517010998, + "grad_norm": 0.1944034993648529, + "learning_rate": 4.989445866991105e-05, + "loss": 0.5449, + "step": 7024 + }, + { + "epoch": 1.4441360879843765, + "grad_norm": 0.1605810672044754, + "learning_rate": 4.988452969758485e-05, + "loss": 0.5229, + "step": 7025 + }, + { + "epoch": 1.4443416589577551, + "grad_norm": 0.15166768431663513, + "learning_rate": 4.9874600484620684e-05, + "loss": 0.5337, + "step": 7026 + }, + { + "epoch": 1.4445472299311337, + "grad_norm": 0.19105499982833862, + "learning_rate": 4.9864671031507746e-05, + "loss": 0.5351, + "step": 7027 + }, + { + "epoch": 1.4447528009045123, + "grad_norm": 0.18772821128368378, + "learning_rate": 4.98547413387352e-05, + "loss": 0.5418, + "step": 7028 + }, + { + "epoch": 1.444958371877891, + "grad_norm": 0.1658894121646881, + "learning_rate": 4.984481140679224e-05, + "loss": 0.5272, + "step": 7029 + }, + { + "epoch": 1.4451639428512695, + "grad_norm": 0.17171718180179596, + "learning_rate": 4.983488123616807e-05, + "loss": 0.5593, + "step": 7030 + }, + { + "epoch": 1.445369513824648, + "grad_norm": 0.18422532081604004, + "learning_rate": 4.9824950827351894e-05, + "loss": 0.5262, + "step": 7031 + }, + { + "epoch": 1.4455750847980267, + "grad_norm": 0.19110561907291412, + "learning_rate": 4.981502018083295e-05, + "loss": 0.5546, + "step": 7032 + }, + { + "epoch": 1.445780655771405, + "grad_norm": 0.18570828437805176, + "learning_rate": 4.980508929710045e-05, + "loss": 0.5493, + "step": 7033 + }, + { + "epoch": 1.4459862267447836, + "grad_norm": 0.19072416424751282, + "learning_rate": 4.9795158176643665e-05, + "loss": 0.5656, + "step": 7034 + }, + { + "epoch": 1.4461917977181622, + "grad_norm": 0.18956297636032104, + "learning_rate": 4.978522681995186e-05, + "loss": 0.5594, + "step": 7035 + }, + { + "epoch": 1.4463973686915408, + "grad_norm": 0.1876407116651535, + "learning_rate": 4.977529522751429e-05, + "loss": 0.5668, + "step": 7036 + }, + { + "epoch": 1.4466029396649194, + "grad_norm": 0.1943429410457611, + "learning_rate": 4.976536339982024e-05, + "loss": 0.5389, + "step": 7037 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 0.19916300475597382, + "learning_rate": 4.975543133735901e-05, + "loss": 0.5564, + "step": 7038 + }, + { + "epoch": 1.4470140816116763, + "grad_norm": 0.19892625510692596, + "learning_rate": 4.974549904061991e-05, + "loss": 0.5782, + "step": 7039 + }, + { + "epoch": 1.447219652585055, + "grad_norm": 0.19441033899784088, + "learning_rate": 4.9735566510092245e-05, + "loss": 0.5703, + "step": 7040 + }, + { + "epoch": 1.4474252235584335, + "grad_norm": 0.1984698474407196, + "learning_rate": 4.972563374626536e-05, + "loss": 0.5614, + "step": 7041 + }, + { + "epoch": 1.447630794531812, + "grad_norm": 0.16778507828712463, + "learning_rate": 4.971570074962859e-05, + "loss": 0.5299, + "step": 7042 + }, + { + "epoch": 1.4478363655051907, + "grad_norm": 0.14573578536510468, + "learning_rate": 4.970576752067128e-05, + "loss": 0.5233, + "step": 7043 + }, + { + "epoch": 1.4480419364785693, + "grad_norm": 0.14844007790088654, + "learning_rate": 4.9695834059882796e-05, + "loss": 0.5304, + "step": 7044 + }, + { + "epoch": 1.4482475074519479, + "grad_norm": 0.19099220633506775, + "learning_rate": 4.968590036775251e-05, + "loss": 0.5603, + "step": 7045 + }, + { + "epoch": 1.4484530784253264, + "grad_norm": 0.16473321616649628, + "learning_rate": 4.967596644476983e-05, + "loss": 0.5134, + "step": 7046 + }, + { + "epoch": 1.448658649398705, + "grad_norm": 0.17135196924209595, + "learning_rate": 4.966603229142412e-05, + "loss": 0.5579, + "step": 7047 + }, + { + "epoch": 1.4488642203720834, + "grad_norm": 0.19533687829971313, + "learning_rate": 4.9656097908204825e-05, + "loss": 0.5617, + "step": 7048 + }, + { + "epoch": 1.449069791345462, + "grad_norm": 0.1876286268234253, + "learning_rate": 4.964616329560136e-05, + "loss": 0.554, + "step": 7049 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.16037873923778534, + "learning_rate": 4.9636228454103126e-05, + "loss": 0.529, + "step": 7050 + }, + { + "epoch": 1.4494809332922192, + "grad_norm": 0.1680610179901123, + "learning_rate": 4.962629338419958e-05, + "loss": 0.5376, + "step": 7051 + }, + { + "epoch": 1.4496865042655978, + "grad_norm": 0.1924995481967926, + "learning_rate": 4.9616358086380196e-05, + "loss": 0.5543, + "step": 7052 + }, + { + "epoch": 1.4498920752389761, + "grad_norm": 0.1638346016407013, + "learning_rate": 4.9606422561134425e-05, + "loss": 0.5091, + "step": 7053 + }, + { + "epoch": 1.4500976462123547, + "grad_norm": 0.16642382740974426, + "learning_rate": 4.9596486808951735e-05, + "loss": 0.5628, + "step": 7054 + }, + { + "epoch": 1.4503032171857333, + "grad_norm": 0.16534394025802612, + "learning_rate": 4.958655083032164e-05, + "loss": 0.5297, + "step": 7055 + }, + { + "epoch": 1.4505087881591119, + "grad_norm": 0.16639864444732666, + "learning_rate": 4.95766146257336e-05, + "loss": 0.5561, + "step": 7056 + }, + { + "epoch": 1.4507143591324905, + "grad_norm": 0.190561905503273, + "learning_rate": 4.956667819567717e-05, + "loss": 0.5604, + "step": 7057 + }, + { + "epoch": 1.450919930105869, + "grad_norm": 0.19295108318328857, + "learning_rate": 4.955674154064182e-05, + "loss": 0.5524, + "step": 7058 + }, + { + "epoch": 1.4511255010792476, + "grad_norm": 0.19699627161026, + "learning_rate": 4.9546804661117146e-05, + "loss": 0.5482, + "step": 7059 + }, + { + "epoch": 1.4513310720526262, + "grad_norm": 0.18727873265743256, + "learning_rate": 4.953686755759265e-05, + "loss": 0.5565, + "step": 7060 + }, + { + "epoch": 1.4515366430260048, + "grad_norm": 0.19223269820213318, + "learning_rate": 4.952693023055788e-05, + "loss": 0.5661, + "step": 7061 + }, + { + "epoch": 1.4517422139993834, + "grad_norm": 0.19679668545722961, + "learning_rate": 4.951699268050243e-05, + "loss": 0.5632, + "step": 7062 + }, + { + "epoch": 1.4519477849727618, + "grad_norm": 0.19206634163856506, + "learning_rate": 4.9507054907915866e-05, + "loss": 0.5459, + "step": 7063 + }, + { + "epoch": 1.4521533559461404, + "grad_norm": 0.19624993205070496, + "learning_rate": 4.949711691328777e-05, + "loss": 0.5741, + "step": 7064 + }, + { + "epoch": 1.452358926919519, + "grad_norm": 0.19353879988193512, + "learning_rate": 4.948717869710773e-05, + "loss": 0.5228, + "step": 7065 + }, + { + "epoch": 1.4525644978928975, + "grad_norm": 0.1924706995487213, + "learning_rate": 4.947724025986538e-05, + "loss": 0.5716, + "step": 7066 + }, + { + "epoch": 1.4527700688662761, + "grad_norm": 0.19107024371623993, + "learning_rate": 4.946730160205033e-05, + "loss": 0.555, + "step": 7067 + }, + { + "epoch": 1.4529756398396547, + "grad_norm": 0.18900389969348907, + "learning_rate": 4.94573627241522e-05, + "loss": 0.5505, + "step": 7068 + }, + { + "epoch": 1.453181210813033, + "grad_norm": 0.16496512293815613, + "learning_rate": 4.944742362666065e-05, + "loss": 0.5272, + "step": 7069 + }, + { + "epoch": 1.4533867817864117, + "grad_norm": 0.16446129977703094, + "learning_rate": 4.9437484310065326e-05, + "loss": 0.5483, + "step": 7070 + }, + { + "epoch": 1.4535923527597903, + "grad_norm": 0.1935243159532547, + "learning_rate": 4.942754477485588e-05, + "loss": 0.5516, + "step": 7071 + }, + { + "epoch": 1.4537979237331689, + "grad_norm": 0.1573350727558136, + "learning_rate": 4.9417605021522016e-05, + "loss": 0.5269, + "step": 7072 + }, + { + "epoch": 1.4540034947065474, + "grad_norm": 0.1570722460746765, + "learning_rate": 4.9407665050553395e-05, + "loss": 0.5599, + "step": 7073 + }, + { + "epoch": 1.454209065679926, + "grad_norm": 0.19235976040363312, + "learning_rate": 4.9397724862439726e-05, + "loss": 0.5488, + "step": 7074 + }, + { + "epoch": 1.4544146366533046, + "grad_norm": 0.19353123009204865, + "learning_rate": 4.938778445767069e-05, + "loss": 0.5436, + "step": 7075 + }, + { + "epoch": 1.4546202076266832, + "grad_norm": 0.192392498254776, + "learning_rate": 4.9377843836736026e-05, + "loss": 0.547, + "step": 7076 + }, + { + "epoch": 1.4548257786000618, + "grad_norm": 0.1857522875070572, + "learning_rate": 4.936790300012545e-05, + "loss": 0.5477, + "step": 7077 + }, + { + "epoch": 1.4550313495734402, + "grad_norm": 0.20272956788539886, + "learning_rate": 4.935796194832872e-05, + "loss": 0.5526, + "step": 7078 + }, + { + "epoch": 1.4552369205468187, + "grad_norm": 0.1533660888671875, + "learning_rate": 4.9348020681835573e-05, + "loss": 0.5079, + "step": 7079 + }, + { + "epoch": 1.4554424915201973, + "grad_norm": 0.15885986387729645, + "learning_rate": 4.9338079201135777e-05, + "loss": 0.544, + "step": 7080 + }, + { + "epoch": 1.455648062493576, + "grad_norm": 0.19332925975322723, + "learning_rate": 4.932813750671909e-05, + "loss": 0.5493, + "step": 7081 + }, + { + "epoch": 1.4558536334669545, + "grad_norm": 0.16609343886375427, + "learning_rate": 4.931819559907529e-05, + "loss": 0.5295, + "step": 7082 + }, + { + "epoch": 1.456059204440333, + "grad_norm": 0.12420736253261566, + "learning_rate": 4.930825347869418e-05, + "loss": 0.5104, + "step": 7083 + }, + { + "epoch": 1.4562647754137115, + "grad_norm": 0.12772247195243835, + "learning_rate": 4.9298311146065565e-05, + "loss": 0.5214, + "step": 7084 + }, + { + "epoch": 1.45647034638709, + "grad_norm": 0.1771061271429062, + "learning_rate": 4.9288368601679235e-05, + "loss": 0.5358, + "step": 7085 + }, + { + "epoch": 1.4566759173604686, + "grad_norm": 0.20758508145809174, + "learning_rate": 4.9278425846025047e-05, + "loss": 0.5321, + "step": 7086 + }, + { + "epoch": 1.4568814883338472, + "grad_norm": 0.16325919330120087, + "learning_rate": 4.926848287959281e-05, + "loss": 0.5155, + "step": 7087 + }, + { + "epoch": 1.4570870593072258, + "grad_norm": 0.15556760132312775, + "learning_rate": 4.925853970287236e-05, + "loss": 0.5374, + "step": 7088 + }, + { + "epoch": 1.4572926302806044, + "grad_norm": 0.19319914281368256, + "learning_rate": 4.924859631635356e-05, + "loss": 0.5403, + "step": 7089 + }, + { + "epoch": 1.457498201253983, + "grad_norm": 0.19514033198356628, + "learning_rate": 4.9238652720526295e-05, + "loss": 0.5609, + "step": 7090 + }, + { + "epoch": 1.4577037722273616, + "grad_norm": 0.18153122067451477, + "learning_rate": 4.922870891588042e-05, + "loss": 0.5313, + "step": 7091 + }, + { + "epoch": 1.4579093432007402, + "grad_norm": 0.19177407026290894, + "learning_rate": 4.9218764902905814e-05, + "loss": 0.5595, + "step": 7092 + }, + { + "epoch": 1.4581149141741185, + "grad_norm": 0.18836280703544617, + "learning_rate": 4.920882068209238e-05, + "loss": 0.544, + "step": 7093 + }, + { + "epoch": 1.4583204851474971, + "grad_norm": 0.19115997850894928, + "learning_rate": 4.919887625393003e-05, + "loss": 0.5544, + "step": 7094 + }, + { + "epoch": 1.4585260561208757, + "grad_norm": 0.1862732619047165, + "learning_rate": 4.918893161890867e-05, + "loss": 0.5515, + "step": 7095 + }, + { + "epoch": 1.4587316270942543, + "grad_norm": 0.15882770717144012, + "learning_rate": 4.917898677751822e-05, + "loss": 0.5248, + "step": 7096 + }, + { + "epoch": 1.458937198067633, + "grad_norm": 0.16427573561668396, + "learning_rate": 4.9169041730248634e-05, + "loss": 0.5654, + "step": 7097 + }, + { + "epoch": 1.4591427690410115, + "grad_norm": 0.19142089784145355, + "learning_rate": 4.915909647758984e-05, + "loss": 0.5522, + "step": 7098 + }, + { + "epoch": 1.4593483400143898, + "grad_norm": 0.19446474313735962, + "learning_rate": 4.914915102003181e-05, + "loss": 0.5274, + "step": 7099 + }, + { + "epoch": 1.4595539109877684, + "grad_norm": 0.1596178114414215, + "learning_rate": 4.9139205358064495e-05, + "loss": 0.5138, + "step": 7100 + }, + { + "epoch": 1.459759481961147, + "grad_norm": 0.1602422297000885, + "learning_rate": 4.912925949217788e-05, + "loss": 0.5237, + "step": 7101 + }, + { + "epoch": 1.4599650529345256, + "grad_norm": 0.19484317302703857, + "learning_rate": 4.911931342286195e-05, + "loss": 0.5393, + "step": 7102 + }, + { + "epoch": 1.4601706239079042, + "grad_norm": 0.2035979926586151, + "learning_rate": 4.91093671506067e-05, + "loss": 0.5555, + "step": 7103 + }, + { + "epoch": 1.4603761948812828, + "grad_norm": 0.19783945381641388, + "learning_rate": 4.909942067590215e-05, + "loss": 0.5507, + "step": 7104 + }, + { + "epoch": 1.4605817658546614, + "grad_norm": 0.19101816415786743, + "learning_rate": 4.9089473999238294e-05, + "loss": 0.5457, + "step": 7105 + }, + { + "epoch": 1.46078733682804, + "grad_norm": 0.18535058200359344, + "learning_rate": 4.907952712110516e-05, + "loss": 0.5209, + "step": 7106 + }, + { + "epoch": 1.4609929078014185, + "grad_norm": 0.1839088499546051, + "learning_rate": 4.906958004199281e-05, + "loss": 0.5424, + "step": 7107 + }, + { + "epoch": 1.461198478774797, + "grad_norm": 0.18688786029815674, + "learning_rate": 4.905963276239127e-05, + "loss": 0.5383, + "step": 7108 + }, + { + "epoch": 1.4614040497481755, + "grad_norm": 0.19204580783843994, + "learning_rate": 4.904968528279058e-05, + "loss": 0.5667, + "step": 7109 + }, + { + "epoch": 1.461609620721554, + "grad_norm": 0.19083940982818604, + "learning_rate": 4.903973760368084e-05, + "loss": 0.5628, + "step": 7110 + }, + { + "epoch": 1.4618151916949327, + "grad_norm": 0.1922621876001358, + "learning_rate": 4.9029789725552105e-05, + "loss": 0.536, + "step": 7111 + }, + { + "epoch": 1.4620207626683113, + "grad_norm": 0.19811585545539856, + "learning_rate": 4.901984164889447e-05, + "loss": 0.571, + "step": 7112 + }, + { + "epoch": 1.4622263336416899, + "grad_norm": 0.1963101178407669, + "learning_rate": 4.9009893374198015e-05, + "loss": 0.568, + "step": 7113 + }, + { + "epoch": 1.4624319046150682, + "grad_norm": 0.19826072454452515, + "learning_rate": 4.899994490195286e-05, + "loss": 0.541, + "step": 7114 + }, + { + "epoch": 1.4626374755884468, + "grad_norm": 0.19222994148731232, + "learning_rate": 4.898999623264913e-05, + "loss": 0.5699, + "step": 7115 + }, + { + "epoch": 1.4628430465618254, + "grad_norm": 0.19945533573627472, + "learning_rate": 4.898004736677692e-05, + "loss": 0.5663, + "step": 7116 + }, + { + "epoch": 1.463048617535204, + "grad_norm": 0.18743856251239777, + "learning_rate": 4.8970098304826384e-05, + "loss": 0.5423, + "step": 7117 + }, + { + "epoch": 1.4632541885085826, + "grad_norm": 0.1742721050977707, + "learning_rate": 4.896014904728766e-05, + "loss": 0.5273, + "step": 7118 + }, + { + "epoch": 1.4634597594819612, + "grad_norm": 0.15842121839523315, + "learning_rate": 4.895019959465091e-05, + "loss": 0.5392, + "step": 7119 + }, + { + "epoch": 1.4636653304553398, + "grad_norm": 0.1904791295528412, + "learning_rate": 4.894024994740627e-05, + "loss": 0.565, + "step": 7120 + }, + { + "epoch": 1.4638709014287183, + "grad_norm": 0.18996872007846832, + "learning_rate": 4.893030010604393e-05, + "loss": 0.5624, + "step": 7121 + }, + { + "epoch": 1.464076472402097, + "grad_norm": 0.18377164006233215, + "learning_rate": 4.89203500710541e-05, + "loss": 0.5628, + "step": 7122 + }, + { + "epoch": 1.4642820433754755, + "grad_norm": 0.19251424074172974, + "learning_rate": 4.891039984292693e-05, + "loss": 0.5489, + "step": 7123 + }, + { + "epoch": 1.4644876143488539, + "grad_norm": 0.1817564070224762, + "learning_rate": 4.890044942215263e-05, + "loss": 0.5592, + "step": 7124 + }, + { + "epoch": 1.4646931853222325, + "grad_norm": 0.1885865181684494, + "learning_rate": 4.8890498809221434e-05, + "loss": 0.5447, + "step": 7125 + }, + { + "epoch": 1.464898756295611, + "grad_norm": 0.19473087787628174, + "learning_rate": 4.8880548004623545e-05, + "loss": 0.5545, + "step": 7126 + }, + { + "epoch": 1.4651043272689896, + "grad_norm": 0.18976017832756042, + "learning_rate": 4.8870597008849175e-05, + "loss": 0.5323, + "step": 7127 + }, + { + "epoch": 1.4653098982423682, + "grad_norm": 0.1930120289325714, + "learning_rate": 4.88606458223886e-05, + "loss": 0.5459, + "step": 7128 + }, + { + "epoch": 1.4655154692157466, + "grad_norm": 0.18661560118198395, + "learning_rate": 4.885069444573205e-05, + "loss": 0.5345, + "step": 7129 + }, + { + "epoch": 1.4657210401891252, + "grad_norm": 0.1941232681274414, + "learning_rate": 4.884074287936977e-05, + "loss": 0.5289, + "step": 7130 + }, + { + "epoch": 1.4659266111625038, + "grad_norm": 0.19508835673332214, + "learning_rate": 4.883079112379204e-05, + "loss": 0.5421, + "step": 7131 + }, + { + "epoch": 1.4661321821358824, + "grad_norm": 0.200748473405838, + "learning_rate": 4.882083917948914e-05, + "loss": 0.5602, + "step": 7132 + }, + { + "epoch": 1.466337753109261, + "grad_norm": 0.19630691409111023, + "learning_rate": 4.8810887046951356e-05, + "loss": 0.5469, + "step": 7133 + }, + { + "epoch": 1.4665433240826395, + "grad_norm": 0.18631185591220856, + "learning_rate": 4.880093472666897e-05, + "loss": 0.5349, + "step": 7134 + }, + { + "epoch": 1.4667488950560181, + "grad_norm": 0.20446190237998962, + "learning_rate": 4.879098221913231e-05, + "loss": 0.5395, + "step": 7135 + }, + { + "epoch": 1.4669544660293967, + "grad_norm": 0.19369782507419586, + "learning_rate": 4.8781029524831676e-05, + "loss": 0.548, + "step": 7136 + }, + { + "epoch": 1.4671600370027753, + "grad_norm": 0.19022773206233978, + "learning_rate": 4.8771076644257365e-05, + "loss": 0.5499, + "step": 7137 + }, + { + "epoch": 1.467365607976154, + "grad_norm": 0.19664426147937775, + "learning_rate": 4.876112357789977e-05, + "loss": 0.5629, + "step": 7138 + }, + { + "epoch": 1.4675711789495323, + "grad_norm": 0.19032470881938934, + "learning_rate": 4.875117032624917e-05, + "loss": 0.546, + "step": 7139 + }, + { + "epoch": 1.4677767499229109, + "grad_norm": 0.18640637397766113, + "learning_rate": 4.874121688979595e-05, + "loss": 0.5317, + "step": 7140 + }, + { + "epoch": 1.4679823208962894, + "grad_norm": 0.19098687171936035, + "learning_rate": 4.873126326903045e-05, + "loss": 0.5494, + "step": 7141 + }, + { + "epoch": 1.468187891869668, + "grad_norm": 0.19771692156791687, + "learning_rate": 4.872130946444305e-05, + "loss": 0.5562, + "step": 7142 + }, + { + "epoch": 1.4683934628430466, + "grad_norm": 0.18976187705993652, + "learning_rate": 4.871135547652414e-05, + "loss": 0.5607, + "step": 7143 + }, + { + "epoch": 1.4685990338164252, + "grad_norm": 0.19151365756988525, + "learning_rate": 4.870140130576408e-05, + "loss": 0.5471, + "step": 7144 + }, + { + "epoch": 1.4688046047898036, + "grad_norm": 0.19620567560195923, + "learning_rate": 4.869144695265328e-05, + "loss": 0.562, + "step": 7145 + }, + { + "epoch": 1.4690101757631822, + "grad_norm": 0.19159796833992004, + "learning_rate": 4.8681492417682154e-05, + "loss": 0.5638, + "step": 7146 + }, + { + "epoch": 1.4692157467365607, + "grad_norm": 0.20116734504699707, + "learning_rate": 4.867153770134108e-05, + "loss": 0.5677, + "step": 7147 + }, + { + "epoch": 1.4694213177099393, + "grad_norm": 0.19330163300037384, + "learning_rate": 4.866158280412053e-05, + "loss": 0.5546, + "step": 7148 + }, + { + "epoch": 1.469626888683318, + "grad_norm": 0.18877775967121124, + "learning_rate": 4.86516277265109e-05, + "loss": 0.559, + "step": 7149 + }, + { + "epoch": 1.4698324596566965, + "grad_norm": 0.1901031881570816, + "learning_rate": 4.864167246900265e-05, + "loss": 0.5388, + "step": 7150 + }, + { + "epoch": 1.470038030630075, + "grad_norm": 0.18822161853313446, + "learning_rate": 4.8631717032086195e-05, + "loss": 0.5466, + "step": 7151 + }, + { + "epoch": 1.4702436016034537, + "grad_norm": 0.16988466680049896, + "learning_rate": 4.862176141625203e-05, + "loss": 0.5347, + "step": 7152 + }, + { + "epoch": 1.4704491725768323, + "grad_norm": 0.12935671210289001, + "learning_rate": 4.86118056219906e-05, + "loss": 0.5038, + "step": 7153 + }, + { + "epoch": 1.4706547435502106, + "grad_norm": 0.16515877842903137, + "learning_rate": 4.860184964979239e-05, + "loss": 0.5383, + "step": 7154 + }, + { + "epoch": 1.4708603145235892, + "grad_norm": 0.2031169980764389, + "learning_rate": 4.859189350014789e-05, + "loss": 0.558, + "step": 7155 + }, + { + "epoch": 1.4710658854969678, + "grad_norm": 0.1971338540315628, + "learning_rate": 4.858193717354759e-05, + "loss": 0.5552, + "step": 7156 + }, + { + "epoch": 1.4712714564703464, + "grad_norm": 0.18545454740524292, + "learning_rate": 4.857198067048199e-05, + "loss": 0.5499, + "step": 7157 + }, + { + "epoch": 1.471477027443725, + "grad_norm": 0.18908904492855072, + "learning_rate": 4.856202399144157e-05, + "loss": 0.5331, + "step": 7158 + }, + { + "epoch": 1.4716825984171036, + "grad_norm": 0.18228811025619507, + "learning_rate": 4.855206713691691e-05, + "loss": 0.5181, + "step": 7159 + }, + { + "epoch": 1.471888169390482, + "grad_norm": 0.1866607964038849, + "learning_rate": 4.8542110107398483e-05, + "loss": 0.5157, + "step": 7160 + }, + { + "epoch": 1.4720937403638605, + "grad_norm": 0.19502104818820953, + "learning_rate": 4.853215290337685e-05, + "loss": 0.5462, + "step": 7161 + }, + { + "epoch": 1.4722993113372391, + "grad_norm": 0.16694171726703644, + "learning_rate": 4.852219552534256e-05, + "loss": 0.5123, + "step": 7162 + }, + { + "epoch": 1.4725048823106177, + "grad_norm": 0.1643698364496231, + "learning_rate": 4.851223797378614e-05, + "loss": 0.5402, + "step": 7163 + }, + { + "epoch": 1.4727104532839963, + "grad_norm": 0.20267751812934875, + "learning_rate": 4.85022802491982e-05, + "loss": 0.5493, + "step": 7164 + }, + { + "epoch": 1.472916024257375, + "grad_norm": 0.19984979927539825, + "learning_rate": 4.849232235206927e-05, + "loss": 0.5387, + "step": 7165 + }, + { + "epoch": 1.4731215952307535, + "grad_norm": 0.19350376725196838, + "learning_rate": 4.848236428288993e-05, + "loss": 0.5465, + "step": 7166 + }, + { + "epoch": 1.473327166204132, + "grad_norm": 0.2067371904850006, + "learning_rate": 4.84724060421508e-05, + "loss": 0.5688, + "step": 7167 + }, + { + "epoch": 1.4735327371775107, + "grad_norm": 0.20047098398208618, + "learning_rate": 4.846244763034243e-05, + "loss": 0.5426, + "step": 7168 + }, + { + "epoch": 1.473738308150889, + "grad_norm": 0.1930703967809677, + "learning_rate": 4.845248904795547e-05, + "loss": 0.5556, + "step": 7169 + }, + { + "epoch": 1.4739438791242676, + "grad_norm": 0.19122304022312164, + "learning_rate": 4.8442530295480496e-05, + "loss": 0.5323, + "step": 7170 + }, + { + "epoch": 1.4741494500976462, + "grad_norm": 0.1875450760126114, + "learning_rate": 4.843257137340816e-05, + "loss": 0.519, + "step": 7171 + }, + { + "epoch": 1.4743550210710248, + "grad_norm": 0.18695366382598877, + "learning_rate": 4.842261228222906e-05, + "loss": 0.538, + "step": 7172 + }, + { + "epoch": 1.4745605920444034, + "grad_norm": 0.19884580373764038, + "learning_rate": 4.841265302243386e-05, + "loss": 0.5696, + "step": 7173 + }, + { + "epoch": 1.474766163017782, + "grad_norm": 0.19241276383399963, + "learning_rate": 4.840269359451319e-05, + "loss": 0.5595, + "step": 7174 + }, + { + "epoch": 1.4749717339911603, + "grad_norm": 0.16710297763347626, + "learning_rate": 4.839273399895772e-05, + "loss": 0.5195, + "step": 7175 + }, + { + "epoch": 1.475177304964539, + "grad_norm": 0.15979520976543427, + "learning_rate": 4.8382774236258085e-05, + "loss": 0.5616, + "step": 7176 + }, + { + "epoch": 1.4753828759379175, + "grad_norm": 0.2003268003463745, + "learning_rate": 4.8372814306904984e-05, + "loss": 0.5718, + "step": 7177 + }, + { + "epoch": 1.475588446911296, + "grad_norm": 0.18857726454734802, + "learning_rate": 4.83628542113891e-05, + "loss": 0.5305, + "step": 7178 + }, + { + "epoch": 1.4757940178846747, + "grad_norm": 0.15321624279022217, + "learning_rate": 4.8352893950201096e-05, + "loss": 0.5213, + "step": 7179 + }, + { + "epoch": 1.4759995888580533, + "grad_norm": 0.15973275899887085, + "learning_rate": 4.834293352383168e-05, + "loss": 0.5575, + "step": 7180 + }, + { + "epoch": 1.4762051598314319, + "grad_norm": 0.18778233230113983, + "learning_rate": 4.8332972932771556e-05, + "loss": 0.5239, + "step": 7181 + }, + { + "epoch": 1.4764107308048104, + "grad_norm": 0.15525855123996735, + "learning_rate": 4.832301217751142e-05, + "loss": 0.4881, + "step": 7182 + }, + { + "epoch": 1.476616301778189, + "grad_norm": 0.15355351567268372, + "learning_rate": 4.8313051258542024e-05, + "loss": 0.5315, + "step": 7183 + }, + { + "epoch": 1.4768218727515674, + "grad_norm": 0.2030985951423645, + "learning_rate": 4.830309017635407e-05, + "loss": 0.5901, + "step": 7184 + }, + { + "epoch": 1.477027443724946, + "grad_norm": 0.19170239567756653, + "learning_rate": 4.82931289314383e-05, + "loss": 0.5517, + "step": 7185 + }, + { + "epoch": 1.4772330146983246, + "grad_norm": 0.19333000481128693, + "learning_rate": 4.828316752428545e-05, + "loss": 0.5547, + "step": 7186 + }, + { + "epoch": 1.4774385856717032, + "grad_norm": 0.19361145794391632, + "learning_rate": 4.82732059553863e-05, + "loss": 0.5518, + "step": 7187 + }, + { + "epoch": 1.4776441566450818, + "grad_norm": 0.16968531906604767, + "learning_rate": 4.8263244225231586e-05, + "loss": 0.5055, + "step": 7188 + }, + { + "epoch": 1.4778497276184603, + "grad_norm": 0.1647455245256424, + "learning_rate": 4.825328233431207e-05, + "loss": 0.5489, + "step": 7189 + }, + { + "epoch": 1.4780552985918387, + "grad_norm": 0.18998976051807404, + "learning_rate": 4.824332028311856e-05, + "loss": 0.5302, + "step": 7190 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.18618905544281006, + "learning_rate": 4.8233358072141806e-05, + "loss": 0.5217, + "step": 7191 + }, + { + "epoch": 1.4784664405385959, + "grad_norm": 0.19258539378643036, + "learning_rate": 4.822339570187261e-05, + "loss": 0.5551, + "step": 7192 + }, + { + "epoch": 1.4786720115119745, + "grad_norm": 0.1874276101589203, + "learning_rate": 4.821343317280179e-05, + "loss": 0.5409, + "step": 7193 + }, + { + "epoch": 1.478877582485353, + "grad_norm": 0.18570971488952637, + "learning_rate": 4.8203470485420126e-05, + "loss": 0.5524, + "step": 7194 + }, + { + "epoch": 1.4790831534587316, + "grad_norm": 0.19946832954883575, + "learning_rate": 4.819350764021844e-05, + "loss": 0.5618, + "step": 7195 + }, + { + "epoch": 1.4792887244321102, + "grad_norm": 0.1732860952615738, + "learning_rate": 4.818354463768756e-05, + "loss": 0.5354, + "step": 7196 + }, + { + "epoch": 1.4794942954054888, + "grad_norm": 0.16083048284053802, + "learning_rate": 4.817358147831831e-05, + "loss": 0.539, + "step": 7197 + }, + { + "epoch": 1.4796998663788674, + "grad_norm": 0.1897859424352646, + "learning_rate": 4.816361816260155e-05, + "loss": 0.54, + "step": 7198 + }, + { + "epoch": 1.479905437352246, + "grad_norm": 0.1890067458152771, + "learning_rate": 4.815365469102809e-05, + "loss": 0.5339, + "step": 7199 + }, + { + "epoch": 1.4801110083256244, + "grad_norm": 0.19852851331233978, + "learning_rate": 4.8143691064088823e-05, + "loss": 0.555, + "step": 7200 + }, + { + "epoch": 1.480316579299003, + "grad_norm": 0.1849977821111679, + "learning_rate": 4.813372728227459e-05, + "loss": 0.5255, + "step": 7201 + }, + { + "epoch": 1.4805221502723815, + "grad_norm": 0.1914818435907364, + "learning_rate": 4.8123763346076256e-05, + "loss": 0.5525, + "step": 7202 + }, + { + "epoch": 1.4807277212457601, + "grad_norm": 0.2014429122209549, + "learning_rate": 4.811379925598469e-05, + "loss": 0.5693, + "step": 7203 + }, + { + "epoch": 1.4809332922191387, + "grad_norm": 0.1984141618013382, + "learning_rate": 4.81038350124908e-05, + "loss": 0.5566, + "step": 7204 + }, + { + "epoch": 1.481138863192517, + "grad_norm": 0.19716762006282806, + "learning_rate": 4.809387061608548e-05, + "loss": 0.5513, + "step": 7205 + }, + { + "epoch": 1.4813444341658957, + "grad_norm": 0.19718822836875916, + "learning_rate": 4.8083906067259585e-05, + "loss": 0.5376, + "step": 7206 + }, + { + "epoch": 1.4815500051392743, + "grad_norm": 0.1910613626241684, + "learning_rate": 4.807394136650406e-05, + "loss": 0.5604, + "step": 7207 + }, + { + "epoch": 1.4817555761126529, + "grad_norm": 0.19918161630630493, + "learning_rate": 4.806397651430983e-05, + "loss": 0.549, + "step": 7208 + }, + { + "epoch": 1.4819611470860314, + "grad_norm": 0.18760617077350616, + "learning_rate": 4.805401151116778e-05, + "loss": 0.5507, + "step": 7209 + }, + { + "epoch": 1.48216671805941, + "grad_norm": 0.15669982135295868, + "learning_rate": 4.804404635756886e-05, + "loss": 0.5268, + "step": 7210 + }, + { + "epoch": 1.4823722890327886, + "grad_norm": 0.16258768737316132, + "learning_rate": 4.803408105400401e-05, + "loss": 0.5557, + "step": 7211 + }, + { + "epoch": 1.4825778600061672, + "grad_norm": 0.200164794921875, + "learning_rate": 4.802411560096418e-05, + "loss": 0.5652, + "step": 7212 + }, + { + "epoch": 1.4827834309795458, + "grad_norm": 0.1986524760723114, + "learning_rate": 4.801414999894028e-05, + "loss": 0.5608, + "step": 7213 + }, + { + "epoch": 1.4829890019529244, + "grad_norm": 0.15464936196804047, + "learning_rate": 4.8004184248423325e-05, + "loss": 0.519, + "step": 7214 + }, + { + "epoch": 1.4831945729263027, + "grad_norm": 0.15096427500247955, + "learning_rate": 4.799421834990424e-05, + "loss": 0.5417, + "step": 7215 + }, + { + "epoch": 1.4834001438996813, + "grad_norm": 0.15722674131393433, + "learning_rate": 4.798425230387402e-05, + "loss": 0.5158, + "step": 7216 + }, + { + "epoch": 1.48360571487306, + "grad_norm": 0.15923316776752472, + "learning_rate": 4.797428611082362e-05, + "loss": 0.5495, + "step": 7217 + }, + { + "epoch": 1.4838112858464385, + "grad_norm": 0.16226224601268768, + "learning_rate": 4.796431977124405e-05, + "loss": 0.5213, + "step": 7218 + }, + { + "epoch": 1.484016856819817, + "grad_norm": 0.16145376861095428, + "learning_rate": 4.7954353285626314e-05, + "loss": 0.568, + "step": 7219 + }, + { + "epoch": 1.4842224277931955, + "grad_norm": 0.15974651277065277, + "learning_rate": 4.7944386654461385e-05, + "loss": 0.512, + "step": 7220 + }, + { + "epoch": 1.484427998766574, + "grad_norm": 0.15350697934627533, + "learning_rate": 4.7934419878240296e-05, + "loss": 0.5473, + "step": 7221 + }, + { + "epoch": 1.4846335697399526, + "grad_norm": 0.19197656214237213, + "learning_rate": 4.792445295745406e-05, + "loss": 0.5461, + "step": 7222 + }, + { + "epoch": 1.4848391407133312, + "grad_norm": 0.19040462374687195, + "learning_rate": 4.7914485892593686e-05, + "loss": 0.5372, + "step": 7223 + }, + { + "epoch": 1.4850447116867098, + "grad_norm": 0.1572524458169937, + "learning_rate": 4.790451868415021e-05, + "loss": 0.5145, + "step": 7224 + }, + { + "epoch": 1.4852502826600884, + "grad_norm": 0.15703527629375458, + "learning_rate": 4.7894551332614686e-05, + "loss": 0.5627, + "step": 7225 + }, + { + "epoch": 1.485455853633467, + "grad_norm": 0.16500575840473175, + "learning_rate": 4.788458383847816e-05, + "loss": 0.5229, + "step": 7226 + }, + { + "epoch": 1.4856614246068456, + "grad_norm": 0.16244147717952728, + "learning_rate": 4.787461620223164e-05, + "loss": 0.5392, + "step": 7227 + }, + { + "epoch": 1.4858669955802242, + "grad_norm": 0.19701159000396729, + "learning_rate": 4.786464842436623e-05, + "loss": 0.5197, + "step": 7228 + }, + { + "epoch": 1.4860725665536028, + "grad_norm": 0.18858790397644043, + "learning_rate": 4.785468050537298e-05, + "loss": 0.5707, + "step": 7229 + }, + { + "epoch": 1.4862781375269811, + "grad_norm": 0.1888207048177719, + "learning_rate": 4.784471244574295e-05, + "loss": 0.5432, + "step": 7230 + }, + { + "epoch": 1.4864837085003597, + "grad_norm": 0.19446338713169098, + "learning_rate": 4.783474424596726e-05, + "loss": 0.5676, + "step": 7231 + }, + { + "epoch": 1.4866892794737383, + "grad_norm": 0.19412629306316376, + "learning_rate": 4.782477590653696e-05, + "loss": 0.5435, + "step": 7232 + }, + { + "epoch": 1.486894850447117, + "grad_norm": 0.18198393285274506, + "learning_rate": 4.781480742794316e-05, + "loss": 0.5172, + "step": 7233 + }, + { + "epoch": 1.4871004214204955, + "grad_norm": 0.2016136646270752, + "learning_rate": 4.7804838810676935e-05, + "loss": 0.5872, + "step": 7234 + }, + { + "epoch": 1.487305992393874, + "grad_norm": 0.17606668174266815, + "learning_rate": 4.779487005522943e-05, + "loss": 0.5324, + "step": 7235 + }, + { + "epoch": 1.4875115633672524, + "grad_norm": 0.16043418645858765, + "learning_rate": 4.778490116209174e-05, + "loss": 0.5447, + "step": 7236 + }, + { + "epoch": 1.487717134340631, + "grad_norm": 0.19674460589885712, + "learning_rate": 4.7774932131754975e-05, + "loss": 0.5595, + "step": 7237 + }, + { + "epoch": 1.4879227053140096, + "grad_norm": 0.2002599984407425, + "learning_rate": 4.776496296471029e-05, + "loss": 0.5289, + "step": 7238 + }, + { + "epoch": 1.4881282762873882, + "grad_norm": 0.18798843026161194, + "learning_rate": 4.775499366144878e-05, + "loss": 0.5465, + "step": 7239 + }, + { + "epoch": 1.4883338472607668, + "grad_norm": 0.18151499330997467, + "learning_rate": 4.7745024222461626e-05, + "loss": 0.5398, + "step": 7240 + }, + { + "epoch": 1.4885394182341454, + "grad_norm": 0.16490262746810913, + "learning_rate": 4.773505464823995e-05, + "loss": 0.5314, + "step": 7241 + }, + { + "epoch": 1.488744989207524, + "grad_norm": 0.16644752025604248, + "learning_rate": 4.772508493927492e-05, + "loss": 0.5573, + "step": 7242 + }, + { + "epoch": 1.4889505601809025, + "grad_norm": 0.1932040899991989, + "learning_rate": 4.77151150960577e-05, + "loss": 0.5464, + "step": 7243 + }, + { + "epoch": 1.4891561311542811, + "grad_norm": 0.19342085719108582, + "learning_rate": 4.770514511907943e-05, + "loss": 0.5528, + "step": 7244 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 0.1693827509880066, + "learning_rate": 4.7695175008831317e-05, + "loss": 0.5318, + "step": 7245 + }, + { + "epoch": 1.489567273101038, + "grad_norm": 0.15933051705360413, + "learning_rate": 4.768520476580454e-05, + "loss": 0.5436, + "step": 7246 + }, + { + "epoch": 1.4897728440744167, + "grad_norm": 0.19581708312034607, + "learning_rate": 4.767523439049026e-05, + "loss": 0.5502, + "step": 7247 + }, + { + "epoch": 1.4899784150477953, + "grad_norm": 0.1909896582365036, + "learning_rate": 4.7665263883379685e-05, + "loss": 0.5415, + "step": 7248 + }, + { + "epoch": 1.4901839860211739, + "grad_norm": 0.1643315702676773, + "learning_rate": 4.765529324496402e-05, + "loss": 0.5078, + "step": 7249 + }, + { + "epoch": 1.4903895569945524, + "grad_norm": 0.15782994031906128, + "learning_rate": 4.764532247573446e-05, + "loss": 0.5169, + "step": 7250 + }, + { + "epoch": 1.4905951279679308, + "grad_norm": 0.16611091792583466, + "learning_rate": 4.763535157618222e-05, + "loss": 0.5207, + "step": 7251 + }, + { + "epoch": 1.4908006989413094, + "grad_norm": 0.1263076364994049, + "learning_rate": 4.7625380546798546e-05, + "loss": 0.5362, + "step": 7252 + }, + { + "epoch": 1.491006269914688, + "grad_norm": 0.16741037368774414, + "learning_rate": 4.761540938807464e-05, + "loss": 0.5364, + "step": 7253 + }, + { + "epoch": 1.4912118408880666, + "grad_norm": 0.19533216953277588, + "learning_rate": 4.760543810050174e-05, + "loss": 0.5505, + "step": 7254 + }, + { + "epoch": 1.4914174118614452, + "grad_norm": 0.19828902184963226, + "learning_rate": 4.759546668457107e-05, + "loss": 0.5722, + "step": 7255 + }, + { + "epoch": 1.4916229828348238, + "grad_norm": 0.19037294387817383, + "learning_rate": 4.7585495140773894e-05, + "loss": 0.557, + "step": 7256 + }, + { + "epoch": 1.4918285538082023, + "grad_norm": 0.1699882447719574, + "learning_rate": 4.7575523469601464e-05, + "loss": 0.5252, + "step": 7257 + }, + { + "epoch": 1.492034124781581, + "grad_norm": 0.16292616724967957, + "learning_rate": 4.7565551671545003e-05, + "loss": 0.5557, + "step": 7258 + }, + { + "epoch": 1.4922396957549595, + "grad_norm": 0.1898314654827118, + "learning_rate": 4.755557974709584e-05, + "loss": 0.5341, + "step": 7259 + }, + { + "epoch": 1.4924452667283379, + "grad_norm": 0.18847902119159698, + "learning_rate": 4.7545607696745186e-05, + "loss": 0.557, + "step": 7260 + }, + { + "epoch": 1.4926508377017165, + "grad_norm": 0.16627360880374908, + "learning_rate": 4.753563552098433e-05, + "loss": 0.5283, + "step": 7261 + }, + { + "epoch": 1.492856408675095, + "grad_norm": 0.16107410192489624, + "learning_rate": 4.752566322030457e-05, + "loss": 0.5447, + "step": 7262 + }, + { + "epoch": 1.4930619796484736, + "grad_norm": 0.16727054119110107, + "learning_rate": 4.751569079519721e-05, + "loss": 0.5214, + "step": 7263 + }, + { + "epoch": 1.4932675506218522, + "grad_norm": 0.1620626598596573, + "learning_rate": 4.75057182461535e-05, + "loss": 0.5117, + "step": 7264 + }, + { + "epoch": 1.4934731215952308, + "grad_norm": 0.1607995629310608, + "learning_rate": 4.749574557366477e-05, + "loss": 0.5112, + "step": 7265 + }, + { + "epoch": 1.4936786925686092, + "grad_norm": 0.16359218955039978, + "learning_rate": 4.748577277822232e-05, + "loss": 0.5528, + "step": 7266 + }, + { + "epoch": 1.4938842635419878, + "grad_norm": 0.19799359142780304, + "learning_rate": 4.747579986031747e-05, + "loss": 0.5505, + "step": 7267 + }, + { + "epoch": 1.4940898345153664, + "grad_norm": 0.1984180063009262, + "learning_rate": 4.746582682044153e-05, + "loss": 0.5721, + "step": 7268 + }, + { + "epoch": 1.494295405488745, + "grad_norm": 0.196151003241539, + "learning_rate": 4.745585365908582e-05, + "loss": 0.5405, + "step": 7269 + }, + { + "epoch": 1.4945009764621235, + "grad_norm": 0.16846486926078796, + "learning_rate": 4.744588037674169e-05, + "loss": 0.5246, + "step": 7270 + }, + { + "epoch": 1.4947065474355021, + "grad_norm": 0.16317616403102875, + "learning_rate": 4.743590697390045e-05, + "loss": 0.5584, + "step": 7271 + }, + { + "epoch": 1.4949121184088807, + "grad_norm": 0.18906491994857788, + "learning_rate": 4.7425933451053474e-05, + "loss": 0.5638, + "step": 7272 + }, + { + "epoch": 1.4951176893822593, + "grad_norm": 0.16003085672855377, + "learning_rate": 4.7415959808692085e-05, + "loss": 0.5194, + "step": 7273 + }, + { + "epoch": 1.495323260355638, + "grad_norm": 0.12602053582668304, + "learning_rate": 4.740598604730766e-05, + "loss": 0.5273, + "step": 7274 + }, + { + "epoch": 1.4955288313290163, + "grad_norm": 0.15797413885593414, + "learning_rate": 4.7396012167391536e-05, + "loss": 0.537, + "step": 7275 + }, + { + "epoch": 1.4957344023023948, + "grad_norm": 0.19751828908920288, + "learning_rate": 4.73860381694351e-05, + "loss": 0.5497, + "step": 7276 + }, + { + "epoch": 1.4959399732757734, + "grad_norm": 0.1944907009601593, + "learning_rate": 4.7376064053929724e-05, + "loss": 0.5645, + "step": 7277 + }, + { + "epoch": 1.496145544249152, + "grad_norm": 0.18657876551151276, + "learning_rate": 4.736608982136676e-05, + "loss": 0.5405, + "step": 7278 + }, + { + "epoch": 1.4963511152225306, + "grad_norm": 0.19843631982803345, + "learning_rate": 4.735611547223761e-05, + "loss": 0.5417, + "step": 7279 + }, + { + "epoch": 1.4965566861959092, + "grad_norm": 0.19256174564361572, + "learning_rate": 4.7346141007033676e-05, + "loss": 0.541, + "step": 7280 + }, + { + "epoch": 1.4967622571692876, + "grad_norm": 0.18709734082221985, + "learning_rate": 4.733616642624634e-05, + "loss": 0.539, + "step": 7281 + }, + { + "epoch": 1.4969678281426662, + "grad_norm": 0.1940479278564453, + "learning_rate": 4.732619173036699e-05, + "loss": 0.5556, + "step": 7282 + }, + { + "epoch": 1.4971733991160447, + "grad_norm": 0.2690550684928894, + "learning_rate": 4.731621691988705e-05, + "loss": 0.5554, + "step": 7283 + }, + { + "epoch": 1.4973789700894233, + "grad_norm": 0.19769832491874695, + "learning_rate": 4.730624199529793e-05, + "loss": 0.5355, + "step": 7284 + }, + { + "epoch": 1.497584541062802, + "grad_norm": 0.23557159304618835, + "learning_rate": 4.729626695709105e-05, + "loss": 0.5675, + "step": 7285 + }, + { + "epoch": 1.4977901120361805, + "grad_norm": 0.19808165729045868, + "learning_rate": 4.728629180575783e-05, + "loss": 0.5494, + "step": 7286 + }, + { + "epoch": 1.497995683009559, + "grad_norm": 0.19333380460739136, + "learning_rate": 4.7276316541789694e-05, + "loss": 0.5534, + "step": 7287 + }, + { + "epoch": 1.4982012539829377, + "grad_norm": 0.185968816280365, + "learning_rate": 4.726634116567809e-05, + "loss": 0.5273, + "step": 7288 + }, + { + "epoch": 1.4984068249563163, + "grad_norm": 0.19186194241046906, + "learning_rate": 4.725636567791443e-05, + "loss": 0.5485, + "step": 7289 + }, + { + "epoch": 1.4986123959296949, + "grad_norm": 0.16538295149803162, + "learning_rate": 4.7246390078990195e-05, + "loss": 0.5292, + "step": 7290 + }, + { + "epoch": 1.4988179669030732, + "grad_norm": 0.16167549788951874, + "learning_rate": 4.723641436939683e-05, + "loss": 0.5493, + "step": 7291 + }, + { + "epoch": 1.4990235378764518, + "grad_norm": 0.19126403331756592, + "learning_rate": 4.722643854962577e-05, + "loss": 0.531, + "step": 7292 + }, + { + "epoch": 1.4992291088498304, + "grad_norm": 0.19075235724449158, + "learning_rate": 4.721646262016849e-05, + "loss": 0.5507, + "step": 7293 + }, + { + "epoch": 1.499434679823209, + "grad_norm": 0.19539231061935425, + "learning_rate": 4.720648658151645e-05, + "loss": 0.5525, + "step": 7294 + }, + { + "epoch": 1.4996402507965876, + "grad_norm": 0.19356007874011993, + "learning_rate": 4.719651043416114e-05, + "loss": 0.5398, + "step": 7295 + }, + { + "epoch": 1.499845821769966, + "grad_norm": 0.19021181762218475, + "learning_rate": 4.7186534178594016e-05, + "loss": 0.5507, + "step": 7296 + }, + { + "epoch": 1.5000513927433445, + "grad_norm": 0.1926860511302948, + "learning_rate": 4.717655781530658e-05, + "loss": 0.5716, + "step": 7297 + }, + { + "epoch": 1.5002569637167231, + "grad_norm": 0.16523759067058563, + "learning_rate": 4.716658134479031e-05, + "loss": 0.4999, + "step": 7298 + }, + { + "epoch": 1.5004625346901017, + "grad_norm": 0.15917497873306274, + "learning_rate": 4.7156604767536716e-05, + "loss": 0.5651, + "step": 7299 + }, + { + "epoch": 1.5006681056634803, + "grad_norm": 0.16436144709587097, + "learning_rate": 4.714662808403727e-05, + "loss": 0.5352, + "step": 7300 + }, + { + "epoch": 1.5008736766368589, + "grad_norm": 0.15653958916664124, + "learning_rate": 4.71366512947835e-05, + "loss": 0.5314, + "step": 7301 + }, + { + "epoch": 1.5010792476102375, + "grad_norm": 0.1997946798801422, + "learning_rate": 4.71266744002669e-05, + "loss": 0.5585, + "step": 7302 + }, + { + "epoch": 1.501284818583616, + "grad_norm": 0.1864425539970398, + "learning_rate": 4.7116697400979e-05, + "loss": 0.5312, + "step": 7303 + }, + { + "epoch": 1.5014903895569947, + "grad_norm": 0.19595369696617126, + "learning_rate": 4.710672029741131e-05, + "loss": 0.5518, + "step": 7304 + }, + { + "epoch": 1.5016959605303732, + "grad_norm": 0.16003580391407013, + "learning_rate": 4.7096743090055354e-05, + "loss": 0.5241, + "step": 7305 + }, + { + "epoch": 1.5019015315037518, + "grad_norm": 0.16186951100826263, + "learning_rate": 4.708676577940266e-05, + "loss": 0.5391, + "step": 7306 + }, + { + "epoch": 1.5021071024771302, + "grad_norm": 0.16420379281044006, + "learning_rate": 4.707678836594478e-05, + "loss": 0.4949, + "step": 7307 + }, + { + "epoch": 1.5023126734505088, + "grad_norm": 0.15808852016925812, + "learning_rate": 4.706681085017325e-05, + "loss": 0.5566, + "step": 7308 + }, + { + "epoch": 1.5025182444238874, + "grad_norm": 0.18840067088603973, + "learning_rate": 4.7056833232579604e-05, + "loss": 0.5491, + "step": 7309 + }, + { + "epoch": 1.502723815397266, + "grad_norm": 0.16313523054122925, + "learning_rate": 4.70468555136554e-05, + "loss": 0.5258, + "step": 7310 + }, + { + "epoch": 1.5029293863706443, + "grad_norm": 0.15801596641540527, + "learning_rate": 4.703687769389219e-05, + "loss": 0.5443, + "step": 7311 + }, + { + "epoch": 1.503134957344023, + "grad_norm": 0.19635756313800812, + "learning_rate": 4.702689977378154e-05, + "loss": 0.5529, + "step": 7312 + }, + { + "epoch": 1.5033405283174015, + "grad_norm": 0.1938237100839615, + "learning_rate": 4.7016921753815e-05, + "loss": 0.564, + "step": 7313 + }, + { + "epoch": 1.50354609929078, + "grad_norm": 0.22758108377456665, + "learning_rate": 4.7006943634484154e-05, + "loss": 0.5604, + "step": 7314 + }, + { + "epoch": 1.5037516702641587, + "grad_norm": 0.2014021873474121, + "learning_rate": 4.699696541628058e-05, + "loss": 0.5574, + "step": 7315 + }, + { + "epoch": 1.5039572412375373, + "grad_norm": 0.1863914430141449, + "learning_rate": 4.698698709969585e-05, + "loss": 0.5268, + "step": 7316 + }, + { + "epoch": 1.5041628122109159, + "grad_norm": 0.19100484251976013, + "learning_rate": 4.6977008685221556e-05, + "loss": 0.5515, + "step": 7317 + }, + { + "epoch": 1.5043683831842944, + "grad_norm": 0.1965937614440918, + "learning_rate": 4.6967030173349285e-05, + "loss": 0.557, + "step": 7318 + }, + { + "epoch": 1.504573954157673, + "grad_norm": 0.16544751822948456, + "learning_rate": 4.695705156457064e-05, + "loss": 0.5139, + "step": 7319 + }, + { + "epoch": 1.5047795251310516, + "grad_norm": 0.1309744417667389, + "learning_rate": 4.69470728593772e-05, + "loss": 0.5451, + "step": 7320 + }, + { + "epoch": 1.5049850961044302, + "grad_norm": 0.16225290298461914, + "learning_rate": 4.6937094058260585e-05, + "loss": 0.5624, + "step": 7321 + }, + { + "epoch": 1.5051906670778086, + "grad_norm": 0.19539402425289154, + "learning_rate": 4.69271151617124e-05, + "loss": 0.5485, + "step": 7322 + }, + { + "epoch": 1.5053962380511872, + "grad_norm": 0.19238321483135223, + "learning_rate": 4.691713617022427e-05, + "loss": 0.537, + "step": 7323 + }, + { + "epoch": 1.5056018090245658, + "grad_norm": 0.19159597158432007, + "learning_rate": 4.6907157084287774e-05, + "loss": 0.5662, + "step": 7324 + }, + { + "epoch": 1.5058073799979443, + "grad_norm": 0.16289053857326508, + "learning_rate": 4.689717790439459e-05, + "loss": 0.5125, + "step": 7325 + }, + { + "epoch": 1.5060129509713227, + "grad_norm": 0.16851918399333954, + "learning_rate": 4.6887198631036295e-05, + "loss": 0.5272, + "step": 7326 + }, + { + "epoch": 1.5062185219447013, + "grad_norm": 0.1957252323627472, + "learning_rate": 4.687721926470455e-05, + "loss": 0.5669, + "step": 7327 + }, + { + "epoch": 1.5064240929180799, + "grad_norm": 0.1756441295146942, + "learning_rate": 4.686723980589099e-05, + "loss": 0.5055, + "step": 7328 + }, + { + "epoch": 1.5066296638914585, + "grad_norm": 0.16394411027431488, + "learning_rate": 4.685726025508726e-05, + "loss": 0.5624, + "step": 7329 + }, + { + "epoch": 1.506835234864837, + "grad_norm": 0.16520611941814423, + "learning_rate": 4.684728061278499e-05, + "loss": 0.5223, + "step": 7330 + }, + { + "epoch": 1.5070408058382156, + "grad_norm": 0.12648457288742065, + "learning_rate": 4.683730087947584e-05, + "loss": 0.5101, + "step": 7331 + }, + { + "epoch": 1.5072463768115942, + "grad_norm": 0.11723072826862335, + "learning_rate": 4.682732105565146e-05, + "loss": 0.5237, + "step": 7332 + }, + { + "epoch": 1.5074519477849728, + "grad_norm": 0.12541693449020386, + "learning_rate": 4.681734114180352e-05, + "loss": 0.5185, + "step": 7333 + }, + { + "epoch": 1.5076575187583514, + "grad_norm": 0.15850338339805603, + "learning_rate": 4.6807361138423664e-05, + "loss": 0.5335, + "step": 7334 + }, + { + "epoch": 1.50786308973173, + "grad_norm": 0.1691320687532425, + "learning_rate": 4.679738104600359e-05, + "loss": 0.5226, + "step": 7335 + }, + { + "epoch": 1.5080686607051086, + "grad_norm": 0.16223326325416565, + "learning_rate": 4.678740086503494e-05, + "loss": 0.5376, + "step": 7336 + }, + { + "epoch": 1.508274231678487, + "grad_norm": 0.18564042448997498, + "learning_rate": 4.6777420596009406e-05, + "loss": 0.5129, + "step": 7337 + }, + { + "epoch": 1.5084798026518655, + "grad_norm": 0.1631714105606079, + "learning_rate": 4.676744023941866e-05, + "loss": 0.5274, + "step": 7338 + }, + { + "epoch": 1.5086853736252441, + "grad_norm": 0.15576131641864777, + "learning_rate": 4.67574597957544e-05, + "loss": 0.5431, + "step": 7339 + }, + { + "epoch": 1.5088909445986227, + "grad_norm": 0.19135643541812897, + "learning_rate": 4.6747479265508314e-05, + "loss": 0.5605, + "step": 7340 + }, + { + "epoch": 1.509096515572001, + "grad_norm": 0.2023853212594986, + "learning_rate": 4.673749864917209e-05, + "loss": 0.5562, + "step": 7341 + }, + { + "epoch": 1.5093020865453797, + "grad_norm": 0.1936071515083313, + "learning_rate": 4.672751794723743e-05, + "loss": 0.5556, + "step": 7342 + }, + { + "epoch": 1.5095076575187583, + "grad_norm": 0.18370911478996277, + "learning_rate": 4.671753716019604e-05, + "loss": 0.5524, + "step": 7343 + }, + { + "epoch": 1.5097132284921368, + "grad_norm": 0.15776073932647705, + "learning_rate": 4.6707556288539605e-05, + "loss": 0.4955, + "step": 7344 + }, + { + "epoch": 1.5099187994655154, + "grad_norm": 0.15899749100208282, + "learning_rate": 4.6697575332759865e-05, + "loss": 0.5609, + "step": 7345 + }, + { + "epoch": 1.510124370438894, + "grad_norm": 0.19149565696716309, + "learning_rate": 4.668759429334852e-05, + "loss": 0.5453, + "step": 7346 + }, + { + "epoch": 1.5103299414122726, + "grad_norm": 0.20891959965229034, + "learning_rate": 4.667761317079729e-05, + "loss": 0.5634, + "step": 7347 + }, + { + "epoch": 1.5105355123856512, + "grad_norm": 0.18865418434143066, + "learning_rate": 4.666763196559791e-05, + "loss": 0.5462, + "step": 7348 + }, + { + "epoch": 1.5107410833590298, + "grad_norm": 0.18833813071250916, + "learning_rate": 4.6657650678242085e-05, + "loss": 0.5334, + "step": 7349 + }, + { + "epoch": 1.5109466543324084, + "grad_norm": 0.18930873274803162, + "learning_rate": 4.664766930922157e-05, + "loss": 0.5332, + "step": 7350 + }, + { + "epoch": 1.511152225305787, + "grad_norm": 0.19529637694358826, + "learning_rate": 4.663768785902807e-05, + "loss": 0.5644, + "step": 7351 + }, + { + "epoch": 1.5113577962791653, + "grad_norm": 0.1973542720079422, + "learning_rate": 4.662770632815337e-05, + "loss": 0.5617, + "step": 7352 + }, + { + "epoch": 1.511563367252544, + "grad_norm": 0.18992508947849274, + "learning_rate": 4.6617724717089174e-05, + "loss": 0.5536, + "step": 7353 + }, + { + "epoch": 1.5117689382259225, + "grad_norm": 0.16945968568325043, + "learning_rate": 4.660774302632724e-05, + "loss": 0.5303, + "step": 7354 + }, + { + "epoch": 1.511974509199301, + "grad_norm": 0.15689992904663086, + "learning_rate": 4.659776125635932e-05, + "loss": 0.5519, + "step": 7355 + }, + { + "epoch": 1.5121800801726795, + "grad_norm": 0.8934375643730164, + "learning_rate": 4.6587779407677185e-05, + "loss": 0.579, + "step": 7356 + }, + { + "epoch": 1.512385651146058, + "grad_norm": 0.1862555593252182, + "learning_rate": 4.657779748077257e-05, + "loss": 0.5403, + "step": 7357 + }, + { + "epoch": 1.5125912221194366, + "grad_norm": 0.19881917536258698, + "learning_rate": 4.656781547613724e-05, + "loss": 0.5623, + "step": 7358 + }, + { + "epoch": 1.5127967930928152, + "grad_norm": 0.16885186731815338, + "learning_rate": 4.655783339426297e-05, + "loss": 0.5123, + "step": 7359 + }, + { + "epoch": 1.5130023640661938, + "grad_norm": 0.1638081818819046, + "learning_rate": 4.654785123564155e-05, + "loss": 0.5536, + "step": 7360 + }, + { + "epoch": 1.5132079350395724, + "grad_norm": 0.19882342219352722, + "learning_rate": 4.653786900076472e-05, + "loss": 0.5512, + "step": 7361 + }, + { + "epoch": 1.513413506012951, + "grad_norm": 0.20189371705055237, + "learning_rate": 4.652788669012427e-05, + "loss": 0.5612, + "step": 7362 + }, + { + "epoch": 1.5136190769863296, + "grad_norm": 0.1760426163673401, + "learning_rate": 4.651790430421199e-05, + "loss": 0.5255, + "step": 7363 + }, + { + "epoch": 1.5138246479597082, + "grad_norm": 0.16108988225460052, + "learning_rate": 4.6507921843519664e-05, + "loss": 0.5382, + "step": 7364 + }, + { + "epoch": 1.5140302189330868, + "grad_norm": 0.19698002934455872, + "learning_rate": 4.649793930853907e-05, + "loss": 0.5369, + "step": 7365 + }, + { + "epoch": 1.5142357899064653, + "grad_norm": 0.20208927989006042, + "learning_rate": 4.6487956699762004e-05, + "loss": 0.5455, + "step": 7366 + }, + { + "epoch": 1.5144413608798437, + "grad_norm": 0.1949499249458313, + "learning_rate": 4.6477974017680275e-05, + "loss": 0.5547, + "step": 7367 + }, + { + "epoch": 1.5146469318532223, + "grad_norm": 0.19195735454559326, + "learning_rate": 4.646799126278567e-05, + "loss": 0.5309, + "step": 7368 + }, + { + "epoch": 1.5148525028266009, + "grad_norm": 0.16471721231937408, + "learning_rate": 4.645800843556999e-05, + "loss": 0.5248, + "step": 7369 + }, + { + "epoch": 1.5150580737999795, + "grad_norm": 0.16040369868278503, + "learning_rate": 4.644802553652505e-05, + "loss": 0.5192, + "step": 7370 + }, + { + "epoch": 1.5152636447733578, + "grad_norm": 0.16164757311344147, + "learning_rate": 4.643804256614267e-05, + "loss": 0.5253, + "step": 7371 + }, + { + "epoch": 1.5154692157467364, + "grad_norm": 0.15787971019744873, + "learning_rate": 4.6428059524914643e-05, + "loss": 0.5589, + "step": 7372 + }, + { + "epoch": 1.515674786720115, + "grad_norm": 0.19109466671943665, + "learning_rate": 4.641807641333281e-05, + "loss": 0.5557, + "step": 7373 + }, + { + "epoch": 1.5158803576934936, + "grad_norm": 0.19500547647476196, + "learning_rate": 4.640809323188897e-05, + "loss": 0.5407, + "step": 7374 + }, + { + "epoch": 1.5160859286668722, + "grad_norm": 0.1970156580209732, + "learning_rate": 4.639810998107497e-05, + "loss": 0.5453, + "step": 7375 + }, + { + "epoch": 1.5162914996402508, + "grad_norm": 0.20001158118247986, + "learning_rate": 4.638812666138261e-05, + "loss": 0.5552, + "step": 7376 + }, + { + "epoch": 1.5164970706136294, + "grad_norm": 0.16510051488876343, + "learning_rate": 4.637814327330376e-05, + "loss": 0.5262, + "step": 7377 + }, + { + "epoch": 1.516702641587008, + "grad_norm": 0.161884605884552, + "learning_rate": 4.636815981733022e-05, + "loss": 0.5346, + "step": 7378 + }, + { + "epoch": 1.5169082125603865, + "grad_norm": 0.17277652025222778, + "learning_rate": 4.635817629395383e-05, + "loss": 0.5142, + "step": 7379 + }, + { + "epoch": 1.5171137835337651, + "grad_norm": 0.15767474472522736, + "learning_rate": 4.6348192703666444e-05, + "loss": 0.529, + "step": 7380 + }, + { + "epoch": 1.5173193545071437, + "grad_norm": 0.19689583778381348, + "learning_rate": 4.633820904695992e-05, + "loss": 0.5467, + "step": 7381 + }, + { + "epoch": 1.5175249254805223, + "grad_norm": 0.19332459568977356, + "learning_rate": 4.6328225324326066e-05, + "loss": 0.5505, + "step": 7382 + }, + { + "epoch": 1.5177304964539007, + "grad_norm": 0.19339875876903534, + "learning_rate": 4.631824153625679e-05, + "loss": 0.5504, + "step": 7383 + }, + { + "epoch": 1.5179360674272793, + "grad_norm": 0.19665616750717163, + "learning_rate": 4.63082576832439e-05, + "loss": 0.5474, + "step": 7384 + }, + { + "epoch": 1.5181416384006579, + "grad_norm": 0.19962632656097412, + "learning_rate": 4.629827376577927e-05, + "loss": 0.5514, + "step": 7385 + }, + { + "epoch": 1.5183472093740362, + "grad_norm": 0.19536101818084717, + "learning_rate": 4.628828978435475e-05, + "loss": 0.55, + "step": 7386 + }, + { + "epoch": 1.5185527803474148, + "grad_norm": 0.19217143952846527, + "learning_rate": 4.627830573946223e-05, + "loss": 0.5404, + "step": 7387 + }, + { + "epoch": 1.5187583513207934, + "grad_norm": 0.20492962002754211, + "learning_rate": 4.6268321631593556e-05, + "loss": 0.5701, + "step": 7388 + }, + { + "epoch": 1.518963922294172, + "grad_norm": 0.16076092422008514, + "learning_rate": 4.6258337461240595e-05, + "loss": 0.5199, + "step": 7389 + }, + { + "epoch": 1.5191694932675506, + "grad_norm": 0.16766008734703064, + "learning_rate": 4.624835322889524e-05, + "loss": 0.544, + "step": 7390 + }, + { + "epoch": 1.5193750642409292, + "grad_norm": 0.19400693476200104, + "learning_rate": 4.623836893504934e-05, + "loss": 0.526, + "step": 7391 + }, + { + "epoch": 1.5195806352143078, + "grad_norm": 0.19015835225582123, + "learning_rate": 4.62283845801948e-05, + "loss": 0.5383, + "step": 7392 + }, + { + "epoch": 1.5197862061876863, + "grad_norm": 0.19058318436145782, + "learning_rate": 4.6218400164823495e-05, + "loss": 0.5406, + "step": 7393 + }, + { + "epoch": 1.519991777161065, + "grad_norm": 0.1955268830060959, + "learning_rate": 4.620841568942731e-05, + "loss": 0.5357, + "step": 7394 + }, + { + "epoch": 1.5201973481344435, + "grad_norm": 0.16312715411186218, + "learning_rate": 4.619843115449814e-05, + "loss": 0.5241, + "step": 7395 + }, + { + "epoch": 1.520402919107822, + "grad_norm": 0.16432897746562958, + "learning_rate": 4.6188446560527846e-05, + "loss": 0.5364, + "step": 7396 + }, + { + "epoch": 1.5206084900812007, + "grad_norm": 0.1991865038871765, + "learning_rate": 4.617846190800837e-05, + "loss": 0.5332, + "step": 7397 + }, + { + "epoch": 1.520814061054579, + "grad_norm": 0.19771799445152283, + "learning_rate": 4.616847719743157e-05, + "loss": 0.5473, + "step": 7398 + }, + { + "epoch": 1.5210196320279576, + "grad_norm": 0.21633638441562653, + "learning_rate": 4.615849242928936e-05, + "loss": 0.582, + "step": 7399 + }, + { + "epoch": 1.5212252030013362, + "grad_norm": 0.19637715816497803, + "learning_rate": 4.614850760407364e-05, + "loss": 0.5619, + "step": 7400 + }, + { + "epoch": 1.5214307739747148, + "grad_norm": 0.1928258240222931, + "learning_rate": 4.613852272227633e-05, + "loss": 0.5578, + "step": 7401 + }, + { + "epoch": 1.5216363449480932, + "grad_norm": 0.19066447019577026, + "learning_rate": 4.612853778438931e-05, + "loss": 0.5507, + "step": 7402 + }, + { + "epoch": 1.5218419159214718, + "grad_norm": 0.19168606400489807, + "learning_rate": 4.611855279090452e-05, + "loss": 0.5625, + "step": 7403 + }, + { + "epoch": 1.5220474868948504, + "grad_norm": 0.18386611342430115, + "learning_rate": 4.610856774231386e-05, + "loss": 0.5484, + "step": 7404 + }, + { + "epoch": 1.522253057868229, + "grad_norm": 0.1938936412334442, + "learning_rate": 4.609858263910925e-05, + "loss": 0.5629, + "step": 7405 + }, + { + "epoch": 1.5224586288416075, + "grad_norm": 0.1900719851255417, + "learning_rate": 4.6088597481782606e-05, + "loss": 0.5491, + "step": 7406 + }, + { + "epoch": 1.5226641998149861, + "grad_norm": 0.18934617936611176, + "learning_rate": 4.607861227082585e-05, + "loss": 0.5377, + "step": 7407 + }, + { + "epoch": 1.5228697707883647, + "grad_norm": 0.20040073990821838, + "learning_rate": 4.606862700673091e-05, + "loss": 0.5384, + "step": 7408 + }, + { + "epoch": 1.5230753417617433, + "grad_norm": 0.19345182180404663, + "learning_rate": 4.6058641689989724e-05, + "loss": 0.5519, + "step": 7409 + }, + { + "epoch": 1.523280912735122, + "grad_norm": 0.19998955726623535, + "learning_rate": 4.6048656321094196e-05, + "loss": 0.5499, + "step": 7410 + }, + { + "epoch": 1.5234864837085005, + "grad_norm": 0.2003701776266098, + "learning_rate": 4.603867090053627e-05, + "loss": 0.5471, + "step": 7411 + }, + { + "epoch": 1.523692054681879, + "grad_norm": 0.1997435837984085, + "learning_rate": 4.6028685428807896e-05, + "loss": 0.5349, + "step": 7412 + }, + { + "epoch": 1.5238976256552574, + "grad_norm": 0.19210022687911987, + "learning_rate": 4.6018699906400996e-05, + "loss": 0.5452, + "step": 7413 + }, + { + "epoch": 1.524103196628636, + "grad_norm": 0.19292627274990082, + "learning_rate": 4.6008714333807496e-05, + "loss": 0.5605, + "step": 7414 + }, + { + "epoch": 1.5243087676020146, + "grad_norm": 0.18850092589855194, + "learning_rate": 4.599872871151937e-05, + "loss": 0.5521, + "step": 7415 + }, + { + "epoch": 1.5245143385753932, + "grad_norm": 0.19602644443511963, + "learning_rate": 4.5988743040028554e-05, + "loss": 0.55, + "step": 7416 + }, + { + "epoch": 1.5247199095487716, + "grad_norm": 0.19302399456501007, + "learning_rate": 4.597875731982697e-05, + "loss": 0.5361, + "step": 7417 + }, + { + "epoch": 1.5249254805221502, + "grad_norm": 0.16675427556037903, + "learning_rate": 4.596877155140661e-05, + "loss": 0.5136, + "step": 7418 + }, + { + "epoch": 1.5251310514955287, + "grad_norm": 0.15877321362495422, + "learning_rate": 4.59587857352594e-05, + "loss": 0.5591, + "step": 7419 + }, + { + "epoch": 1.5253366224689073, + "grad_norm": 0.16738201677799225, + "learning_rate": 4.594879987187729e-05, + "loss": 0.5191, + "step": 7420 + }, + { + "epoch": 1.525542193442286, + "grad_norm": 0.16919690370559692, + "learning_rate": 4.5938813961752254e-05, + "loss": 0.5439, + "step": 7421 + }, + { + "epoch": 1.5257477644156645, + "grad_norm": 0.15980926156044006, + "learning_rate": 4.592882800537624e-05, + "loss": 0.5099, + "step": 7422 + }, + { + "epoch": 1.525953335389043, + "grad_norm": 0.1241704598069191, + "learning_rate": 4.5918842003241195e-05, + "loss": 0.5069, + "step": 7423 + }, + { + "epoch": 1.5261589063624217, + "grad_norm": 0.1193804070353508, + "learning_rate": 4.59088559558391e-05, + "loss": 0.5091, + "step": 7424 + }, + { + "epoch": 1.5263644773358003, + "grad_norm": 0.12635476887226105, + "learning_rate": 4.589886986366194e-05, + "loss": 0.5111, + "step": 7425 + }, + { + "epoch": 1.5265700483091789, + "grad_norm": 0.11729497462511063, + "learning_rate": 4.5888883727201665e-05, + "loss": 0.5215, + "step": 7426 + }, + { + "epoch": 1.5267756192825575, + "grad_norm": 0.16425076127052307, + "learning_rate": 4.5878897546950225e-05, + "loss": 0.5357, + "step": 7427 + }, + { + "epoch": 1.5269811902559358, + "grad_norm": 0.20362845063209534, + "learning_rate": 4.586891132339962e-05, + "loss": 0.5392, + "step": 7428 + }, + { + "epoch": 1.5271867612293144, + "grad_norm": 0.1934981644153595, + "learning_rate": 4.585892505704182e-05, + "loss": 0.5484, + "step": 7429 + }, + { + "epoch": 1.527392332202693, + "grad_norm": 0.19643427431583405, + "learning_rate": 4.584893874836879e-05, + "loss": 0.5564, + "step": 7430 + }, + { + "epoch": 1.5275979031760716, + "grad_norm": 0.1882271021604538, + "learning_rate": 4.583895239787251e-05, + "loss": 0.5667, + "step": 7431 + }, + { + "epoch": 1.52780347414945, + "grad_norm": 0.15838836133480072, + "learning_rate": 4.5828966006044974e-05, + "loss": 0.5059, + "step": 7432 + }, + { + "epoch": 1.5280090451228285, + "grad_norm": 0.16002227365970612, + "learning_rate": 4.581897957337817e-05, + "loss": 0.5405, + "step": 7433 + }, + { + "epoch": 1.5282146160962071, + "grad_norm": 0.19433261454105377, + "learning_rate": 4.5808993100364055e-05, + "loss": 0.5678, + "step": 7434 + }, + { + "epoch": 1.5284201870695857, + "grad_norm": 0.16582860052585602, + "learning_rate": 4.579900658749462e-05, + "loss": 0.5538, + "step": 7435 + }, + { + "epoch": 1.5286257580429643, + "grad_norm": 0.1574729084968567, + "learning_rate": 4.5789020035261886e-05, + "loss": 0.5472, + "step": 7436 + }, + { + "epoch": 1.5288313290163429, + "grad_norm": 0.20113399624824524, + "learning_rate": 4.577903344415781e-05, + "loss": 0.568, + "step": 7437 + }, + { + "epoch": 1.5290368999897215, + "grad_norm": 0.19250795245170593, + "learning_rate": 4.57690468146744e-05, + "loss": 0.548, + "step": 7438 + }, + { + "epoch": 1.5292424709631, + "grad_norm": 0.1601334810256958, + "learning_rate": 4.5759060147303655e-05, + "loss": 0.4955, + "step": 7439 + }, + { + "epoch": 1.5294480419364787, + "grad_norm": 0.16352780163288116, + "learning_rate": 4.5749073442537566e-05, + "loss": 0.5445, + "step": 7440 + }, + { + "epoch": 1.5296536129098572, + "grad_norm": 0.1970401108264923, + "learning_rate": 4.573908670086812e-05, + "loss": 0.5818, + "step": 7441 + }, + { + "epoch": 1.5298591838832358, + "grad_norm": 0.19766905903816223, + "learning_rate": 4.572909992278734e-05, + "loss": 0.5515, + "step": 7442 + }, + { + "epoch": 1.5300647548566142, + "grad_norm": 0.19481036067008972, + "learning_rate": 4.57191131087872e-05, + "loss": 0.5512, + "step": 7443 + }, + { + "epoch": 1.5302703258299928, + "grad_norm": 0.20617318153381348, + "learning_rate": 4.570912625935972e-05, + "loss": 0.5534, + "step": 7444 + }, + { + "epoch": 1.5304758968033714, + "grad_norm": 0.20254306495189667, + "learning_rate": 4.5699139374996906e-05, + "loss": 0.5534, + "step": 7445 + }, + { + "epoch": 1.53068146777675, + "grad_norm": 0.1929122805595398, + "learning_rate": 4.568915245619076e-05, + "loss": 0.5436, + "step": 7446 + }, + { + "epoch": 1.5308870387501283, + "grad_norm": 0.19024674594402313, + "learning_rate": 4.5679165503433306e-05, + "loss": 0.5508, + "step": 7447 + }, + { + "epoch": 1.531092609723507, + "grad_norm": 0.19227847456932068, + "learning_rate": 4.5669178517216525e-05, + "loss": 0.5456, + "step": 7448 + }, + { + "epoch": 1.5312981806968855, + "grad_norm": 0.1958528608083725, + "learning_rate": 4.5659191498032456e-05, + "loss": 0.5482, + "step": 7449 + }, + { + "epoch": 1.531503751670264, + "grad_norm": 0.19175393879413605, + "learning_rate": 4.564920444637311e-05, + "loss": 0.5557, + "step": 7450 + }, + { + "epoch": 1.5317093226436427, + "grad_norm": 0.19114267826080322, + "learning_rate": 4.5639217362730484e-05, + "loss": 0.5439, + "step": 7451 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 0.16341425478458405, + "learning_rate": 4.56292302475966e-05, + "loss": 0.507, + "step": 7452 + }, + { + "epoch": 1.5321204645903999, + "grad_norm": 0.15693975985050201, + "learning_rate": 4.56192431014635e-05, + "loss": 0.5558, + "step": 7453 + }, + { + "epoch": 1.5323260355637784, + "grad_norm": 0.21227800846099854, + "learning_rate": 4.560925592482319e-05, + "loss": 0.5398, + "step": 7454 + }, + { + "epoch": 1.532531606537157, + "grad_norm": 0.19406823813915253, + "learning_rate": 4.559926871816767e-05, + "loss": 0.5334, + "step": 7455 + }, + { + "epoch": 1.5327371775105356, + "grad_norm": 0.19032882153987885, + "learning_rate": 4.558928148198898e-05, + "loss": 0.5247, + "step": 7456 + }, + { + "epoch": 1.5329427484839142, + "grad_norm": 0.19708728790283203, + "learning_rate": 4.557929421677916e-05, + "loss": 0.5549, + "step": 7457 + }, + { + "epoch": 1.5331483194572928, + "grad_norm": 0.1929347962141037, + "learning_rate": 4.556930692303021e-05, + "loss": 0.5586, + "step": 7458 + }, + { + "epoch": 1.5333538904306712, + "grad_norm": 0.19860495626926422, + "learning_rate": 4.555931960123418e-05, + "loss": 0.5539, + "step": 7459 + }, + { + "epoch": 1.5335594614040498, + "grad_norm": 0.1928236037492752, + "learning_rate": 4.554933225188308e-05, + "loss": 0.5639, + "step": 7460 + }, + { + "epoch": 1.5337650323774283, + "grad_norm": 0.19600355625152588, + "learning_rate": 4.553934487546895e-05, + "loss": 0.5587, + "step": 7461 + }, + { + "epoch": 1.5339706033508067, + "grad_norm": 0.1872026026248932, + "learning_rate": 4.5529357472483815e-05, + "loss": 0.5292, + "step": 7462 + }, + { + "epoch": 1.5341761743241853, + "grad_norm": 0.19457010924816132, + "learning_rate": 4.551937004341971e-05, + "loss": 0.5526, + "step": 7463 + }, + { + "epoch": 1.5343817452975639, + "grad_norm": 0.19338703155517578, + "learning_rate": 4.5509382588768684e-05, + "loss": 0.5475, + "step": 7464 + }, + { + "epoch": 1.5345873162709425, + "grad_norm": 0.16978971660137177, + "learning_rate": 4.549939510902274e-05, + "loss": 0.5315, + "step": 7465 + }, + { + "epoch": 1.534792887244321, + "grad_norm": 0.16673077642917633, + "learning_rate": 4.548940760467395e-05, + "loss": 0.5475, + "step": 7466 + }, + { + "epoch": 1.5349984582176996, + "grad_norm": 0.195562481880188, + "learning_rate": 4.5479420076214315e-05, + "loss": 0.5599, + "step": 7467 + }, + { + "epoch": 1.5352040291910782, + "grad_norm": 0.1955966353416443, + "learning_rate": 4.5469432524135913e-05, + "loss": 0.5538, + "step": 7468 + }, + { + "epoch": 1.5354096001644568, + "grad_norm": 0.20345093309879303, + "learning_rate": 4.5459444948930754e-05, + "loss": 0.5529, + "step": 7469 + }, + { + "epoch": 1.5356151711378354, + "grad_norm": 0.16392046213150024, + "learning_rate": 4.5449457351090896e-05, + "loss": 0.53, + "step": 7470 + }, + { + "epoch": 1.535820742111214, + "grad_norm": 0.1566355973482132, + "learning_rate": 4.5439469731108383e-05, + "loss": 0.5523, + "step": 7471 + }, + { + "epoch": 1.5360263130845926, + "grad_norm": 0.1888071596622467, + "learning_rate": 4.542948208947523e-05, + "loss": 0.5527, + "step": 7472 + }, + { + "epoch": 1.5362318840579712, + "grad_norm": 0.19896787405014038, + "learning_rate": 4.5419494426683514e-05, + "loss": 0.5568, + "step": 7473 + }, + { + "epoch": 1.5364374550313495, + "grad_norm": 0.1599314957857132, + "learning_rate": 4.5409506743225274e-05, + "loss": 0.5418, + "step": 7474 + }, + { + "epoch": 1.5366430260047281, + "grad_norm": 0.15871824324131012, + "learning_rate": 4.5399519039592546e-05, + "loss": 0.5393, + "step": 7475 + }, + { + "epoch": 1.5368485969781067, + "grad_norm": 0.18515051901340485, + "learning_rate": 4.538953131627737e-05, + "loss": 0.5383, + "step": 7476 + }, + { + "epoch": 1.537054167951485, + "grad_norm": 0.1832568496465683, + "learning_rate": 4.5379543573771823e-05, + "loss": 0.5393, + "step": 7477 + }, + { + "epoch": 1.5372597389248637, + "grad_norm": 0.188548281788826, + "learning_rate": 4.5369555812567926e-05, + "loss": 0.5413, + "step": 7478 + }, + { + "epoch": 1.5374653098982423, + "grad_norm": 0.16678757965564728, + "learning_rate": 4.535956803315774e-05, + "loss": 0.5216, + "step": 7479 + }, + { + "epoch": 1.5376708808716208, + "grad_norm": 0.12842969596385956, + "learning_rate": 4.534958023603333e-05, + "loss": 0.5017, + "step": 7480 + }, + { + "epoch": 1.5378764518449994, + "grad_norm": 0.16010682284832, + "learning_rate": 4.5339592421686734e-05, + "loss": 0.5213, + "step": 7481 + }, + { + "epoch": 1.538082022818378, + "grad_norm": 0.20323491096496582, + "learning_rate": 4.5329604590610004e-05, + "loss": 0.5543, + "step": 7482 + }, + { + "epoch": 1.5382875937917566, + "grad_norm": 0.19236190617084503, + "learning_rate": 4.531961674329519e-05, + "loss": 0.5641, + "step": 7483 + }, + { + "epoch": 1.5384931647651352, + "grad_norm": 0.19376271963119507, + "learning_rate": 4.5309628880234356e-05, + "loss": 0.542, + "step": 7484 + }, + { + "epoch": 1.5386987357385138, + "grad_norm": 0.18914787471294403, + "learning_rate": 4.529964100191957e-05, + "loss": 0.5481, + "step": 7485 + }, + { + "epoch": 1.5389043067118924, + "grad_norm": 0.19532737135887146, + "learning_rate": 4.5289653108842845e-05, + "loss": 0.5634, + "step": 7486 + }, + { + "epoch": 1.539109877685271, + "grad_norm": 0.1869991570711136, + "learning_rate": 4.527966520149629e-05, + "loss": 0.5536, + "step": 7487 + }, + { + "epoch": 1.5393154486586496, + "grad_norm": 0.18661408126354218, + "learning_rate": 4.526967728037191e-05, + "loss": 0.5466, + "step": 7488 + }, + { + "epoch": 1.539521019632028, + "grad_norm": 0.2640432119369507, + "learning_rate": 4.525968934596181e-05, + "loss": 0.5553, + "step": 7489 + }, + { + "epoch": 1.5397265906054065, + "grad_norm": 0.20137301087379456, + "learning_rate": 4.524970139875803e-05, + "loss": 0.5563, + "step": 7490 + }, + { + "epoch": 1.539932161578785, + "grad_norm": 0.17082248628139496, + "learning_rate": 4.523971343925263e-05, + "loss": 0.5198, + "step": 7491 + }, + { + "epoch": 1.5401377325521637, + "grad_norm": 0.13131971657276154, + "learning_rate": 4.5229725467937666e-05, + "loss": 0.5375, + "step": 7492 + }, + { + "epoch": 1.540343303525542, + "grad_norm": 0.16236910223960876, + "learning_rate": 4.5219737485305194e-05, + "loss": 0.5435, + "step": 7493 + }, + { + "epoch": 1.5405488744989206, + "grad_norm": 0.19899526238441467, + "learning_rate": 4.5209749491847295e-05, + "loss": 0.5685, + "step": 7494 + }, + { + "epoch": 1.5407544454722992, + "grad_norm": 0.19995881617069244, + "learning_rate": 4.519976148805602e-05, + "loss": 0.5646, + "step": 7495 + }, + { + "epoch": 1.5409600164456778, + "grad_norm": 0.20216208696365356, + "learning_rate": 4.518977347442341e-05, + "loss": 0.5596, + "step": 7496 + }, + { + "epoch": 1.5411655874190564, + "grad_norm": 0.17260567843914032, + "learning_rate": 4.5179785451441574e-05, + "loss": 0.5084, + "step": 7497 + }, + { + "epoch": 1.541371158392435, + "grad_norm": 0.15725255012512207, + "learning_rate": 4.516979741960254e-05, + "loss": 0.5399, + "step": 7498 + }, + { + "epoch": 1.5415767293658136, + "grad_norm": 0.1909477263689041, + "learning_rate": 4.515980937939837e-05, + "loss": 0.5416, + "step": 7499 + }, + { + "epoch": 1.5417823003391922, + "grad_norm": 0.1896287351846695, + "learning_rate": 4.514982133132114e-05, + "loss": 0.5395, + "step": 7500 + }, + { + "epoch": 1.5419878713125708, + "grad_norm": 0.188772514462471, + "learning_rate": 4.5139833275862925e-05, + "loss": 0.5456, + "step": 7501 + }, + { + "epoch": 1.5421934422859493, + "grad_norm": 0.18162913620471954, + "learning_rate": 4.5129845213515775e-05, + "loss": 0.543, + "step": 7502 + }, + { + "epoch": 1.542399013259328, + "grad_norm": 0.19076716899871826, + "learning_rate": 4.511985714477175e-05, + "loss": 0.5502, + "step": 7503 + }, + { + "epoch": 1.5426045842327063, + "grad_norm": 0.20053020119667053, + "learning_rate": 4.5109869070122946e-05, + "loss": 0.5675, + "step": 7504 + }, + { + "epoch": 1.5428101552060849, + "grad_norm": 0.19717735052108765, + "learning_rate": 4.509988099006138e-05, + "loss": 0.5525, + "step": 7505 + }, + { + "epoch": 1.5430157261794635, + "grad_norm": 0.1972462683916092, + "learning_rate": 4.5089892905079175e-05, + "loss": 0.561, + "step": 7506 + }, + { + "epoch": 1.543221297152842, + "grad_norm": 0.1987045705318451, + "learning_rate": 4.507990481566833e-05, + "loss": 0.5333, + "step": 7507 + }, + { + "epoch": 1.5434268681262204, + "grad_norm": 0.18806061148643494, + "learning_rate": 4.506991672232097e-05, + "loss": 0.5213, + "step": 7508 + }, + { + "epoch": 1.543632439099599, + "grad_norm": 0.19716767966747284, + "learning_rate": 4.505992862552913e-05, + "loss": 0.5605, + "step": 7509 + }, + { + "epoch": 1.5438380100729776, + "grad_norm": 0.18911804258823395, + "learning_rate": 4.50499405257849e-05, + "loss": 0.559, + "step": 7510 + }, + { + "epoch": 1.5440435810463562, + "grad_norm": 0.18609070777893066, + "learning_rate": 4.5039952423580324e-05, + "loss": 0.5176, + "step": 7511 + }, + { + "epoch": 1.5442491520197348, + "grad_norm": 0.19210830330848694, + "learning_rate": 4.502996431940748e-05, + "loss": 0.5397, + "step": 7512 + }, + { + "epoch": 1.5444547229931134, + "grad_norm": 0.1905742585659027, + "learning_rate": 4.5019976213758434e-05, + "loss": 0.5585, + "step": 7513 + }, + { + "epoch": 1.544660293966492, + "grad_norm": 0.16525664925575256, + "learning_rate": 4.500998810712525e-05, + "loss": 0.5138, + "step": 7514 + }, + { + "epoch": 1.5448658649398705, + "grad_norm": 0.16021090745925903, + "learning_rate": 4.5e-05, + "loss": 0.5536, + "step": 7515 + }, + { + "epoch": 1.5450714359132491, + "grad_norm": 0.1621478945016861, + "learning_rate": 4.499001189287476e-05, + "loss": 0.5065, + "step": 7516 + }, + { + "epoch": 1.5452770068866277, + "grad_norm": 0.19542866945266724, + "learning_rate": 4.4980023786241585e-05, + "loss": 0.5389, + "step": 7517 + }, + { + "epoch": 1.5454825778600063, + "grad_norm": 0.18569281697273254, + "learning_rate": 4.497003568059254e-05, + "loss": 0.5289, + "step": 7518 + }, + { + "epoch": 1.5456881488333847, + "grad_norm": 0.19323447346687317, + "learning_rate": 4.496004757641968e-05, + "loss": 0.5605, + "step": 7519 + }, + { + "epoch": 1.5458937198067633, + "grad_norm": 0.18728816509246826, + "learning_rate": 4.495005947421511e-05, + "loss": 0.5522, + "step": 7520 + }, + { + "epoch": 1.5460992907801419, + "grad_norm": 0.19524379074573517, + "learning_rate": 4.4940071374470875e-05, + "loss": 0.5501, + "step": 7521 + }, + { + "epoch": 1.5463048617535204, + "grad_norm": 0.19686923921108246, + "learning_rate": 4.4930083277679036e-05, + "loss": 0.5574, + "step": 7522 + }, + { + "epoch": 1.5465104327268988, + "grad_norm": 0.19316346943378448, + "learning_rate": 4.492009518433167e-05, + "loss": 0.5493, + "step": 7523 + }, + { + "epoch": 1.5467160037002774, + "grad_norm": 0.19701054692268372, + "learning_rate": 4.491010709492085e-05, + "loss": 0.5269, + "step": 7524 + }, + { + "epoch": 1.546921574673656, + "grad_norm": 0.1707211434841156, + "learning_rate": 4.490011900993863e-05, + "loss": 0.5326, + "step": 7525 + }, + { + "epoch": 1.5471271456470346, + "grad_norm": 0.16687439382076263, + "learning_rate": 4.489013092987706e-05, + "loss": 0.5514, + "step": 7526 + }, + { + "epoch": 1.5473327166204132, + "grad_norm": 0.1970919817686081, + "learning_rate": 4.488014285522825e-05, + "loss": 0.5512, + "step": 7527 + }, + { + "epoch": 1.5475382875937918, + "grad_norm": 0.20226997137069702, + "learning_rate": 4.487015478648423e-05, + "loss": 0.5549, + "step": 7528 + }, + { + "epoch": 1.5477438585671703, + "grad_norm": 0.1875869780778885, + "learning_rate": 4.486016672413708e-05, + "loss": 0.5532, + "step": 7529 + }, + { + "epoch": 1.547949429540549, + "grad_norm": 0.19215047359466553, + "learning_rate": 4.4850178668678864e-05, + "loss": 0.5533, + "step": 7530 + }, + { + "epoch": 1.5481550005139275, + "grad_norm": 0.18497878313064575, + "learning_rate": 4.484019062060164e-05, + "loss": 0.5389, + "step": 7531 + }, + { + "epoch": 1.548360571487306, + "grad_norm": 0.18966837227344513, + "learning_rate": 4.483020258039748e-05, + "loss": 0.5352, + "step": 7532 + }, + { + "epoch": 1.5485661424606847, + "grad_norm": 0.19131658971309662, + "learning_rate": 4.482021454855844e-05, + "loss": 0.5429, + "step": 7533 + }, + { + "epoch": 1.548771713434063, + "grad_norm": 0.18846401572227478, + "learning_rate": 4.481022652557658e-05, + "loss": 0.5442, + "step": 7534 + }, + { + "epoch": 1.5489772844074416, + "grad_norm": 0.16239413619041443, + "learning_rate": 4.480023851194399e-05, + "loss": 0.5047, + "step": 7535 + }, + { + "epoch": 1.5491828553808202, + "grad_norm": 0.13217657804489136, + "learning_rate": 4.479025050815272e-05, + "loss": 0.4997, + "step": 7536 + }, + { + "epoch": 1.5493884263541988, + "grad_norm": 0.12488622963428497, + "learning_rate": 4.478026251469482e-05, + "loss": 0.5081, + "step": 7537 + }, + { + "epoch": 1.5495939973275772, + "grad_norm": 0.1763962060213089, + "learning_rate": 4.477027453206236e-05, + "loss": 0.5517, + "step": 7538 + }, + { + "epoch": 1.5497995683009558, + "grad_norm": 0.20494931936264038, + "learning_rate": 4.476028656074739e-05, + "loss": 0.5535, + "step": 7539 + }, + { + "epoch": 1.5500051392743344, + "grad_norm": 0.2072146087884903, + "learning_rate": 4.4750298601241976e-05, + "loss": 0.5409, + "step": 7540 + }, + { + "epoch": 1.550210710247713, + "grad_norm": 0.1965474635362625, + "learning_rate": 4.4740310654038194e-05, + "loss": 0.5307, + "step": 7541 + }, + { + "epoch": 1.5504162812210915, + "grad_norm": 0.16837544739246368, + "learning_rate": 4.47303227196281e-05, + "loss": 0.5289, + "step": 7542 + }, + { + "epoch": 1.5506218521944701, + "grad_norm": 0.16805261373519897, + "learning_rate": 4.4720334798503725e-05, + "loss": 0.5413, + "step": 7543 + }, + { + "epoch": 1.5508274231678487, + "grad_norm": 0.203588604927063, + "learning_rate": 4.471034689115717e-05, + "loss": 0.5474, + "step": 7544 + }, + { + "epoch": 1.5510329941412273, + "grad_norm": 0.20456770062446594, + "learning_rate": 4.470035899808046e-05, + "loss": 0.5409, + "step": 7545 + }, + { + "epoch": 1.551238565114606, + "grad_norm": 0.18718034029006958, + "learning_rate": 4.469037111976566e-05, + "loss": 0.537, + "step": 7546 + }, + { + "epoch": 1.5514441360879845, + "grad_norm": 0.19375449419021606, + "learning_rate": 4.4680383256704814e-05, + "loss": 0.5322, + "step": 7547 + }, + { + "epoch": 1.551649707061363, + "grad_norm": 0.23705141246318817, + "learning_rate": 4.467039540939001e-05, + "loss": 0.5616, + "step": 7548 + }, + { + "epoch": 1.5518552780347417, + "grad_norm": 0.16841238737106323, + "learning_rate": 4.466040757831328e-05, + "loss": 0.4964, + "step": 7549 + }, + { + "epoch": 1.55206084900812, + "grad_norm": 0.16423995792865753, + "learning_rate": 4.465041976396668e-05, + "loss": 0.5511, + "step": 7550 + }, + { + "epoch": 1.5522664199814986, + "grad_norm": 0.1915719360113144, + "learning_rate": 4.464043196684227e-05, + "loss": 0.5412, + "step": 7551 + }, + { + "epoch": 1.5524719909548772, + "grad_norm": 0.19022904336452484, + "learning_rate": 4.463044418743209e-05, + "loss": 0.5372, + "step": 7552 + }, + { + "epoch": 1.5526775619282556, + "grad_norm": 0.19907855987548828, + "learning_rate": 4.4620456426228196e-05, + "loss": 0.5657, + "step": 7553 + }, + { + "epoch": 1.5528831329016342, + "grad_norm": 0.1949799507856369, + "learning_rate": 4.461046868372264e-05, + "loss": 0.5452, + "step": 7554 + }, + { + "epoch": 1.5530887038750127, + "grad_norm": 0.1677858829498291, + "learning_rate": 4.4600480960407467e-05, + "loss": 0.5087, + "step": 7555 + }, + { + "epoch": 1.5532942748483913, + "grad_norm": 0.1660327911376953, + "learning_rate": 4.459049325677474e-05, + "loss": 0.5361, + "step": 7556 + }, + { + "epoch": 1.55349984582177, + "grad_norm": 0.16196422278881073, + "learning_rate": 4.45805055733165e-05, + "loss": 0.5322, + "step": 7557 + }, + { + "epoch": 1.5537054167951485, + "grad_norm": 0.1612974852323532, + "learning_rate": 4.457051791052478e-05, + "loss": 0.5549, + "step": 7558 + }, + { + "epoch": 1.553910987768527, + "grad_norm": 0.19015921652317047, + "learning_rate": 4.456053026889164e-05, + "loss": 0.5375, + "step": 7559 + }, + { + "epoch": 1.5541165587419057, + "grad_norm": 0.19856490194797516, + "learning_rate": 4.45505426489091e-05, + "loss": 0.5626, + "step": 7560 + }, + { + "epoch": 1.5543221297152843, + "grad_norm": 0.18954843282699585, + "learning_rate": 4.454055505106925e-05, + "loss": 0.5461, + "step": 7561 + }, + { + "epoch": 1.5545277006886629, + "grad_norm": 0.16355063021183014, + "learning_rate": 4.45305674758641e-05, + "loss": 0.5308, + "step": 7562 + }, + { + "epoch": 1.5547332716620414, + "grad_norm": 0.16068147122859955, + "learning_rate": 4.452057992378569e-05, + "loss": 0.5596, + "step": 7563 + }, + { + "epoch": 1.55493884263542, + "grad_norm": 0.18733803927898407, + "learning_rate": 4.4510592395326064e-05, + "loss": 0.5618, + "step": 7564 + }, + { + "epoch": 1.5551444136087984, + "grad_norm": 0.16565637290477753, + "learning_rate": 4.4500604890977264e-05, + "loss": 0.533, + "step": 7565 + }, + { + "epoch": 1.555349984582177, + "grad_norm": 0.1654541790485382, + "learning_rate": 4.449061741123134e-05, + "loss": 0.5562, + "step": 7566 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.20242147147655487, + "learning_rate": 4.448062995658028e-05, + "loss": 0.5494, + "step": 7567 + }, + { + "epoch": 1.5557611265289342, + "grad_norm": 0.19619537889957428, + "learning_rate": 4.447064252751619e-05, + "loss": 0.5455, + "step": 7568 + }, + { + "epoch": 1.5559666975023125, + "grad_norm": 0.16296258568763733, + "learning_rate": 4.446065512453106e-05, + "loss": 0.5202, + "step": 7569 + }, + { + "epoch": 1.5561722684756911, + "grad_norm": 0.15891185402870178, + "learning_rate": 4.4450667748116935e-05, + "loss": 0.5455, + "step": 7570 + }, + { + "epoch": 1.5563778394490697, + "grad_norm": 0.19792260229587555, + "learning_rate": 4.444068039876584e-05, + "loss": 0.5495, + "step": 7571 + }, + { + "epoch": 1.5565834104224483, + "grad_norm": 0.19216637313365936, + "learning_rate": 4.4430693076969805e-05, + "loss": 0.5576, + "step": 7572 + }, + { + "epoch": 1.5567889813958269, + "grad_norm": 0.18915432691574097, + "learning_rate": 4.442070578322086e-05, + "loss": 0.5269, + "step": 7573 + }, + { + "epoch": 1.5569945523692055, + "grad_norm": 0.19710315763950348, + "learning_rate": 4.441071851801102e-05, + "loss": 0.589, + "step": 7574 + }, + { + "epoch": 1.557200123342584, + "grad_norm": 0.19663040339946747, + "learning_rate": 4.4400731281832346e-05, + "loss": 0.5445, + "step": 7575 + }, + { + "epoch": 1.5574056943159627, + "grad_norm": 0.16456833481788635, + "learning_rate": 4.4390744075176826e-05, + "loss": 0.5084, + "step": 7576 + }, + { + "epoch": 1.5576112652893412, + "grad_norm": 0.16168387234210968, + "learning_rate": 4.438075689853651e-05, + "loss": 0.5335, + "step": 7577 + }, + { + "epoch": 1.5578168362627198, + "grad_norm": 0.19194790720939636, + "learning_rate": 4.43707697524034e-05, + "loss": 0.5517, + "step": 7578 + }, + { + "epoch": 1.5580224072360984, + "grad_norm": 0.19601012766361237, + "learning_rate": 4.4360782637269535e-05, + "loss": 0.5568, + "step": 7579 + }, + { + "epoch": 1.5582279782094768, + "grad_norm": 0.18594755232334137, + "learning_rate": 4.435079555362691e-05, + "loss": 0.5313, + "step": 7580 + }, + { + "epoch": 1.5584335491828554, + "grad_norm": 0.16498349606990814, + "learning_rate": 4.434080850196754e-05, + "loss": 0.5261, + "step": 7581 + }, + { + "epoch": 1.558639120156234, + "grad_norm": 0.15921123325824738, + "learning_rate": 4.433082148278348e-05, + "loss": 0.5481, + "step": 7582 + }, + { + "epoch": 1.5588446911296125, + "grad_norm": 0.19702661037445068, + "learning_rate": 4.4320834496566706e-05, + "loss": 0.565, + "step": 7583 + }, + { + "epoch": 1.559050262102991, + "grad_norm": 0.19030775129795074, + "learning_rate": 4.431084754380925e-05, + "loss": 0.5561, + "step": 7584 + }, + { + "epoch": 1.5592558330763695, + "grad_norm": 0.19048479199409485, + "learning_rate": 4.43008606250031e-05, + "loss": 0.5367, + "step": 7585 + }, + { + "epoch": 1.559461404049748, + "grad_norm": 0.189329594373703, + "learning_rate": 4.429087374064029e-05, + "loss": 0.5271, + "step": 7586 + }, + { + "epoch": 1.5596669750231267, + "grad_norm": 0.1947106570005417, + "learning_rate": 4.428088689121282e-05, + "loss": 0.5415, + "step": 7587 + }, + { + "epoch": 1.5598725459965053, + "grad_norm": 0.19340308010578156, + "learning_rate": 4.427090007721267e-05, + "loss": 0.5465, + "step": 7588 + }, + { + "epoch": 1.5600781169698839, + "grad_norm": 0.19165843725204468, + "learning_rate": 4.4260913299131885e-05, + "loss": 0.5478, + "step": 7589 + }, + { + "epoch": 1.5602836879432624, + "grad_norm": 0.20227845013141632, + "learning_rate": 4.425092655746244e-05, + "loss": 0.5432, + "step": 7590 + }, + { + "epoch": 1.560489258916641, + "grad_norm": 0.20343764126300812, + "learning_rate": 4.424093985269635e-05, + "loss": 0.5508, + "step": 7591 + }, + { + "epoch": 1.5606948298900196, + "grad_norm": 0.19420337677001953, + "learning_rate": 4.423095318532561e-05, + "loss": 0.5483, + "step": 7592 + }, + { + "epoch": 1.5609004008633982, + "grad_norm": 0.19176806509494781, + "learning_rate": 4.42209665558422e-05, + "loss": 0.5431, + "step": 7593 + }, + { + "epoch": 1.5611059718367768, + "grad_norm": 0.1622324138879776, + "learning_rate": 4.421097996473813e-05, + "loss": 0.5213, + "step": 7594 + }, + { + "epoch": 1.5613115428101552, + "grad_norm": 0.1601867824792862, + "learning_rate": 4.420099341250538e-05, + "loss": 0.5538, + "step": 7595 + }, + { + "epoch": 1.5615171137835338, + "grad_norm": 0.1894841194152832, + "learning_rate": 4.4191006899635964e-05, + "loss": 0.5515, + "step": 7596 + }, + { + "epoch": 1.5617226847569123, + "grad_norm": 0.15804892778396606, + "learning_rate": 4.418102042662184e-05, + "loss": 0.493, + "step": 7597 + }, + { + "epoch": 1.561928255730291, + "grad_norm": 0.15905854105949402, + "learning_rate": 4.417103399395503e-05, + "loss": 0.5405, + "step": 7598 + }, + { + "epoch": 1.5621338267036693, + "grad_norm": 0.19244399666786194, + "learning_rate": 4.4161047602127494e-05, + "loss": 0.5372, + "step": 7599 + }, + { + "epoch": 1.5623393976770479, + "grad_norm": 0.18696913123130798, + "learning_rate": 4.415106125163123e-05, + "loss": 0.534, + "step": 7600 + }, + { + "epoch": 1.5625449686504265, + "grad_norm": 0.19538486003875732, + "learning_rate": 4.41410749429582e-05, + "loss": 0.5348, + "step": 7601 + }, + { + "epoch": 1.562750539623805, + "grad_norm": 0.19690623879432678, + "learning_rate": 4.4131088676600386e-05, + "loss": 0.5461, + "step": 7602 + }, + { + "epoch": 1.5629561105971836, + "grad_norm": 0.19831502437591553, + "learning_rate": 4.412110245304978e-05, + "loss": 0.5541, + "step": 7603 + }, + { + "epoch": 1.5631616815705622, + "grad_norm": 0.20122960209846497, + "learning_rate": 4.411111627279835e-05, + "loss": 0.5473, + "step": 7604 + }, + { + "epoch": 1.5633672525439408, + "grad_norm": 0.1640729159116745, + "learning_rate": 4.410113013633807e-05, + "loss": 0.5054, + "step": 7605 + }, + { + "epoch": 1.5635728235173194, + "grad_norm": 0.16052688658237457, + "learning_rate": 4.4091144044160905e-05, + "loss": 0.5322, + "step": 7606 + }, + { + "epoch": 1.563778394490698, + "grad_norm": 0.19739840924739838, + "learning_rate": 4.408115799675881e-05, + "loss": 0.5606, + "step": 7607 + }, + { + "epoch": 1.5639839654640766, + "grad_norm": 0.19876334071159363, + "learning_rate": 4.407117199462378e-05, + "loss": 0.5147, + "step": 7608 + }, + { + "epoch": 1.5641895364374552, + "grad_norm": 0.19272910058498383, + "learning_rate": 4.406118603824775e-05, + "loss": 0.5433, + "step": 7609 + }, + { + "epoch": 1.5643951074108335, + "grad_norm": 0.1927374005317688, + "learning_rate": 4.4051200128122715e-05, + "loss": 0.5351, + "step": 7610 + }, + { + "epoch": 1.5646006783842121, + "grad_norm": 0.19942370057106018, + "learning_rate": 4.404121426474061e-05, + "loss": 0.543, + "step": 7611 + }, + { + "epoch": 1.5648062493575907, + "grad_norm": 0.15870188176631927, + "learning_rate": 4.4031228448593395e-05, + "loss": 0.5113, + "step": 7612 + }, + { + "epoch": 1.5650118203309693, + "grad_norm": 0.1612454354763031, + "learning_rate": 4.402124268017303e-05, + "loss": 0.54, + "step": 7613 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 0.19843849539756775, + "learning_rate": 4.4011256959971465e-05, + "loss": 0.5468, + "step": 7614 + }, + { + "epoch": 1.5654229622777263, + "grad_norm": 0.1602935492992401, + "learning_rate": 4.400127128848065e-05, + "loss": 0.5168, + "step": 7615 + }, + { + "epoch": 1.5656285332511048, + "grad_norm": 0.18167522549629211, + "learning_rate": 4.39912856661925e-05, + "loss": 0.5568, + "step": 7616 + }, + { + "epoch": 1.5658341042244834, + "grad_norm": 0.16602426767349243, + "learning_rate": 4.398130009359902e-05, + "loss": 0.5254, + "step": 7617 + }, + { + "epoch": 1.566039675197862, + "grad_norm": 0.16260112822055817, + "learning_rate": 4.397131457119212e-05, + "loss": 0.5646, + "step": 7618 + }, + { + "epoch": 1.5662452461712406, + "grad_norm": 0.19944046437740326, + "learning_rate": 4.396132909946373e-05, + "loss": 0.5459, + "step": 7619 + }, + { + "epoch": 1.5664508171446192, + "grad_norm": 0.19292668998241425, + "learning_rate": 4.3951343678905816e-05, + "loss": 0.5421, + "step": 7620 + }, + { + "epoch": 1.5666563881179978, + "grad_norm": 0.19421285390853882, + "learning_rate": 4.3941358310010295e-05, + "loss": 0.5649, + "step": 7621 + }, + { + "epoch": 1.5668619590913764, + "grad_norm": 0.1894664317369461, + "learning_rate": 4.393137299326911e-05, + "loss": 0.5683, + "step": 7622 + }, + { + "epoch": 1.567067530064755, + "grad_norm": 0.18972072005271912, + "learning_rate": 4.392138772917415e-05, + "loss": 0.5459, + "step": 7623 + }, + { + "epoch": 1.5672731010381336, + "grad_norm": 0.16586807370185852, + "learning_rate": 4.39114025182174e-05, + "loss": 0.5409, + "step": 7624 + }, + { + "epoch": 1.567478672011512, + "grad_norm": 0.13293050229549408, + "learning_rate": 4.390141736089076e-05, + "loss": 0.5069, + "step": 7625 + }, + { + "epoch": 1.5676842429848905, + "grad_norm": 0.15764681994915009, + "learning_rate": 4.389143225768616e-05, + "loss": 0.5475, + "step": 7626 + }, + { + "epoch": 1.567889813958269, + "grad_norm": 0.1995992809534073, + "learning_rate": 4.3881447209095495e-05, + "loss": 0.5426, + "step": 7627 + }, + { + "epoch": 1.5680953849316477, + "grad_norm": 0.1619638353586197, + "learning_rate": 4.3871462215610696e-05, + "loss": 0.5103, + "step": 7628 + }, + { + "epoch": 1.568300955905026, + "grad_norm": 0.16626045107841492, + "learning_rate": 4.386147727772369e-05, + "loss": 0.5347, + "step": 7629 + }, + { + "epoch": 1.5685065268784046, + "grad_norm": 0.20278498530387878, + "learning_rate": 4.3851492395926364e-05, + "loss": 0.5572, + "step": 7630 + }, + { + "epoch": 1.5687120978517832, + "grad_norm": 0.2107708603143692, + "learning_rate": 4.384150757071064e-05, + "loss": 0.5623, + "step": 7631 + }, + { + "epoch": 1.5689176688251618, + "grad_norm": 0.19431017339229584, + "learning_rate": 4.383152280256844e-05, + "loss": 0.5589, + "step": 7632 + }, + { + "epoch": 1.5691232397985404, + "grad_norm": 0.1882307529449463, + "learning_rate": 4.3821538091991645e-05, + "loss": 0.5481, + "step": 7633 + }, + { + "epoch": 1.569328810771919, + "grad_norm": 0.19112688302993774, + "learning_rate": 4.3811553439472166e-05, + "loss": 0.5419, + "step": 7634 + }, + { + "epoch": 1.5695343817452976, + "grad_norm": 0.19997398555278778, + "learning_rate": 4.380156884550188e-05, + "loss": 0.5692, + "step": 7635 + }, + { + "epoch": 1.5697399527186762, + "grad_norm": 0.19339673221111298, + "learning_rate": 4.3791584310572686e-05, + "loss": 0.5366, + "step": 7636 + }, + { + "epoch": 1.5699455236920548, + "grad_norm": 0.18707948923110962, + "learning_rate": 4.3781599835176504e-05, + "loss": 0.5303, + "step": 7637 + }, + { + "epoch": 1.5701510946654333, + "grad_norm": 0.1914735585451126, + "learning_rate": 4.37716154198052e-05, + "loss": 0.5569, + "step": 7638 + }, + { + "epoch": 1.570356665638812, + "grad_norm": 0.19773781299591064, + "learning_rate": 4.376163106495067e-05, + "loss": 0.5482, + "step": 7639 + }, + { + "epoch": 1.5705622366121905, + "grad_norm": 0.17177283763885498, + "learning_rate": 4.3751646771104774e-05, + "loss": 0.5203, + "step": 7640 + }, + { + "epoch": 1.5707678075855689, + "grad_norm": 0.16656096279621124, + "learning_rate": 4.374166253875942e-05, + "loss": 0.5528, + "step": 7641 + }, + { + "epoch": 1.5709733785589475, + "grad_norm": 0.19667677581310272, + "learning_rate": 4.3731678368406464e-05, + "loss": 0.5588, + "step": 7642 + }, + { + "epoch": 1.571178949532326, + "grad_norm": 0.15893961489200592, + "learning_rate": 4.372169426053777e-05, + "loss": 0.5165, + "step": 7643 + }, + { + "epoch": 1.5713845205057044, + "grad_norm": 0.15546555817127228, + "learning_rate": 4.371171021564525e-05, + "loss": 0.5631, + "step": 7644 + }, + { + "epoch": 1.571590091479083, + "grad_norm": 0.16072389483451843, + "learning_rate": 4.3701726234220744e-05, + "loss": 0.5273, + "step": 7645 + }, + { + "epoch": 1.5717956624524616, + "grad_norm": 0.15544024109840393, + "learning_rate": 4.369174231675611e-05, + "loss": 0.5508, + "step": 7646 + }, + { + "epoch": 1.5720012334258402, + "grad_norm": 0.15451103448867798, + "learning_rate": 4.3681758463743225e-05, + "loss": 0.5066, + "step": 7647 + }, + { + "epoch": 1.5722068043992188, + "grad_norm": 0.15433375537395477, + "learning_rate": 4.367177467567394e-05, + "loss": 0.5444, + "step": 7648 + }, + { + "epoch": 1.5724123753725974, + "grad_norm": 0.16077595949172974, + "learning_rate": 4.36617909530401e-05, + "loss": 0.5234, + "step": 7649 + }, + { + "epoch": 1.572617946345976, + "grad_norm": 0.15683984756469727, + "learning_rate": 4.3651807296333555e-05, + "loss": 0.5316, + "step": 7650 + }, + { + "epoch": 1.5728235173193545, + "grad_norm": 0.1868003010749817, + "learning_rate": 4.3641823706046186e-05, + "loss": 0.5313, + "step": 7651 + }, + { + "epoch": 1.5730290882927331, + "grad_norm": 0.1609300971031189, + "learning_rate": 4.363184018266979e-05, + "loss": 0.5225, + "step": 7652 + }, + { + "epoch": 1.5732346592661117, + "grad_norm": 0.15994325280189514, + "learning_rate": 4.362185672669626e-05, + "loss": 0.5298, + "step": 7653 + }, + { + "epoch": 1.5734402302394903, + "grad_norm": 0.1932908594608307, + "learning_rate": 4.3611873338617393e-05, + "loss": 0.5419, + "step": 7654 + }, + { + "epoch": 1.573645801212869, + "grad_norm": 0.1590869426727295, + "learning_rate": 4.3601890018925046e-05, + "loss": 0.5014, + "step": 7655 + }, + { + "epoch": 1.5738513721862473, + "grad_norm": 0.16261689364910126, + "learning_rate": 4.359190676811104e-05, + "loss": 0.5592, + "step": 7656 + }, + { + "epoch": 1.5740569431596259, + "grad_norm": 0.20458675920963287, + "learning_rate": 4.3581923586667196e-05, + "loss": 0.5704, + "step": 7657 + }, + { + "epoch": 1.5742625141330044, + "grad_norm": 0.189193993806839, + "learning_rate": 4.3571940475085355e-05, + "loss": 0.5508, + "step": 7658 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 0.18907295167446136, + "learning_rate": 4.356195743385734e-05, + "loss": 0.5312, + "step": 7659 + }, + { + "epoch": 1.5746736560797614, + "grad_norm": 0.18863658607006073, + "learning_rate": 4.3551974463474956e-05, + "loss": 0.5668, + "step": 7660 + }, + { + "epoch": 1.57487922705314, + "grad_norm": 0.1917717009782791, + "learning_rate": 4.354199156443002e-05, + "loss": 0.5327, + "step": 7661 + }, + { + "epoch": 1.5750847980265186, + "grad_norm": 0.19521358609199524, + "learning_rate": 4.353200873721435e-05, + "loss": 0.5242, + "step": 7662 + }, + { + "epoch": 1.5752903689998972, + "grad_norm": 0.19762447476387024, + "learning_rate": 4.352202598231975e-05, + "loss": 0.5609, + "step": 7663 + }, + { + "epoch": 1.5754959399732757, + "grad_norm": 0.19108183681964874, + "learning_rate": 4.3512043300237994e-05, + "loss": 0.5453, + "step": 7664 + }, + { + "epoch": 1.5757015109466543, + "grad_norm": 0.19864460825920105, + "learning_rate": 4.3502060691460935e-05, + "loss": 0.5685, + "step": 7665 + }, + { + "epoch": 1.575907081920033, + "grad_norm": 0.1909678727388382, + "learning_rate": 4.349207815648035e-05, + "loss": 0.5466, + "step": 7666 + }, + { + "epoch": 1.5761126528934115, + "grad_norm": 0.19452133774757385, + "learning_rate": 4.348209569578802e-05, + "loss": 0.5522, + "step": 7667 + }, + { + "epoch": 1.57631822386679, + "grad_norm": 0.1838688850402832, + "learning_rate": 4.3472113309875744e-05, + "loss": 0.5366, + "step": 7668 + }, + { + "epoch": 1.5765237948401687, + "grad_norm": 0.18900097906589508, + "learning_rate": 4.3462130999235295e-05, + "loss": 0.5301, + "step": 7669 + }, + { + "epoch": 1.5767293658135473, + "grad_norm": 0.19407951831817627, + "learning_rate": 4.345214876435847e-05, + "loss": 0.544, + "step": 7670 + }, + { + "epoch": 1.5769349367869256, + "grad_norm": 0.19032980501651764, + "learning_rate": 4.344216660573703e-05, + "loss": 0.5525, + "step": 7671 + }, + { + "epoch": 1.5771405077603042, + "grad_norm": 0.19637268781661987, + "learning_rate": 4.343218452386277e-05, + "loss": 0.5492, + "step": 7672 + }, + { + "epoch": 1.5773460787336828, + "grad_norm": 0.18958862125873566, + "learning_rate": 4.342220251922744e-05, + "loss": 0.5393, + "step": 7673 + }, + { + "epoch": 1.5775516497070614, + "grad_norm": 0.1648726463317871, + "learning_rate": 4.341222059232283e-05, + "loss": 0.4955, + "step": 7674 + }, + { + "epoch": 1.5777572206804398, + "grad_norm": 0.16251088678836823, + "learning_rate": 4.340223874364069e-05, + "loss": 0.5312, + "step": 7675 + }, + { + "epoch": 1.5779627916538184, + "grad_norm": 0.19399689137935638, + "learning_rate": 4.3392256973672776e-05, + "loss": 0.5527, + "step": 7676 + }, + { + "epoch": 1.578168362627197, + "grad_norm": 0.1864946484565735, + "learning_rate": 4.338227528291085e-05, + "loss": 0.5352, + "step": 7677 + }, + { + "epoch": 1.5783739336005755, + "grad_norm": 0.19393518567085266, + "learning_rate": 4.337229367184664e-05, + "loss": 0.5451, + "step": 7678 + }, + { + "epoch": 1.5785795045739541, + "grad_norm": 0.19147159159183502, + "learning_rate": 4.3362312140971927e-05, + "loss": 0.5515, + "step": 7679 + }, + { + "epoch": 1.5787850755473327, + "grad_norm": 0.19576434791088104, + "learning_rate": 4.3352330690778445e-05, + "loss": 0.5504, + "step": 7680 + }, + { + "epoch": 1.5789906465207113, + "grad_norm": 0.19198796153068542, + "learning_rate": 4.3342349321757934e-05, + "loss": 0.5452, + "step": 7681 + }, + { + "epoch": 1.57919621749409, + "grad_norm": 0.19014614820480347, + "learning_rate": 4.3332368034402105e-05, + "loss": 0.5615, + "step": 7682 + }, + { + "epoch": 1.5794017884674685, + "grad_norm": 0.1940838247537613, + "learning_rate": 4.332238682920272e-05, + "loss": 0.5369, + "step": 7683 + }, + { + "epoch": 1.579607359440847, + "grad_norm": 0.1929844617843628, + "learning_rate": 4.3312405706651496e-05, + "loss": 0.5502, + "step": 7684 + }, + { + "epoch": 1.5798129304142257, + "grad_norm": 0.1682363599538803, + "learning_rate": 4.330242466724014e-05, + "loss": 0.5245, + "step": 7685 + }, + { + "epoch": 1.580018501387604, + "grad_norm": 0.14466369152069092, + "learning_rate": 4.32924437114604e-05, + "loss": 0.5162, + "step": 7686 + }, + { + "epoch": 1.5802240723609826, + "grad_norm": 0.16307014226913452, + "learning_rate": 4.3282462839803976e-05, + "loss": 0.5432, + "step": 7687 + }, + { + "epoch": 1.5804296433343612, + "grad_norm": 0.1943131685256958, + "learning_rate": 4.3272482052762584e-05, + "loss": 0.5377, + "step": 7688 + }, + { + "epoch": 1.5806352143077398, + "grad_norm": 0.1945241242647171, + "learning_rate": 4.3262501350827925e-05, + "loss": 0.5425, + "step": 7689 + }, + { + "epoch": 1.5808407852811182, + "grad_norm": 0.1876905858516693, + "learning_rate": 4.3252520734491706e-05, + "loss": 0.5435, + "step": 7690 + }, + { + "epoch": 1.5810463562544967, + "grad_norm": 0.18484771251678467, + "learning_rate": 4.3242540204245625e-05, + "loss": 0.5292, + "step": 7691 + }, + { + "epoch": 1.5812519272278753, + "grad_norm": 0.19600705802440643, + "learning_rate": 4.323255976058135e-05, + "loss": 0.5593, + "step": 7692 + }, + { + "epoch": 1.581457498201254, + "grad_norm": 0.18599645793437958, + "learning_rate": 4.3222579403990614e-05, + "loss": 0.5226, + "step": 7693 + }, + { + "epoch": 1.5816630691746325, + "grad_norm": 0.18677088618278503, + "learning_rate": 4.321259913496508e-05, + "loss": 0.517, + "step": 7694 + }, + { + "epoch": 1.581868640148011, + "grad_norm": 0.19142663478851318, + "learning_rate": 4.3202618953996425e-05, + "loss": 0.5486, + "step": 7695 + }, + { + "epoch": 1.5820742111213897, + "grad_norm": 0.19013050198554993, + "learning_rate": 4.319263886157634e-05, + "loss": 0.5584, + "step": 7696 + }, + { + "epoch": 1.5822797820947683, + "grad_norm": 0.1859898418188095, + "learning_rate": 4.31826588581965e-05, + "loss": 0.5319, + "step": 7697 + }, + { + "epoch": 1.5824853530681469, + "grad_norm": 0.19170920550823212, + "learning_rate": 4.3172678944348556e-05, + "loss": 0.5519, + "step": 7698 + }, + { + "epoch": 1.5826909240415254, + "grad_norm": 0.21785251796245575, + "learning_rate": 4.3162699120524165e-05, + "loss": 0.5545, + "step": 7699 + }, + { + "epoch": 1.582896495014904, + "grad_norm": 0.19362372159957886, + "learning_rate": 4.3152719387215016e-05, + "loss": 0.5692, + "step": 7700 + }, + { + "epoch": 1.5831020659882824, + "grad_norm": 0.19510303437709808, + "learning_rate": 4.3142739744912754e-05, + "loss": 0.53, + "step": 7701 + }, + { + "epoch": 1.583307636961661, + "grad_norm": 0.18352247774600983, + "learning_rate": 4.3132760194109017e-05, + "loss": 0.5042, + "step": 7702 + }, + { + "epoch": 1.5835132079350396, + "grad_norm": 0.17170487344264984, + "learning_rate": 4.312278073529546e-05, + "loss": 0.5543, + "step": 7703 + }, + { + "epoch": 1.5837187789084182, + "grad_norm": 0.19856008887290955, + "learning_rate": 4.311280136896372e-05, + "loss": 0.5696, + "step": 7704 + }, + { + "epoch": 1.5839243498817965, + "grad_norm": 0.19567757844924927, + "learning_rate": 4.310282209560543e-05, + "loss": 0.5493, + "step": 7705 + }, + { + "epoch": 1.5841299208551751, + "grad_norm": 0.20032745599746704, + "learning_rate": 4.309284291571223e-05, + "loss": 0.5603, + "step": 7706 + }, + { + "epoch": 1.5843354918285537, + "grad_norm": 0.19758538901805878, + "learning_rate": 4.308286382977575e-05, + "loss": 0.5574, + "step": 7707 + }, + { + "epoch": 1.5845410628019323, + "grad_norm": 0.1984431892633438, + "learning_rate": 4.3072884838287605e-05, + "loss": 0.5502, + "step": 7708 + }, + { + "epoch": 1.5847466337753109, + "grad_norm": 0.18602418899536133, + "learning_rate": 4.306290594173942e-05, + "loss": 0.5592, + "step": 7709 + }, + { + "epoch": 1.5849522047486895, + "grad_norm": 0.19030845165252686, + "learning_rate": 4.3052927140622814e-05, + "loss": 0.5444, + "step": 7710 + }, + { + "epoch": 1.585157775722068, + "grad_norm": 0.1725304126739502, + "learning_rate": 4.304294843542938e-05, + "loss": 0.5359, + "step": 7711 + }, + { + "epoch": 1.5853633466954467, + "grad_norm": 0.16047422587871552, + "learning_rate": 4.3032969826650714e-05, + "loss": 0.5433, + "step": 7712 + }, + { + "epoch": 1.5855689176688252, + "grad_norm": 0.19161836802959442, + "learning_rate": 4.302299131477844e-05, + "loss": 0.5271, + "step": 7713 + }, + { + "epoch": 1.5857744886422038, + "grad_norm": 0.15936709940433502, + "learning_rate": 4.301301290030415e-05, + "loss": 0.542, + "step": 7714 + }, + { + "epoch": 1.5859800596155824, + "grad_norm": 0.16099698841571808, + "learning_rate": 4.3003034583719435e-05, + "loss": 0.5483, + "step": 7715 + }, + { + "epoch": 1.586185630588961, + "grad_norm": 0.19221562147140503, + "learning_rate": 4.299305636551585e-05, + "loss": 0.5501, + "step": 7716 + }, + { + "epoch": 1.5863912015623394, + "grad_norm": 0.15940634906291962, + "learning_rate": 4.2983078246185015e-05, + "loss": 0.5228, + "step": 7717 + }, + { + "epoch": 1.586596772535718, + "grad_norm": 0.15413826704025269, + "learning_rate": 4.297310022621849e-05, + "loss": 0.5451, + "step": 7718 + }, + { + "epoch": 1.5868023435090965, + "grad_norm": 0.18997204303741455, + "learning_rate": 4.2963122306107816e-05, + "loss": 0.5558, + "step": 7719 + }, + { + "epoch": 1.587007914482475, + "grad_norm": 0.18921561539173126, + "learning_rate": 4.295314448634461e-05, + "loss": 0.5325, + "step": 7720 + }, + { + "epoch": 1.5872134854558535, + "grad_norm": 0.19362856447696686, + "learning_rate": 4.29431667674204e-05, + "loss": 0.5475, + "step": 7721 + }, + { + "epoch": 1.587419056429232, + "grad_norm": 0.1657908409833908, + "learning_rate": 4.293318914982676e-05, + "loss": 0.4907, + "step": 7722 + }, + { + "epoch": 1.5876246274026107, + "grad_norm": 0.16281838715076447, + "learning_rate": 4.2923211634055226e-05, + "loss": 0.5385, + "step": 7723 + }, + { + "epoch": 1.5878301983759893, + "grad_norm": 0.19449788331985474, + "learning_rate": 4.291323422059735e-05, + "loss": 0.5253, + "step": 7724 + }, + { + "epoch": 1.5880357693493679, + "grad_norm": 0.19563239812850952, + "learning_rate": 4.2903256909944665e-05, + "loss": 0.5349, + "step": 7725 + }, + { + "epoch": 1.5882413403227464, + "grad_norm": 0.19291435182094574, + "learning_rate": 4.28932797025887e-05, + "loss": 0.5294, + "step": 7726 + }, + { + "epoch": 1.588446911296125, + "grad_norm": 0.21474219858646393, + "learning_rate": 4.288330259902101e-05, + "loss": 0.5389, + "step": 7727 + }, + { + "epoch": 1.5886524822695036, + "grad_norm": 0.19437165558338165, + "learning_rate": 4.28733255997331e-05, + "loss": 0.5459, + "step": 7728 + }, + { + "epoch": 1.5888580532428822, + "grad_norm": 0.18734323978424072, + "learning_rate": 4.2863348705216516e-05, + "loss": 0.5381, + "step": 7729 + }, + { + "epoch": 1.5890636242162608, + "grad_norm": 0.19250360131263733, + "learning_rate": 4.285337191596274e-05, + "loss": 0.5357, + "step": 7730 + }, + { + "epoch": 1.5892691951896394, + "grad_norm": 0.19198143482208252, + "learning_rate": 4.284339523246331e-05, + "loss": 0.5375, + "step": 7731 + }, + { + "epoch": 1.5894747661630177, + "grad_norm": 0.18675874173641205, + "learning_rate": 4.2833418655209703e-05, + "loss": 0.5385, + "step": 7732 + }, + { + "epoch": 1.5896803371363963, + "grad_norm": 0.19431853294372559, + "learning_rate": 4.282344218469342e-05, + "loss": 0.5468, + "step": 7733 + }, + { + "epoch": 1.589885908109775, + "grad_norm": 0.16220088303089142, + "learning_rate": 4.281346582140599e-05, + "loss": 0.5035, + "step": 7734 + }, + { + "epoch": 1.5900914790831535, + "grad_norm": 0.1646573841571808, + "learning_rate": 4.2803489565838874e-05, + "loss": 0.5567, + "step": 7735 + }, + { + "epoch": 1.5902970500565319, + "grad_norm": 0.16230642795562744, + "learning_rate": 4.2793513418483565e-05, + "loss": 0.5234, + "step": 7736 + }, + { + "epoch": 1.5905026210299105, + "grad_norm": 0.15772514045238495, + "learning_rate": 4.2783537379831524e-05, + "loss": 0.5393, + "step": 7737 + }, + { + "epoch": 1.590708192003289, + "grad_norm": 0.18681733310222626, + "learning_rate": 4.277356145037425e-05, + "loss": 0.5332, + "step": 7738 + }, + { + "epoch": 1.5909137629766676, + "grad_norm": 0.19121171534061432, + "learning_rate": 4.276358563060319e-05, + "loss": 0.5351, + "step": 7739 + }, + { + "epoch": 1.5911193339500462, + "grad_norm": 0.2163754552602768, + "learning_rate": 4.27536099210098e-05, + "loss": 0.5121, + "step": 7740 + }, + { + "epoch": 1.5913249049234248, + "grad_norm": 0.17165131866931915, + "learning_rate": 4.274363432208556e-05, + "loss": 0.5342, + "step": 7741 + }, + { + "epoch": 1.5915304758968034, + "grad_norm": 0.17426596581935883, + "learning_rate": 4.273365883432192e-05, + "loss": 0.5432, + "step": 7742 + }, + { + "epoch": 1.591736046870182, + "grad_norm": 0.16686050593852997, + "learning_rate": 4.272368345821031e-05, + "loss": 0.5046, + "step": 7743 + }, + { + "epoch": 1.5919416178435606, + "grad_norm": 0.1610487550497055, + "learning_rate": 4.2713708194242184e-05, + "loss": 0.5472, + "step": 7744 + }, + { + "epoch": 1.5921471888169392, + "grad_norm": 0.1983231157064438, + "learning_rate": 4.270373304290897e-05, + "loss": 0.5526, + "step": 7745 + }, + { + "epoch": 1.5923527597903178, + "grad_norm": 0.19409148395061493, + "learning_rate": 4.2693758004702076e-05, + "loss": 0.5521, + "step": 7746 + }, + { + "epoch": 1.5925583307636961, + "grad_norm": 0.18833288550376892, + "learning_rate": 4.268378308011296e-05, + "loss": 0.5263, + "step": 7747 + }, + { + "epoch": 1.5927639017370747, + "grad_norm": 0.18587639927864075, + "learning_rate": 4.2673808269633016e-05, + "loss": 0.5297, + "step": 7748 + }, + { + "epoch": 1.5929694727104533, + "grad_norm": 0.18532033264636993, + "learning_rate": 4.266383357375367e-05, + "loss": 0.5309, + "step": 7749 + }, + { + "epoch": 1.593175043683832, + "grad_norm": 0.1910453587770462, + "learning_rate": 4.2653858992966336e-05, + "loss": 0.5683, + "step": 7750 + }, + { + "epoch": 1.5933806146572103, + "grad_norm": 0.19505764544010162, + "learning_rate": 4.26438845277624e-05, + "loss": 0.5421, + "step": 7751 + }, + { + "epoch": 1.5935861856305888, + "grad_norm": 0.19671325385570526, + "learning_rate": 4.263391017863326e-05, + "loss": 0.5408, + "step": 7752 + }, + { + "epoch": 1.5937917566039674, + "grad_norm": 0.1978052705526352, + "learning_rate": 4.26239359460703e-05, + "loss": 0.5512, + "step": 7753 + }, + { + "epoch": 1.593997327577346, + "grad_norm": 0.1925462931394577, + "learning_rate": 4.26139618305649e-05, + "loss": 0.5285, + "step": 7754 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.1875825971364975, + "learning_rate": 4.260398783260846e-05, + "loss": 0.5481, + "step": 7755 + }, + { + "epoch": 1.5944084695241032, + "grad_norm": 0.1970067173242569, + "learning_rate": 4.2594013952692353e-05, + "loss": 0.528, + "step": 7756 + }, + { + "epoch": 1.5946140404974818, + "grad_norm": 0.19316576421260834, + "learning_rate": 4.258404019130792e-05, + "loss": 0.5348, + "step": 7757 + }, + { + "epoch": 1.5948196114708604, + "grad_norm": 0.19398510456085205, + "learning_rate": 4.257406654894653e-05, + "loss": 0.5404, + "step": 7758 + }, + { + "epoch": 1.595025182444239, + "grad_norm": 0.19227631390094757, + "learning_rate": 4.256409302609956e-05, + "loss": 0.5298, + "step": 7759 + }, + { + "epoch": 1.5952307534176176, + "grad_norm": 0.16509932279586792, + "learning_rate": 4.255411962325833e-05, + "loss": 0.5097, + "step": 7760 + }, + { + "epoch": 1.5954363243909961, + "grad_norm": 0.16759321093559265, + "learning_rate": 4.254414634091418e-05, + "loss": 0.5725, + "step": 7761 + }, + { + "epoch": 1.5956418953643745, + "grad_norm": 0.19898711144924164, + "learning_rate": 4.253417317955848e-05, + "loss": 0.5409, + "step": 7762 + }, + { + "epoch": 1.595847466337753, + "grad_norm": 0.19673512876033783, + "learning_rate": 4.252420013968254e-05, + "loss": 0.5403, + "step": 7763 + }, + { + "epoch": 1.5960530373111317, + "grad_norm": 0.19727066159248352, + "learning_rate": 4.251422722177769e-05, + "loss": 0.5597, + "step": 7764 + }, + { + "epoch": 1.5962586082845103, + "grad_norm": 0.16265854239463806, + "learning_rate": 4.250425442633524e-05, + "loss": 0.5227, + "step": 7765 + }, + { + "epoch": 1.5964641792578886, + "grad_norm": 0.15699994564056396, + "learning_rate": 4.2494281753846515e-05, + "loss": 0.5637, + "step": 7766 + }, + { + "epoch": 1.5966697502312672, + "grad_norm": 0.1968710571527481, + "learning_rate": 4.2484309204802816e-05, + "loss": 0.5566, + "step": 7767 + }, + { + "epoch": 1.5968753212046458, + "grad_norm": 0.19877804815769196, + "learning_rate": 4.2474336779695427e-05, + "loss": 0.5463, + "step": 7768 + }, + { + "epoch": 1.5970808921780244, + "grad_norm": 0.18838095664978027, + "learning_rate": 4.246436447901567e-05, + "loss": 0.5483, + "step": 7769 + }, + { + "epoch": 1.597286463151403, + "grad_norm": 0.18883812427520752, + "learning_rate": 4.245439230325483e-05, + "loss": 0.5465, + "step": 7770 + }, + { + "epoch": 1.5974920341247816, + "grad_norm": 0.20267321169376373, + "learning_rate": 4.244442025290418e-05, + "loss": 0.5651, + "step": 7771 + }, + { + "epoch": 1.5976976050981602, + "grad_norm": 0.19783546030521393, + "learning_rate": 4.2434448328455e-05, + "loss": 0.5623, + "step": 7772 + }, + { + "epoch": 1.5979031760715388, + "grad_norm": 0.20209753513336182, + "learning_rate": 4.242447653039856e-05, + "loss": 0.5378, + "step": 7773 + }, + { + "epoch": 1.5981087470449173, + "grad_norm": 0.16521279513835907, + "learning_rate": 4.2414504859226125e-05, + "loss": 0.4948, + "step": 7774 + }, + { + "epoch": 1.598314318018296, + "grad_norm": 0.15857960283756256, + "learning_rate": 4.240453331542894e-05, + "loss": 0.5269, + "step": 7775 + }, + { + "epoch": 1.5985198889916745, + "grad_norm": 0.16982486844062805, + "learning_rate": 4.239456189949828e-05, + "loss": 0.5311, + "step": 7776 + }, + { + "epoch": 1.5987254599650529, + "grad_norm": 0.15772342681884766, + "learning_rate": 4.238459061192537e-05, + "loss": 0.5586, + "step": 7777 + }, + { + "epoch": 1.5989310309384315, + "grad_norm": 0.18286247551441193, + "learning_rate": 4.2374619453201466e-05, + "loss": 0.527, + "step": 7778 + }, + { + "epoch": 1.59913660191181, + "grad_norm": 0.19069987535476685, + "learning_rate": 4.236464842381778e-05, + "loss": 0.5576, + "step": 7779 + }, + { + "epoch": 1.5993421728851887, + "grad_norm": 0.19216850399971008, + "learning_rate": 4.235467752426555e-05, + "loss": 0.5289, + "step": 7780 + }, + { + "epoch": 1.599547743858567, + "grad_norm": 0.1922430843114853, + "learning_rate": 4.2344706755036e-05, + "loss": 0.568, + "step": 7781 + }, + { + "epoch": 1.5997533148319456, + "grad_norm": 0.18228840827941895, + "learning_rate": 4.2334736116620314e-05, + "loss": 0.531, + "step": 7782 + }, + { + "epoch": 1.5999588858053242, + "grad_norm": 0.18847499787807465, + "learning_rate": 4.2324765609509746e-05, + "loss": 0.5421, + "step": 7783 + }, + { + "epoch": 1.6001644567787028, + "grad_norm": 0.1916157454252243, + "learning_rate": 4.231479523419547e-05, + "loss": 0.5423, + "step": 7784 + }, + { + "epoch": 1.6003700277520814, + "grad_norm": 0.19695116579532623, + "learning_rate": 4.230482499116869e-05, + "loss": 0.5403, + "step": 7785 + }, + { + "epoch": 1.60057559872546, + "grad_norm": 0.20415250957012177, + "learning_rate": 4.2294854880920575e-05, + "loss": 0.5381, + "step": 7786 + }, + { + "epoch": 1.6007811696988385, + "grad_norm": 0.20049957931041718, + "learning_rate": 4.228488490394232e-05, + "loss": 0.5711, + "step": 7787 + }, + { + "epoch": 1.6009867406722171, + "grad_norm": 0.16379691660404205, + "learning_rate": 4.227491506072508e-05, + "loss": 0.5004, + "step": 7788 + }, + { + "epoch": 1.6011923116455957, + "grad_norm": 0.16042593121528625, + "learning_rate": 4.226494535176005e-05, + "loss": 0.5595, + "step": 7789 + }, + { + "epoch": 1.6013978826189743, + "grad_norm": 0.18765395879745483, + "learning_rate": 4.2254975777538386e-05, + "loss": 0.5608, + "step": 7790 + }, + { + "epoch": 1.601603453592353, + "grad_norm": 0.16303540766239166, + "learning_rate": 4.224500633855123e-05, + "loss": 0.5161, + "step": 7791 + }, + { + "epoch": 1.6018090245657313, + "grad_norm": 0.16182848811149597, + "learning_rate": 4.223503703528973e-05, + "loss": 0.5586, + "step": 7792 + }, + { + "epoch": 1.6020145955391099, + "grad_norm": 0.1916949301958084, + "learning_rate": 4.222506786824504e-05, + "loss": 0.5563, + "step": 7793 + }, + { + "epoch": 1.6022201665124884, + "grad_norm": 0.18221786618232727, + "learning_rate": 4.221509883790828e-05, + "loss": 0.5474, + "step": 7794 + }, + { + "epoch": 1.602425737485867, + "grad_norm": 0.1872803419828415, + "learning_rate": 4.2205129944770574e-05, + "loss": 0.5405, + "step": 7795 + }, + { + "epoch": 1.6026313084592454, + "grad_norm": 0.1901916116476059, + "learning_rate": 4.2195161189323064e-05, + "loss": 0.5595, + "step": 7796 + }, + { + "epoch": 1.602836879432624, + "grad_norm": 0.48569947481155396, + "learning_rate": 4.2185192572056856e-05, + "loss": 0.5194, + "step": 7797 + }, + { + "epoch": 1.6030424504060026, + "grad_norm": 0.1648416817188263, + "learning_rate": 4.217522409346305e-05, + "loss": 0.5493, + "step": 7798 + }, + { + "epoch": 1.6032480213793812, + "grad_norm": 0.2003573477268219, + "learning_rate": 4.216525575403275e-05, + "loss": 0.5516, + "step": 7799 + }, + { + "epoch": 1.6034535923527597, + "grad_norm": 0.16360460221767426, + "learning_rate": 4.2155287554257056e-05, + "loss": 0.5167, + "step": 7800 + }, + { + "epoch": 1.6036591633261383, + "grad_norm": 0.12889930605888367, + "learning_rate": 4.2145319494627034e-05, + "loss": 0.4986, + "step": 7801 + }, + { + "epoch": 1.603864734299517, + "grad_norm": 0.16201470792293549, + "learning_rate": 4.213535157563378e-05, + "loss": 0.5439, + "step": 7802 + }, + { + "epoch": 1.6040703052728955, + "grad_norm": 0.19083839654922485, + "learning_rate": 4.212538379776837e-05, + "loss": 0.5315, + "step": 7803 + }, + { + "epoch": 1.604275876246274, + "grad_norm": 0.19757793843746185, + "learning_rate": 4.211541616152186e-05, + "loss": 0.5288, + "step": 7804 + }, + { + "epoch": 1.6044814472196527, + "grad_norm": 0.21021337807178497, + "learning_rate": 4.210544866738532e-05, + "loss": 0.5579, + "step": 7805 + }, + { + "epoch": 1.6046870181930313, + "grad_norm": 0.18950164318084717, + "learning_rate": 4.2095481315849796e-05, + "loss": 0.5447, + "step": 7806 + }, + { + "epoch": 1.6048925891664099, + "grad_norm": 0.18903128802776337, + "learning_rate": 4.2085514107406326e-05, + "loss": 0.5478, + "step": 7807 + }, + { + "epoch": 1.6050981601397882, + "grad_norm": 0.1963806450366974, + "learning_rate": 4.207554704254596e-05, + "loss": 0.5411, + "step": 7808 + }, + { + "epoch": 1.6053037311131668, + "grad_norm": 0.19509243965148926, + "learning_rate": 4.20655801217597e-05, + "loss": 0.5447, + "step": 7809 + }, + { + "epoch": 1.6055093020865454, + "grad_norm": 0.18859466910362244, + "learning_rate": 4.205561334553862e-05, + "loss": 0.5434, + "step": 7810 + }, + { + "epoch": 1.6057148730599238, + "grad_norm": 0.1625402718782425, + "learning_rate": 4.20456467143737e-05, + "loss": 0.5226, + "step": 7811 + }, + { + "epoch": 1.6059204440333024, + "grad_norm": 0.16071906685829163, + "learning_rate": 4.203568022875596e-05, + "loss": 0.5362, + "step": 7812 + }, + { + "epoch": 1.606126015006681, + "grad_norm": 0.19820047914981842, + "learning_rate": 4.202571388917638e-05, + "loss": 0.5452, + "step": 7813 + }, + { + "epoch": 1.6063315859800595, + "grad_norm": 0.1983959972858429, + "learning_rate": 4.2015747696126e-05, + "loss": 0.526, + "step": 7814 + }, + { + "epoch": 1.6065371569534381, + "grad_norm": 0.19241683185100555, + "learning_rate": 4.200578165009578e-05, + "loss": 0.5378, + "step": 7815 + }, + { + "epoch": 1.6067427279268167, + "grad_norm": 0.19365909695625305, + "learning_rate": 4.199581575157668e-05, + "loss": 0.5589, + "step": 7816 + }, + { + "epoch": 1.6069482989001953, + "grad_norm": 0.1934269517660141, + "learning_rate": 4.198585000105971e-05, + "loss": 0.5438, + "step": 7817 + }, + { + "epoch": 1.607153869873574, + "grad_norm": 0.19813553988933563, + "learning_rate": 4.1975884399035834e-05, + "loss": 0.5569, + "step": 7818 + }, + { + "epoch": 1.6073594408469525, + "grad_norm": 0.1831195056438446, + "learning_rate": 4.1965918945995994e-05, + "loss": 0.5217, + "step": 7819 + }, + { + "epoch": 1.607565011820331, + "grad_norm": 0.16004779934883118, + "learning_rate": 4.1955953642431144e-05, + "loss": 0.5526, + "step": 7820 + }, + { + "epoch": 1.6077705827937097, + "grad_norm": 0.19440321624279022, + "learning_rate": 4.1945988488832236e-05, + "loss": 0.5287, + "step": 7821 + }, + { + "epoch": 1.6079761537670882, + "grad_norm": 0.18852464854717255, + "learning_rate": 4.1936023485690185e-05, + "loss": 0.526, + "step": 7822 + }, + { + "epoch": 1.6081817247404666, + "grad_norm": 0.18994298577308655, + "learning_rate": 4.192605863349594e-05, + "loss": 0.5729, + "step": 7823 + }, + { + "epoch": 1.6083872957138452, + "grad_norm": 0.18983709812164307, + "learning_rate": 4.191609393274042e-05, + "loss": 0.5418, + "step": 7824 + }, + { + "epoch": 1.6085928666872238, + "grad_norm": 0.19144746661186218, + "learning_rate": 4.190612938391454e-05, + "loss": 0.5502, + "step": 7825 + }, + { + "epoch": 1.6087984376606024, + "grad_norm": 0.18976972997188568, + "learning_rate": 4.18961649875092e-05, + "loss": 0.5363, + "step": 7826 + }, + { + "epoch": 1.6090040086339807, + "grad_norm": 0.19141483306884766, + "learning_rate": 4.188620074401532e-05, + "loss": 0.5285, + "step": 7827 + }, + { + "epoch": 1.6092095796073593, + "grad_norm": 0.19065243005752563, + "learning_rate": 4.187623665392377e-05, + "loss": 0.5374, + "step": 7828 + }, + { + "epoch": 1.609415150580738, + "grad_norm": 0.19287769496440887, + "learning_rate": 4.186627271772544e-05, + "loss": 0.5363, + "step": 7829 + }, + { + "epoch": 1.6096207215541165, + "grad_norm": 0.19527852535247803, + "learning_rate": 4.1856308935911175e-05, + "loss": 0.562, + "step": 7830 + }, + { + "epoch": 1.609826292527495, + "grad_norm": 0.16113971173763275, + "learning_rate": 4.184634530897191e-05, + "loss": 0.5236, + "step": 7831 + }, + { + "epoch": 1.6100318635008737, + "grad_norm": 0.16417936980724335, + "learning_rate": 4.183638183739846e-05, + "loss": 0.5406, + "step": 7832 + }, + { + "epoch": 1.6102374344742523, + "grad_norm": 0.19486621022224426, + "learning_rate": 4.1826418521681696e-05, + "loss": 0.5471, + "step": 7833 + }, + { + "epoch": 1.6104430054476309, + "grad_norm": 0.19623447954654694, + "learning_rate": 4.181645536231245e-05, + "loss": 0.5402, + "step": 7834 + }, + { + "epoch": 1.6106485764210094, + "grad_norm": 0.19563686847686768, + "learning_rate": 4.180649235978158e-05, + "loss": 0.559, + "step": 7835 + }, + { + "epoch": 1.610854147394388, + "grad_norm": 0.19006025791168213, + "learning_rate": 4.17965295145799e-05, + "loss": 0.5595, + "step": 7836 + }, + { + "epoch": 1.6110597183677666, + "grad_norm": 0.1941699981689453, + "learning_rate": 4.178656682719822e-05, + "loss": 0.5391, + "step": 7837 + }, + { + "epoch": 1.611265289341145, + "grad_norm": 0.20085136592388153, + "learning_rate": 4.177660429812739e-05, + "loss": 0.5546, + "step": 7838 + }, + { + "epoch": 1.6114708603145236, + "grad_norm": 0.19179563224315643, + "learning_rate": 4.1766641927858206e-05, + "loss": 0.5336, + "step": 7839 + }, + { + "epoch": 1.6116764312879022, + "grad_norm": 0.19993935525417328, + "learning_rate": 4.175667971688145e-05, + "loss": 0.5577, + "step": 7840 + }, + { + "epoch": 1.6118820022612808, + "grad_norm": 0.19750361144542694, + "learning_rate": 4.1746717665687934e-05, + "loss": 0.5378, + "step": 7841 + }, + { + "epoch": 1.6120875732346591, + "grad_norm": 0.1938353031873703, + "learning_rate": 4.173675577476843e-05, + "loss": 0.5184, + "step": 7842 + }, + { + "epoch": 1.6122931442080377, + "grad_norm": 0.16070544719696045, + "learning_rate": 4.172679404461371e-05, + "loss": 0.521, + "step": 7843 + }, + { + "epoch": 1.6124987151814163, + "grad_norm": 0.17982181906700134, + "learning_rate": 4.171683247571455e-05, + "loss": 0.546, + "step": 7844 + }, + { + "epoch": 1.6127042861547949, + "grad_norm": 0.19503210484981537, + "learning_rate": 4.170687106856171e-05, + "loss": 0.557, + "step": 7845 + }, + { + "epoch": 1.6129098571281735, + "grad_norm": 0.19316428899765015, + "learning_rate": 4.1696909823645936e-05, + "loss": 0.5496, + "step": 7846 + }, + { + "epoch": 1.613115428101552, + "grad_norm": 0.19165056943893433, + "learning_rate": 4.168694874145799e-05, + "loss": 0.5376, + "step": 7847 + }, + { + "epoch": 1.6133209990749307, + "grad_norm": 0.1679886281490326, + "learning_rate": 4.167698782248859e-05, + "loss": 0.5215, + "step": 7848 + }, + { + "epoch": 1.6135265700483092, + "grad_norm": 0.16243119537830353, + "learning_rate": 4.166702706722847e-05, + "loss": 0.5333, + "step": 7849 + }, + { + "epoch": 1.6137321410216878, + "grad_norm": 0.19812798500061035, + "learning_rate": 4.1657066476168345e-05, + "loss": 0.542, + "step": 7850 + }, + { + "epoch": 1.6139377119950664, + "grad_norm": 0.20092356204986572, + "learning_rate": 4.164710604979891e-05, + "loss": 0.5599, + "step": 7851 + }, + { + "epoch": 1.614143282968445, + "grad_norm": 0.1670868694782257, + "learning_rate": 4.1637145788610914e-05, + "loss": 0.5134, + "step": 7852 + }, + { + "epoch": 1.6143488539418234, + "grad_norm": 0.1713995337486267, + "learning_rate": 4.162718569309502e-05, + "loss": 0.5345, + "step": 7853 + }, + { + "epoch": 1.614554424915202, + "grad_norm": 0.1977371871471405, + "learning_rate": 4.161722576374192e-05, + "loss": 0.5363, + "step": 7854 + }, + { + "epoch": 1.6147599958885805, + "grad_norm": 0.16193437576293945, + "learning_rate": 4.1607266001042295e-05, + "loss": 0.4881, + "step": 7855 + }, + { + "epoch": 1.6149655668619591, + "grad_norm": 0.16229775547981262, + "learning_rate": 4.159730640548683e-05, + "loss": 0.5395, + "step": 7856 + }, + { + "epoch": 1.6151711378353375, + "grad_norm": 0.1984437257051468, + "learning_rate": 4.158734697756616e-05, + "loss": 0.5634, + "step": 7857 + }, + { + "epoch": 1.615376708808716, + "grad_norm": 0.1987016797065735, + "learning_rate": 4.157738771777094e-05, + "loss": 0.5676, + "step": 7858 + }, + { + "epoch": 1.6155822797820947, + "grad_norm": 0.15975748002529144, + "learning_rate": 4.156742862659185e-05, + "loss": 0.5129, + "step": 7859 + }, + { + "epoch": 1.6157878507554733, + "grad_norm": 0.15861687064170837, + "learning_rate": 4.155746970451951e-05, + "loss": 0.5272, + "step": 7860 + }, + { + "epoch": 1.6159934217288519, + "grad_norm": 0.18841111660003662, + "learning_rate": 4.154751095204455e-05, + "loss": 0.54, + "step": 7861 + }, + { + "epoch": 1.6161989927022304, + "grad_norm": 0.19690489768981934, + "learning_rate": 4.153755236965758e-05, + "loss": 0.5461, + "step": 7862 + }, + { + "epoch": 1.616404563675609, + "grad_norm": 0.1665157824754715, + "learning_rate": 4.1527593957849224e-05, + "loss": 0.5081, + "step": 7863 + }, + { + "epoch": 1.6166101346489876, + "grad_norm": 0.15810109674930573, + "learning_rate": 4.1517635717110087e-05, + "loss": 0.5394, + "step": 7864 + }, + { + "epoch": 1.6168157056223662, + "grad_norm": 0.1974000781774521, + "learning_rate": 4.150767764793074e-05, + "loss": 0.5227, + "step": 7865 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 0.19814777374267578, + "learning_rate": 4.149771975080181e-05, + "loss": 0.5464, + "step": 7866 + }, + { + "epoch": 1.6172268475691234, + "grad_norm": 0.1915402114391327, + "learning_rate": 4.148776202621386e-05, + "loss": 0.541, + "step": 7867 + }, + { + "epoch": 1.6174324185425017, + "grad_norm": 0.19537873566150665, + "learning_rate": 4.147780447465745e-05, + "loss": 0.5508, + "step": 7868 + }, + { + "epoch": 1.6176379895158803, + "grad_norm": 0.1904834657907486, + "learning_rate": 4.146784709662316e-05, + "loss": 0.4971, + "step": 7869 + }, + { + "epoch": 1.617843560489259, + "grad_norm": 0.19342583417892456, + "learning_rate": 4.1457889892601536e-05, + "loss": 0.5577, + "step": 7870 + }, + { + "epoch": 1.6180491314626375, + "grad_norm": 0.19713959097862244, + "learning_rate": 4.14479328630831e-05, + "loss": 0.5568, + "step": 7871 + }, + { + "epoch": 1.6182547024360159, + "grad_norm": 0.19126051664352417, + "learning_rate": 4.143797600855843e-05, + "loss": 0.5551, + "step": 7872 + }, + { + "epoch": 1.6184602734093945, + "grad_norm": 0.16810829937458038, + "learning_rate": 4.142801932951803e-05, + "loss": 0.5213, + "step": 7873 + }, + { + "epoch": 1.618665844382773, + "grad_norm": 0.15974818170070648, + "learning_rate": 4.1418062826452424e-05, + "loss": 0.5456, + "step": 7874 + }, + { + "epoch": 1.6188714153561516, + "grad_norm": 0.19179581105709076, + "learning_rate": 4.140810649985212e-05, + "loss": 0.5298, + "step": 7875 + }, + { + "epoch": 1.6190769863295302, + "grad_norm": 0.19233964383602142, + "learning_rate": 4.139815035020762e-05, + "loss": 0.5471, + "step": 7876 + }, + { + "epoch": 1.6192825573029088, + "grad_norm": 0.18875513970851898, + "learning_rate": 4.1388194378009406e-05, + "loss": 0.5382, + "step": 7877 + }, + { + "epoch": 1.6194881282762874, + "grad_norm": 0.18729184567928314, + "learning_rate": 4.1378238583747975e-05, + "loss": 0.5342, + "step": 7878 + }, + { + "epoch": 1.619693699249666, + "grad_norm": 0.19150425493717194, + "learning_rate": 4.136828296791382e-05, + "loss": 0.565, + "step": 7879 + }, + { + "epoch": 1.6198992702230446, + "grad_norm": 0.18844369053840637, + "learning_rate": 4.1358327530997366e-05, + "loss": 0.5405, + "step": 7880 + }, + { + "epoch": 1.6201048411964232, + "grad_norm": 0.19033032655715942, + "learning_rate": 4.1348372273489106e-05, + "loss": 0.5274, + "step": 7881 + }, + { + "epoch": 1.6203104121698018, + "grad_norm": 0.16202832758426666, + "learning_rate": 4.133841719587948e-05, + "loss": 0.4988, + "step": 7882 + }, + { + "epoch": 1.6205159831431801, + "grad_norm": 0.16193822026252747, + "learning_rate": 4.132846229865892e-05, + "loss": 0.542, + "step": 7883 + }, + { + "epoch": 1.6207215541165587, + "grad_norm": 0.1977519690990448, + "learning_rate": 4.131850758231787e-05, + "loss": 0.5588, + "step": 7884 + }, + { + "epoch": 1.6209271250899373, + "grad_norm": 0.20576632022857666, + "learning_rate": 4.1308553047346713e-05, + "loss": 0.5583, + "step": 7885 + }, + { + "epoch": 1.621132696063316, + "grad_norm": 0.1919194608926773, + "learning_rate": 4.129859869423592e-05, + "loss": 0.5519, + "step": 7886 + }, + { + "epoch": 1.6213382670366943, + "grad_norm": 0.19272786378860474, + "learning_rate": 4.128864452347587e-05, + "loss": 0.5368, + "step": 7887 + }, + { + "epoch": 1.6215438380100728, + "grad_norm": 0.19439461827278137, + "learning_rate": 4.127869053555696e-05, + "loss": 0.5374, + "step": 7888 + }, + { + "epoch": 1.6217494089834514, + "grad_norm": 0.19123432040214539, + "learning_rate": 4.126873673096956e-05, + "loss": 0.5392, + "step": 7889 + }, + { + "epoch": 1.62195497995683, + "grad_norm": 0.18603573739528656, + "learning_rate": 4.1258783110204074e-05, + "loss": 0.5217, + "step": 7890 + }, + { + "epoch": 1.6221605509302086, + "grad_norm": 0.1992233395576477, + "learning_rate": 4.1248829673750846e-05, + "loss": 0.5625, + "step": 7891 + }, + { + "epoch": 1.6223661219035872, + "grad_norm": 0.18787723779678345, + "learning_rate": 4.123887642210024e-05, + "loss": 0.554, + "step": 7892 + }, + { + "epoch": 1.6225716928769658, + "grad_norm": 0.18760953843593597, + "learning_rate": 4.122892335574263e-05, + "loss": 0.5411, + "step": 7893 + }, + { + "epoch": 1.6227772638503444, + "grad_norm": 0.19207806885242462, + "learning_rate": 4.121897047516834e-05, + "loss": 0.5274, + "step": 7894 + }, + { + "epoch": 1.622982834823723, + "grad_norm": 0.1640760600566864, + "learning_rate": 4.12090177808677e-05, + "loss": 0.5044, + "step": 7895 + }, + { + "epoch": 1.6231884057971016, + "grad_norm": 0.1595536321401596, + "learning_rate": 4.1199065273331035e-05, + "loss": 0.5382, + "step": 7896 + }, + { + "epoch": 1.6233939767704801, + "grad_norm": 0.7902474403381348, + "learning_rate": 4.118911295304866e-05, + "loss": 0.5773, + "step": 7897 + }, + { + "epoch": 1.6235995477438587, + "grad_norm": 0.15795102715492249, + "learning_rate": 4.1179160820510866e-05, + "loss": 0.5388, + "step": 7898 + }, + { + "epoch": 1.623805118717237, + "grad_norm": 0.2210693508386612, + "learning_rate": 4.116920887620797e-05, + "loss": 0.5724, + "step": 7899 + }, + { + "epoch": 1.6240106896906157, + "grad_norm": 0.16837280988693237, + "learning_rate": 4.1159257120630244e-05, + "loss": 0.5361, + "step": 7900 + }, + { + "epoch": 1.6242162606639943, + "grad_norm": 0.16610947251319885, + "learning_rate": 4.1149305554267965e-05, + "loss": 0.5441, + "step": 7901 + }, + { + "epoch": 1.6244218316373726, + "grad_norm": 0.19282789528369904, + "learning_rate": 4.1139354177611413e-05, + "loss": 0.5416, + "step": 7902 + }, + { + "epoch": 1.6246274026107512, + "grad_norm": 0.19123776257038116, + "learning_rate": 4.112940299115083e-05, + "loss": 0.5602, + "step": 7903 + }, + { + "epoch": 1.6248329735841298, + "grad_norm": 0.19537465274333954, + "learning_rate": 4.111945199537648e-05, + "loss": 0.5568, + "step": 7904 + }, + { + "epoch": 1.6250385445575084, + "grad_norm": 0.1960020512342453, + "learning_rate": 4.1109501190778585e-05, + "loss": 0.5366, + "step": 7905 + }, + { + "epoch": 1.625244115530887, + "grad_norm": 0.16584603488445282, + "learning_rate": 4.109955057784737e-05, + "loss": 0.5022, + "step": 7906 + }, + { + "epoch": 1.6254496865042656, + "grad_norm": 0.14005246758460999, + "learning_rate": 4.108960015707308e-05, + "loss": 0.5147, + "step": 7907 + }, + { + "epoch": 1.6256552574776442, + "grad_norm": 0.16588489711284637, + "learning_rate": 4.107964992894592e-05, + "loss": 0.5522, + "step": 7908 + }, + { + "epoch": 1.6258608284510228, + "grad_norm": 0.19331607222557068, + "learning_rate": 4.1069699893956074e-05, + "loss": 0.5099, + "step": 7909 + }, + { + "epoch": 1.6260663994244013, + "grad_norm": 0.1920442134141922, + "learning_rate": 4.105975005259374e-05, + "loss": 0.543, + "step": 7910 + }, + { + "epoch": 1.62627197039778, + "grad_norm": 0.19395653903484344, + "learning_rate": 4.1049800405349116e-05, + "loss": 0.5589, + "step": 7911 + }, + { + "epoch": 1.6264775413711585, + "grad_norm": 0.19541949033737183, + "learning_rate": 4.103985095271236e-05, + "loss": 0.5376, + "step": 7912 + }, + { + "epoch": 1.626683112344537, + "grad_norm": 0.16967599093914032, + "learning_rate": 4.102990169517362e-05, + "loss": 0.5135, + "step": 7913 + }, + { + "epoch": 1.6268886833179155, + "grad_norm": 0.16106168925762177, + "learning_rate": 4.101995263322308e-05, + "loss": 0.5548, + "step": 7914 + }, + { + "epoch": 1.627094254291294, + "grad_norm": 0.20895619690418243, + "learning_rate": 4.101000376735088e-05, + "loss": 0.536, + "step": 7915 + }, + { + "epoch": 1.6272998252646727, + "grad_norm": 0.1945531964302063, + "learning_rate": 4.1000055098047144e-05, + "loss": 0.5196, + "step": 7916 + }, + { + "epoch": 1.6275053962380512, + "grad_norm": 0.19166290760040283, + "learning_rate": 4.099010662580199e-05, + "loss": 0.5329, + "step": 7917 + }, + { + "epoch": 1.6277109672114296, + "grad_norm": 0.1970268040895462, + "learning_rate": 4.0980158351105554e-05, + "loss": 0.5334, + "step": 7918 + }, + { + "epoch": 1.6279165381848082, + "grad_norm": 0.19781675934791565, + "learning_rate": 4.097021027444791e-05, + "loss": 0.557, + "step": 7919 + }, + { + "epoch": 1.6281221091581868, + "grad_norm": 0.2081199437379837, + "learning_rate": 4.0960262396319165e-05, + "loss": 0.5316, + "step": 7920 + }, + { + "epoch": 1.6283276801315654, + "grad_norm": 0.19772003591060638, + "learning_rate": 4.0950314717209425e-05, + "loss": 0.5286, + "step": 7921 + }, + { + "epoch": 1.628533251104944, + "grad_norm": 0.1967727690935135, + "learning_rate": 4.094036723760875e-05, + "loss": 0.55, + "step": 7922 + }, + { + "epoch": 1.6287388220783225, + "grad_norm": 0.1625976264476776, + "learning_rate": 4.09304199580072e-05, + "loss": 0.5024, + "step": 7923 + }, + { + "epoch": 1.6289443930517011, + "grad_norm": 0.16001035273075104, + "learning_rate": 4.092047287889484e-05, + "loss": 0.5347, + "step": 7924 + }, + { + "epoch": 1.6291499640250797, + "grad_norm": 0.20354917645454407, + "learning_rate": 4.0910526000761725e-05, + "loss": 0.5271, + "step": 7925 + }, + { + "epoch": 1.6293555349984583, + "grad_norm": 0.20167338848114014, + "learning_rate": 4.0900579324097874e-05, + "loss": 0.547, + "step": 7926 + }, + { + "epoch": 1.629561105971837, + "grad_norm": 0.1940862089395523, + "learning_rate": 4.08906328493933e-05, + "loss": 0.5388, + "step": 7927 + }, + { + "epoch": 1.6297666769452155, + "grad_norm": 0.20124763250350952, + "learning_rate": 4.088068657713805e-05, + "loss": 0.5325, + "step": 7928 + }, + { + "epoch": 1.6299722479185939, + "grad_norm": 0.1647825688123703, + "learning_rate": 4.087074050782213e-05, + "loss": 0.5181, + "step": 7929 + }, + { + "epoch": 1.6301778188919724, + "grad_norm": 0.13776123523712158, + "learning_rate": 4.0860794641935524e-05, + "loss": 0.5102, + "step": 7930 + }, + { + "epoch": 1.630383389865351, + "grad_norm": 0.1585695892572403, + "learning_rate": 4.0850848979968205e-05, + "loss": 0.5194, + "step": 7931 + }, + { + "epoch": 1.6305889608387296, + "grad_norm": 0.19522860646247864, + "learning_rate": 4.084090352241017e-05, + "loss": 0.5335, + "step": 7932 + }, + { + "epoch": 1.630794531812108, + "grad_norm": 0.200296089053154, + "learning_rate": 4.0830958269751385e-05, + "loss": 0.5442, + "step": 7933 + }, + { + "epoch": 1.6310001027854866, + "grad_norm": 0.19578911364078522, + "learning_rate": 4.0821013222481786e-05, + "loss": 0.5368, + "step": 7934 + }, + { + "epoch": 1.6312056737588652, + "grad_norm": 0.19223348796367645, + "learning_rate": 4.0811068381091336e-05, + "loss": 0.5389, + "step": 7935 + }, + { + "epoch": 1.6314112447322437, + "grad_norm": 0.19662773609161377, + "learning_rate": 4.080112374606998e-05, + "loss": 0.5493, + "step": 7936 + }, + { + "epoch": 1.6316168157056223, + "grad_norm": 0.1632963865995407, + "learning_rate": 4.0791179317907626e-05, + "loss": 0.4949, + "step": 7937 + }, + { + "epoch": 1.631822386679001, + "grad_norm": 0.16675293445587158, + "learning_rate": 4.0781235097094205e-05, + "loss": 0.5681, + "step": 7938 + }, + { + "epoch": 1.6320279576523795, + "grad_norm": 0.2007942795753479, + "learning_rate": 4.0771291084119603e-05, + "loss": 0.551, + "step": 7939 + }, + { + "epoch": 1.632233528625758, + "grad_norm": 0.1977294385433197, + "learning_rate": 4.076134727947373e-05, + "loss": 0.5417, + "step": 7940 + }, + { + "epoch": 1.6324390995991367, + "grad_norm": 0.20973463356494904, + "learning_rate": 4.075140368364644e-05, + "loss": 0.5623, + "step": 7941 + }, + { + "epoch": 1.6326446705725153, + "grad_norm": 0.2024088203907013, + "learning_rate": 4.074146029712765e-05, + "loss": 0.5461, + "step": 7942 + }, + { + "epoch": 1.6328502415458939, + "grad_norm": 0.18343862891197205, + "learning_rate": 4.0731517120407205e-05, + "loss": 0.5329, + "step": 7943 + }, + { + "epoch": 1.6330558125192722, + "grad_norm": 0.19498711824417114, + "learning_rate": 4.0721574153974966e-05, + "loss": 0.5226, + "step": 7944 + }, + { + "epoch": 1.6332613834926508, + "grad_norm": 0.1982509046792984, + "learning_rate": 4.071163139832077e-05, + "loss": 0.5745, + "step": 7945 + }, + { + "epoch": 1.6334669544660294, + "grad_norm": 0.19435621798038483, + "learning_rate": 4.0701688853934454e-05, + "loss": 0.5448, + "step": 7946 + }, + { + "epoch": 1.633672525439408, + "grad_norm": 0.19986435770988464, + "learning_rate": 4.069174652130582e-05, + "loss": 0.535, + "step": 7947 + }, + { + "epoch": 1.6338780964127864, + "grad_norm": 0.2016473263502121, + "learning_rate": 4.068180440092471e-05, + "loss": 0.5354, + "step": 7948 + }, + { + "epoch": 1.634083667386165, + "grad_norm": 0.1967112123966217, + "learning_rate": 4.067186249328092e-05, + "loss": 0.5405, + "step": 7949 + }, + { + "epoch": 1.6342892383595435, + "grad_norm": 0.1958150565624237, + "learning_rate": 4.0661920798864236e-05, + "loss": 0.5235, + "step": 7950 + }, + { + "epoch": 1.6344948093329221, + "grad_norm": 0.19553299248218536, + "learning_rate": 4.065197931816444e-05, + "loss": 0.5356, + "step": 7951 + }, + { + "epoch": 1.6347003803063007, + "grad_norm": 0.19405850768089294, + "learning_rate": 4.064203805167129e-05, + "loss": 0.536, + "step": 7952 + }, + { + "epoch": 1.6349059512796793, + "grad_norm": 0.20262351632118225, + "learning_rate": 4.0632096999874556e-05, + "loss": 0.546, + "step": 7953 + }, + { + "epoch": 1.635111522253058, + "grad_norm": 0.1994638741016388, + "learning_rate": 4.0622156163263986e-05, + "loss": 0.5446, + "step": 7954 + }, + { + "epoch": 1.6353170932264365, + "grad_norm": 0.19563588500022888, + "learning_rate": 4.0612215542329316e-05, + "loss": 0.5533, + "step": 7955 + }, + { + "epoch": 1.635522664199815, + "grad_norm": 0.19695055484771729, + "learning_rate": 4.060227513756029e-05, + "loss": 0.522, + "step": 7956 + }, + { + "epoch": 1.6357282351731937, + "grad_norm": 0.1933106780052185, + "learning_rate": 4.059233494944662e-05, + "loss": 0.5295, + "step": 7957 + }, + { + "epoch": 1.6359338061465722, + "grad_norm": 0.1970299780368805, + "learning_rate": 4.0582394978477997e-05, + "loss": 0.5533, + "step": 7958 + }, + { + "epoch": 1.6361393771199506, + "grad_norm": 0.19385181367397308, + "learning_rate": 4.0572455225144124e-05, + "loss": 0.5575, + "step": 7959 + }, + { + "epoch": 1.6363449480933292, + "grad_norm": 0.19552960991859436, + "learning_rate": 4.056251568993469e-05, + "loss": 0.5427, + "step": 7960 + }, + { + "epoch": 1.6365505190667078, + "grad_norm": 0.20119963586330414, + "learning_rate": 4.055257637333935e-05, + "loss": 0.5481, + "step": 7961 + }, + { + "epoch": 1.6367560900400864, + "grad_norm": 0.19214770197868347, + "learning_rate": 4.05426372758478e-05, + "loss": 0.5258, + "step": 7962 + }, + { + "epoch": 1.6369616610134647, + "grad_norm": 0.19121824204921722, + "learning_rate": 4.0532698397949686e-05, + "loss": 0.5093, + "step": 7963 + }, + { + "epoch": 1.6371672319868433, + "grad_norm": 0.18680913746356964, + "learning_rate": 4.052275974013464e-05, + "loss": 0.54, + "step": 7964 + }, + { + "epoch": 1.637372802960222, + "grad_norm": 0.1893320232629776, + "learning_rate": 4.051282130289228e-05, + "loss": 0.5448, + "step": 7965 + }, + { + "epoch": 1.6375783739336005, + "grad_norm": 0.1885337233543396, + "learning_rate": 4.050288308671225e-05, + "loss": 0.5424, + "step": 7966 + }, + { + "epoch": 1.637783944906979, + "grad_norm": 0.1980556845664978, + "learning_rate": 4.049294509208415e-05, + "loss": 0.5693, + "step": 7967 + }, + { + "epoch": 1.6379895158803577, + "grad_norm": 0.194559246301651, + "learning_rate": 4.0483007319497566e-05, + "loss": 0.5378, + "step": 7968 + }, + { + "epoch": 1.6381950868537363, + "grad_norm": 0.1900004744529724, + "learning_rate": 4.047306976944211e-05, + "loss": 0.5471, + "step": 7969 + }, + { + "epoch": 1.6384006578271149, + "grad_norm": 0.19014038145542145, + "learning_rate": 4.0463132442407365e-05, + "loss": 0.5482, + "step": 7970 + }, + { + "epoch": 1.6386062288004934, + "grad_norm": 0.17057844996452332, + "learning_rate": 4.0453195338882867e-05, + "loss": 0.5261, + "step": 7971 + }, + { + "epoch": 1.638811799773872, + "grad_norm": 0.1279505342245102, + "learning_rate": 4.044325845935818e-05, + "loss": 0.5028, + "step": 7972 + }, + { + "epoch": 1.6390173707472506, + "grad_norm": 0.16361773014068604, + "learning_rate": 4.043332180432286e-05, + "loss": 0.5524, + "step": 7973 + }, + { + "epoch": 1.6392229417206292, + "grad_norm": 0.20223143696784973, + "learning_rate": 4.042338537426641e-05, + "loss": 0.5304, + "step": 7974 + }, + { + "epoch": 1.6394285126940076, + "grad_norm": 0.20231173932552338, + "learning_rate": 4.041344916967838e-05, + "loss": 0.5483, + "step": 7975 + }, + { + "epoch": 1.6396340836673862, + "grad_norm": 0.20245333015918732, + "learning_rate": 4.040351319104828e-05, + "loss": 0.5334, + "step": 7976 + }, + { + "epoch": 1.6398396546407648, + "grad_norm": 0.15880252420902252, + "learning_rate": 4.039357743886559e-05, + "loss": 0.5144, + "step": 7977 + }, + { + "epoch": 1.6400452256141431, + "grad_norm": 0.1283801794052124, + "learning_rate": 4.0383641913619816e-05, + "loss": 0.5137, + "step": 7978 + }, + { + "epoch": 1.6402507965875217, + "grad_norm": 0.1789664328098297, + "learning_rate": 4.0373706615800426e-05, + "loss": 0.5355, + "step": 7979 + }, + { + "epoch": 1.6404563675609003, + "grad_norm": 0.196334108710289, + "learning_rate": 4.0363771545896894e-05, + "loss": 0.5392, + "step": 7980 + }, + { + "epoch": 1.6406619385342789, + "grad_norm": 0.19602881371974945, + "learning_rate": 4.035383670439867e-05, + "loss": 0.5369, + "step": 7981 + }, + { + "epoch": 1.6408675095076575, + "grad_norm": 0.19509628415107727, + "learning_rate": 4.0343902091795174e-05, + "loss": 0.5494, + "step": 7982 + }, + { + "epoch": 1.641073080481036, + "grad_norm": 0.19635361433029175, + "learning_rate": 4.033396770857588e-05, + "loss": 0.5527, + "step": 7983 + }, + { + "epoch": 1.6412786514544146, + "grad_norm": 0.19803519546985626, + "learning_rate": 4.0324033555230184e-05, + "loss": 0.537, + "step": 7984 + }, + { + "epoch": 1.6414842224277932, + "grad_norm": 0.20085453987121582, + "learning_rate": 4.03140996322475e-05, + "loss": 0.5394, + "step": 7985 + }, + { + "epoch": 1.6416897934011718, + "grad_norm": 0.18997138738632202, + "learning_rate": 4.030416594011722e-05, + "loss": 0.5145, + "step": 7986 + }, + { + "epoch": 1.6418953643745504, + "grad_norm": 0.16585613787174225, + "learning_rate": 4.029423247932874e-05, + "loss": 0.511, + "step": 7987 + }, + { + "epoch": 1.642100935347929, + "grad_norm": 0.1544012725353241, + "learning_rate": 4.028429925037143e-05, + "loss": 0.5345, + "step": 7988 + }, + { + "epoch": 1.6423065063213076, + "grad_norm": 0.2430618703365326, + "learning_rate": 4.0274366253734644e-05, + "loss": 0.5486, + "step": 7989 + }, + { + "epoch": 1.642512077294686, + "grad_norm": 0.19470450282096863, + "learning_rate": 4.0264433489907753e-05, + "loss": 0.5318, + "step": 7990 + }, + { + "epoch": 1.6427176482680645, + "grad_norm": 0.196413055062294, + "learning_rate": 4.0254500959380096e-05, + "loss": 0.5383, + "step": 7991 + }, + { + "epoch": 1.6429232192414431, + "grad_norm": 0.19302628934383392, + "learning_rate": 4.0244568662641e-05, + "loss": 0.5368, + "step": 7992 + }, + { + "epoch": 1.6431287902148217, + "grad_norm": 0.19250887632369995, + "learning_rate": 4.023463660017978e-05, + "loss": 0.5225, + "step": 7993 + }, + { + "epoch": 1.6433343611882, + "grad_norm": 0.18870443105697632, + "learning_rate": 4.022470477248573e-05, + "loss": 0.5322, + "step": 7994 + }, + { + "epoch": 1.6435399321615787, + "grad_norm": 0.19748498499393463, + "learning_rate": 4.0214773180048155e-05, + "loss": 0.5266, + "step": 7995 + }, + { + "epoch": 1.6437455031349573, + "grad_norm": 0.19181567430496216, + "learning_rate": 4.020484182335634e-05, + "loss": 0.5553, + "step": 7996 + }, + { + "epoch": 1.6439510741083359, + "grad_norm": 0.18883375823497772, + "learning_rate": 4.019491070289956e-05, + "loss": 0.536, + "step": 7997 + }, + { + "epoch": 1.6441566450817144, + "grad_norm": 0.19764509797096252, + "learning_rate": 4.0184979819167066e-05, + "loss": 0.5231, + "step": 7998 + }, + { + "epoch": 1.644362216055093, + "grad_norm": 0.1661233752965927, + "learning_rate": 4.017504917264812e-05, + "loss": 0.5223, + "step": 7999 + }, + { + "epoch": 1.6445677870284716, + "grad_norm": 0.1361915022134781, + "learning_rate": 4.016511876383195e-05, + "loss": 0.5278, + "step": 8000 + }, + { + "epoch": 1.6447733580018502, + "grad_norm": 0.16932383179664612, + "learning_rate": 4.015518859320778e-05, + "loss": 0.5341, + "step": 8001 + }, + { + "epoch": 1.6449789289752288, + "grad_norm": 0.19935861229896545, + "learning_rate": 4.014525866126482e-05, + "loss": 0.5358, + "step": 8002 + }, + { + "epoch": 1.6451844999486074, + "grad_norm": 0.20055261254310608, + "learning_rate": 4.013532896849226e-05, + "loss": 0.5611, + "step": 8003 + }, + { + "epoch": 1.645390070921986, + "grad_norm": 0.1982363760471344, + "learning_rate": 4.012539951537932e-05, + "loss": 0.5271, + "step": 8004 + }, + { + "epoch": 1.6455956418953643, + "grad_norm": 0.16576005518436432, + "learning_rate": 4.011547030241516e-05, + "loss": 0.5156, + "step": 8005 + }, + { + "epoch": 1.645801212868743, + "grad_norm": 0.13087031245231628, + "learning_rate": 4.010554133008895e-05, + "loss": 0.5298, + "step": 8006 + }, + { + "epoch": 1.6460067838421215, + "grad_norm": 0.16294503211975098, + "learning_rate": 4.0095612598889837e-05, + "loss": 0.5526, + "step": 8007 + }, + { + "epoch": 1.6462123548155, + "grad_norm": 0.20266200602054596, + "learning_rate": 4.008568410930698e-05, + "loss": 0.5262, + "step": 8008 + }, + { + "epoch": 1.6464179257888785, + "grad_norm": 0.16137059032917023, + "learning_rate": 4.007575586182949e-05, + "loss": 0.5215, + "step": 8009 + }, + { + "epoch": 1.646623496762257, + "grad_norm": 0.16377897560596466, + "learning_rate": 4.006582785694648e-05, + "loss": 0.5292, + "step": 8010 + }, + { + "epoch": 1.6468290677356356, + "grad_norm": 0.19530196487903595, + "learning_rate": 4.005590009514708e-05, + "loss": 0.5454, + "step": 8011 + }, + { + "epoch": 1.6470346387090142, + "grad_norm": 0.19677075743675232, + "learning_rate": 4.0045972576920374e-05, + "loss": 0.5499, + "step": 8012 + }, + { + "epoch": 1.6472402096823928, + "grad_norm": 0.19411884248256683, + "learning_rate": 4.003604530275545e-05, + "loss": 0.5623, + "step": 8013 + }, + { + "epoch": 1.6474457806557714, + "grad_norm": 0.15870682895183563, + "learning_rate": 4.002611827314137e-05, + "loss": 0.5135, + "step": 8014 + }, + { + "epoch": 1.64765135162915, + "grad_norm": 0.1609289050102234, + "learning_rate": 4.0016191488567195e-05, + "loss": 0.5723, + "step": 8015 + }, + { + "epoch": 1.6478569226025286, + "grad_norm": 0.19486412405967712, + "learning_rate": 4.000626494952196e-05, + "loss": 0.5615, + "step": 8016 + }, + { + "epoch": 1.6480624935759072, + "grad_norm": 0.20491555333137512, + "learning_rate": 3.9996338656494715e-05, + "loss": 0.5451, + "step": 8017 + }, + { + "epoch": 1.6482680645492858, + "grad_norm": 0.19133470952510834, + "learning_rate": 3.998641260997449e-05, + "loss": 0.5251, + "step": 8018 + }, + { + "epoch": 1.6484736355226643, + "grad_norm": 0.1599549949169159, + "learning_rate": 3.997648681045026e-05, + "loss": 0.5172, + "step": 8019 + }, + { + "epoch": 1.6486792064960427, + "grad_norm": 0.1676701456308365, + "learning_rate": 3.996656125841106e-05, + "loss": 0.5404, + "step": 8020 + }, + { + "epoch": 1.6488847774694213, + "grad_norm": 0.1984013020992279, + "learning_rate": 3.995663595434587e-05, + "loss": 0.5757, + "step": 8021 + }, + { + "epoch": 1.6490903484428, + "grad_norm": 0.1664489060640335, + "learning_rate": 3.994671089874364e-05, + "loss": 0.5177, + "step": 8022 + }, + { + "epoch": 1.6492959194161785, + "grad_norm": 0.15646716952323914, + "learning_rate": 3.993678609209333e-05, + "loss": 0.548, + "step": 8023 + }, + { + "epoch": 1.6495014903895568, + "grad_norm": 0.1926644891500473, + "learning_rate": 3.9926861534883924e-05, + "loss": 0.5528, + "step": 8024 + }, + { + "epoch": 1.6497070613629354, + "grad_norm": 0.20535780489444733, + "learning_rate": 3.991693722760434e-05, + "loss": 0.5611, + "step": 8025 + }, + { + "epoch": 1.649912632336314, + "grad_norm": 0.19756321609020233, + "learning_rate": 3.9907013170743504e-05, + "loss": 0.5244, + "step": 8026 + }, + { + "epoch": 1.6501182033096926, + "grad_norm": 0.16199225187301636, + "learning_rate": 3.9897089364790315e-05, + "loss": 0.5097, + "step": 8027 + }, + { + "epoch": 1.6503237742830712, + "grad_norm": 0.1658937931060791, + "learning_rate": 3.988716581023368e-05, + "loss": 0.556, + "step": 8028 + }, + { + "epoch": 1.6505293452564498, + "grad_norm": 0.1961878091096878, + "learning_rate": 3.98772425075625e-05, + "loss": 0.5337, + "step": 8029 + }, + { + "epoch": 1.6507349162298284, + "grad_norm": 0.1957957148551941, + "learning_rate": 3.9867319457265616e-05, + "loss": 0.5322, + "step": 8030 + }, + { + "epoch": 1.650940487203207, + "grad_norm": 0.19228583574295044, + "learning_rate": 3.985739665983192e-05, + "loss": 0.5356, + "step": 8031 + }, + { + "epoch": 1.6511460581765856, + "grad_norm": 0.20034292340278625, + "learning_rate": 3.984747411575027e-05, + "loss": 0.557, + "step": 8032 + }, + { + "epoch": 1.6513516291499641, + "grad_norm": 0.1647980958223343, + "learning_rate": 3.983755182550948e-05, + "loss": 0.5102, + "step": 8033 + }, + { + "epoch": 1.6515572001233427, + "grad_norm": 0.16599765419960022, + "learning_rate": 3.982762978959838e-05, + "loss": 0.5501, + "step": 8034 + }, + { + "epoch": 1.651762771096721, + "grad_norm": 0.193580761551857, + "learning_rate": 3.981770800850579e-05, + "loss": 0.5507, + "step": 8035 + }, + { + "epoch": 1.6519683420700997, + "grad_norm": 0.1937427669763565, + "learning_rate": 3.98077864827205e-05, + "loss": 0.5389, + "step": 8036 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.1987418383359909, + "learning_rate": 3.9797865212731286e-05, + "loss": 0.5353, + "step": 8037 + }, + { + "epoch": 1.6523794840168569, + "grad_norm": 0.1963115632534027, + "learning_rate": 3.978794419902696e-05, + "loss": 0.5466, + "step": 8038 + }, + { + "epoch": 1.6525850549902352, + "grad_norm": 0.16360154747962952, + "learning_rate": 3.977802344209626e-05, + "loss": 0.4988, + "step": 8039 + }, + { + "epoch": 1.6527906259636138, + "grad_norm": 0.18510453402996063, + "learning_rate": 3.976810294242792e-05, + "loss": 0.5791, + "step": 8040 + }, + { + "epoch": 1.6529961969369924, + "grad_norm": 0.2012956142425537, + "learning_rate": 3.9758182700510714e-05, + "loss": 0.5532, + "step": 8041 + }, + { + "epoch": 1.653201767910371, + "grad_norm": 0.19226433336734772, + "learning_rate": 3.974826271683334e-05, + "loss": 0.5284, + "step": 8042 + }, + { + "epoch": 1.6534073388837496, + "grad_norm": 0.19937126338481903, + "learning_rate": 3.973834299188452e-05, + "loss": 0.5377, + "step": 8043 + }, + { + "epoch": 1.6536129098571282, + "grad_norm": 0.19442661106586456, + "learning_rate": 3.9728423526152927e-05, + "loss": 0.5465, + "step": 8044 + }, + { + "epoch": 1.6538184808305068, + "grad_norm": 0.19324155151844025, + "learning_rate": 3.971850432012729e-05, + "loss": 0.5376, + "step": 8045 + }, + { + "epoch": 1.6540240518038853, + "grad_norm": 0.16883355379104614, + "learning_rate": 3.970858537429625e-05, + "loss": 0.5119, + "step": 8046 + }, + { + "epoch": 1.654229622777264, + "grad_norm": 0.16271071135997772, + "learning_rate": 3.969866668914848e-05, + "loss": 0.5545, + "step": 8047 + }, + { + "epoch": 1.6544351937506425, + "grad_norm": 0.18823817372322083, + "learning_rate": 3.9688748265172625e-05, + "loss": 0.5345, + "step": 8048 + }, + { + "epoch": 1.654640764724021, + "grad_norm": 0.19708384573459625, + "learning_rate": 3.9678830102857324e-05, + "loss": 0.5677, + "step": 8049 + }, + { + "epoch": 1.6548463356973995, + "grad_norm": 0.2034367471933365, + "learning_rate": 3.966891220269118e-05, + "loss": 0.5178, + "step": 8050 + }, + { + "epoch": 1.655051906670778, + "grad_norm": 0.1986580491065979, + "learning_rate": 3.9658994565162816e-05, + "loss": 0.5545, + "step": 8051 + }, + { + "epoch": 1.6552574776441566, + "grad_norm": 0.20059730112552643, + "learning_rate": 3.964907719076083e-05, + "loss": 0.5635, + "step": 8052 + }, + { + "epoch": 1.6554630486175352, + "grad_norm": 0.19054940342903137, + "learning_rate": 3.963916007997379e-05, + "loss": 0.5542, + "step": 8053 + }, + { + "epoch": 1.6556686195909136, + "grad_norm": 0.19373731315135956, + "learning_rate": 3.962924323329029e-05, + "loss": 0.5499, + "step": 8054 + }, + { + "epoch": 1.6558741905642922, + "grad_norm": 0.19648055732250214, + "learning_rate": 3.9619326651198875e-05, + "loss": 0.5399, + "step": 8055 + }, + { + "epoch": 1.6560797615376708, + "grad_norm": 0.21236325800418854, + "learning_rate": 3.960941033418808e-05, + "loss": 0.5433, + "step": 8056 + }, + { + "epoch": 1.6562853325110494, + "grad_norm": 0.18751861155033112, + "learning_rate": 3.959949428274645e-05, + "loss": 0.521, + "step": 8057 + }, + { + "epoch": 1.656490903484428, + "grad_norm": 0.1983969807624817, + "learning_rate": 3.958957849736247e-05, + "loss": 0.5316, + "step": 8058 + }, + { + "epoch": 1.6566964744578065, + "grad_norm": 0.6342448592185974, + "learning_rate": 3.9579662978524695e-05, + "loss": 0.5639, + "step": 8059 + }, + { + "epoch": 1.6569020454311851, + "grad_norm": 0.19162461161613464, + "learning_rate": 3.9569747726721584e-05, + "loss": 0.5219, + "step": 8060 + }, + { + "epoch": 1.6571076164045637, + "grad_norm": 0.152262344956398, + "learning_rate": 3.9559832742441625e-05, + "loss": 0.4952, + "step": 8061 + }, + { + "epoch": 1.6573131873779423, + "grad_norm": 0.16122353076934814, + "learning_rate": 3.9549918026173265e-05, + "loss": 0.548, + "step": 8062 + }, + { + "epoch": 1.657518758351321, + "grad_norm": 0.19267982244491577, + "learning_rate": 3.9540003578404985e-05, + "loss": 0.5452, + "step": 8063 + }, + { + "epoch": 1.6577243293246995, + "grad_norm": 0.17134782671928406, + "learning_rate": 3.953008939962521e-05, + "loss": 0.5009, + "step": 8064 + }, + { + "epoch": 1.657929900298078, + "grad_norm": 0.16193920373916626, + "learning_rate": 3.952017549032234e-05, + "loss": 0.5392, + "step": 8065 + }, + { + "epoch": 1.6581354712714564, + "grad_norm": 0.1981363743543625, + "learning_rate": 3.951026185098483e-05, + "loss": 0.5403, + "step": 8066 + }, + { + "epoch": 1.658341042244835, + "grad_norm": 0.19924452900886536, + "learning_rate": 3.950034848210107e-05, + "loss": 0.5773, + "step": 8067 + }, + { + "epoch": 1.6585466132182136, + "grad_norm": 0.16161105036735535, + "learning_rate": 3.949043538415942e-05, + "loss": 0.5061, + "step": 8068 + }, + { + "epoch": 1.658752184191592, + "grad_norm": 0.16612055897712708, + "learning_rate": 3.948052255764828e-05, + "loss": 0.5527, + "step": 8069 + }, + { + "epoch": 1.6589577551649706, + "grad_norm": 0.20378176867961884, + "learning_rate": 3.947061000305599e-05, + "loss": 0.5445, + "step": 8070 + }, + { + "epoch": 1.6591633261383492, + "grad_norm": 0.21650046110153198, + "learning_rate": 3.946069772087089e-05, + "loss": 0.5334, + "step": 8071 + }, + { + "epoch": 1.6593688971117277, + "grad_norm": 0.1963663250207901, + "learning_rate": 3.9450785711581324e-05, + "loss": 0.5466, + "step": 8072 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 0.19677862524986267, + "learning_rate": 3.944087397567561e-05, + "loss": 0.5542, + "step": 8073 + }, + { + "epoch": 1.659780039058485, + "grad_norm": 0.19894835352897644, + "learning_rate": 3.943096251364205e-05, + "loss": 0.5259, + "step": 8074 + }, + { + "epoch": 1.6599856100318635, + "grad_norm": 0.20943677425384521, + "learning_rate": 3.942105132596895e-05, + "loss": 0.5323, + "step": 8075 + }, + { + "epoch": 1.660191181005242, + "grad_norm": 0.20376256108283997, + "learning_rate": 3.941114041314458e-05, + "loss": 0.5369, + "step": 8076 + }, + { + "epoch": 1.6603967519786207, + "grad_norm": 0.1930057853460312, + "learning_rate": 3.9401229775657185e-05, + "loss": 0.5351, + "step": 8077 + }, + { + "epoch": 1.6606023229519993, + "grad_norm": 0.19255690276622772, + "learning_rate": 3.939131941399504e-05, + "loss": 0.534, + "step": 8078 + }, + { + "epoch": 1.6608078939253779, + "grad_norm": 0.18883344531059265, + "learning_rate": 3.938140932864635e-05, + "loss": 0.5353, + "step": 8079 + }, + { + "epoch": 1.6610134648987565, + "grad_norm": 0.20242716372013092, + "learning_rate": 3.937149952009938e-05, + "loss": 0.5459, + "step": 8080 + }, + { + "epoch": 1.6612190358721348, + "grad_norm": 0.19481943547725677, + "learning_rate": 3.9361589988842325e-05, + "loss": 0.5526, + "step": 8081 + }, + { + "epoch": 1.6614246068455134, + "grad_norm": 0.19463589787483215, + "learning_rate": 3.935168073536337e-05, + "loss": 0.5376, + "step": 8082 + }, + { + "epoch": 1.661630177818892, + "grad_norm": 0.1837586909532547, + "learning_rate": 3.93417717601507e-05, + "loss": 0.5, + "step": 8083 + }, + { + "epoch": 1.6618357487922706, + "grad_norm": 0.19010527431964874, + "learning_rate": 3.9331863063692494e-05, + "loss": 0.5356, + "step": 8084 + }, + { + "epoch": 1.662041319765649, + "grad_norm": 0.19221745431423187, + "learning_rate": 3.932195464647691e-05, + "loss": 0.5369, + "step": 8085 + }, + { + "epoch": 1.6622468907390275, + "grad_norm": 0.19402176141738892, + "learning_rate": 3.9312046508992064e-05, + "loss": 0.5403, + "step": 8086 + }, + { + "epoch": 1.6624524617124061, + "grad_norm": 0.19770248234272003, + "learning_rate": 3.930213865172611e-05, + "loss": 0.5719, + "step": 8087 + }, + { + "epoch": 1.6626580326857847, + "grad_norm": 0.16622693836688995, + "learning_rate": 3.929223107516716e-05, + "loss": 0.5024, + "step": 8088 + }, + { + "epoch": 1.6628636036591633, + "grad_norm": 0.15783652663230896, + "learning_rate": 3.92823237798033e-05, + "loss": 0.5163, + "step": 8089 + }, + { + "epoch": 1.663069174632542, + "grad_norm": 0.19830361008644104, + "learning_rate": 3.927241676612263e-05, + "loss": 0.5191, + "step": 8090 + }, + { + "epoch": 1.6632747456059205, + "grad_norm": 0.20356783270835876, + "learning_rate": 3.9262510034613215e-05, + "loss": 0.5717, + "step": 8091 + }, + { + "epoch": 1.663480316579299, + "grad_norm": 0.17174309492111206, + "learning_rate": 3.92526035857631e-05, + "loss": 0.5261, + "step": 8092 + }, + { + "epoch": 1.6636858875526777, + "grad_norm": 0.1699124574661255, + "learning_rate": 3.924269742006035e-05, + "loss": 0.5411, + "step": 8093 + }, + { + "epoch": 1.6638914585260562, + "grad_norm": 0.19385066628456116, + "learning_rate": 3.923279153799299e-05, + "loss": 0.5143, + "step": 8094 + }, + { + "epoch": 1.6640970294994348, + "grad_norm": 0.1945018619298935, + "learning_rate": 3.922288594004903e-05, + "loss": 0.5342, + "step": 8095 + }, + { + "epoch": 1.6643026004728132, + "grad_norm": 0.19037404656410217, + "learning_rate": 3.921298062671649e-05, + "loss": 0.5635, + "step": 8096 + }, + { + "epoch": 1.6645081714461918, + "grad_norm": 0.1975833922624588, + "learning_rate": 3.9203075598483335e-05, + "loss": 0.5451, + "step": 8097 + }, + { + "epoch": 1.6647137424195704, + "grad_norm": 0.1882157325744629, + "learning_rate": 3.9193170855837564e-05, + "loss": 0.5283, + "step": 8098 + }, + { + "epoch": 1.664919313392949, + "grad_norm": 0.19174973666667938, + "learning_rate": 3.9183266399267094e-05, + "loss": 0.5513, + "step": 8099 + }, + { + "epoch": 1.6651248843663273, + "grad_norm": 0.19739782810211182, + "learning_rate": 3.9173362229259926e-05, + "loss": 0.5301, + "step": 8100 + }, + { + "epoch": 1.665330455339706, + "grad_norm": 0.16633886098861694, + "learning_rate": 3.916345834630396e-05, + "loss": 0.541, + "step": 8101 + }, + { + "epoch": 1.6655360263130845, + "grad_norm": 0.16310401260852814, + "learning_rate": 3.915355475088714e-05, + "loss": 0.5662, + "step": 8102 + }, + { + "epoch": 1.665741597286463, + "grad_norm": 0.18664813041687012, + "learning_rate": 3.914365144349733e-05, + "loss": 0.5332, + "step": 8103 + }, + { + "epoch": 1.6659471682598417, + "grad_norm": 0.19100825488567352, + "learning_rate": 3.913374842462244e-05, + "loss": 0.5315, + "step": 8104 + }, + { + "epoch": 1.6661527392332203, + "grad_norm": 0.20404808223247528, + "learning_rate": 3.912384569475036e-05, + "loss": 0.5542, + "step": 8105 + }, + { + "epoch": 1.6663583102065989, + "grad_norm": 0.1687227189540863, + "learning_rate": 3.9113943254368916e-05, + "loss": 0.5423, + "step": 8106 + }, + { + "epoch": 1.6665638811799774, + "grad_norm": 0.1573527455329895, + "learning_rate": 3.9104041103965985e-05, + "loss": 0.5323, + "step": 8107 + }, + { + "epoch": 1.666769452153356, + "grad_norm": 0.16023261845111847, + "learning_rate": 3.90941392440294e-05, + "loss": 0.5108, + "step": 8108 + }, + { + "epoch": 1.6669750231267346, + "grad_norm": 0.15852369368076324, + "learning_rate": 3.9084237675046975e-05, + "loss": 0.5464, + "step": 8109 + }, + { + "epoch": 1.6671805941001132, + "grad_norm": 0.19316738843917847, + "learning_rate": 3.90743363975065e-05, + "loss": 0.5283, + "step": 8110 + }, + { + "epoch": 1.6673861650734916, + "grad_norm": 0.1973247081041336, + "learning_rate": 3.906443541189578e-05, + "loss": 0.5398, + "step": 8111 + }, + { + "epoch": 1.6675917360468702, + "grad_norm": 0.1693935990333557, + "learning_rate": 3.905453471870259e-05, + "loss": 0.509, + "step": 8112 + }, + { + "epoch": 1.6677973070202488, + "grad_norm": 0.1599174290895462, + "learning_rate": 3.9044634318414656e-05, + "loss": 0.5468, + "step": 8113 + }, + { + "epoch": 1.6680028779936273, + "grad_norm": 0.18429811298847198, + "learning_rate": 3.903473421151978e-05, + "loss": 0.5272, + "step": 8114 + }, + { + "epoch": 1.6682084489670057, + "grad_norm": 0.19133618474006653, + "learning_rate": 3.902483439850566e-05, + "loss": 0.538, + "step": 8115 + }, + { + "epoch": 1.6684140199403843, + "grad_norm": 0.19194607436656952, + "learning_rate": 3.901493487986002e-05, + "loss": 0.5341, + "step": 8116 + }, + { + "epoch": 1.6686195909137629, + "grad_norm": 0.16348059475421906, + "learning_rate": 3.900503565607057e-05, + "loss": 0.5021, + "step": 8117 + }, + { + "epoch": 1.6688251618871415, + "grad_norm": 0.16237923502922058, + "learning_rate": 3.899513672762499e-05, + "loss": 0.5647, + "step": 8118 + }, + { + "epoch": 1.66903073286052, + "grad_norm": 0.19955293834209442, + "learning_rate": 3.8985238095010965e-05, + "loss": 0.5687, + "step": 8119 + }, + { + "epoch": 1.6692363038338986, + "grad_norm": 0.16089332103729248, + "learning_rate": 3.897533975871612e-05, + "loss": 0.51, + "step": 8120 + }, + { + "epoch": 1.6694418748072772, + "grad_norm": 0.161229208111763, + "learning_rate": 3.896544171922815e-05, + "loss": 0.5291, + "step": 8121 + }, + { + "epoch": 1.6696474457806558, + "grad_norm": 0.19278062880039215, + "learning_rate": 3.895554397703466e-05, + "loss": 0.5307, + "step": 8122 + }, + { + "epoch": 1.6698530167540344, + "grad_norm": 0.16215354204177856, + "learning_rate": 3.8945646532623256e-05, + "loss": 0.5, + "step": 8123 + }, + { + "epoch": 1.670058587727413, + "grad_norm": 0.16377978026866913, + "learning_rate": 3.893574938648156e-05, + "loss": 0.5362, + "step": 8124 + }, + { + "epoch": 1.6702641587007916, + "grad_norm": 0.19552935659885406, + "learning_rate": 3.892585253909714e-05, + "loss": 0.5255, + "step": 8125 + }, + { + "epoch": 1.67046972967417, + "grad_norm": 0.164475217461586, + "learning_rate": 3.8915955990957575e-05, + "loss": 0.4989, + "step": 8126 + }, + { + "epoch": 1.6706753006475485, + "grad_norm": 0.15810781717300415, + "learning_rate": 3.890605974255042e-05, + "loss": 0.5276, + "step": 8127 + }, + { + "epoch": 1.6708808716209271, + "grad_norm": 0.1982525885105133, + "learning_rate": 3.889616379436321e-05, + "loss": 0.5349, + "step": 8128 + }, + { + "epoch": 1.6710864425943057, + "grad_norm": 0.15992006659507751, + "learning_rate": 3.88862681468835e-05, + "loss": 0.5116, + "step": 8129 + }, + { + "epoch": 1.671292013567684, + "grad_norm": 0.15967024862766266, + "learning_rate": 3.887637280059878e-05, + "loss": 0.5657, + "step": 8130 + }, + { + "epoch": 1.6714975845410627, + "grad_norm": 0.1937428116798401, + "learning_rate": 3.886647775599655e-05, + "loss": 0.5581, + "step": 8131 + }, + { + "epoch": 1.6717031555144413, + "grad_norm": 0.18604367971420288, + "learning_rate": 3.885658301356429e-05, + "loss": 0.5246, + "step": 8132 + }, + { + "epoch": 1.6719087264878199, + "grad_norm": 0.18401312828063965, + "learning_rate": 3.884668857378947e-05, + "loss": 0.5059, + "step": 8133 + }, + { + "epoch": 1.6721142974611984, + "grad_norm": 0.20079663395881653, + "learning_rate": 3.883679443715953e-05, + "loss": 0.539, + "step": 8134 + }, + { + "epoch": 1.672319868434577, + "grad_norm": 0.18646441400051117, + "learning_rate": 3.882690060416194e-05, + "loss": 0.548, + "step": 8135 + }, + { + "epoch": 1.6725254394079556, + "grad_norm": 0.19753128290176392, + "learning_rate": 3.88170070752841e-05, + "loss": 0.5599, + "step": 8136 + }, + { + "epoch": 1.6727310103813342, + "grad_norm": 0.19681565463542938, + "learning_rate": 3.8807113851013425e-05, + "loss": 0.543, + "step": 8137 + }, + { + "epoch": 1.6729365813547128, + "grad_norm": 0.16268804669380188, + "learning_rate": 3.879722093183729e-05, + "loss": 0.5131, + "step": 8138 + }, + { + "epoch": 1.6731421523280914, + "grad_norm": 0.16209396719932556, + "learning_rate": 3.87873283182431e-05, + "loss": 0.5467, + "step": 8139 + }, + { + "epoch": 1.67334772330147, + "grad_norm": 0.19812874495983124, + "learning_rate": 3.877743601071821e-05, + "loss": 0.5457, + "step": 8140 + }, + { + "epoch": 1.6735532942748486, + "grad_norm": 0.15637758374214172, + "learning_rate": 3.8767544009749944e-05, + "loss": 0.5099, + "step": 8141 + }, + { + "epoch": 1.673758865248227, + "grad_norm": 0.15744930505752563, + "learning_rate": 3.875765231582568e-05, + "loss": 0.5549, + "step": 8142 + }, + { + "epoch": 1.6739644362216055, + "grad_norm": 0.19686995446681976, + "learning_rate": 3.874776092943269e-05, + "loss": 0.5183, + "step": 8143 + }, + { + "epoch": 1.674170007194984, + "grad_norm": 0.1597413569688797, + "learning_rate": 3.8737869851058315e-05, + "loss": 0.5043, + "step": 8144 + }, + { + "epoch": 1.6743755781683625, + "grad_norm": 0.1251799464225769, + "learning_rate": 3.872797908118982e-05, + "loss": 0.5108, + "step": 8145 + }, + { + "epoch": 1.674581149141741, + "grad_norm": 0.16012680530548096, + "learning_rate": 3.8718088620314474e-05, + "loss": 0.5168, + "step": 8146 + }, + { + "epoch": 1.6747867201151196, + "grad_norm": 0.19369451701641083, + "learning_rate": 3.870819846891953e-05, + "loss": 0.5266, + "step": 8147 + }, + { + "epoch": 1.6749922910884982, + "grad_norm": 0.19420257210731506, + "learning_rate": 3.869830862749224e-05, + "loss": 0.5269, + "step": 8148 + }, + { + "epoch": 1.6751978620618768, + "grad_norm": 0.16983704268932343, + "learning_rate": 3.8688419096519844e-05, + "loss": 0.5027, + "step": 8149 + }, + { + "epoch": 1.6754034330352554, + "grad_norm": 0.16026097536087036, + "learning_rate": 3.8678529876489526e-05, + "loss": 0.5368, + "step": 8150 + }, + { + "epoch": 1.675609004008634, + "grad_norm": 0.1565508395433426, + "learning_rate": 3.86686409678885e-05, + "loss": 0.5045, + "step": 8151 + }, + { + "epoch": 1.6758145749820126, + "grad_norm": 0.15121006965637207, + "learning_rate": 3.865875237120395e-05, + "loss": 0.5409, + "step": 8152 + }, + { + "epoch": 1.6760201459553912, + "grad_norm": 0.19622927904129028, + "learning_rate": 3.864886408692303e-05, + "loss": 0.5297, + "step": 8153 + }, + { + "epoch": 1.6762257169287698, + "grad_norm": 0.20082417130470276, + "learning_rate": 3.863897611553289e-05, + "loss": 0.5516, + "step": 8154 + }, + { + "epoch": 1.6764312879021483, + "grad_norm": 0.19279861450195312, + "learning_rate": 3.8629088457520645e-05, + "loss": 0.5286, + "step": 8155 + }, + { + "epoch": 1.676636858875527, + "grad_norm": 0.18971529603004456, + "learning_rate": 3.861920111337345e-05, + "loss": 0.5381, + "step": 8156 + }, + { + "epoch": 1.6768424298489053, + "grad_norm": 0.18667519092559814, + "learning_rate": 3.8609314083578396e-05, + "loss": 0.529, + "step": 8157 + }, + { + "epoch": 1.677048000822284, + "grad_norm": 0.18965506553649902, + "learning_rate": 3.859942736862257e-05, + "loss": 0.5504, + "step": 8158 + }, + { + "epoch": 1.6772535717956625, + "grad_norm": 0.1879250854253769, + "learning_rate": 3.858954096899303e-05, + "loss": 0.521, + "step": 8159 + }, + { + "epoch": 1.6774591427690408, + "grad_norm": 0.16116970777511597, + "learning_rate": 3.8579654885176854e-05, + "loss": 0.5171, + "step": 8160 + }, + { + "epoch": 1.6776647137424194, + "grad_norm": 0.16163001954555511, + "learning_rate": 3.856976911766107e-05, + "loss": 0.5526, + "step": 8161 + }, + { + "epoch": 1.677870284715798, + "grad_norm": 0.19858844578266144, + "learning_rate": 3.855988366693269e-05, + "loss": 0.5105, + "step": 8162 + }, + { + "epoch": 1.6780758556891766, + "grad_norm": 0.19145843386650085, + "learning_rate": 3.854999853347876e-05, + "loss": 0.5701, + "step": 8163 + }, + { + "epoch": 1.6782814266625552, + "grad_norm": 0.19304659962654114, + "learning_rate": 3.854011371778625e-05, + "loss": 0.5276, + "step": 8164 + }, + { + "epoch": 1.6784869976359338, + "grad_norm": 0.19083738327026367, + "learning_rate": 3.853022922034215e-05, + "loss": 0.5204, + "step": 8165 + }, + { + "epoch": 1.6786925686093124, + "grad_norm": 0.18819309771060944, + "learning_rate": 3.852034504163341e-05, + "loss": 0.5283, + "step": 8166 + }, + { + "epoch": 1.678898139582691, + "grad_norm": 0.19191038608551025, + "learning_rate": 3.851046118214699e-05, + "loss": 0.5261, + "step": 8167 + }, + { + "epoch": 1.6791037105560696, + "grad_norm": 0.19225665926933289, + "learning_rate": 3.850057764236981e-05, + "loss": 0.5282, + "step": 8168 + }, + { + "epoch": 1.6793092815294481, + "grad_norm": 0.19503363966941833, + "learning_rate": 3.849069442278878e-05, + "loss": 0.5355, + "step": 8169 + }, + { + "epoch": 1.6795148525028267, + "grad_norm": 0.19625093042850494, + "learning_rate": 3.848081152389083e-05, + "loss": 0.5557, + "step": 8170 + }, + { + "epoch": 1.6797204234762053, + "grad_norm": 0.19365637004375458, + "learning_rate": 3.8470928946162813e-05, + "loss": 0.5369, + "step": 8171 + }, + { + "epoch": 1.6799259944495837, + "grad_norm": 0.19885706901550293, + "learning_rate": 3.8461046690091616e-05, + "loss": 0.5276, + "step": 8172 + }, + { + "epoch": 1.6801315654229623, + "grad_norm": 0.19316908717155457, + "learning_rate": 3.845116475616409e-05, + "loss": 0.5332, + "step": 8173 + }, + { + "epoch": 1.6803371363963409, + "grad_norm": 0.1912158727645874, + "learning_rate": 3.844128314486706e-05, + "loss": 0.542, + "step": 8174 + }, + { + "epoch": 1.6805427073697194, + "grad_norm": 0.1975051760673523, + "learning_rate": 3.843140185668737e-05, + "loss": 0.5467, + "step": 8175 + }, + { + "epoch": 1.6807482783430978, + "grad_norm": 0.16475236415863037, + "learning_rate": 3.8421520892111776e-05, + "loss": 0.5106, + "step": 8176 + }, + { + "epoch": 1.6809538493164764, + "grad_norm": 0.16820210218429565, + "learning_rate": 3.841164025162713e-05, + "loss": 0.5522, + "step": 8177 + }, + { + "epoch": 1.681159420289855, + "grad_norm": 0.19619794189929962, + "learning_rate": 3.840175993572016e-05, + "loss": 0.5367, + "step": 8178 + }, + { + "epoch": 1.6813649912632336, + "grad_norm": 0.19805863499641418, + "learning_rate": 3.839187994487765e-05, + "loss": 0.5383, + "step": 8179 + }, + { + "epoch": 1.6815705622366122, + "grad_norm": 0.18975287675857544, + "learning_rate": 3.838200027958632e-05, + "loss": 0.5476, + "step": 8180 + }, + { + "epoch": 1.6817761332099908, + "grad_norm": 0.18960921466350555, + "learning_rate": 3.837212094033291e-05, + "loss": 0.5452, + "step": 8181 + }, + { + "epoch": 1.6819817041833693, + "grad_norm": 0.1594635397195816, + "learning_rate": 3.8362241927604106e-05, + "loss": 0.5045, + "step": 8182 + }, + { + "epoch": 1.682187275156748, + "grad_norm": 0.1598910242319107, + "learning_rate": 3.835236324188662e-05, + "loss": 0.5456, + "step": 8183 + }, + { + "epoch": 1.6823928461301265, + "grad_norm": 0.19848540425300598, + "learning_rate": 3.834248488366714e-05, + "loss": 0.5193, + "step": 8184 + }, + { + "epoch": 1.682598417103505, + "grad_norm": 0.2017425149679184, + "learning_rate": 3.833260685343231e-05, + "loss": 0.5427, + "step": 8185 + }, + { + "epoch": 1.6828039880768837, + "grad_norm": 0.19509434700012207, + "learning_rate": 3.832272915166878e-05, + "loss": 0.5208, + "step": 8186 + }, + { + "epoch": 1.683009559050262, + "grad_norm": 0.19122706353664398, + "learning_rate": 3.8312851778863176e-05, + "loss": 0.5213, + "step": 8187 + }, + { + "epoch": 1.6832151300236406, + "grad_norm": 0.18763068318367004, + "learning_rate": 3.8302974735502104e-05, + "loss": 0.5363, + "step": 8188 + }, + { + "epoch": 1.6834207009970192, + "grad_norm": 0.2000308781862259, + "learning_rate": 3.829309802207215e-05, + "loss": 0.5397, + "step": 8189 + }, + { + "epoch": 1.6836262719703978, + "grad_norm": 0.19013464450836182, + "learning_rate": 3.828322163905993e-05, + "loss": 0.5073, + "step": 8190 + }, + { + "epoch": 1.6838318429437762, + "grad_norm": 0.19034752249717712, + "learning_rate": 3.827334558695198e-05, + "loss": 0.5318, + "step": 8191 + }, + { + "epoch": 1.6840374139171548, + "grad_norm": 0.16001807153224945, + "learning_rate": 3.8263469866234844e-05, + "loss": 0.4987, + "step": 8192 + }, + { + "epoch": 1.6842429848905334, + "grad_norm": 0.15920346975326538, + "learning_rate": 3.825359447739507e-05, + "loss": 0.5404, + "step": 8193 + }, + { + "epoch": 1.684448555863912, + "grad_norm": 0.19532343745231628, + "learning_rate": 3.8243719420919165e-05, + "loss": 0.5134, + "step": 8194 + }, + { + "epoch": 1.6846541268372905, + "grad_norm": 0.19484242796897888, + "learning_rate": 3.823384469729363e-05, + "loss": 0.5334, + "step": 8195 + }, + { + "epoch": 1.6848596978106691, + "grad_norm": 0.20333658158779144, + "learning_rate": 3.822397030700491e-05, + "loss": 0.5491, + "step": 8196 + }, + { + "epoch": 1.6850652687840477, + "grad_norm": 0.20554953813552856, + "learning_rate": 3.821409625053953e-05, + "loss": 0.5479, + "step": 8197 + }, + { + "epoch": 1.6852708397574263, + "grad_norm": 0.19656214118003845, + "learning_rate": 3.820422252838391e-05, + "loss": 0.5334, + "step": 8198 + }, + { + "epoch": 1.685476410730805, + "grad_norm": 0.19906407594680786, + "learning_rate": 3.819434914102448e-05, + "loss": 0.5302, + "step": 8199 + }, + { + "epoch": 1.6856819817041835, + "grad_norm": 0.16761255264282227, + "learning_rate": 3.818447608894767e-05, + "loss": 0.5145, + "step": 8200 + }, + { + "epoch": 1.685887552677562, + "grad_norm": 0.16284339129924774, + "learning_rate": 3.8174603372639846e-05, + "loss": 0.5399, + "step": 8201 + }, + { + "epoch": 1.6860931236509404, + "grad_norm": 0.19720837473869324, + "learning_rate": 3.816473099258742e-05, + "loss": 0.5452, + "step": 8202 + }, + { + "epoch": 1.686298694624319, + "grad_norm": 0.19352254271507263, + "learning_rate": 3.8154858949276744e-05, + "loss": 0.5399, + "step": 8203 + }, + { + "epoch": 1.6865042655976976, + "grad_norm": 0.16425921022891998, + "learning_rate": 3.814498724319418e-05, + "loss": 0.5016, + "step": 8204 + }, + { + "epoch": 1.6867098365710762, + "grad_norm": 0.15797263383865356, + "learning_rate": 3.813511587482606e-05, + "loss": 0.5325, + "step": 8205 + }, + { + "epoch": 1.6869154075444546, + "grad_norm": 0.16672199964523315, + "learning_rate": 3.812524484465869e-05, + "loss": 0.4982, + "step": 8206 + }, + { + "epoch": 1.6871209785178332, + "grad_norm": 0.2091359794139862, + "learning_rate": 3.811537415317837e-05, + "loss": 0.5206, + "step": 8207 + }, + { + "epoch": 1.6873265494912117, + "grad_norm": 0.19015903770923615, + "learning_rate": 3.81055038008714e-05, + "loss": 0.5234, + "step": 8208 + }, + { + "epoch": 1.6875321204645903, + "grad_norm": 0.20703433454036713, + "learning_rate": 3.8095633788224024e-05, + "loss": 0.5743, + "step": 8209 + }, + { + "epoch": 1.687737691437969, + "grad_norm": 0.19770927727222443, + "learning_rate": 3.8085764115722484e-05, + "loss": 0.5482, + "step": 8210 + }, + { + "epoch": 1.6879432624113475, + "grad_norm": 0.15969951450824738, + "learning_rate": 3.8075894783853054e-05, + "loss": 0.4893, + "step": 8211 + }, + { + "epoch": 1.688148833384726, + "grad_norm": 0.15302079916000366, + "learning_rate": 3.806602579310191e-05, + "loss": 0.5153, + "step": 8212 + }, + { + "epoch": 1.6883544043581047, + "grad_norm": 0.19498853385448456, + "learning_rate": 3.805615714395527e-05, + "loss": 0.546, + "step": 8213 + }, + { + "epoch": 1.6885599753314833, + "grad_norm": 0.1922113597393036, + "learning_rate": 3.804628883689931e-05, + "loss": 0.5351, + "step": 8214 + }, + { + "epoch": 1.6887655463048619, + "grad_norm": 0.19428758323192596, + "learning_rate": 3.803642087242021e-05, + "loss": 0.5452, + "step": 8215 + }, + { + "epoch": 1.6889711172782405, + "grad_norm": 0.19760240614414215, + "learning_rate": 3.8026553251004096e-05, + "loss": 0.5356, + "step": 8216 + }, + { + "epoch": 1.6891766882516188, + "grad_norm": 0.16687412559986115, + "learning_rate": 3.8016685973137095e-05, + "loss": 0.5093, + "step": 8217 + }, + { + "epoch": 1.6893822592249974, + "grad_norm": 0.1642359495162964, + "learning_rate": 3.800681903930535e-05, + "loss": 0.5485, + "step": 8218 + }, + { + "epoch": 1.689587830198376, + "grad_norm": 0.1901901662349701, + "learning_rate": 3.799695244999495e-05, + "loss": 0.5102, + "step": 8219 + }, + { + "epoch": 1.6897934011717546, + "grad_norm": 0.19654683768749237, + "learning_rate": 3.798708620569197e-05, + "loss": 0.5338, + "step": 8220 + }, + { + "epoch": 1.689998972145133, + "grad_norm": 0.1945556253194809, + "learning_rate": 3.797722030688248e-05, + "loss": 0.5369, + "step": 8221 + }, + { + "epoch": 1.6902045431185115, + "grad_norm": 0.19918568432331085, + "learning_rate": 3.7967354754052514e-05, + "loss": 0.5409, + "step": 8222 + }, + { + "epoch": 1.6904101140918901, + "grad_norm": 0.19335860013961792, + "learning_rate": 3.7957489547688096e-05, + "loss": 0.5451, + "step": 8223 + }, + { + "epoch": 1.6906156850652687, + "grad_norm": 0.19655676186084747, + "learning_rate": 3.794762468827526e-05, + "loss": 0.5484, + "step": 8224 + }, + { + "epoch": 1.6908212560386473, + "grad_norm": 0.20534905791282654, + "learning_rate": 3.79377601763e-05, + "loss": 0.5321, + "step": 8225 + }, + { + "epoch": 1.691026827012026, + "grad_norm": 0.16838058829307556, + "learning_rate": 3.7927896012248275e-05, + "loss": 0.4978, + "step": 8226 + }, + { + "epoch": 1.6912323979854045, + "grad_norm": 0.16190923750400543, + "learning_rate": 3.7918032196606064e-05, + "loss": 0.5627, + "step": 8227 + }, + { + "epoch": 1.691437968958783, + "grad_norm": 0.197221040725708, + "learning_rate": 3.790816872985931e-05, + "loss": 0.5287, + "step": 8228 + }, + { + "epoch": 1.6916435399321617, + "grad_norm": 0.19407358765602112, + "learning_rate": 3.789830561249394e-05, + "loss": 0.5409, + "step": 8229 + }, + { + "epoch": 1.6918491109055402, + "grad_norm": 0.19359079003334045, + "learning_rate": 3.7888442844995856e-05, + "loss": 0.5378, + "step": 8230 + }, + { + "epoch": 1.6920546818789188, + "grad_norm": 0.21710537374019623, + "learning_rate": 3.7878580427850937e-05, + "loss": 0.5545, + "step": 8231 + }, + { + "epoch": 1.6922602528522974, + "grad_norm": 0.19026683270931244, + "learning_rate": 3.786871836154509e-05, + "loss": 0.549, + "step": 8232 + }, + { + "epoch": 1.6924658238256758, + "grad_norm": 0.19044183194637299, + "learning_rate": 3.785885664656415e-05, + "loss": 0.5286, + "step": 8233 + }, + { + "epoch": 1.6926713947990544, + "grad_norm": 0.20085959136486053, + "learning_rate": 3.7848995283393984e-05, + "loss": 0.5414, + "step": 8234 + }, + { + "epoch": 1.692876965772433, + "grad_norm": 0.16187427937984467, + "learning_rate": 3.783913427252038e-05, + "loss": 0.5116, + "step": 8235 + }, + { + "epoch": 1.6930825367458113, + "grad_norm": 0.16329748928546906, + "learning_rate": 3.782927361442916e-05, + "loss": 0.5387, + "step": 8236 + }, + { + "epoch": 1.69328810771919, + "grad_norm": 0.1952928751707077, + "learning_rate": 3.781941330960612e-05, + "loss": 0.569, + "step": 8237 + }, + { + "epoch": 1.6934936786925685, + "grad_norm": 0.19381776452064514, + "learning_rate": 3.780955335853701e-05, + "loss": 0.5336, + "step": 8238 + }, + { + "epoch": 1.693699249665947, + "grad_norm": 0.2035483717918396, + "learning_rate": 3.779969376170761e-05, + "loss": 0.5385, + "step": 8239 + }, + { + "epoch": 1.6939048206393257, + "grad_norm": 0.19148887693881989, + "learning_rate": 3.778983451960365e-05, + "loss": 0.5156, + "step": 8240 + }, + { + "epoch": 1.6941103916127043, + "grad_norm": 0.17306075990200043, + "learning_rate": 3.7779975632710836e-05, + "loss": 0.5245, + "step": 8241 + }, + { + "epoch": 1.6943159625860829, + "grad_norm": 0.12030526250600815, + "learning_rate": 3.7770117101514885e-05, + "loss": 0.5117, + "step": 8242 + }, + { + "epoch": 1.6945215335594614, + "grad_norm": 0.15814997255802155, + "learning_rate": 3.776025892650147e-05, + "loss": 0.55, + "step": 8243 + }, + { + "epoch": 1.69472710453284, + "grad_norm": 0.18655110895633698, + "learning_rate": 3.775040110815624e-05, + "loss": 0.4924, + "step": 8244 + }, + { + "epoch": 1.6949326755062186, + "grad_norm": 0.19482672214508057, + "learning_rate": 3.7740543646964876e-05, + "loss": 0.5431, + "step": 8245 + }, + { + "epoch": 1.6951382464795972, + "grad_norm": 0.1627287119626999, + "learning_rate": 3.7730686543412994e-05, + "loss": 0.5191, + "step": 8246 + }, + { + "epoch": 1.6953438174529758, + "grad_norm": 0.17188504338264465, + "learning_rate": 3.772082979798621e-05, + "loss": 0.5403, + "step": 8247 + }, + { + "epoch": 1.6955493884263542, + "grad_norm": 0.21016332507133484, + "learning_rate": 3.7710973411170126e-05, + "loss": 0.5456, + "step": 8248 + }, + { + "epoch": 1.6957549593997328, + "grad_norm": 0.1925675868988037, + "learning_rate": 3.770111738345031e-05, + "loss": 0.5214, + "step": 8249 + }, + { + "epoch": 1.6959605303731113, + "grad_norm": 0.19163696467876434, + "learning_rate": 3.769126171531232e-05, + "loss": 0.5354, + "step": 8250 + }, + { + "epoch": 1.69616610134649, + "grad_norm": 0.16222819685935974, + "learning_rate": 3.7681406407241716e-05, + "loss": 0.5241, + "step": 8251 + }, + { + "epoch": 1.6963716723198683, + "grad_norm": 0.16099952161312103, + "learning_rate": 3.767155145972399e-05, + "loss": 0.5321, + "step": 8252 + }, + { + "epoch": 1.6965772432932469, + "grad_norm": 0.1959654986858368, + "learning_rate": 3.766169687324468e-05, + "loss": 0.54, + "step": 8253 + }, + { + "epoch": 1.6967828142666255, + "grad_norm": 0.19316841661930084, + "learning_rate": 3.7651842648289276e-05, + "loss": 0.5356, + "step": 8254 + }, + { + "epoch": 1.696988385240004, + "grad_norm": 0.202810600399971, + "learning_rate": 3.7641988785343236e-05, + "loss": 0.5506, + "step": 8255 + }, + { + "epoch": 1.6971939562133826, + "grad_norm": 0.1944981962442398, + "learning_rate": 3.763213528489201e-05, + "loss": 0.5019, + "step": 8256 + }, + { + "epoch": 1.6973995271867612, + "grad_norm": 0.16003461182117462, + "learning_rate": 3.762228214742105e-05, + "loss": 0.504, + "step": 8257 + }, + { + "epoch": 1.6976050981601398, + "grad_norm": 0.15627720952033997, + "learning_rate": 3.7612429373415754e-05, + "loss": 0.5165, + "step": 8258 + }, + { + "epoch": 1.6978106691335184, + "grad_norm": 0.19209109246730804, + "learning_rate": 3.760257696336154e-05, + "loss": 0.5114, + "step": 8259 + }, + { + "epoch": 1.698016240106897, + "grad_norm": 0.16496042907238007, + "learning_rate": 3.759272491774378e-05, + "loss": 0.5113, + "step": 8260 + }, + { + "epoch": 1.6982218110802756, + "grad_norm": 0.16691668331623077, + "learning_rate": 3.758287323704785e-05, + "loss": 0.5469, + "step": 8261 + }, + { + "epoch": 1.6984273820536542, + "grad_norm": 0.20020011067390442, + "learning_rate": 3.757302192175909e-05, + "loss": 0.5397, + "step": 8262 + }, + { + "epoch": 1.6986329530270325, + "grad_norm": 0.19349105656147003, + "learning_rate": 3.756317097236282e-05, + "loss": 0.5422, + "step": 8263 + }, + { + "epoch": 1.6988385240004111, + "grad_norm": 0.16651464998722076, + "learning_rate": 3.755332038934436e-05, + "loss": 0.4836, + "step": 8264 + }, + { + "epoch": 1.6990440949737897, + "grad_norm": 0.12502692639827728, + "learning_rate": 3.754347017318897e-05, + "loss": 0.5132, + "step": 8265 + }, + { + "epoch": 1.6992496659471683, + "grad_norm": 0.12334790080785751, + "learning_rate": 3.7533620324381984e-05, + "loss": 0.5108, + "step": 8266 + }, + { + "epoch": 1.6994552369205467, + "grad_norm": 0.12631775438785553, + "learning_rate": 3.752377084340863e-05, + "loss": 0.5167, + "step": 8267 + }, + { + "epoch": 1.6996608078939253, + "grad_norm": 0.16306687891483307, + "learning_rate": 3.7513921730754125e-05, + "loss": 0.5369, + "step": 8268 + }, + { + "epoch": 1.6998663788673039, + "grad_norm": 0.19654233753681183, + "learning_rate": 3.750407298690372e-05, + "loss": 0.52, + "step": 8269 + }, + { + "epoch": 1.7000719498406824, + "grad_norm": 0.1925351619720459, + "learning_rate": 3.74942246123426e-05, + "loss": 0.5356, + "step": 8270 + }, + { + "epoch": 1.700277520814061, + "grad_norm": 0.165648952126503, + "learning_rate": 3.7484376607555954e-05, + "loss": 0.5244, + "step": 8271 + }, + { + "epoch": 1.7004830917874396, + "grad_norm": 0.1643042266368866, + "learning_rate": 3.747452897302892e-05, + "loss": 0.5356, + "step": 8272 + }, + { + "epoch": 1.7006886627608182, + "grad_norm": 0.1931808739900589, + "learning_rate": 3.7464681709246696e-05, + "loss": 0.5371, + "step": 8273 + }, + { + "epoch": 1.7008942337341968, + "grad_norm": 0.19738541543483734, + "learning_rate": 3.745483481669438e-05, + "loss": 0.5506, + "step": 8274 + }, + { + "epoch": 1.7010998047075754, + "grad_norm": 0.1938776969909668, + "learning_rate": 3.744498829585709e-05, + "loss": 0.5548, + "step": 8275 + }, + { + "epoch": 1.701305375680954, + "grad_norm": 0.192849263548851, + "learning_rate": 3.743514214721991e-05, + "loss": 0.5506, + "step": 8276 + }, + { + "epoch": 1.7015109466543326, + "grad_norm": 0.19882553815841675, + "learning_rate": 3.742529637126791e-05, + "loss": 0.5341, + "step": 8277 + }, + { + "epoch": 1.701716517627711, + "grad_norm": 0.1962941288948059, + "learning_rate": 3.741545096848617e-05, + "loss": 0.5582, + "step": 8278 + }, + { + "epoch": 1.7019220886010895, + "grad_norm": 0.1944989264011383, + "learning_rate": 3.7405605939359694e-05, + "loss": 0.5621, + "step": 8279 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 0.19153885543346405, + "learning_rate": 3.7395761284373516e-05, + "loss": 0.5256, + "step": 8280 + }, + { + "epoch": 1.7023332305478467, + "grad_norm": 0.1736639142036438, + "learning_rate": 3.738591700401265e-05, + "loss": 0.5124, + "step": 8281 + }, + { + "epoch": 1.702538801521225, + "grad_norm": 0.160393625497818, + "learning_rate": 3.7376073098762065e-05, + "loss": 0.5521, + "step": 8282 + }, + { + "epoch": 1.7027443724946036, + "grad_norm": 0.17553849518299103, + "learning_rate": 3.736622956910673e-05, + "loss": 0.5196, + "step": 8283 + }, + { + "epoch": 1.7029499434679822, + "grad_norm": 0.16473659873008728, + "learning_rate": 3.735638641553157e-05, + "loss": 0.5314, + "step": 8284 + }, + { + "epoch": 1.7031555144413608, + "grad_norm": 0.15864580869674683, + "learning_rate": 3.734654363852153e-05, + "loss": 0.4975, + "step": 8285 + }, + { + "epoch": 1.7033610854147394, + "grad_norm": 0.15445110201835632, + "learning_rate": 3.7336701238561504e-05, + "loss": 0.5165, + "step": 8286 + }, + { + "epoch": 1.703566656388118, + "grad_norm": 0.19712099432945251, + "learning_rate": 3.73268592161364e-05, + "loss": 0.5323, + "step": 8287 + }, + { + "epoch": 1.7037722273614966, + "grad_norm": 0.9865581393241882, + "learning_rate": 3.731701757173108e-05, + "loss": 0.5455, + "step": 8288 + }, + { + "epoch": 1.7039777983348752, + "grad_norm": 0.1986110508441925, + "learning_rate": 3.730717630583038e-05, + "loss": 0.5355, + "step": 8289 + }, + { + "epoch": 1.7041833693082538, + "grad_norm": 0.1951410174369812, + "learning_rate": 3.729733541891917e-05, + "loss": 0.5557, + "step": 8290 + }, + { + "epoch": 1.7043889402816323, + "grad_norm": 0.19467894732952118, + "learning_rate": 3.728749491148223e-05, + "loss": 0.523, + "step": 8291 + }, + { + "epoch": 1.704594511255011, + "grad_norm": 0.2036120444536209, + "learning_rate": 3.727765478400437e-05, + "loss": 0.5361, + "step": 8292 + }, + { + "epoch": 1.7048000822283893, + "grad_norm": 0.17244306206703186, + "learning_rate": 3.726781503697034e-05, + "loss": 0.5073, + "step": 8293 + }, + { + "epoch": 1.705005653201768, + "grad_norm": 0.18532809615135193, + "learning_rate": 3.7257975670864954e-05, + "loss": 0.5654, + "step": 8294 + }, + { + "epoch": 1.7052112241751465, + "grad_norm": 0.1989675760269165, + "learning_rate": 3.724813668617292e-05, + "loss": 0.5094, + "step": 8295 + }, + { + "epoch": 1.705416795148525, + "grad_norm": 0.15224121510982513, + "learning_rate": 3.723829808337895e-05, + "loss": 0.5202, + "step": 8296 + }, + { + "epoch": 1.7056223661219034, + "grad_norm": 0.1623305380344391, + "learning_rate": 3.722845986296776e-05, + "loss": 0.5323, + "step": 8297 + }, + { + "epoch": 1.705827937095282, + "grad_norm": 0.20320528745651245, + "learning_rate": 3.721862202542403e-05, + "loss": 0.5185, + "step": 8298 + }, + { + "epoch": 1.7060335080686606, + "grad_norm": 0.17598193883895874, + "learning_rate": 3.7208784571232404e-05, + "loss": 0.5198, + "step": 8299 + }, + { + "epoch": 1.7062390790420392, + "grad_norm": 0.1603892594575882, + "learning_rate": 3.7198947500877554e-05, + "loss": 0.5437, + "step": 8300 + }, + { + "epoch": 1.7064446500154178, + "grad_norm": 0.16339556872844696, + "learning_rate": 3.71891108148441e-05, + "loss": 0.507, + "step": 8301 + }, + { + "epoch": 1.7066502209887964, + "grad_norm": 0.12459365278482437, + "learning_rate": 3.717927451361665e-05, + "loss": 0.5091, + "step": 8302 + }, + { + "epoch": 1.706855791962175, + "grad_norm": 0.15585310757160187, + "learning_rate": 3.7169438597679804e-05, + "loss": 0.5252, + "step": 8303 + }, + { + "epoch": 1.7070613629355536, + "grad_norm": 0.19386261701583862, + "learning_rate": 3.7159603067518105e-05, + "loss": 0.552, + "step": 8304 + }, + { + "epoch": 1.7072669339089321, + "grad_norm": 0.19203509390354156, + "learning_rate": 3.714976792361612e-05, + "loss": 0.5265, + "step": 8305 + }, + { + "epoch": 1.7074725048823107, + "grad_norm": 0.1666734516620636, + "learning_rate": 3.713993316645839e-05, + "loss": 0.5117, + "step": 8306 + }, + { + "epoch": 1.7076780758556893, + "grad_norm": 0.1642848700284958, + "learning_rate": 3.713009879652938e-05, + "loss": 0.5366, + "step": 8307 + }, + { + "epoch": 1.7078836468290677, + "grad_norm": 0.19008807837963104, + "learning_rate": 3.712026481431364e-05, + "loss": 0.5266, + "step": 8308 + }, + { + "epoch": 1.7080892178024463, + "grad_norm": 0.19728736579418182, + "learning_rate": 3.711043122029563e-05, + "loss": 0.5425, + "step": 8309 + }, + { + "epoch": 1.7082947887758249, + "grad_norm": 0.1897844821214676, + "learning_rate": 3.71005980149598e-05, + "loss": 0.5437, + "step": 8310 + }, + { + "epoch": 1.7085003597492034, + "grad_norm": 0.19176128506660461, + "learning_rate": 3.709076519879057e-05, + "loss": 0.5434, + "step": 8311 + }, + { + "epoch": 1.7087059307225818, + "grad_norm": 0.1628829389810562, + "learning_rate": 3.7080932772272376e-05, + "loss": 0.503, + "step": 8312 + }, + { + "epoch": 1.7089115016959604, + "grad_norm": 0.16427487134933472, + "learning_rate": 3.707110073588962e-05, + "loss": 0.5303, + "step": 8313 + }, + { + "epoch": 1.709117072669339, + "grad_norm": 0.19906549155712128, + "learning_rate": 3.706126909012664e-05, + "loss": 0.5186, + "step": 8314 + }, + { + "epoch": 1.7093226436427176, + "grad_norm": 0.19773396849632263, + "learning_rate": 3.7051437835467854e-05, + "loss": 0.544, + "step": 8315 + }, + { + "epoch": 1.7095282146160962, + "grad_norm": 0.19623300433158875, + "learning_rate": 3.7041606972397575e-05, + "loss": 0.5391, + "step": 8316 + }, + { + "epoch": 1.7097337855894748, + "grad_norm": 0.1944045126438141, + "learning_rate": 3.703177650140011e-05, + "loss": 0.5314, + "step": 8317 + }, + { + "epoch": 1.7099393565628533, + "grad_norm": 0.187159925699234, + "learning_rate": 3.702194642295979e-05, + "loss": 0.5218, + "step": 8318 + }, + { + "epoch": 1.710144927536232, + "grad_norm": 0.19343869388103485, + "learning_rate": 3.701211673756087e-05, + "loss": 0.5396, + "step": 8319 + }, + { + "epoch": 1.7103504985096105, + "grad_norm": 0.1905796229839325, + "learning_rate": 3.700228744568762e-05, + "loss": 0.5374, + "step": 8320 + }, + { + "epoch": 1.710556069482989, + "grad_norm": 0.19070343673229218, + "learning_rate": 3.6992458547824285e-05, + "loss": 0.5484, + "step": 8321 + }, + { + "epoch": 1.7107616404563677, + "grad_norm": 0.1975802779197693, + "learning_rate": 3.69826300444551e-05, + "loss": 0.5266, + "step": 8322 + }, + { + "epoch": 1.7109672114297463, + "grad_norm": 0.18827404081821442, + "learning_rate": 3.6972801936064244e-05, + "loss": 0.5176, + "step": 8323 + }, + { + "epoch": 1.7111727824031246, + "grad_norm": 0.21654710173606873, + "learning_rate": 3.6962974223135936e-05, + "loss": 0.5777, + "step": 8324 + }, + { + "epoch": 1.7113783533765032, + "grad_norm": 0.19230619072914124, + "learning_rate": 3.695314690615432e-05, + "loss": 0.5154, + "step": 8325 + }, + { + "epoch": 1.7115839243498818, + "grad_norm": 0.16557452082633972, + "learning_rate": 3.694331998560354e-05, + "loss": 0.5188, + "step": 8326 + }, + { + "epoch": 1.7117894953232602, + "grad_norm": 0.15893855690956116, + "learning_rate": 3.693349346196773e-05, + "loss": 0.5331, + "step": 8327 + }, + { + "epoch": 1.7119950662966388, + "grad_norm": 0.19809909164905548, + "learning_rate": 3.692366733573098e-05, + "loss": 0.5245, + "step": 8328 + }, + { + "epoch": 1.7122006372700174, + "grad_norm": 0.19923657178878784, + "learning_rate": 3.691384160737741e-05, + "loss": 0.5283, + "step": 8329 + }, + { + "epoch": 1.712406208243396, + "grad_norm": 0.2039818912744522, + "learning_rate": 3.690401627739107e-05, + "loss": 0.5497, + "step": 8330 + }, + { + "epoch": 1.7126117792167745, + "grad_norm": 0.194504514336586, + "learning_rate": 3.6894191346255995e-05, + "loss": 0.5249, + "step": 8331 + }, + { + "epoch": 1.7128173501901531, + "grad_norm": 0.535821795463562, + "learning_rate": 3.688436681445623e-05, + "loss": 0.5977, + "step": 8332 + }, + { + "epoch": 1.7130229211635317, + "grad_norm": 0.19687367975711823, + "learning_rate": 3.687454268247578e-05, + "loss": 0.5581, + "step": 8333 + }, + { + "epoch": 1.7132284921369103, + "grad_norm": 0.1983097642660141, + "learning_rate": 3.686471895079863e-05, + "loss": 0.5311, + "step": 8334 + }, + { + "epoch": 1.713434063110289, + "grad_norm": 0.19524888694286346, + "learning_rate": 3.685489561990875e-05, + "loss": 0.558, + "step": 8335 + }, + { + "epoch": 1.7136396340836675, + "grad_norm": 0.19910888373851776, + "learning_rate": 3.684507269029011e-05, + "loss": 0.547, + "step": 8336 + }, + { + "epoch": 1.713845205057046, + "grad_norm": 0.1981588751077652, + "learning_rate": 3.683525016242662e-05, + "loss": 0.5375, + "step": 8337 + }, + { + "epoch": 1.7140507760304247, + "grad_norm": 0.20191727578639984, + "learning_rate": 3.6825428036802184e-05, + "loss": 0.5574, + "step": 8338 + }, + { + "epoch": 1.714256347003803, + "grad_norm": 0.16322053968906403, + "learning_rate": 3.681560631390071e-05, + "loss": 0.5131, + "step": 8339 + }, + { + "epoch": 1.7144619179771816, + "grad_norm": 0.19834211468696594, + "learning_rate": 3.6805784994206056e-05, + "loss": 0.5583, + "step": 8340 + }, + { + "epoch": 1.7146674889505602, + "grad_norm": 0.20397832989692688, + "learning_rate": 3.679596407820205e-05, + "loss": 0.5192, + "step": 8341 + }, + { + "epoch": 1.7148730599239388, + "grad_norm": 0.19556482136249542, + "learning_rate": 3.678614356637258e-05, + "loss": 0.528, + "step": 8342 + }, + { + "epoch": 1.7150786308973172, + "grad_norm": 0.18456198275089264, + "learning_rate": 3.6776323459201415e-05, + "loss": 0.4952, + "step": 8343 + }, + { + "epoch": 1.7152842018706957, + "grad_norm": 0.19880907237529755, + "learning_rate": 3.676650375717235e-05, + "loss": 0.5528, + "step": 8344 + }, + { + "epoch": 1.7154897728440743, + "grad_norm": 0.19653092324733734, + "learning_rate": 3.6756684460769175e-05, + "loss": 0.5341, + "step": 8345 + }, + { + "epoch": 1.715695343817453, + "grad_norm": 0.5179283022880554, + "learning_rate": 3.674686557047562e-05, + "loss": 0.5772, + "step": 8346 + }, + { + "epoch": 1.7159009147908315, + "grad_norm": 0.1959078460931778, + "learning_rate": 3.673704708677543e-05, + "loss": 0.5262, + "step": 8347 + }, + { + "epoch": 1.71610648576421, + "grad_norm": 0.19857066869735718, + "learning_rate": 3.672722901015228e-05, + "loss": 0.5572, + "step": 8348 + }, + { + "epoch": 1.7163120567375887, + "grad_norm": 0.19778084754943848, + "learning_rate": 3.6717411341089914e-05, + "loss": 0.5264, + "step": 8349 + }, + { + "epoch": 1.7165176277109673, + "grad_norm": 0.2006417065858841, + "learning_rate": 3.670759408007199e-05, + "loss": 0.5192, + "step": 8350 + }, + { + "epoch": 1.7167231986843459, + "grad_norm": 0.19210219383239746, + "learning_rate": 3.669777722758213e-05, + "loss": 0.5253, + "step": 8351 + }, + { + "epoch": 1.7169287696577245, + "grad_norm": 0.19173528254032135, + "learning_rate": 3.668796078410399e-05, + "loss": 0.5447, + "step": 8352 + }, + { + "epoch": 1.717134340631103, + "grad_norm": 0.19798819720745087, + "learning_rate": 3.667814475012116e-05, + "loss": 0.5544, + "step": 8353 + }, + { + "epoch": 1.7173399116044814, + "grad_norm": 0.19367478787899017, + "learning_rate": 3.666832912611725e-05, + "loss": 0.546, + "step": 8354 + }, + { + "epoch": 1.71754548257786, + "grad_norm": 0.19712290167808533, + "learning_rate": 3.665851391257582e-05, + "loss": 0.5339, + "step": 8355 + }, + { + "epoch": 1.7177510535512386, + "grad_norm": 0.19337862730026245, + "learning_rate": 3.6648699109980416e-05, + "loss": 0.5559, + "step": 8356 + }, + { + "epoch": 1.7179566245246172, + "grad_norm": 0.19475507736206055, + "learning_rate": 3.6638884718814584e-05, + "loss": 0.5432, + "step": 8357 + }, + { + "epoch": 1.7181621954979955, + "grad_norm": 0.18744108080863953, + "learning_rate": 3.6629070739561816e-05, + "loss": 0.5275, + "step": 8358 + }, + { + "epoch": 1.7183677664713741, + "grad_norm": 0.18683594465255737, + "learning_rate": 3.661925717270561e-05, + "loss": 0.5345, + "step": 8359 + }, + { + "epoch": 1.7185733374447527, + "grad_norm": 0.1923644095659256, + "learning_rate": 3.660944401872944e-05, + "loss": 0.534, + "step": 8360 + }, + { + "epoch": 1.7187789084181313, + "grad_norm": 0.19515560567378998, + "learning_rate": 3.6599631278116735e-05, + "loss": 0.5591, + "step": 8361 + }, + { + "epoch": 1.71898447939151, + "grad_norm": 0.19667771458625793, + "learning_rate": 3.658981895135092e-05, + "loss": 0.5374, + "step": 8362 + }, + { + "epoch": 1.7191900503648885, + "grad_norm": 0.19800591468811035, + "learning_rate": 3.6580007038915436e-05, + "loss": 0.5309, + "step": 8363 + }, + { + "epoch": 1.719395621338267, + "grad_norm": 0.1907908171415329, + "learning_rate": 3.657019554129365e-05, + "loss": 0.5261, + "step": 8364 + }, + { + "epoch": 1.7196011923116457, + "grad_norm": 0.195295050740242, + "learning_rate": 3.656038445896891e-05, + "loss": 0.5586, + "step": 8365 + }, + { + "epoch": 1.7198067632850242, + "grad_norm": 0.1947198510169983, + "learning_rate": 3.6550573792424606e-05, + "loss": 0.5474, + "step": 8366 + }, + { + "epoch": 1.7200123342584028, + "grad_norm": 0.1944238245487213, + "learning_rate": 3.654076354214403e-05, + "loss": 0.543, + "step": 8367 + }, + { + "epoch": 1.7202179052317814, + "grad_norm": 0.18714429438114166, + "learning_rate": 3.6530953708610496e-05, + "loss": 0.5199, + "step": 8368 + }, + { + "epoch": 1.7204234762051598, + "grad_norm": 0.16641157865524292, + "learning_rate": 3.652114429230727e-05, + "loss": 0.5063, + "step": 8369 + }, + { + "epoch": 1.7206290471785384, + "grad_norm": 0.1247912049293518, + "learning_rate": 3.651133529371765e-05, + "loss": 0.5055, + "step": 8370 + }, + { + "epoch": 1.720834618151917, + "grad_norm": 0.12845945358276367, + "learning_rate": 3.650152671332487e-05, + "loss": 0.5052, + "step": 8371 + }, + { + "epoch": 1.7210401891252955, + "grad_norm": 0.15474404394626617, + "learning_rate": 3.6491718551612146e-05, + "loss": 0.5453, + "step": 8372 + }, + { + "epoch": 1.721245760098674, + "grad_norm": 0.19407010078430176, + "learning_rate": 3.648191080906268e-05, + "loss": 0.5121, + "step": 8373 + }, + { + "epoch": 1.7214513310720525, + "grad_norm": 0.16241449117660522, + "learning_rate": 3.647210348615964e-05, + "loss": 0.5118, + "step": 8374 + }, + { + "epoch": 1.721656902045431, + "grad_norm": 0.1552891731262207, + "learning_rate": 3.6462296583386225e-05, + "loss": 0.5206, + "step": 8375 + }, + { + "epoch": 1.7218624730188097, + "grad_norm": 0.194035142660141, + "learning_rate": 3.6452490101225536e-05, + "loss": 0.5528, + "step": 8376 + }, + { + "epoch": 1.7220680439921883, + "grad_norm": 0.2007959634065628, + "learning_rate": 3.6442684040160704e-05, + "loss": 0.5216, + "step": 8377 + }, + { + "epoch": 1.7222736149655669, + "grad_norm": 0.17652183771133423, + "learning_rate": 3.643287840067485e-05, + "loss": 0.5151, + "step": 8378 + }, + { + "epoch": 1.7224791859389454, + "grad_norm": 0.15610848367214203, + "learning_rate": 3.6423073183251024e-05, + "loss": 0.5385, + "step": 8379 + }, + { + "epoch": 1.722684756912324, + "grad_norm": 0.19610700011253357, + "learning_rate": 3.641326838837231e-05, + "loss": 0.5363, + "step": 8380 + }, + { + "epoch": 1.7228903278857026, + "grad_norm": 0.1605963259935379, + "learning_rate": 3.6403464016521716e-05, + "loss": 0.5102, + "step": 8381 + }, + { + "epoch": 1.7230958988590812, + "grad_norm": 0.15825892984867096, + "learning_rate": 3.639366006818227e-05, + "loss": 0.5264, + "step": 8382 + }, + { + "epoch": 1.7233014698324598, + "grad_norm": 0.156993106007576, + "learning_rate": 3.638385654383695e-05, + "loss": 0.4904, + "step": 8383 + }, + { + "epoch": 1.7235070408058382, + "grad_norm": 0.1612616777420044, + "learning_rate": 3.637405344396877e-05, + "loss": 0.5488, + "step": 8384 + }, + { + "epoch": 1.7237126117792168, + "grad_norm": 0.19856800138950348, + "learning_rate": 3.6364250769060654e-05, + "loss": 0.5246, + "step": 8385 + }, + { + "epoch": 1.7239181827525953, + "grad_norm": 0.19383041560649872, + "learning_rate": 3.6354448519595526e-05, + "loss": 0.5251, + "step": 8386 + }, + { + "epoch": 1.724123753725974, + "grad_norm": 0.15535280108451843, + "learning_rate": 3.634464669605633e-05, + "loss": 0.4938, + "step": 8387 + }, + { + "epoch": 1.7243293246993523, + "grad_norm": 0.1630435734987259, + "learning_rate": 3.633484529892593e-05, + "loss": 0.5444, + "step": 8388 + }, + { + "epoch": 1.7245348956727309, + "grad_norm": 0.2020839899778366, + "learning_rate": 3.6325044328687194e-05, + "loss": 0.5377, + "step": 8389 + }, + { + "epoch": 1.7247404666461095, + "grad_norm": 0.16391253471374512, + "learning_rate": 3.631524378582297e-05, + "loss": 0.4937, + "step": 8390 + }, + { + "epoch": 1.724946037619488, + "grad_norm": 0.12773092091083527, + "learning_rate": 3.630544367081611e-05, + "loss": 0.5292, + "step": 8391 + }, + { + "epoch": 1.7251516085928666, + "grad_norm": 0.16146111488342285, + "learning_rate": 3.62956439841494e-05, + "loss": 0.5185, + "step": 8392 + }, + { + "epoch": 1.7253571795662452, + "grad_norm": 0.19887956976890564, + "learning_rate": 3.6285844726305624e-05, + "loss": 0.5424, + "step": 8393 + }, + { + "epoch": 1.7255627505396238, + "grad_norm": 0.19647051393985748, + "learning_rate": 3.627604589776755e-05, + "loss": 0.5365, + "step": 8394 + }, + { + "epoch": 1.7257683215130024, + "grad_norm": 0.16327311098575592, + "learning_rate": 3.626624749901792e-05, + "loss": 0.523, + "step": 8395 + }, + { + "epoch": 1.725973892486381, + "grad_norm": 0.16111861169338226, + "learning_rate": 3.625644953053945e-05, + "loss": 0.5296, + "step": 8396 + }, + { + "epoch": 1.7261794634597596, + "grad_norm": 0.20019720494747162, + "learning_rate": 3.624665199281483e-05, + "loss": 0.5494, + "step": 8397 + }, + { + "epoch": 1.7263850344331382, + "grad_norm": 0.1936234086751938, + "learning_rate": 3.623685488632678e-05, + "loss": 0.5291, + "step": 8398 + }, + { + "epoch": 1.7265906054065168, + "grad_norm": 0.20377790927886963, + "learning_rate": 3.6227058211557906e-05, + "loss": 0.5492, + "step": 8399 + }, + { + "epoch": 1.7267961763798951, + "grad_norm": 0.16904407739639282, + "learning_rate": 3.621726196899089e-05, + "loss": 0.5124, + "step": 8400 + }, + { + "epoch": 1.7270017473532737, + "grad_norm": 0.1632084995508194, + "learning_rate": 3.620746615910832e-05, + "loss": 0.5514, + "step": 8401 + }, + { + "epoch": 1.7272073183266523, + "grad_norm": 0.16341902315616608, + "learning_rate": 3.61976707823928e-05, + "loss": 0.5038, + "step": 8402 + }, + { + "epoch": 1.7274128893000307, + "grad_norm": 0.1626911461353302, + "learning_rate": 3.61878758393269e-05, + "loss": 0.5417, + "step": 8403 + }, + { + "epoch": 1.7276184602734093, + "grad_norm": 0.2055915892124176, + "learning_rate": 3.617808133039314e-05, + "loss": 0.531, + "step": 8404 + }, + { + "epoch": 1.7278240312467879, + "grad_norm": 0.1969294250011444, + "learning_rate": 3.616828725607411e-05, + "loss": 0.5347, + "step": 8405 + }, + { + "epoch": 1.7280296022201664, + "grad_norm": 0.1655907779932022, + "learning_rate": 3.6158493616852276e-05, + "loss": 0.5059, + "step": 8406 + }, + { + "epoch": 1.728235173193545, + "grad_norm": 0.1626054346561432, + "learning_rate": 3.6148700413210144e-05, + "loss": 0.5243, + "step": 8407 + }, + { + "epoch": 1.7284407441669236, + "grad_norm": 0.19298885762691498, + "learning_rate": 3.613890764563016e-05, + "loss": 0.5355, + "step": 8408 + }, + { + "epoch": 1.7286463151403022, + "grad_norm": 0.20283274352550507, + "learning_rate": 3.61291153145948e-05, + "loss": 0.5398, + "step": 8409 + }, + { + "epoch": 1.7288518861136808, + "grad_norm": 0.19936981797218323, + "learning_rate": 3.6119323420586446e-05, + "loss": 0.5374, + "step": 8410 + }, + { + "epoch": 1.7290574570870594, + "grad_norm": 0.16159012913703918, + "learning_rate": 3.610953196408752e-05, + "loss": 0.4839, + "step": 8411 + }, + { + "epoch": 1.729263028060438, + "grad_norm": 0.16305240988731384, + "learning_rate": 3.609974094558041e-05, + "loss": 0.5284, + "step": 8412 + }, + { + "epoch": 1.7294685990338166, + "grad_norm": 0.1939508616924286, + "learning_rate": 3.608995036554746e-05, + "loss": 0.5127, + "step": 8413 + }, + { + "epoch": 1.7296741700071951, + "grad_norm": 0.1960534304380417, + "learning_rate": 3.608016022447102e-05, + "loss": 0.5506, + "step": 8414 + }, + { + "epoch": 1.7298797409805735, + "grad_norm": 0.18489257991313934, + "learning_rate": 3.607037052283339e-05, + "loss": 0.5321, + "step": 8415 + }, + { + "epoch": 1.730085311953952, + "grad_norm": 0.1943347156047821, + "learning_rate": 3.606058126111686e-05, + "loss": 0.5447, + "step": 8416 + }, + { + "epoch": 1.7302908829273307, + "grad_norm": 0.199358269572258, + "learning_rate": 3.60507924398037e-05, + "loss": 0.553, + "step": 8417 + }, + { + "epoch": 1.7304964539007093, + "grad_norm": 0.16631248593330383, + "learning_rate": 3.6041004059376176e-05, + "loss": 0.4963, + "step": 8418 + }, + { + "epoch": 1.7307020248740876, + "grad_norm": 0.210128515958786, + "learning_rate": 3.603121612031652e-05, + "loss": 0.5068, + "step": 8419 + }, + { + "epoch": 1.7309075958474662, + "grad_norm": 0.16205939650535583, + "learning_rate": 3.602142862310691e-05, + "loss": 0.5304, + "step": 8420 + }, + { + "epoch": 1.7311131668208448, + "grad_norm": 0.1637234389781952, + "learning_rate": 3.601164156822956e-05, + "loss": 0.498, + "step": 8421 + }, + { + "epoch": 1.7313187377942234, + "grad_norm": 0.12157563865184784, + "learning_rate": 3.600185495616661e-05, + "loss": 0.5283, + "step": 8422 + }, + { + "epoch": 1.731524308767602, + "grad_norm": 0.1593407392501831, + "learning_rate": 3.599206878740021e-05, + "loss": 0.5318, + "step": 8423 + }, + { + "epoch": 1.7317298797409806, + "grad_norm": 0.16835933923721313, + "learning_rate": 3.598228306241247e-05, + "loss": 0.5268, + "step": 8424 + }, + { + "epoch": 1.7319354507143592, + "grad_norm": 0.12342957407236099, + "learning_rate": 3.59724977816855e-05, + "loss": 0.5118, + "step": 8425 + }, + { + "epoch": 1.7321410216877378, + "grad_norm": 0.15400569140911102, + "learning_rate": 3.596271294570138e-05, + "loss": 0.535, + "step": 8426 + }, + { + "epoch": 1.7323465926611163, + "grad_norm": 0.19436071813106537, + "learning_rate": 3.595292855494215e-05, + "loss": 0.5485, + "step": 8427 + }, + { + "epoch": 1.732552163634495, + "grad_norm": 0.16538384556770325, + "learning_rate": 3.594314460988984e-05, + "loss": 0.4909, + "step": 8428 + }, + { + "epoch": 1.7327577346078735, + "grad_norm": 0.15564298629760742, + "learning_rate": 3.5933361111026453e-05, + "loss": 0.5438, + "step": 8429 + }, + { + "epoch": 1.732963305581252, + "grad_norm": 0.19588908553123474, + "learning_rate": 3.5923578058834e-05, + "loss": 0.5485, + "step": 8430 + }, + { + "epoch": 1.7331688765546305, + "grad_norm": 0.19124017655849457, + "learning_rate": 3.5913795453794427e-05, + "loss": 0.5295, + "step": 8431 + }, + { + "epoch": 1.733374447528009, + "grad_norm": 0.15818458795547485, + "learning_rate": 3.5904013296389686e-05, + "loss": 0.5142, + "step": 8432 + }, + { + "epoch": 1.7335800185013877, + "grad_norm": 0.15775617957115173, + "learning_rate": 3.5894231587101694e-05, + "loss": 0.5282, + "step": 8433 + }, + { + "epoch": 1.733785589474766, + "grad_norm": 0.16275940835475922, + "learning_rate": 3.588445032641236e-05, + "loss": 0.5012, + "step": 8434 + }, + { + "epoch": 1.7339911604481446, + "grad_norm": 0.15710069239139557, + "learning_rate": 3.5874669514803545e-05, + "loss": 0.5309, + "step": 8435 + }, + { + "epoch": 1.7341967314215232, + "grad_norm": 0.19356967508792877, + "learning_rate": 3.586488915275711e-05, + "loss": 0.5344, + "step": 8436 + }, + { + "epoch": 1.7344023023949018, + "grad_norm": 0.19396322965621948, + "learning_rate": 3.58551092407549e-05, + "loss": 0.5279, + "step": 8437 + }, + { + "epoch": 1.7346078733682804, + "grad_norm": 0.20493246614933014, + "learning_rate": 3.5845329779278694e-05, + "loss": 0.5537, + "step": 8438 + }, + { + "epoch": 1.734813444341659, + "grad_norm": 0.18893173336982727, + "learning_rate": 3.583555076881031e-05, + "loss": 0.5145, + "step": 8439 + }, + { + "epoch": 1.7350190153150375, + "grad_norm": 0.19152796268463135, + "learning_rate": 3.5825772209831517e-05, + "loss": 0.514, + "step": 8440 + }, + { + "epoch": 1.7352245862884161, + "grad_norm": 0.1870860904455185, + "learning_rate": 3.581599410282403e-05, + "loss": 0.5234, + "step": 8441 + }, + { + "epoch": 1.7354301572617947, + "grad_norm": 0.1898457258939743, + "learning_rate": 3.58062164482696e-05, + "loss": 0.5324, + "step": 8442 + }, + { + "epoch": 1.7356357282351733, + "grad_norm": 0.19367991387844086, + "learning_rate": 3.579643924664991e-05, + "loss": 0.5443, + "step": 8443 + }, + { + "epoch": 1.735841299208552, + "grad_norm": 0.1994738131761551, + "learning_rate": 3.5786662498446645e-05, + "loss": 0.5449, + "step": 8444 + }, + { + "epoch": 1.7360468701819303, + "grad_norm": 0.16127611696720123, + "learning_rate": 3.577688620414143e-05, + "loss": 0.5126, + "step": 8445 + }, + { + "epoch": 1.7362524411553089, + "grad_norm": 0.16468468308448792, + "learning_rate": 3.5767110364215954e-05, + "loss": 0.5467, + "step": 8446 + }, + { + "epoch": 1.7364580121286874, + "grad_norm": 0.20566272735595703, + "learning_rate": 3.575733497915179e-05, + "loss": 0.5503, + "step": 8447 + }, + { + "epoch": 1.736663583102066, + "grad_norm": 0.2016957551240921, + "learning_rate": 3.5747560049430526e-05, + "loss": 0.534, + "step": 8448 + }, + { + "epoch": 1.7368691540754444, + "grad_norm": 0.6397230625152588, + "learning_rate": 3.573778557553374e-05, + "loss": 0.5599, + "step": 8449 + }, + { + "epoch": 1.737074725048823, + "grad_norm": 0.191917285323143, + "learning_rate": 3.572801155794295e-05, + "loss": 0.5208, + "step": 8450 + }, + { + "epoch": 1.7372802960222016, + "grad_norm": 0.20242702960968018, + "learning_rate": 3.571823799713971e-05, + "loss": 0.5409, + "step": 8451 + }, + { + "epoch": 1.7374858669955802, + "grad_norm": 0.16706420481204987, + "learning_rate": 3.570846489360549e-05, + "loss": 0.5102, + "step": 8452 + }, + { + "epoch": 1.7376914379689588, + "grad_norm": 0.1416233628988266, + "learning_rate": 3.569869224782177e-05, + "loss": 0.5132, + "step": 8453 + }, + { + "epoch": 1.7378970089423373, + "grad_norm": 0.16062797605991364, + "learning_rate": 3.568892006027003e-05, + "loss": 0.5522, + "step": 8454 + }, + { + "epoch": 1.738102579915716, + "grad_norm": 0.16195809841156006, + "learning_rate": 3.5679148331431666e-05, + "loss": 0.4988, + "step": 8455 + }, + { + "epoch": 1.7383081508890945, + "grad_norm": 0.13933859765529633, + "learning_rate": 3.5669377061788104e-05, + "loss": 0.5216, + "step": 8456 + }, + { + "epoch": 1.738513721862473, + "grad_norm": 0.16723297536373138, + "learning_rate": 3.565960625182073e-05, + "loss": 0.5381, + "step": 8457 + }, + { + "epoch": 1.7387192928358517, + "grad_norm": 0.20578205585479736, + "learning_rate": 3.564983590201089e-05, + "loss": 0.5171, + "step": 8458 + }, + { + "epoch": 1.7389248638092303, + "grad_norm": 0.20782290399074554, + "learning_rate": 3.564006601283992e-05, + "loss": 0.5442, + "step": 8459 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.19257017970085144, + "learning_rate": 3.563029658478916e-05, + "loss": 0.5502, + "step": 8460 + }, + { + "epoch": 1.7393360057559872, + "grad_norm": 0.19143828749656677, + "learning_rate": 3.56205276183399e-05, + "loss": 0.5325, + "step": 8461 + }, + { + "epoch": 1.7395415767293658, + "grad_norm": 0.19385689496994019, + "learning_rate": 3.5610759113973395e-05, + "loss": 0.5194, + "step": 8462 + }, + { + "epoch": 1.7397471477027444, + "grad_norm": 0.1967114955186844, + "learning_rate": 3.560099107217091e-05, + "loss": 0.5313, + "step": 8463 + }, + { + "epoch": 1.7399527186761228, + "grad_norm": 0.16215933859348297, + "learning_rate": 3.559122349341366e-05, + "loss": 0.5018, + "step": 8464 + }, + { + "epoch": 1.7401582896495014, + "grad_norm": 0.1343732327222824, + "learning_rate": 3.558145637818286e-05, + "loss": 0.519, + "step": 8465 + }, + { + "epoch": 1.74036386062288, + "grad_norm": 0.15892748534679413, + "learning_rate": 3.557168972695966e-05, + "loss": 0.5512, + "step": 8466 + }, + { + "epoch": 1.7405694315962585, + "grad_norm": 0.2068302482366562, + "learning_rate": 3.556192354022525e-05, + "loss": 0.5618, + "step": 8467 + }, + { + "epoch": 1.7407750025696371, + "grad_norm": 0.20231375098228455, + "learning_rate": 3.555215781846077e-05, + "loss": 0.5403, + "step": 8468 + }, + { + "epoch": 1.7409805735430157, + "grad_norm": 0.18931826949119568, + "learning_rate": 3.5542392562147305e-05, + "loss": 0.5406, + "step": 8469 + }, + { + "epoch": 1.7411861445163943, + "grad_norm": 0.1967364251613617, + "learning_rate": 3.553262777176596e-05, + "loss": 0.5488, + "step": 8470 + }, + { + "epoch": 1.741391715489773, + "grad_norm": 0.16582554578781128, + "learning_rate": 3.552286344779779e-05, + "loss": 0.5162, + "step": 8471 + }, + { + "epoch": 1.7415972864631515, + "grad_norm": 0.16116267442703247, + "learning_rate": 3.551309959072383e-05, + "loss": 0.5275, + "step": 8472 + }, + { + "epoch": 1.74180285743653, + "grad_norm": 0.19118881225585938, + "learning_rate": 3.550333620102512e-05, + "loss": 0.5363, + "step": 8473 + }, + { + "epoch": 1.7420084284099087, + "grad_norm": 0.1949569284915924, + "learning_rate": 3.549357327918264e-05, + "loss": 0.5546, + "step": 8474 + }, + { + "epoch": 1.742213999383287, + "grad_norm": 0.2024715095758438, + "learning_rate": 3.548381082567738e-05, + "loss": 0.5318, + "step": 8475 + }, + { + "epoch": 1.7424195703566656, + "grad_norm": 0.21180285513401031, + "learning_rate": 3.5474048840990286e-05, + "loss": 0.5362, + "step": 8476 + }, + { + "epoch": 1.7426251413300442, + "grad_norm": 0.1865611970424652, + "learning_rate": 3.546428732560228e-05, + "loss": 0.4995, + "step": 8477 + }, + { + "epoch": 1.7428307123034228, + "grad_norm": 0.17171883583068848, + "learning_rate": 3.545452627999427e-05, + "loss": 0.5554, + "step": 8478 + }, + { + "epoch": 1.7430362832768012, + "grad_norm": 0.2119421362876892, + "learning_rate": 3.544476570464713e-05, + "loss": 0.5312, + "step": 8479 + }, + { + "epoch": 1.7432418542501797, + "grad_norm": 0.1979297697544098, + "learning_rate": 3.543500560004171e-05, + "loss": 0.5277, + "step": 8480 + }, + { + "epoch": 1.7434474252235583, + "grad_norm": 0.19431854784488678, + "learning_rate": 3.542524596665887e-05, + "loss": 0.5307, + "step": 8481 + }, + { + "epoch": 1.743652996196937, + "grad_norm": 0.18718986213207245, + "learning_rate": 3.5415486804979417e-05, + "loss": 0.5164, + "step": 8482 + }, + { + "epoch": 1.7438585671703155, + "grad_norm": 0.19408833980560303, + "learning_rate": 3.540572811548412e-05, + "loss": 0.5484, + "step": 8483 + }, + { + "epoch": 1.744064138143694, + "grad_norm": 0.19318553805351257, + "learning_rate": 3.539596989865375e-05, + "loss": 0.523, + "step": 8484 + }, + { + "epoch": 1.7442697091170727, + "grad_norm": 0.16727426648139954, + "learning_rate": 3.538621215496907e-05, + "loss": 0.5504, + "step": 8485 + }, + { + "epoch": 1.7444752800904513, + "grad_norm": 0.16280822455883026, + "learning_rate": 3.537645488491078e-05, + "loss": 0.538, + "step": 8486 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 0.19524060189723969, + "learning_rate": 3.5366698088959557e-05, + "loss": 0.5333, + "step": 8487 + }, + { + "epoch": 1.7448864220372085, + "grad_norm": 0.1601538062095642, + "learning_rate": 3.535694176759611e-05, + "loss": 0.4854, + "step": 8488 + }, + { + "epoch": 1.745091993010587, + "grad_norm": 0.16524933278560638, + "learning_rate": 3.534718592130107e-05, + "loss": 0.5261, + "step": 8489 + }, + { + "epoch": 1.7452975639839656, + "grad_norm": 0.20658870041370392, + "learning_rate": 3.5337430550555065e-05, + "loss": 0.5592, + "step": 8490 + }, + { + "epoch": 1.745503134957344, + "grad_norm": 0.20213808119297028, + "learning_rate": 3.5327675655838694e-05, + "loss": 0.5413, + "step": 8491 + }, + { + "epoch": 1.7457087059307226, + "grad_norm": 0.20643405616283417, + "learning_rate": 3.531792123763253e-05, + "loss": 0.5504, + "step": 8492 + }, + { + "epoch": 1.7459142769041012, + "grad_norm": 0.1972249150276184, + "learning_rate": 3.5308167296417125e-05, + "loss": 0.5359, + "step": 8493 + }, + { + "epoch": 1.7461198478774795, + "grad_norm": 0.19785918295383453, + "learning_rate": 3.529841383267303e-05, + "loss": 0.5605, + "step": 8494 + }, + { + "epoch": 1.7463254188508581, + "grad_norm": 0.3983357846736908, + "learning_rate": 3.528866084688074e-05, + "loss": 0.5958, + "step": 8495 + }, + { + "epoch": 1.7465309898242367, + "grad_norm": 0.21267639100551605, + "learning_rate": 3.527890833952073e-05, + "loss": 0.5268, + "step": 8496 + }, + { + "epoch": 1.7467365607976153, + "grad_norm": 0.16559986770153046, + "learning_rate": 3.5269156311073484e-05, + "loss": 0.5056, + "step": 8497 + }, + { + "epoch": 1.746942131770994, + "grad_norm": 0.15838290750980377, + "learning_rate": 3.5259404762019416e-05, + "loss": 0.552, + "step": 8498 + }, + { + "epoch": 1.7471477027443725, + "grad_norm": 0.19080090522766113, + "learning_rate": 3.524965369283896e-05, + "loss": 0.5234, + "step": 8499 + }, + { + "epoch": 1.747353273717751, + "grad_norm": 0.166726753115654, + "learning_rate": 3.5239903104012464e-05, + "loss": 0.5099, + "step": 8500 + }, + { + "epoch": 1.7475588446911297, + "grad_norm": 0.15904779732227325, + "learning_rate": 3.5230152996020346e-05, + "loss": 0.5136, + "step": 8501 + }, + { + "epoch": 1.7477644156645082, + "grad_norm": 0.19941627979278564, + "learning_rate": 3.522040336934293e-05, + "loss": 0.5499, + "step": 8502 + }, + { + "epoch": 1.7479699866378868, + "grad_norm": 0.20110583305358887, + "learning_rate": 3.521065422446052e-05, + "loss": 0.5503, + "step": 8503 + }, + { + "epoch": 1.7481755576112654, + "grad_norm": 0.20107027888298035, + "learning_rate": 3.520090556185343e-05, + "loss": 0.5605, + "step": 8504 + }, + { + "epoch": 1.748381128584644, + "grad_norm": 0.18705639243125916, + "learning_rate": 3.51911573820019e-05, + "loss": 0.5163, + "step": 8505 + }, + { + "epoch": 1.7485866995580224, + "grad_norm": 0.19800741970539093, + "learning_rate": 3.518140968538622e-05, + "loss": 0.4896, + "step": 8506 + }, + { + "epoch": 1.748792270531401, + "grad_norm": 0.19224296510219574, + "learning_rate": 3.517166247248659e-05, + "loss": 0.5034, + "step": 8507 + }, + { + "epoch": 1.7489978415047795, + "grad_norm": 0.20960035920143127, + "learning_rate": 3.51619157437832e-05, + "loss": 0.5245, + "step": 8508 + }, + { + "epoch": 1.7492034124781581, + "grad_norm": 0.19137395918369293, + "learning_rate": 3.5152169499756256e-05, + "loss": 0.5284, + "step": 8509 + }, + { + "epoch": 1.7494089834515365, + "grad_norm": 0.19862139225006104, + "learning_rate": 3.514242374088588e-05, + "loss": 0.5506, + "step": 8510 + }, + { + "epoch": 1.749614554424915, + "grad_norm": 0.19606275856494904, + "learning_rate": 3.5132678467652226e-05, + "loss": 0.5297, + "step": 8511 + }, + { + "epoch": 1.7498201253982937, + "grad_norm": 0.1958342045545578, + "learning_rate": 3.512293368053537e-05, + "loss": 0.5255, + "step": 8512 + }, + { + "epoch": 1.7500256963716723, + "grad_norm": 0.16824734210968018, + "learning_rate": 3.511318938001542e-05, + "loss": 0.5178, + "step": 8513 + }, + { + "epoch": 1.7502312673450509, + "grad_norm": 0.162201389670372, + "learning_rate": 3.510344556657239e-05, + "loss": 0.5169, + "step": 8514 + }, + { + "epoch": 1.7504368383184294, + "grad_norm": 0.20394515991210938, + "learning_rate": 3.509370224068637e-05, + "loss": 0.5215, + "step": 8515 + }, + { + "epoch": 1.750642409291808, + "grad_norm": 0.2038257122039795, + "learning_rate": 3.508395940283733e-05, + "loss": 0.5277, + "step": 8516 + }, + { + "epoch": 1.7508479802651866, + "grad_norm": 0.19794686138629913, + "learning_rate": 3.507421705350526e-05, + "loss": 0.5379, + "step": 8517 + }, + { + "epoch": 1.7510535512385652, + "grad_norm": 0.19401569664478302, + "learning_rate": 3.506447519317012e-05, + "loss": 0.5313, + "step": 8518 + }, + { + "epoch": 1.7512591222119438, + "grad_norm": 0.1934097856283188, + "learning_rate": 3.5054733822311856e-05, + "loss": 0.5291, + "step": 8519 + }, + { + "epoch": 1.7514646931853224, + "grad_norm": 0.19061771035194397, + "learning_rate": 3.5044992941410374e-05, + "loss": 0.5239, + "step": 8520 + }, + { + "epoch": 1.7516702641587008, + "grad_norm": 0.19829559326171875, + "learning_rate": 3.503525255094554e-05, + "loss": 0.5532, + "step": 8521 + }, + { + "epoch": 1.7518758351320793, + "grad_norm": 0.1951601654291153, + "learning_rate": 3.502551265139726e-05, + "loss": 0.5366, + "step": 8522 + }, + { + "epoch": 1.752081406105458, + "grad_norm": 0.18550780415534973, + "learning_rate": 3.501577324324535e-05, + "loss": 0.5199, + "step": 8523 + }, + { + "epoch": 1.7522869770788365, + "grad_norm": 0.19197461009025574, + "learning_rate": 3.500603432696962e-05, + "loss": 0.5048, + "step": 8524 + }, + { + "epoch": 1.7524925480522149, + "grad_norm": 0.1984768956899643, + "learning_rate": 3.4996295903049874e-05, + "loss": 0.5334, + "step": 8525 + }, + { + "epoch": 1.7526981190255935, + "grad_norm": 0.1615784913301468, + "learning_rate": 3.498655797196586e-05, + "loss": 0.5212, + "step": 8526 + }, + { + "epoch": 1.752903689998972, + "grad_norm": 0.16125060617923737, + "learning_rate": 3.4976820534197335e-05, + "loss": 0.5676, + "step": 8527 + }, + { + "epoch": 1.7531092609723506, + "grad_norm": 0.19413301348686218, + "learning_rate": 3.4967083590224016e-05, + "loss": 0.5531, + "step": 8528 + }, + { + "epoch": 1.7533148319457292, + "grad_norm": 0.19663669168949127, + "learning_rate": 3.4957347140525585e-05, + "loss": 0.5442, + "step": 8529 + }, + { + "epoch": 1.7535204029191078, + "grad_norm": 0.16446875035762787, + "learning_rate": 3.4947611185581735e-05, + "loss": 0.5127, + "step": 8530 + }, + { + "epoch": 1.7537259738924864, + "grad_norm": 0.15640254318714142, + "learning_rate": 3.4937875725872095e-05, + "loss": 0.5259, + "step": 8531 + }, + { + "epoch": 1.753931544865865, + "grad_norm": 0.19805364310741425, + "learning_rate": 3.492814076187629e-05, + "loss": 0.535, + "step": 8532 + }, + { + "epoch": 1.7541371158392436, + "grad_norm": 0.16201485693454742, + "learning_rate": 3.491840629407391e-05, + "loss": 0.4878, + "step": 8533 + }, + { + "epoch": 1.7543426868126222, + "grad_norm": 0.1226087361574173, + "learning_rate": 3.490867232294454e-05, + "loss": 0.5218, + "step": 8534 + }, + { + "epoch": 1.7545482577860008, + "grad_norm": 0.16910824179649353, + "learning_rate": 3.4898938848967695e-05, + "loss": 0.5276, + "step": 8535 + }, + { + "epoch": 1.7547538287593791, + "grad_norm": 0.19596606492996216, + "learning_rate": 3.4889205872622936e-05, + "loss": 0.5526, + "step": 8536 + }, + { + "epoch": 1.7549593997327577, + "grad_norm": 0.20501984655857086, + "learning_rate": 3.4879473394389745e-05, + "loss": 0.5593, + "step": 8537 + }, + { + "epoch": 1.7551649707061363, + "grad_norm": 0.1966264247894287, + "learning_rate": 3.486974141474759e-05, + "loss": 0.5296, + "step": 8538 + }, + { + "epoch": 1.755370541679515, + "grad_norm": 0.18841052055358887, + "learning_rate": 3.4860009934175934e-05, + "loss": 0.5222, + "step": 8539 + }, + { + "epoch": 1.7555761126528933, + "grad_norm": 0.19071705639362335, + "learning_rate": 3.48502789531542e-05, + "loss": 0.5494, + "step": 8540 + }, + { + "epoch": 1.7557816836262718, + "grad_norm": 0.20280326902866364, + "learning_rate": 3.4840548472161777e-05, + "loss": 0.547, + "step": 8541 + }, + { + "epoch": 1.7559872545996504, + "grad_norm": 0.18994936347007751, + "learning_rate": 3.483081849167803e-05, + "loss": 0.5271, + "step": 8542 + }, + { + "epoch": 1.756192825573029, + "grad_norm": 0.19104993343353271, + "learning_rate": 3.482108901218234e-05, + "loss": 0.5228, + "step": 8543 + }, + { + "epoch": 1.7563983965464076, + "grad_norm": 0.19525660574436188, + "learning_rate": 3.481136003415402e-05, + "loss": 0.5298, + "step": 8544 + }, + { + "epoch": 1.7566039675197862, + "grad_norm": 0.19333256781101227, + "learning_rate": 3.4801631558072374e-05, + "loss": 0.5217, + "step": 8545 + }, + { + "epoch": 1.7568095384931648, + "grad_norm": 0.19645366072654724, + "learning_rate": 3.4791903584416667e-05, + "loss": 0.5334, + "step": 8546 + }, + { + "epoch": 1.7570151094665434, + "grad_norm": 0.1938944011926651, + "learning_rate": 3.478217611366615e-05, + "loss": 0.5287, + "step": 8547 + }, + { + "epoch": 1.757220680439922, + "grad_norm": 0.1910870373249054, + "learning_rate": 3.477244914630007e-05, + "loss": 0.5285, + "step": 8548 + }, + { + "epoch": 1.7574262514133006, + "grad_norm": 0.20212024450302124, + "learning_rate": 3.4762722682797614e-05, + "loss": 0.5529, + "step": 8549 + }, + { + "epoch": 1.7576318223866791, + "grad_norm": 0.19146008789539337, + "learning_rate": 3.475299672363795e-05, + "loss": 0.5124, + "step": 8550 + }, + { + "epoch": 1.7578373933600575, + "grad_norm": 0.1885506808757782, + "learning_rate": 3.474327126930026e-05, + "loss": 0.4892, + "step": 8551 + }, + { + "epoch": 1.758042964333436, + "grad_norm": 0.18597213923931122, + "learning_rate": 3.473354632026365e-05, + "loss": 0.5208, + "step": 8552 + }, + { + "epoch": 1.7582485353068147, + "grad_norm": 0.19762767851352692, + "learning_rate": 3.472382187700723e-05, + "loss": 0.5474, + "step": 8553 + }, + { + "epoch": 1.7584541062801933, + "grad_norm": 0.1776442676782608, + "learning_rate": 3.471409794001008e-05, + "loss": 0.5056, + "step": 8554 + }, + { + "epoch": 1.7586596772535716, + "grad_norm": 0.16057129204273224, + "learning_rate": 3.4704374509751246e-05, + "loss": 0.5486, + "step": 8555 + }, + { + "epoch": 1.7588652482269502, + "grad_norm": 0.1970880925655365, + "learning_rate": 3.469465158670973e-05, + "loss": 0.5392, + "step": 8556 + }, + { + "epoch": 1.7590708192003288, + "grad_norm": 0.16427253186702728, + "learning_rate": 3.4684929171364594e-05, + "loss": 0.5139, + "step": 8557 + }, + { + "epoch": 1.7592763901737074, + "grad_norm": 0.13471728563308716, + "learning_rate": 3.4675207264194776e-05, + "loss": 0.5029, + "step": 8558 + }, + { + "epoch": 1.759481961147086, + "grad_norm": 0.15767881274223328, + "learning_rate": 3.4665485865679233e-05, + "loss": 0.5285, + "step": 8559 + }, + { + "epoch": 1.7596875321204646, + "grad_norm": 0.19089291989803314, + "learning_rate": 3.465576497629691e-05, + "loss": 0.52, + "step": 8560 + }, + { + "epoch": 1.7598931030938432, + "grad_norm": 0.1997915655374527, + "learning_rate": 3.46460445965267e-05, + "loss": 0.5741, + "step": 8561 + }, + { + "epoch": 1.7600986740672218, + "grad_norm": 0.20209218561649323, + "learning_rate": 3.4636324726847474e-05, + "loss": 0.5263, + "step": 8562 + }, + { + "epoch": 1.7603042450406003, + "grad_norm": 0.1946118324995041, + "learning_rate": 3.4626605367738065e-05, + "loss": 0.5425, + "step": 8563 + }, + { + "epoch": 1.760509816013979, + "grad_norm": 0.1665966659784317, + "learning_rate": 3.4616886519677345e-05, + "loss": 0.4985, + "step": 8564 + }, + { + "epoch": 1.7607153869873575, + "grad_norm": 0.16211137175559998, + "learning_rate": 3.4607168183144104e-05, + "loss": 0.5197, + "step": 8565 + }, + { + "epoch": 1.7609209579607359, + "grad_norm": 0.20239417254924774, + "learning_rate": 3.4597450358617106e-05, + "loss": 0.5397, + "step": 8566 + }, + { + "epoch": 1.7611265289341145, + "grad_norm": 0.20297926664352417, + "learning_rate": 3.458773304657511e-05, + "loss": 0.5313, + "step": 8567 + }, + { + "epoch": 1.761332099907493, + "grad_norm": 0.1988510936498642, + "learning_rate": 3.457801624749683e-05, + "loss": 0.5136, + "step": 8568 + }, + { + "epoch": 1.7615376708808717, + "grad_norm": 0.19971226155757904, + "learning_rate": 3.4568299961860965e-05, + "loss": 0.5421, + "step": 8569 + }, + { + "epoch": 1.76174324185425, + "grad_norm": 0.19609498977661133, + "learning_rate": 3.4558584190146226e-05, + "loss": 0.5286, + "step": 8570 + }, + { + "epoch": 1.7619488128276286, + "grad_norm": 0.20023983716964722, + "learning_rate": 3.4548868932831235e-05, + "loss": 0.5378, + "step": 8571 + }, + { + "epoch": 1.7621543838010072, + "grad_norm": 0.20097847282886505, + "learning_rate": 3.453915419039462e-05, + "loss": 0.5509, + "step": 8572 + }, + { + "epoch": 1.7623599547743858, + "grad_norm": 0.1950797438621521, + "learning_rate": 3.452943996331499e-05, + "loss": 0.5162, + "step": 8573 + }, + { + "epoch": 1.7625655257477644, + "grad_norm": 0.19853217899799347, + "learning_rate": 3.451972625207091e-05, + "loss": 0.5326, + "step": 8574 + }, + { + "epoch": 1.762771096721143, + "grad_norm": 0.19258736073970795, + "learning_rate": 3.451001305714094e-05, + "loss": 0.5194, + "step": 8575 + }, + { + "epoch": 1.7629766676945215, + "grad_norm": 0.1989012360572815, + "learning_rate": 3.450030037900357e-05, + "loss": 0.5142, + "step": 8576 + }, + { + "epoch": 1.7631822386679001, + "grad_norm": 0.19085493683815002, + "learning_rate": 3.4490588218137356e-05, + "loss": 0.5305, + "step": 8577 + }, + { + "epoch": 1.7633878096412787, + "grad_norm": 0.16426925361156464, + "learning_rate": 3.448087657502073e-05, + "loss": 0.5152, + "step": 8578 + }, + { + "epoch": 1.7635933806146573, + "grad_norm": 0.16579680144786835, + "learning_rate": 3.447116545013215e-05, + "loss": 0.5536, + "step": 8579 + }, + { + "epoch": 1.763798951588036, + "grad_norm": 0.19505342841148376, + "learning_rate": 3.4461454843950035e-05, + "loss": 0.5208, + "step": 8580 + }, + { + "epoch": 1.7640045225614145, + "grad_norm": 0.19521526992321014, + "learning_rate": 3.445174475695277e-05, + "loss": 0.5593, + "step": 8581 + }, + { + "epoch": 1.7642100935347929, + "grad_norm": 0.19059976935386658, + "learning_rate": 3.4442035189618756e-05, + "loss": 0.5199, + "step": 8582 + }, + { + "epoch": 1.7644156645081714, + "grad_norm": 0.1944594383239746, + "learning_rate": 3.443232614242631e-05, + "loss": 0.5527, + "step": 8583 + }, + { + "epoch": 1.76462123548155, + "grad_norm": 0.1929233968257904, + "learning_rate": 3.442261761585376e-05, + "loss": 0.5206, + "step": 8584 + }, + { + "epoch": 1.7648268064549284, + "grad_norm": 0.1909506469964981, + "learning_rate": 3.441290961037941e-05, + "loss": 0.518, + "step": 8585 + }, + { + "epoch": 1.765032377428307, + "grad_norm": 0.20194295048713684, + "learning_rate": 3.440320212648152e-05, + "loss": 0.5559, + "step": 8586 + }, + { + "epoch": 1.7652379484016856, + "grad_norm": 0.20173272490501404, + "learning_rate": 3.439349516463833e-05, + "loss": 0.5235, + "step": 8587 + }, + { + "epoch": 1.7654435193750642, + "grad_norm": 0.1989864706993103, + "learning_rate": 3.438378872532806e-05, + "loss": 0.5219, + "step": 8588 + }, + { + "epoch": 1.7656490903484428, + "grad_norm": 0.17310731112957, + "learning_rate": 3.43740828090289e-05, + "loss": 0.506, + "step": 8589 + }, + { + "epoch": 1.7658546613218213, + "grad_norm": 0.16371743381023407, + "learning_rate": 3.4364377416219e-05, + "loss": 0.5388, + "step": 8590 + }, + { + "epoch": 1.7660602322952, + "grad_norm": 0.19252368807792664, + "learning_rate": 3.4354672547376524e-05, + "loss": 0.5251, + "step": 8591 + }, + { + "epoch": 1.7662658032685785, + "grad_norm": 0.1986730545759201, + "learning_rate": 3.4344968202979584e-05, + "loss": 0.5353, + "step": 8592 + }, + { + "epoch": 1.766471374241957, + "grad_norm": 0.22330817580223083, + "learning_rate": 3.433526438350625e-05, + "loss": 0.5355, + "step": 8593 + }, + { + "epoch": 1.7666769452153357, + "grad_norm": 0.19446399807929993, + "learning_rate": 3.43255610894346e-05, + "loss": 0.518, + "step": 8594 + }, + { + "epoch": 1.7668825161887143, + "grad_norm": 0.19539190828800201, + "learning_rate": 3.431585832124266e-05, + "loss": 0.5334, + "step": 8595 + }, + { + "epoch": 1.7670880871620929, + "grad_norm": 0.20236273109912872, + "learning_rate": 3.430615607940844e-05, + "loss": 0.5315, + "step": 8596 + }, + { + "epoch": 1.7672936581354712, + "grad_norm": 0.1652330905199051, + "learning_rate": 3.429645436440991e-05, + "loss": 0.5177, + "step": 8597 + }, + { + "epoch": 1.7674992291088498, + "grad_norm": 0.16170786321163177, + "learning_rate": 3.428675317672507e-05, + "loss": 0.508, + "step": 8598 + }, + { + "epoch": 1.7677048000822284, + "grad_norm": 0.1644188016653061, + "learning_rate": 3.427705251683182e-05, + "loss": 0.5064, + "step": 8599 + }, + { + "epoch": 1.767910371055607, + "grad_norm": 0.1265815794467926, + "learning_rate": 3.4267352385208086e-05, + "loss": 0.4951, + "step": 8600 + }, + { + "epoch": 1.7681159420289854, + "grad_norm": 0.16070230305194855, + "learning_rate": 3.425765278233172e-05, + "loss": 0.5369, + "step": 8601 + }, + { + "epoch": 1.768321513002364, + "grad_norm": 0.19323338568210602, + "learning_rate": 3.42479537086806e-05, + "loss": 0.5374, + "step": 8602 + }, + { + "epoch": 1.7685270839757425, + "grad_norm": 0.19410564005374908, + "learning_rate": 3.423825516473254e-05, + "loss": 0.5405, + "step": 8603 + }, + { + "epoch": 1.7687326549491211, + "grad_norm": 0.19003941118717194, + "learning_rate": 3.422855715096534e-05, + "loss": 0.5468, + "step": 8604 + }, + { + "epoch": 1.7689382259224997, + "grad_norm": 0.19323213398456573, + "learning_rate": 3.421885966785679e-05, + "loss": 0.5257, + "step": 8605 + }, + { + "epoch": 1.7691437968958783, + "grad_norm": 0.1951300948858261, + "learning_rate": 3.420916271588464e-05, + "loss": 0.5289, + "step": 8606 + }, + { + "epoch": 1.769349367869257, + "grad_norm": 0.17118534445762634, + "learning_rate": 3.419946629552661e-05, + "loss": 0.5018, + "step": 8607 + }, + { + "epoch": 1.7695549388426355, + "grad_norm": 0.18546664714813232, + "learning_rate": 3.418977040726039e-05, + "loss": 0.5171, + "step": 8608 + }, + { + "epoch": 1.769760509816014, + "grad_norm": 0.2011442631483078, + "learning_rate": 3.418007505156365e-05, + "loss": 0.5485, + "step": 8609 + }, + { + "epoch": 1.7699660807893927, + "grad_norm": 0.20571096241474152, + "learning_rate": 3.417038022891405e-05, + "loss": 0.5286, + "step": 8610 + }, + { + "epoch": 1.7701716517627712, + "grad_norm": 0.19118675589561462, + "learning_rate": 3.416068593978917e-05, + "loss": 0.5185, + "step": 8611 + }, + { + "epoch": 1.7703772227361496, + "grad_norm": 0.19263485074043274, + "learning_rate": 3.415099218466666e-05, + "loss": 0.5285, + "step": 8612 + }, + { + "epoch": 1.7705827937095282, + "grad_norm": 0.15957173705101013, + "learning_rate": 3.4141298964024046e-05, + "loss": 0.4959, + "step": 8613 + }, + { + "epoch": 1.7707883646829068, + "grad_norm": 0.13335727155208588, + "learning_rate": 3.4131606278338875e-05, + "loss": 0.5116, + "step": 8614 + }, + { + "epoch": 1.7709939356562854, + "grad_norm": 0.15437600016593933, + "learning_rate": 3.412191412808868e-05, + "loss": 0.545, + "step": 8615 + }, + { + "epoch": 1.7711995066296637, + "grad_norm": 0.19646428525447845, + "learning_rate": 3.411222251375092e-05, + "loss": 0.5433, + "step": 8616 + }, + { + "epoch": 1.7714050776030423, + "grad_norm": 0.21242785453796387, + "learning_rate": 3.410253143580307e-05, + "loss": 0.546, + "step": 8617 + }, + { + "epoch": 1.771610648576421, + "grad_norm": 0.19566522538661957, + "learning_rate": 3.4092840894722545e-05, + "loss": 0.5379, + "step": 8618 + }, + { + "epoch": 1.7718162195497995, + "grad_norm": 0.19648124277591705, + "learning_rate": 3.40831508909868e-05, + "loss": 0.566, + "step": 8619 + }, + { + "epoch": 1.772021790523178, + "grad_norm": 0.17227233946323395, + "learning_rate": 3.407346142507317e-05, + "loss": 0.5122, + "step": 8620 + }, + { + "epoch": 1.7722273614965567, + "grad_norm": 0.1685340255498886, + "learning_rate": 3.406377249745902e-05, + "loss": 0.5275, + "step": 8621 + }, + { + "epoch": 1.7724329324699353, + "grad_norm": 0.19663850963115692, + "learning_rate": 3.4054084108621695e-05, + "loss": 0.5189, + "step": 8622 + }, + { + "epoch": 1.7726385034433139, + "grad_norm": 0.19546058773994446, + "learning_rate": 3.4044396259038475e-05, + "loss": 0.5577, + "step": 8623 + }, + { + "epoch": 1.7728440744166925, + "grad_norm": 0.19245782494544983, + "learning_rate": 3.4034708949186655e-05, + "loss": 0.5378, + "step": 8624 + }, + { + "epoch": 1.773049645390071, + "grad_norm": 0.191994771361351, + "learning_rate": 3.402502217954346e-05, + "loss": 0.5207, + "step": 8625 + }, + { + "epoch": 1.7732552163634496, + "grad_norm": 0.19610409438610077, + "learning_rate": 3.401533595058612e-05, + "loss": 0.5512, + "step": 8626 + }, + { + "epoch": 1.773460787336828, + "grad_norm": 0.19996674358844757, + "learning_rate": 3.400565026279186e-05, + "loss": 0.5401, + "step": 8627 + }, + { + "epoch": 1.7736663583102066, + "grad_norm": 0.19628667831420898, + "learning_rate": 3.3995965116637814e-05, + "loss": 0.5596, + "step": 8628 + }, + { + "epoch": 1.7738719292835852, + "grad_norm": 0.2043389528989792, + "learning_rate": 3.398628051260114e-05, + "loss": 0.545, + "step": 8629 + }, + { + "epoch": 1.7740775002569638, + "grad_norm": 0.1604812741279602, + "learning_rate": 3.397659645115894e-05, + "loss": 0.4899, + "step": 8630 + }, + { + "epoch": 1.7742830712303421, + "grad_norm": 0.15960481762886047, + "learning_rate": 3.396691293278831e-05, + "loss": 0.5164, + "step": 8631 + }, + { + "epoch": 1.7744886422037207, + "grad_norm": 0.1938653588294983, + "learning_rate": 3.395722995796629e-05, + "loss": 0.537, + "step": 8632 + }, + { + "epoch": 1.7746942131770993, + "grad_norm": 0.202159583568573, + "learning_rate": 3.3947547527169964e-05, + "loss": 0.5427, + "step": 8633 + }, + { + "epoch": 1.7748997841504779, + "grad_norm": 0.19612862169742584, + "learning_rate": 3.3937865640876305e-05, + "loss": 0.5301, + "step": 8634 + }, + { + "epoch": 1.7751053551238565, + "grad_norm": 0.20528623461723328, + "learning_rate": 3.39281842995623e-05, + "loss": 0.5452, + "step": 8635 + }, + { + "epoch": 1.775310926097235, + "grad_norm": 0.16790783405303955, + "learning_rate": 3.3918503503704905e-05, + "loss": 0.5233, + "step": 8636 + }, + { + "epoch": 1.7755164970706137, + "grad_norm": 0.12118061631917953, + "learning_rate": 3.390882325378105e-05, + "loss": 0.5104, + "step": 8637 + }, + { + "epoch": 1.7757220680439922, + "grad_norm": 0.1575068235397339, + "learning_rate": 3.389914355026764e-05, + "loss": 0.5293, + "step": 8638 + }, + { + "epoch": 1.7759276390173708, + "grad_norm": 0.16683286428451538, + "learning_rate": 3.3889464393641516e-05, + "loss": 0.5083, + "step": 8639 + }, + { + "epoch": 1.7761332099907494, + "grad_norm": 0.15598100423812866, + "learning_rate": 3.387978578437957e-05, + "loss": 0.5133, + "step": 8640 + }, + { + "epoch": 1.776338780964128, + "grad_norm": 0.202442929148674, + "learning_rate": 3.387010772295861e-05, + "loss": 0.5476, + "step": 8641 + }, + { + "epoch": 1.7765443519375064, + "grad_norm": 0.16517791152000427, + "learning_rate": 3.3860430209855415e-05, + "loss": 0.504, + "step": 8642 + }, + { + "epoch": 1.776749922910885, + "grad_norm": 0.18405590951442719, + "learning_rate": 3.3850753245546756e-05, + "loss": 0.5372, + "step": 8643 + }, + { + "epoch": 1.7769554938842635, + "grad_norm": 0.16793282330036163, + "learning_rate": 3.384107683050938e-05, + "loss": 0.5214, + "step": 8644 + }, + { + "epoch": 1.7771610648576421, + "grad_norm": 0.1541900336742401, + "learning_rate": 3.383140096521997e-05, + "loss": 0.5294, + "step": 8645 + }, + { + "epoch": 1.7773666358310205, + "grad_norm": 0.22302818298339844, + "learning_rate": 3.3821725650155247e-05, + "loss": 0.5564, + "step": 8646 + }, + { + "epoch": 1.777572206804399, + "grad_norm": 0.1978428214788437, + "learning_rate": 3.381205088579185e-05, + "loss": 0.5236, + "step": 8647 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.1918904036283493, + "learning_rate": 3.380237667260642e-05, + "loss": 0.5082, + "step": 8648 + }, + { + "epoch": 1.7779833487511563, + "grad_norm": 0.1877550482749939, + "learning_rate": 3.379270301107555e-05, + "loss": 0.5303, + "step": 8649 + }, + { + "epoch": 1.7781889197245349, + "grad_norm": 0.1972031146287918, + "learning_rate": 3.3783029901675826e-05, + "loss": 0.5553, + "step": 8650 + }, + { + "epoch": 1.7783944906979134, + "grad_norm": 0.19708669185638428, + "learning_rate": 3.377335734488379e-05, + "loss": 0.5414, + "step": 8651 + }, + { + "epoch": 1.778600061671292, + "grad_norm": 0.1898190975189209, + "learning_rate": 3.376368534117595e-05, + "loss": 0.5227, + "step": 8652 + }, + { + "epoch": 1.7788056326446706, + "grad_norm": 0.19257086515426636, + "learning_rate": 3.3754013891028826e-05, + "loss": 0.5288, + "step": 8653 + }, + { + "epoch": 1.7790112036180492, + "grad_norm": 0.20055457949638367, + "learning_rate": 3.374434299491888e-05, + "loss": 0.5422, + "step": 8654 + }, + { + "epoch": 1.7792167745914278, + "grad_norm": 0.19281229376792908, + "learning_rate": 3.373467265332254e-05, + "loss": 0.5286, + "step": 8655 + }, + { + "epoch": 1.7794223455648064, + "grad_norm": 0.16794267296791077, + "learning_rate": 3.372500286671622e-05, + "loss": 0.5123, + "step": 8656 + }, + { + "epoch": 1.779627916538185, + "grad_norm": 0.17391237616539001, + "learning_rate": 3.371533363557631e-05, + "loss": 0.5325, + "step": 8657 + }, + { + "epoch": 1.7798334875115633, + "grad_norm": 0.19158299267292023, + "learning_rate": 3.3705664960379176e-05, + "loss": 0.5259, + "step": 8658 + }, + { + "epoch": 1.780039058484942, + "grad_norm": 0.19506706297397614, + "learning_rate": 3.3695996841601114e-05, + "loss": 0.5304, + "step": 8659 + }, + { + "epoch": 1.7802446294583205, + "grad_norm": 0.19340963661670685, + "learning_rate": 3.3686329279718484e-05, + "loss": 0.5321, + "step": 8660 + }, + { + "epoch": 1.7804502004316989, + "grad_norm": 0.16678109765052795, + "learning_rate": 3.367666227520752e-05, + "loss": 0.4986, + "step": 8661 + }, + { + "epoch": 1.7806557714050775, + "grad_norm": 0.1627744436264038, + "learning_rate": 3.366699582854449e-05, + "loss": 0.5676, + "step": 8662 + }, + { + "epoch": 1.780861342378456, + "grad_norm": 0.2138591855764389, + "learning_rate": 3.365732994020559e-05, + "loss": 0.5439, + "step": 8663 + }, + { + "epoch": 1.7810669133518346, + "grad_norm": 0.197159543633461, + "learning_rate": 3.3647664610667036e-05, + "loss": 0.5174, + "step": 8664 + }, + { + "epoch": 1.7812724843252132, + "grad_norm": 0.16026876866817474, + "learning_rate": 3.363799984040499e-05, + "loss": 0.5147, + "step": 8665 + }, + { + "epoch": 1.7814780552985918, + "grad_norm": 0.1291634738445282, + "learning_rate": 3.3628335629895564e-05, + "loss": 0.5185, + "step": 8666 + }, + { + "epoch": 1.7816836262719704, + "grad_norm": 0.12628033757209778, + "learning_rate": 3.3618671979614906e-05, + "loss": 0.5011, + "step": 8667 + }, + { + "epoch": 1.781889197245349, + "grad_norm": 0.16729123890399933, + "learning_rate": 3.360900889003909e-05, + "loss": 0.5241, + "step": 8668 + }, + { + "epoch": 1.7820947682187276, + "grad_norm": 0.199641615152359, + "learning_rate": 3.3599346361644154e-05, + "loss": 0.5325, + "step": 8669 + }, + { + "epoch": 1.7823003391921062, + "grad_norm": 0.1920914649963379, + "learning_rate": 3.3589684394906144e-05, + "loss": 0.5149, + "step": 8670 + }, + { + "epoch": 1.7825059101654848, + "grad_norm": 0.19104242324829102, + "learning_rate": 3.358002299030105e-05, + "loss": 0.5151, + "step": 8671 + }, + { + "epoch": 1.7827114811388634, + "grad_norm": 0.1941048502922058, + "learning_rate": 3.3570362148304846e-05, + "loss": 0.5251, + "step": 8672 + }, + { + "epoch": 1.7829170521122417, + "grad_norm": 0.18796855211257935, + "learning_rate": 3.356070186939346e-05, + "loss": 0.5222, + "step": 8673 + }, + { + "epoch": 1.7831226230856203, + "grad_norm": 0.1918378323316574, + "learning_rate": 3.355104215404284e-05, + "loss": 0.5433, + "step": 8674 + }, + { + "epoch": 1.783328194058999, + "grad_norm": 0.19748041033744812, + "learning_rate": 3.354138300272887e-05, + "loss": 0.5324, + "step": 8675 + }, + { + "epoch": 1.7835337650323775, + "grad_norm": 0.20158739387989044, + "learning_rate": 3.35317244159274e-05, + "loss": 0.5326, + "step": 8676 + }, + { + "epoch": 1.7837393360057558, + "grad_norm": 0.19620271027088165, + "learning_rate": 3.352206639411426e-05, + "loss": 0.5195, + "step": 8677 + }, + { + "epoch": 1.7839449069791344, + "grad_norm": 0.2002163529396057, + "learning_rate": 3.3512408937765256e-05, + "loss": 0.5183, + "step": 8678 + }, + { + "epoch": 1.784150477952513, + "grad_norm": 0.16237804293632507, + "learning_rate": 3.350275204735618e-05, + "loss": 0.5099, + "step": 8679 + }, + { + "epoch": 1.7843560489258916, + "grad_norm": 0.16660307347774506, + "learning_rate": 3.349309572336276e-05, + "loss": 0.5543, + "step": 8680 + }, + { + "epoch": 1.7845616198992702, + "grad_norm": 0.2031785398721695, + "learning_rate": 3.3483439966260734e-05, + "loss": 0.5341, + "step": 8681 + }, + { + "epoch": 1.7847671908726488, + "grad_norm": 0.19732213020324707, + "learning_rate": 3.34737847765258e-05, + "loss": 0.5229, + "step": 8682 + }, + { + "epoch": 1.7849727618460274, + "grad_norm": 0.20520782470703125, + "learning_rate": 3.3464130154633616e-05, + "loss": 0.5356, + "step": 8683 + }, + { + "epoch": 1.785178332819406, + "grad_norm": 0.1953929215669632, + "learning_rate": 3.345447610105983e-05, + "loss": 0.546, + "step": 8684 + }, + { + "epoch": 1.7853839037927846, + "grad_norm": 0.19621872901916504, + "learning_rate": 3.344482261628003e-05, + "loss": 0.5344, + "step": 8685 + }, + { + "epoch": 1.7855894747661631, + "grad_norm": 0.18925665318965912, + "learning_rate": 3.3435169700769815e-05, + "loss": 0.5191, + "step": 8686 + }, + { + "epoch": 1.7857950457395417, + "grad_norm": 0.16739846765995026, + "learning_rate": 3.3425517355004715e-05, + "loss": 0.5152, + "step": 8687 + }, + { + "epoch": 1.78600061671292, + "grad_norm": 0.198704332113266, + "learning_rate": 3.3415865579460305e-05, + "loss": 0.5538, + "step": 8688 + }, + { + "epoch": 1.7862061876862987, + "grad_norm": 0.24151252210140228, + "learning_rate": 3.340621437461204e-05, + "loss": 0.5001, + "step": 8689 + }, + { + "epoch": 1.7864117586596773, + "grad_norm": 0.20363937318325043, + "learning_rate": 3.3396563740935406e-05, + "loss": 0.5489, + "step": 8690 + }, + { + "epoch": 1.7866173296330559, + "grad_norm": 0.19773469865322113, + "learning_rate": 3.338691367890584e-05, + "loss": 0.5149, + "step": 8691 + }, + { + "epoch": 1.7868229006064342, + "grad_norm": 0.1598690301179886, + "learning_rate": 3.3377264188998764e-05, + "loss": 0.516, + "step": 8692 + }, + { + "epoch": 1.7870284715798128, + "grad_norm": 0.12722936272621155, + "learning_rate": 3.3367615271689555e-05, + "loss": 0.5108, + "step": 8693 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 0.15668757259845734, + "learning_rate": 3.335796692745356e-05, + "loss": 0.5482, + "step": 8694 + }, + { + "epoch": 1.78743961352657, + "grad_norm": 0.20184186100959778, + "learning_rate": 3.3348319156766126e-05, + "loss": 0.5621, + "step": 8695 + }, + { + "epoch": 1.7876451844999486, + "grad_norm": 0.20065537095069885, + "learning_rate": 3.333867196010255e-05, + "loss": 0.5341, + "step": 8696 + }, + { + "epoch": 1.7878507554733272, + "grad_norm": 0.19662179052829742, + "learning_rate": 3.3329025337938106e-05, + "loss": 0.5398, + "step": 8697 + }, + { + "epoch": 1.7880563264467058, + "grad_norm": 0.1874885857105255, + "learning_rate": 3.331937929074804e-05, + "loss": 0.53, + "step": 8698 + }, + { + "epoch": 1.7882618974200843, + "grad_norm": 0.19309687614440918, + "learning_rate": 3.330973381900754e-05, + "loss": 0.5446, + "step": 8699 + }, + { + "epoch": 1.788467468393463, + "grad_norm": 0.1995777040719986, + "learning_rate": 3.330008892319183e-05, + "loss": 0.5365, + "step": 8700 + }, + { + "epoch": 1.7886730393668415, + "grad_norm": 0.20065602660179138, + "learning_rate": 3.3290444603776045e-05, + "loss": 0.5344, + "step": 8701 + }, + { + "epoch": 1.78887861034022, + "grad_norm": 0.19749754667282104, + "learning_rate": 3.328080086123532e-05, + "loss": 0.5441, + "step": 8702 + }, + { + "epoch": 1.7890841813135985, + "grad_norm": 0.1977740079164505, + "learning_rate": 3.3271157696044774e-05, + "loss": 0.4977, + "step": 8703 + }, + { + "epoch": 1.789289752286977, + "grad_norm": 0.19205152988433838, + "learning_rate": 3.3261515108679465e-05, + "loss": 0.5375, + "step": 8704 + }, + { + "epoch": 1.7894953232603557, + "grad_norm": 0.19639678299427032, + "learning_rate": 3.325187309961445e-05, + "loss": 0.5507, + "step": 8705 + }, + { + "epoch": 1.7897008942337342, + "grad_norm": 0.17259171605110168, + "learning_rate": 3.3242231669324727e-05, + "loss": 0.5051, + "step": 8706 + }, + { + "epoch": 1.7899064652071126, + "grad_norm": 0.13026978075504303, + "learning_rate": 3.323259081828529e-05, + "loss": 0.5009, + "step": 8707 + }, + { + "epoch": 1.7901120361804912, + "grad_norm": 0.16744323074817657, + "learning_rate": 3.322295054697109e-05, + "loss": 0.5441, + "step": 8708 + }, + { + "epoch": 1.7903176071538698, + "grad_norm": 0.1634387969970703, + "learning_rate": 3.3213310855857096e-05, + "loss": 0.5119, + "step": 8709 + }, + { + "epoch": 1.7905231781272484, + "grad_norm": 0.15612611174583435, + "learning_rate": 3.3203671745418175e-05, + "loss": 0.5536, + "step": 8710 + }, + { + "epoch": 1.790728749100627, + "grad_norm": 0.23640312254428864, + "learning_rate": 3.31940332161292e-05, + "loss": 0.5475, + "step": 8711 + }, + { + "epoch": 1.7909343200740055, + "grad_norm": 0.19410596787929535, + "learning_rate": 3.318439526846505e-05, + "loss": 0.5559, + "step": 8712 + }, + { + "epoch": 1.7911398910473841, + "grad_norm": 0.19222721457481384, + "learning_rate": 3.317475790290051e-05, + "loss": 0.5342, + "step": 8713 + }, + { + "epoch": 1.7913454620207627, + "grad_norm": 0.17586657404899597, + "learning_rate": 3.316512111991038e-05, + "loss": 0.4957, + "step": 8714 + }, + { + "epoch": 1.7915510329941413, + "grad_norm": 0.1588892936706543, + "learning_rate": 3.31554849199694e-05, + "loss": 0.5331, + "step": 8715 + }, + { + "epoch": 1.79175660396752, + "grad_norm": 0.202567458152771, + "learning_rate": 3.3145849303552333e-05, + "loss": 0.5418, + "step": 8716 + }, + { + "epoch": 1.7919621749408985, + "grad_norm": 0.19263319671154022, + "learning_rate": 3.3136214271133865e-05, + "loss": 0.5144, + "step": 8717 + }, + { + "epoch": 1.7921677459142769, + "grad_norm": 0.17071235179901123, + "learning_rate": 3.312657982318866e-05, + "loss": 0.5028, + "step": 8718 + }, + { + "epoch": 1.7923733168876554, + "grad_norm": 0.16436687111854553, + "learning_rate": 3.311694596019138e-05, + "loss": 0.5452, + "step": 8719 + }, + { + "epoch": 1.792578887861034, + "grad_norm": 0.16867224872112274, + "learning_rate": 3.310731268261662e-05, + "loss": 0.5006, + "step": 8720 + }, + { + "epoch": 1.7927844588344126, + "grad_norm": 0.16634447872638702, + "learning_rate": 3.3097679990938975e-05, + "loss": 0.5141, + "step": 8721 + }, + { + "epoch": 1.792990029807791, + "grad_norm": 0.16255205869674683, + "learning_rate": 3.308804788563302e-05, + "loss": 0.5167, + "step": 8722 + }, + { + "epoch": 1.7931956007811696, + "grad_norm": 0.12086722999811172, + "learning_rate": 3.307841636717326e-05, + "loss": 0.5256, + "step": 8723 + }, + { + "epoch": 1.7934011717545482, + "grad_norm": 0.16073068976402283, + "learning_rate": 3.3068785436034214e-05, + "loss": 0.522, + "step": 8724 + }, + { + "epoch": 1.7936067427279268, + "grad_norm": 0.19669640064239502, + "learning_rate": 3.305915509269034e-05, + "loss": 0.5543, + "step": 8725 + }, + { + "epoch": 1.7938123137013053, + "grad_norm": 0.17151986062526703, + "learning_rate": 3.304952533761608e-05, + "loss": 0.5286, + "step": 8726 + }, + { + "epoch": 1.794017884674684, + "grad_norm": 0.15375934541225433, + "learning_rate": 3.303989617128586e-05, + "loss": 0.5302, + "step": 8727 + }, + { + "epoch": 1.7942234556480625, + "grad_norm": 0.19120700657367706, + "learning_rate": 3.303026759417403e-05, + "loss": 0.5134, + "step": 8728 + }, + { + "epoch": 1.794429026621441, + "grad_norm": 0.15886104106903076, + "learning_rate": 3.302063960675498e-05, + "loss": 0.4965, + "step": 8729 + }, + { + "epoch": 1.7946345975948197, + "grad_norm": 0.1626490205526352, + "learning_rate": 3.3011012209503034e-05, + "loss": 0.5338, + "step": 8730 + }, + { + "epoch": 1.7948401685681983, + "grad_norm": 0.20152784883975983, + "learning_rate": 3.300138540289248e-05, + "loss": 0.5339, + "step": 8731 + }, + { + "epoch": 1.7950457395415769, + "grad_norm": 0.1927708387374878, + "learning_rate": 3.2991759187397575e-05, + "loss": 0.5188, + "step": 8732 + }, + { + "epoch": 1.7952513105149552, + "grad_norm": 0.16235662996768951, + "learning_rate": 3.2982133563492586e-05, + "loss": 0.4898, + "step": 8733 + }, + { + "epoch": 1.7954568814883338, + "grad_norm": 0.1614857167005539, + "learning_rate": 3.2972508531651686e-05, + "loss": 0.5315, + "step": 8734 + }, + { + "epoch": 1.7956624524617124, + "grad_norm": 0.19275487959384918, + "learning_rate": 3.2962884092349074e-05, + "loss": 0.532, + "step": 8735 + }, + { + "epoch": 1.795868023435091, + "grad_norm": 0.1601790487766266, + "learning_rate": 3.295326024605891e-05, + "loss": 0.4982, + "step": 8736 + }, + { + "epoch": 1.7960735944084694, + "grad_norm": 0.16727516055107117, + "learning_rate": 3.2943636993255316e-05, + "loss": 0.5415, + "step": 8737 + }, + { + "epoch": 1.796279165381848, + "grad_norm": 0.19914865493774414, + "learning_rate": 3.293401433441237e-05, + "loss": 0.507, + "step": 8738 + }, + { + "epoch": 1.7964847363552265, + "grad_norm": 0.20361186563968658, + "learning_rate": 3.2924392270004136e-05, + "loss": 0.5369, + "step": 8739 + }, + { + "epoch": 1.7966903073286051, + "grad_norm": 0.19120000302791595, + "learning_rate": 3.2914770800504665e-05, + "loss": 0.5204, + "step": 8740 + }, + { + "epoch": 1.7968958783019837, + "grad_norm": 0.19319793581962585, + "learning_rate": 3.2905149926387946e-05, + "loss": 0.5346, + "step": 8741 + }, + { + "epoch": 1.7971014492753623, + "grad_norm": 0.19608697295188904, + "learning_rate": 3.289552964812793e-05, + "loss": 0.528, + "step": 8742 + }, + { + "epoch": 1.797307020248741, + "grad_norm": 0.19638018310070038, + "learning_rate": 3.2885909966198625e-05, + "loss": 0.5554, + "step": 8743 + }, + { + "epoch": 1.7975125912221195, + "grad_norm": 0.19244056940078735, + "learning_rate": 3.28762908810739e-05, + "loss": 0.5331, + "step": 8744 + }, + { + "epoch": 1.797718162195498, + "grad_norm": 0.1934266835451126, + "learning_rate": 3.2866672393227665e-05, + "loss": 0.523, + "step": 8745 + }, + { + "epoch": 1.7979237331688767, + "grad_norm": 0.1938556432723999, + "learning_rate": 3.285705450313377e-05, + "loss": 0.5195, + "step": 8746 + }, + { + "epoch": 1.7981293041422552, + "grad_norm": 0.1935025453567505, + "learning_rate": 3.284743721126605e-05, + "loss": 0.5507, + "step": 8747 + }, + { + "epoch": 1.7983348751156338, + "grad_norm": 0.16398179531097412, + "learning_rate": 3.2837820518098294e-05, + "loss": 0.5202, + "step": 8748 + }, + { + "epoch": 1.7985404460890122, + "grad_norm": 0.13340577483177185, + "learning_rate": 3.2828204424104256e-05, + "loss": 0.5141, + "step": 8749 + }, + { + "epoch": 1.7987460170623908, + "grad_norm": 0.16972105205059052, + "learning_rate": 3.2818588929757714e-05, + "loss": 0.5168, + "step": 8750 + }, + { + "epoch": 1.7989515880357694, + "grad_norm": 0.19210562109947205, + "learning_rate": 3.2808974035532354e-05, + "loss": 0.5188, + "step": 8751 + }, + { + "epoch": 1.7991571590091477, + "grad_norm": 0.19385598599910736, + "learning_rate": 3.279935974190187e-05, + "loss": 0.5101, + "step": 8752 + }, + { + "epoch": 1.7993627299825263, + "grad_norm": 0.20557603240013123, + "learning_rate": 3.278974604933991e-05, + "loss": 0.5596, + "step": 8753 + }, + { + "epoch": 1.799568300955905, + "grad_norm": 0.17109614610671997, + "learning_rate": 3.2780132958320075e-05, + "loss": 0.5028, + "step": 8754 + }, + { + "epoch": 1.7997738719292835, + "grad_norm": 0.1690118908882141, + "learning_rate": 3.277052046931598e-05, + "loss": 0.5425, + "step": 8755 + }, + { + "epoch": 1.799979442902662, + "grad_norm": 0.20591634511947632, + "learning_rate": 3.276090858280118e-05, + "loss": 0.5446, + "step": 8756 + }, + { + "epoch": 1.8001850138760407, + "grad_norm": 0.19887815415859222, + "learning_rate": 3.27512972992492e-05, + "loss": 0.557, + "step": 8757 + }, + { + "epoch": 1.8003905848494193, + "grad_norm": 0.1681029200553894, + "learning_rate": 3.274168661913357e-05, + "loss": 0.5276, + "step": 8758 + }, + { + "epoch": 1.8005961558227979, + "grad_norm": 0.1590951383113861, + "learning_rate": 3.273207654292774e-05, + "loss": 0.5437, + "step": 8759 + }, + { + "epoch": 1.8008017267961764, + "grad_norm": 0.19525672495365143, + "learning_rate": 3.272246707110516e-05, + "loss": 0.5255, + "step": 8760 + }, + { + "epoch": 1.801007297769555, + "grad_norm": 0.19866180419921875, + "learning_rate": 3.271285820413924e-05, + "loss": 0.5169, + "step": 8761 + }, + { + "epoch": 1.8012128687429336, + "grad_norm": 0.18859532475471497, + "learning_rate": 3.270324994250337e-05, + "loss": 0.5348, + "step": 8762 + }, + { + "epoch": 1.8014184397163122, + "grad_norm": 0.19412027299404144, + "learning_rate": 3.2693642286670884e-05, + "loss": 0.5319, + "step": 8763 + }, + { + "epoch": 1.8016240106896906, + "grad_norm": 0.1961035430431366, + "learning_rate": 3.2684035237115134e-05, + "loss": 0.5443, + "step": 8764 + }, + { + "epoch": 1.8018295816630692, + "grad_norm": 0.19343046844005585, + "learning_rate": 3.2674428794309405e-05, + "loss": 0.5361, + "step": 8765 + }, + { + "epoch": 1.8020351526364478, + "grad_norm": 0.1937210112810135, + "learning_rate": 3.266482295872695e-05, + "loss": 0.5418, + "step": 8766 + }, + { + "epoch": 1.8022407236098263, + "grad_norm": 0.19837218523025513, + "learning_rate": 3.265521773084103e-05, + "loss": 0.5194, + "step": 8767 + }, + { + "epoch": 1.8024462945832047, + "grad_norm": 0.19206029176712036, + "learning_rate": 3.264561311112483e-05, + "loss": 0.535, + "step": 8768 + }, + { + "epoch": 1.8026518655565833, + "grad_norm": 0.19339123368263245, + "learning_rate": 3.263600910005152e-05, + "loss": 0.509, + "step": 8769 + }, + { + "epoch": 1.8028574365299619, + "grad_norm": 0.16199949383735657, + "learning_rate": 3.262640569809424e-05, + "loss": 0.4946, + "step": 8770 + }, + { + "epoch": 1.8030630075033405, + "grad_norm": 0.1284702867269516, + "learning_rate": 3.261680290572613e-05, + "loss": 0.5095, + "step": 8771 + }, + { + "epoch": 1.803268578476719, + "grad_norm": 0.16122448444366455, + "learning_rate": 3.2607200723420274e-05, + "loss": 0.5157, + "step": 8772 + }, + { + "epoch": 1.8034741494500977, + "grad_norm": 0.1663200855255127, + "learning_rate": 3.259759915164971e-05, + "loss": 0.5245, + "step": 8773 + }, + { + "epoch": 1.8036797204234762, + "grad_norm": 0.15676259994506836, + "learning_rate": 3.258799819088746e-05, + "loss": 0.5267, + "step": 8774 + }, + { + "epoch": 1.8038852913968548, + "grad_norm": 0.20118048787117004, + "learning_rate": 3.257839784160652e-05, + "loss": 0.5446, + "step": 8775 + }, + { + "epoch": 1.8040908623702334, + "grad_norm": 0.198233500123024, + "learning_rate": 3.256879810427987e-05, + "loss": 0.5508, + "step": 8776 + }, + { + "epoch": 1.804296433343612, + "grad_norm": 0.20336700975894928, + "learning_rate": 3.255919897938043e-05, + "loss": 0.5365, + "step": 8777 + }, + { + "epoch": 1.8045020043169906, + "grad_norm": 0.16567686200141907, + "learning_rate": 3.2549600467381096e-05, + "loss": 0.5033, + "step": 8778 + }, + { + "epoch": 1.804707575290369, + "grad_norm": 0.16237860918045044, + "learning_rate": 3.2540002568754776e-05, + "loss": 0.5379, + "step": 8779 + }, + { + "epoch": 1.8049131462637475, + "grad_norm": 0.19675461947917938, + "learning_rate": 3.2530405283974284e-05, + "loss": 0.5328, + "step": 8780 + }, + { + "epoch": 1.8051187172371261, + "grad_norm": 0.19670812785625458, + "learning_rate": 3.2520808613512446e-05, + "loss": 0.5439, + "step": 8781 + }, + { + "epoch": 1.8053242882105047, + "grad_norm": 0.19091184437274933, + "learning_rate": 3.2511212557842036e-05, + "loss": 0.5355, + "step": 8782 + }, + { + "epoch": 1.805529859183883, + "grad_norm": 0.19207298755645752, + "learning_rate": 3.250161711743581e-05, + "loss": 0.5185, + "step": 8783 + }, + { + "epoch": 1.8057354301572617, + "grad_norm": 0.19454807043075562, + "learning_rate": 3.2492022292766476e-05, + "loss": 0.5098, + "step": 8784 + }, + { + "epoch": 1.8059410011306403, + "grad_norm": 0.1931590735912323, + "learning_rate": 3.248242808430676e-05, + "loss": 0.5535, + "step": 8785 + }, + { + "epoch": 1.8061465721040189, + "grad_norm": 0.19954814016819, + "learning_rate": 3.24728344925293e-05, + "loss": 0.5226, + "step": 8786 + }, + { + "epoch": 1.8063521430773974, + "grad_norm": 0.18844476342201233, + "learning_rate": 3.2463241517906725e-05, + "loss": 0.502, + "step": 8787 + }, + { + "epoch": 1.806557714050776, + "grad_norm": 0.20117510855197906, + "learning_rate": 3.245364916091166e-05, + "loss": 0.5558, + "step": 8788 + }, + { + "epoch": 1.8067632850241546, + "grad_norm": 0.16829055547714233, + "learning_rate": 3.244405742201665e-05, + "loss": 0.5025, + "step": 8789 + }, + { + "epoch": 1.8069688559975332, + "grad_norm": 0.16325733065605164, + "learning_rate": 3.243446630169425e-05, + "loss": 0.5352, + "step": 8790 + }, + { + "epoch": 1.8071744269709118, + "grad_norm": 0.2082752287387848, + "learning_rate": 3.242487580041695e-05, + "loss": 0.5324, + "step": 8791 + }, + { + "epoch": 1.8073799979442904, + "grad_norm": 0.19825617969036102, + "learning_rate": 3.2415285918657254e-05, + "loss": 0.5495, + "step": 8792 + }, + { + "epoch": 1.807585568917669, + "grad_norm": 0.19314411282539368, + "learning_rate": 3.24056966568876e-05, + "loss": 0.5279, + "step": 8793 + }, + { + "epoch": 1.8077911398910473, + "grad_norm": 0.18960903584957123, + "learning_rate": 3.2396108015580414e-05, + "loss": 0.5288, + "step": 8794 + }, + { + "epoch": 1.807996710864426, + "grad_norm": 0.1918916255235672, + "learning_rate": 3.2386519995208064e-05, + "loss": 0.5223, + "step": 8795 + }, + { + "epoch": 1.8082022818378045, + "grad_norm": 0.1647498458623886, + "learning_rate": 3.2376932596242916e-05, + "loss": 0.4907, + "step": 8796 + }, + { + "epoch": 1.808407852811183, + "grad_norm": 0.296530157327652, + "learning_rate": 3.236734581915732e-05, + "loss": 0.5286, + "step": 8797 + }, + { + "epoch": 1.8086134237845615, + "grad_norm": 0.20278790593147278, + "learning_rate": 3.235775966442352e-05, + "loss": 0.5266, + "step": 8798 + }, + { + "epoch": 1.80881899475794, + "grad_norm": 0.19407640397548676, + "learning_rate": 3.234817413251382e-05, + "loss": 0.5374, + "step": 8799 + }, + { + "epoch": 1.8090245657313186, + "grad_norm": 0.1970399171113968, + "learning_rate": 3.233858922390045e-05, + "loss": 0.5392, + "step": 8800 + }, + { + "epoch": 1.8092301367046972, + "grad_norm": 0.19362105429172516, + "learning_rate": 3.232900493905562e-05, + "loss": 0.5208, + "step": 8801 + }, + { + "epoch": 1.8094357076780758, + "grad_norm": 0.17109599709510803, + "learning_rate": 3.2319421278451495e-05, + "loss": 0.5229, + "step": 8802 + }, + { + "epoch": 1.8096412786514544, + "grad_norm": 0.16294682025909424, + "learning_rate": 3.230983824256021e-05, + "loss": 0.5206, + "step": 8803 + }, + { + "epoch": 1.809846849624833, + "grad_norm": 0.2055048942565918, + "learning_rate": 3.2300255831853856e-05, + "loss": 0.5383, + "step": 8804 + }, + { + "epoch": 1.8100524205982116, + "grad_norm": 0.1635579615831375, + "learning_rate": 3.229067404680456e-05, + "loss": 0.492, + "step": 8805 + }, + { + "epoch": 1.8102579915715902, + "grad_norm": 0.11798587441444397, + "learning_rate": 3.228109288788435e-05, + "loss": 0.493, + "step": 8806 + }, + { + "epoch": 1.8104635625449688, + "grad_norm": 0.16314202547073364, + "learning_rate": 3.227151235556525e-05, + "loss": 0.5462, + "step": 8807 + }, + { + "epoch": 1.8106691335183474, + "grad_norm": 0.19977326691150665, + "learning_rate": 3.2261932450319237e-05, + "loss": 0.5128, + "step": 8808 + }, + { + "epoch": 1.8108747044917257, + "grad_norm": 0.19161002337932587, + "learning_rate": 3.2252353172618275e-05, + "loss": 0.5166, + "step": 8809 + }, + { + "epoch": 1.8110802754651043, + "grad_norm": 0.19649870693683624, + "learning_rate": 3.2242774522934294e-05, + "loss": 0.5205, + "step": 8810 + }, + { + "epoch": 1.811285846438483, + "grad_norm": 0.19603271782398224, + "learning_rate": 3.2233196501739164e-05, + "loss": 0.5314, + "step": 8811 + }, + { + "epoch": 1.8114914174118615, + "grad_norm": 0.19260643422603607, + "learning_rate": 3.222361910950479e-05, + "loss": 0.4841, + "step": 8812 + }, + { + "epoch": 1.8116969883852398, + "grad_norm": 0.19248011708259583, + "learning_rate": 3.221404234670299e-05, + "loss": 0.5179, + "step": 8813 + }, + { + "epoch": 1.8119025593586184, + "grad_norm": 0.21127529442310333, + "learning_rate": 3.2204466213805556e-05, + "loss": 0.5432, + "step": 8814 + }, + { + "epoch": 1.812108130331997, + "grad_norm": 0.20110765099525452, + "learning_rate": 3.219489071128427e-05, + "loss": 0.5333, + "step": 8815 + }, + { + "epoch": 1.8123137013053756, + "grad_norm": 0.19356440007686615, + "learning_rate": 3.2185315839610864e-05, + "loss": 0.5167, + "step": 8816 + }, + { + "epoch": 1.8125192722787542, + "grad_norm": 0.19028930366039276, + "learning_rate": 3.217574159925706e-05, + "loss": 0.526, + "step": 8817 + }, + { + "epoch": 1.8127248432521328, + "grad_norm": 0.16638804972171783, + "learning_rate": 3.21661679906945e-05, + "loss": 0.5041, + "step": 8818 + }, + { + "epoch": 1.8129304142255114, + "grad_norm": 0.13588784635066986, + "learning_rate": 3.2156595014394874e-05, + "loss": 0.5016, + "step": 8819 + }, + { + "epoch": 1.81313598519889, + "grad_norm": 0.16222389042377472, + "learning_rate": 3.214702267082978e-05, + "loss": 0.5194, + "step": 8820 + }, + { + "epoch": 1.8133415561722686, + "grad_norm": 0.1995917111635208, + "learning_rate": 3.213745096047081e-05, + "loss": 0.5335, + "step": 8821 + }, + { + "epoch": 1.8135471271456471, + "grad_norm": 0.20120279490947723, + "learning_rate": 3.212787988378951e-05, + "loss": 0.5491, + "step": 8822 + }, + { + "epoch": 1.8137526981190257, + "grad_norm": 0.19153755903244019, + "learning_rate": 3.211830944125741e-05, + "loss": 0.5137, + "step": 8823 + }, + { + "epoch": 1.8139582690924043, + "grad_norm": 0.19813428819179535, + "learning_rate": 3.210873963334599e-05, + "loss": 0.5302, + "step": 8824 + }, + { + "epoch": 1.8141638400657827, + "grad_norm": 0.16227731108665466, + "learning_rate": 3.20991704605267e-05, + "loss": 0.5092, + "step": 8825 + }, + { + "epoch": 1.8143694110391613, + "grad_norm": 0.13235917687416077, + "learning_rate": 3.2089601923270996e-05, + "loss": 0.5202, + "step": 8826 + }, + { + "epoch": 1.8145749820125399, + "grad_norm": 0.16478480398654938, + "learning_rate": 3.208003402205027e-05, + "loss": 0.5245, + "step": 8827 + }, + { + "epoch": 1.8147805529859182, + "grad_norm": 0.19610688090324402, + "learning_rate": 3.207046675733587e-05, + "loss": 0.514, + "step": 8828 + }, + { + "epoch": 1.8149861239592968, + "grad_norm": 0.19508203864097595, + "learning_rate": 3.206090012959915e-05, + "loss": 0.5138, + "step": 8829 + }, + { + "epoch": 1.8151916949326754, + "grad_norm": 0.20295090973377228, + "learning_rate": 3.205133413931139e-05, + "loss": 0.5225, + "step": 8830 + }, + { + "epoch": 1.815397265906054, + "grad_norm": 0.17680945992469788, + "learning_rate": 3.204176878694388e-05, + "loss": 0.5177, + "step": 8831 + }, + { + "epoch": 1.8156028368794326, + "grad_norm": 0.16306188702583313, + "learning_rate": 3.203220407296784e-05, + "loss": 0.5388, + "step": 8832 + }, + { + "epoch": 1.8158084078528112, + "grad_norm": 0.22620275616645813, + "learning_rate": 3.2022639997854516e-05, + "loss": 0.5522, + "step": 8833 + }, + { + "epoch": 1.8160139788261898, + "grad_norm": 0.19404473900794983, + "learning_rate": 3.201307656207506e-05, + "loss": 0.5213, + "step": 8834 + }, + { + "epoch": 1.8162195497995683, + "grad_norm": 0.1629197895526886, + "learning_rate": 3.200351376610062e-05, + "loss": 0.4795, + "step": 8835 + }, + { + "epoch": 1.816425120772947, + "grad_norm": 0.16560040414333344, + "learning_rate": 3.199395161040231e-05, + "loss": 0.524, + "step": 8836 + }, + { + "epoch": 1.8166306917463255, + "grad_norm": 0.19243641197681427, + "learning_rate": 3.198439009545122e-05, + "loss": 0.5346, + "step": 8837 + }, + { + "epoch": 1.816836262719704, + "grad_norm": 0.16431495547294617, + "learning_rate": 3.1974829221718386e-05, + "loss": 0.5181, + "step": 8838 + }, + { + "epoch": 1.8170418336930827, + "grad_norm": 0.15657220780849457, + "learning_rate": 3.196526898967483e-05, + "loss": 0.5453, + "step": 8839 + }, + { + "epoch": 1.817247404666461, + "grad_norm": 0.1584593802690506, + "learning_rate": 3.1955709399791556e-05, + "loss": 0.4755, + "step": 8840 + }, + { + "epoch": 1.8174529756398397, + "grad_norm": 0.16270606219768524, + "learning_rate": 3.194615045253952e-05, + "loss": 0.5515, + "step": 8841 + }, + { + "epoch": 1.8176585466132182, + "grad_norm": 0.2008228451013565, + "learning_rate": 3.193659214838962e-05, + "loss": 0.5282, + "step": 8842 + }, + { + "epoch": 1.8178641175865966, + "grad_norm": 0.17246867716312408, + "learning_rate": 3.192703448781278e-05, + "loss": 0.5119, + "step": 8843 + }, + { + "epoch": 1.8180696885599752, + "grad_norm": 0.124653160572052, + "learning_rate": 3.1917477471279846e-05, + "loss": 0.509, + "step": 8844 + }, + { + "epoch": 1.8182752595333538, + "grad_norm": 0.1591031700372696, + "learning_rate": 3.1907921099261654e-05, + "loss": 0.5118, + "step": 8845 + }, + { + "epoch": 1.8184808305067324, + "grad_norm": 0.20176099240779877, + "learning_rate": 3.189836537222897e-05, + "loss": 0.5179, + "step": 8846 + }, + { + "epoch": 1.818686401480111, + "grad_norm": 0.19873826205730438, + "learning_rate": 3.1888810290652606e-05, + "loss": 0.5189, + "step": 8847 + }, + { + "epoch": 1.8188919724534895, + "grad_norm": 0.19450412690639496, + "learning_rate": 3.187925585500329e-05, + "loss": 0.5346, + "step": 8848 + }, + { + "epoch": 1.8190975434268681, + "grad_norm": 0.19407886266708374, + "learning_rate": 3.18697020657517e-05, + "loss": 0.5105, + "step": 8849 + }, + { + "epoch": 1.8193031144002467, + "grad_norm": 0.1899595409631729, + "learning_rate": 3.186014892336852e-05, + "loss": 0.5173, + "step": 8850 + }, + { + "epoch": 1.8195086853736253, + "grad_norm": 0.19497114419937134, + "learning_rate": 3.185059642832438e-05, + "loss": 0.5314, + "step": 8851 + }, + { + "epoch": 1.819714256347004, + "grad_norm": 0.18996943533420563, + "learning_rate": 3.184104458108991e-05, + "loss": 0.5371, + "step": 8852 + }, + { + "epoch": 1.8199198273203825, + "grad_norm": 0.20072153210639954, + "learning_rate": 3.1831493382135644e-05, + "loss": 0.5398, + "step": 8853 + }, + { + "epoch": 1.820125398293761, + "grad_norm": 0.18781349062919617, + "learning_rate": 3.182194283193216e-05, + "loss": 0.5452, + "step": 8854 + }, + { + "epoch": 1.8203309692671394, + "grad_norm": 0.19404610991477966, + "learning_rate": 3.181239293094997e-05, + "loss": 0.5322, + "step": 8855 + }, + { + "epoch": 1.820536540240518, + "grad_norm": 0.19891172647476196, + "learning_rate": 3.180284367965953e-05, + "loss": 0.5382, + "step": 8856 + }, + { + "epoch": 1.8207421112138966, + "grad_norm": 0.19251051545143127, + "learning_rate": 3.179329507853131e-05, + "loss": 0.5373, + "step": 8857 + }, + { + "epoch": 1.8209476821872752, + "grad_norm": 0.16882584989070892, + "learning_rate": 3.178374712803571e-05, + "loss": 0.5063, + "step": 8858 + }, + { + "epoch": 1.8211532531606536, + "grad_norm": 0.12265215069055557, + "learning_rate": 3.177419982864312e-05, + "loss": 0.5056, + "step": 8859 + }, + { + "epoch": 1.8213588241340322, + "grad_norm": 0.17124712467193604, + "learning_rate": 3.176465318082386e-05, + "loss": 0.5412, + "step": 8860 + }, + { + "epoch": 1.8215643951074107, + "grad_norm": 0.20257225632667542, + "learning_rate": 3.1755107185048296e-05, + "loss": 0.5302, + "step": 8861 + }, + { + "epoch": 1.8217699660807893, + "grad_norm": 0.2040639966726303, + "learning_rate": 3.17455618417867e-05, + "loss": 0.5245, + "step": 8862 + }, + { + "epoch": 1.821975537054168, + "grad_norm": 0.16710929572582245, + "learning_rate": 3.173601715150931e-05, + "loss": 0.5054, + "step": 8863 + }, + { + "epoch": 1.8221811080275465, + "grad_norm": 0.16893459856510162, + "learning_rate": 3.172647311468637e-05, + "loss": 0.5471, + "step": 8864 + }, + { + "epoch": 1.822386679000925, + "grad_norm": 0.208717480301857, + "learning_rate": 3.171692973178805e-05, + "loss": 0.5122, + "step": 8865 + }, + { + "epoch": 1.8225922499743037, + "grad_norm": 0.20160967111587524, + "learning_rate": 3.170738700328453e-05, + "loss": 0.537, + "step": 8866 + }, + { + "epoch": 1.8227978209476823, + "grad_norm": 0.20131829380989075, + "learning_rate": 3.16978449296459e-05, + "loss": 0.5303, + "step": 8867 + }, + { + "epoch": 1.8230033919210609, + "grad_norm": 0.1935744732618332, + "learning_rate": 3.168830351134229e-05, + "loss": 0.5284, + "step": 8868 + }, + { + "epoch": 1.8232089628944395, + "grad_norm": 0.16306568682193756, + "learning_rate": 3.167876274884375e-05, + "loss": 0.5169, + "step": 8869 + }, + { + "epoch": 1.8234145338678178, + "grad_norm": 0.17691202461719513, + "learning_rate": 3.166922264262031e-05, + "loss": 0.5176, + "step": 8870 + }, + { + "epoch": 1.8236201048411964, + "grad_norm": 0.1908418834209442, + "learning_rate": 3.165968319314196e-05, + "loss": 0.5321, + "step": 8871 + }, + { + "epoch": 1.823825675814575, + "grad_norm": 0.2010890543460846, + "learning_rate": 3.1650144400878655e-05, + "loss": 0.534, + "step": 8872 + }, + { + "epoch": 1.8240312467879536, + "grad_norm": 0.19382749497890472, + "learning_rate": 3.164060626630035e-05, + "loss": 0.528, + "step": 8873 + }, + { + "epoch": 1.824236817761332, + "grad_norm": 0.16458258032798767, + "learning_rate": 3.163106878987692e-05, + "loss": 0.5106, + "step": 8874 + }, + { + "epoch": 1.8244423887347105, + "grad_norm": 0.15716025233268738, + "learning_rate": 3.162153197207825e-05, + "loss": 0.5123, + "step": 8875 + }, + { + "epoch": 1.8246479597080891, + "grad_norm": 0.19664350152015686, + "learning_rate": 3.161199581337418e-05, + "loss": 0.5322, + "step": 8876 + }, + { + "epoch": 1.8248535306814677, + "grad_norm": 0.19546431303024292, + "learning_rate": 3.160246031423449e-05, + "loss": 0.5382, + "step": 8877 + }, + { + "epoch": 1.8250591016548463, + "grad_norm": 0.15982358157634735, + "learning_rate": 3.1592925475128965e-05, + "loss": 0.5175, + "step": 8878 + }, + { + "epoch": 1.825264672628225, + "grad_norm": 0.1651008278131485, + "learning_rate": 3.1583391296527345e-05, + "loss": 0.55, + "step": 8879 + }, + { + "epoch": 1.8254702436016035, + "grad_norm": 0.19452379643917084, + "learning_rate": 3.15738577788993e-05, + "loss": 0.5374, + "step": 8880 + }, + { + "epoch": 1.825675814574982, + "grad_norm": 0.1954164355993271, + "learning_rate": 3.1564324922714546e-05, + "loss": 0.5137, + "step": 8881 + }, + { + "epoch": 1.8258813855483607, + "grad_norm": 0.19587047398090363, + "learning_rate": 3.155479272844271e-05, + "loss": 0.5334, + "step": 8882 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.19509205222129822, + "learning_rate": 3.154526119655339e-05, + "loss": 0.5252, + "step": 8883 + }, + { + "epoch": 1.8262925274951178, + "grad_norm": 0.1942068487405777, + "learning_rate": 3.153573032751616e-05, + "loss": 0.487, + "step": 8884 + }, + { + "epoch": 1.8264980984684962, + "grad_norm": 0.19760790467262268, + "learning_rate": 3.152620012180057e-05, + "loss": 0.5228, + "step": 8885 + }, + { + "epoch": 1.8267036694418748, + "grad_norm": 0.20716530084609985, + "learning_rate": 3.151667057987612e-05, + "loss": 0.5397, + "step": 8886 + }, + { + "epoch": 1.8269092404152534, + "grad_norm": 0.17616085708141327, + "learning_rate": 3.1507141702212276e-05, + "loss": 0.5052, + "step": 8887 + }, + { + "epoch": 1.827114811388632, + "grad_norm": 0.16189555823802948, + "learning_rate": 3.149761348927851e-05, + "loss": 0.54, + "step": 8888 + }, + { + "epoch": 1.8273203823620103, + "grad_norm": 0.19330181181430817, + "learning_rate": 3.148808594154422e-05, + "loss": 0.5388, + "step": 8889 + }, + { + "epoch": 1.827525953335389, + "grad_norm": 0.1901981681585312, + "learning_rate": 3.1478559059478784e-05, + "loss": 0.522, + "step": 8890 + }, + { + "epoch": 1.8277315243087675, + "grad_norm": 0.2037983387708664, + "learning_rate": 3.146903284355154e-05, + "loss": 0.552, + "step": 8891 + }, + { + "epoch": 1.827937095282146, + "grad_norm": 0.19158220291137695, + "learning_rate": 3.14595072942318e-05, + "loss": 0.5287, + "step": 8892 + }, + { + "epoch": 1.8281426662555247, + "grad_norm": 0.19278709590435028, + "learning_rate": 3.1449982411988846e-05, + "loss": 0.5276, + "step": 8893 + }, + { + "epoch": 1.8283482372289033, + "grad_norm": 0.19211730360984802, + "learning_rate": 3.144045819729193e-05, + "loss": 0.5246, + "step": 8894 + }, + { + "epoch": 1.8285538082022819, + "grad_norm": 0.16746920347213745, + "learning_rate": 3.143093465061026e-05, + "loss": 0.5085, + "step": 8895 + }, + { + "epoch": 1.8287593791756604, + "grad_norm": 0.16652396321296692, + "learning_rate": 3.142141177241301e-05, + "loss": 0.5325, + "step": 8896 + }, + { + "epoch": 1.828964950149039, + "grad_norm": 0.19498294591903687, + "learning_rate": 3.141188956316935e-05, + "loss": 0.5344, + "step": 8897 + }, + { + "epoch": 1.8291705211224176, + "grad_norm": 0.19246521592140198, + "learning_rate": 3.140236802334837e-05, + "loss": 0.5317, + "step": 8898 + }, + { + "epoch": 1.8293760920957962, + "grad_norm": 0.15903492271900177, + "learning_rate": 3.139284715341918e-05, + "loss": 0.4802, + "step": 8899 + }, + { + "epoch": 1.8295816630691746, + "grad_norm": 0.16071657836437225, + "learning_rate": 3.1383326953850794e-05, + "loss": 0.5418, + "step": 8900 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 0.19449485838413239, + "learning_rate": 3.1373807425112236e-05, + "loss": 0.5342, + "step": 8901 + }, + { + "epoch": 1.8299928050159318, + "grad_norm": 0.19907018542289734, + "learning_rate": 3.136428856767252e-05, + "loss": 0.5059, + "step": 8902 + }, + { + "epoch": 1.8301983759893103, + "grad_norm": 0.19666697084903717, + "learning_rate": 3.135477038200057e-05, + "loss": 0.5349, + "step": 8903 + }, + { + "epoch": 1.8304039469626887, + "grad_norm": 0.20423884689807892, + "learning_rate": 3.13452528685653e-05, + "loss": 0.5282, + "step": 8904 + }, + { + "epoch": 1.8306095179360673, + "grad_norm": 0.1999506801366806, + "learning_rate": 3.133573602783559e-05, + "loss": 0.5322, + "step": 8905 + }, + { + "epoch": 1.8308150889094459, + "grad_norm": 0.1950768530368805, + "learning_rate": 3.132621986028031e-05, + "loss": 0.5047, + "step": 8906 + }, + { + "epoch": 1.8310206598828245, + "grad_norm": 0.1928025186061859, + "learning_rate": 3.131670436636827e-05, + "loss": 0.5322, + "step": 8907 + }, + { + "epoch": 1.831226230856203, + "grad_norm": 0.18942581117153168, + "learning_rate": 3.1307189546568223e-05, + "loss": 0.5073, + "step": 8908 + }, + { + "epoch": 1.8314318018295817, + "grad_norm": 0.19761119782924652, + "learning_rate": 3.129767540134898e-05, + "loss": 0.5366, + "step": 8909 + }, + { + "epoch": 1.8316373728029602, + "grad_norm": 0.19659826159477234, + "learning_rate": 3.1288161931179216e-05, + "loss": 0.5442, + "step": 8910 + }, + { + "epoch": 1.8318429437763388, + "grad_norm": 0.19665639102458954, + "learning_rate": 3.1278649136527626e-05, + "loss": 0.5245, + "step": 8911 + }, + { + "epoch": 1.8320485147497174, + "grad_norm": 0.19645392894744873, + "learning_rate": 3.1269137017862864e-05, + "loss": 0.5327, + "step": 8912 + }, + { + "epoch": 1.832254085723096, + "grad_norm": 0.1934535652399063, + "learning_rate": 3.1259625575653535e-05, + "loss": 0.5234, + "step": 8913 + }, + { + "epoch": 1.8324596566964746, + "grad_norm": 0.19229261577129364, + "learning_rate": 3.125011481036823e-05, + "loss": 0.528, + "step": 8914 + }, + { + "epoch": 1.8326652276698532, + "grad_norm": 0.1917877197265625, + "learning_rate": 3.124060472247549e-05, + "loss": 0.5341, + "step": 8915 + }, + { + "epoch": 1.8328707986432315, + "grad_norm": 0.1936071366071701, + "learning_rate": 3.1231095312443864e-05, + "loss": 0.5224, + "step": 8916 + }, + { + "epoch": 1.8330763696166101, + "grad_norm": 0.2054835557937622, + "learning_rate": 3.12215865807418e-05, + "loss": 0.5125, + "step": 8917 + }, + { + "epoch": 1.8332819405899887, + "grad_norm": 0.1977832019329071, + "learning_rate": 3.121207852783778e-05, + "loss": 0.5194, + "step": 8918 + }, + { + "epoch": 1.833487511563367, + "grad_norm": 0.19401463866233826, + "learning_rate": 3.1202571154200206e-05, + "loss": 0.5293, + "step": 8919 + }, + { + "epoch": 1.8336930825367457, + "grad_norm": 0.16132505238056183, + "learning_rate": 3.119306446029746e-05, + "loss": 0.4925, + "step": 8920 + }, + { + "epoch": 1.8338986535101243, + "grad_norm": 0.15574200451374054, + "learning_rate": 3.1183558446597894e-05, + "loss": 0.5394, + "step": 8921 + }, + { + "epoch": 1.8341042244835029, + "grad_norm": 0.2005411684513092, + "learning_rate": 3.117405311356981e-05, + "loss": 0.5304, + "step": 8922 + }, + { + "epoch": 1.8343097954568814, + "grad_norm": 0.1618259698152542, + "learning_rate": 3.116454846168153e-05, + "loss": 0.5146, + "step": 8923 + }, + { + "epoch": 1.83451536643026, + "grad_norm": 0.17110760509967804, + "learning_rate": 3.115504449140127e-05, + "loss": 0.5491, + "step": 8924 + }, + { + "epoch": 1.8347209374036386, + "grad_norm": 0.1980321705341339, + "learning_rate": 3.114554120319726e-05, + "loss": 0.523, + "step": 8925 + }, + { + "epoch": 1.8349265083770172, + "grad_norm": 0.19496552646160126, + "learning_rate": 3.113603859753768e-05, + "loss": 0.5267, + "step": 8926 + }, + { + "epoch": 1.8351320793503958, + "grad_norm": 0.19820396602153778, + "learning_rate": 3.112653667489067e-05, + "loss": 0.5187, + "step": 8927 + }, + { + "epoch": 1.8353376503237744, + "grad_norm": 0.16340111196041107, + "learning_rate": 3.111703543572436e-05, + "loss": 0.5077, + "step": 8928 + }, + { + "epoch": 1.835543221297153, + "grad_norm": 0.17557556927204132, + "learning_rate": 3.110753488050682e-05, + "loss": 0.5238, + "step": 8929 + }, + { + "epoch": 1.8357487922705316, + "grad_norm": 0.2018451690673828, + "learning_rate": 3.10980350097061e-05, + "loss": 0.5438, + "step": 8930 + }, + { + "epoch": 1.83595436324391, + "grad_norm": 0.18654681742191315, + "learning_rate": 3.108853582379023e-05, + "loss": 0.5106, + "step": 8931 + }, + { + "epoch": 1.8361599342172885, + "grad_norm": 0.18761597573757172, + "learning_rate": 3.1079037323227176e-05, + "loss": 0.5089, + "step": 8932 + }, + { + "epoch": 1.836365505190667, + "grad_norm": 0.19022609293460846, + "learning_rate": 3.1069539508484894e-05, + "loss": 0.5145, + "step": 8933 + }, + { + "epoch": 1.8365710761640457, + "grad_norm": 0.18628591299057007, + "learning_rate": 3.106004238003128e-05, + "loss": 0.5176, + "step": 8934 + }, + { + "epoch": 1.836776647137424, + "grad_norm": 0.19824586808681488, + "learning_rate": 3.105054593833422e-05, + "loss": 0.522, + "step": 8935 + }, + { + "epoch": 1.8369822181108026, + "grad_norm": 0.16452063620090485, + "learning_rate": 3.1041050183861545e-05, + "loss": 0.5126, + "step": 8936 + }, + { + "epoch": 1.8371877890841812, + "grad_norm": 0.1635677069425583, + "learning_rate": 3.103155511708111e-05, + "loss": 0.5449, + "step": 8937 + }, + { + "epoch": 1.8373933600575598, + "grad_norm": 0.17322902381420135, + "learning_rate": 3.1022060738460663e-05, + "loss": 0.5107, + "step": 8938 + }, + { + "epoch": 1.8375989310309384, + "grad_norm": 0.17800618708133698, + "learning_rate": 3.101256704846794e-05, + "loss": 0.5426, + "step": 8939 + }, + { + "epoch": 1.837804502004317, + "grad_norm": 0.1971377432346344, + "learning_rate": 3.100307404757067e-05, + "loss": 0.5059, + "step": 8940 + }, + { + "epoch": 1.8380100729776956, + "grad_norm": 0.18664588034152985, + "learning_rate": 3.099358173623652e-05, + "loss": 0.5143, + "step": 8941 + }, + { + "epoch": 1.8382156439510742, + "grad_norm": 0.1906706988811493, + "learning_rate": 3.0984090114933135e-05, + "loss": 0.5337, + "step": 8942 + }, + { + "epoch": 1.8384212149244528, + "grad_norm": 0.19608831405639648, + "learning_rate": 3.09745991841281e-05, + "loss": 0.5284, + "step": 8943 + }, + { + "epoch": 1.8386267858978314, + "grad_norm": 0.19849687814712524, + "learning_rate": 3.096510894428902e-05, + "loss": 0.5223, + "step": 8944 + }, + { + "epoch": 1.83883235687121, + "grad_norm": 0.19968105852603912, + "learning_rate": 3.095561939588344e-05, + "loss": 0.5307, + "step": 8945 + }, + { + "epoch": 1.8390379278445883, + "grad_norm": 0.17240165174007416, + "learning_rate": 3.094613053937883e-05, + "loss": 0.5226, + "step": 8946 + }, + { + "epoch": 1.839243498817967, + "grad_norm": 0.16927485167980194, + "learning_rate": 3.0936642375242697e-05, + "loss": 0.5411, + "step": 8947 + }, + { + "epoch": 1.8394490697913455, + "grad_norm": 0.19402731955051422, + "learning_rate": 3.092715490394245e-05, + "loss": 0.5159, + "step": 8948 + }, + { + "epoch": 1.839654640764724, + "grad_norm": 0.16462527215480804, + "learning_rate": 3.091766812594551e-05, + "loss": 0.5177, + "step": 8949 + }, + { + "epoch": 1.8398602117381024, + "grad_norm": 0.16749082505702972, + "learning_rate": 3.0908182041719226e-05, + "loss": 0.5446, + "step": 8950 + }, + { + "epoch": 1.840065782711481, + "grad_norm": 0.1971343457698822, + "learning_rate": 3.089869665173095e-05, + "loss": 0.5092, + "step": 8951 + }, + { + "epoch": 1.8402713536848596, + "grad_norm": 0.20525288581848145, + "learning_rate": 3.0889211956447994e-05, + "loss": 0.5572, + "step": 8952 + }, + { + "epoch": 1.8404769246582382, + "grad_norm": 0.20502051711082458, + "learning_rate": 3.0879727956337605e-05, + "loss": 0.5269, + "step": 8953 + }, + { + "epoch": 1.8406824956316168, + "grad_norm": 0.20041027665138245, + "learning_rate": 3.087024465186704e-05, + "loss": 0.5216, + "step": 8954 + }, + { + "epoch": 1.8408880666049954, + "grad_norm": 0.1678602546453476, + "learning_rate": 3.086076204350346e-05, + "loss": 0.4868, + "step": 8955 + }, + { + "epoch": 1.841093637578374, + "grad_norm": 0.1751408874988556, + "learning_rate": 3.085128013171403e-05, + "loss": 0.549, + "step": 8956 + }, + { + "epoch": 1.8412992085517526, + "grad_norm": 0.19476006925106049, + "learning_rate": 3.084179891696592e-05, + "loss": 0.5015, + "step": 8957 + }, + { + "epoch": 1.8415047795251311, + "grad_norm": 0.20983824133872986, + "learning_rate": 3.083231839972621e-05, + "loss": 0.5351, + "step": 8958 + }, + { + "epoch": 1.8417103504985097, + "grad_norm": 0.19999557733535767, + "learning_rate": 3.082283858046194e-05, + "loss": 0.5268, + "step": 8959 + }, + { + "epoch": 1.8419159214718883, + "grad_norm": 0.2033097892999649, + "learning_rate": 3.081335945964014e-05, + "loss": 0.5243, + "step": 8960 + }, + { + "epoch": 1.8421214924452667, + "grad_norm": 0.19662059843540192, + "learning_rate": 3.080388103772783e-05, + "loss": 0.5197, + "step": 8961 + }, + { + "epoch": 1.8423270634186453, + "grad_norm": 0.19115544855594635, + "learning_rate": 3.079440331519194e-05, + "loss": 0.5119, + "step": 8962 + }, + { + "epoch": 1.8425326343920239, + "grad_norm": 0.19383569061756134, + "learning_rate": 3.078492629249939e-05, + "loss": 0.5221, + "step": 8963 + }, + { + "epoch": 1.8427382053654024, + "grad_norm": 0.19358788430690765, + "learning_rate": 3.077544997011709e-05, + "loss": 0.5366, + "step": 8964 + }, + { + "epoch": 1.8429437763387808, + "grad_norm": 0.19317568838596344, + "learning_rate": 3.0765974348511895e-05, + "loss": 0.5127, + "step": 8965 + }, + { + "epoch": 1.8431493473121594, + "grad_norm": 0.19126683473587036, + "learning_rate": 3.075649942815061e-05, + "loss": 0.5027, + "step": 8966 + }, + { + "epoch": 1.843354918285538, + "grad_norm": 0.2007630318403244, + "learning_rate": 3.0747025209500024e-05, + "loss": 0.5352, + "step": 8967 + }, + { + "epoch": 1.8435604892589166, + "grad_norm": 0.1700150966644287, + "learning_rate": 3.073755169302689e-05, + "loss": 0.4973, + "step": 8968 + }, + { + "epoch": 1.8437660602322952, + "grad_norm": 0.1250450313091278, + "learning_rate": 3.0728078879197913e-05, + "loss": 0.5154, + "step": 8969 + }, + { + "epoch": 1.8439716312056738, + "grad_norm": 0.16313976049423218, + "learning_rate": 3.071860676847978e-05, + "loss": 0.5528, + "step": 8970 + }, + { + "epoch": 1.8441772021790523, + "grad_norm": 0.16907833516597748, + "learning_rate": 3.070913536133915e-05, + "loss": 0.5112, + "step": 8971 + }, + { + "epoch": 1.844382773152431, + "grad_norm": 0.11831056326627731, + "learning_rate": 3.0699664658242614e-05, + "loss": 0.4941, + "step": 8972 + }, + { + "epoch": 1.8445883441258095, + "grad_norm": 0.15390439331531525, + "learning_rate": 3.0690194659656774e-05, + "loss": 0.514, + "step": 8973 + }, + { + "epoch": 1.844793915099188, + "grad_norm": 0.16013920307159424, + "learning_rate": 3.0680725366048155e-05, + "loss": 0.4877, + "step": 8974 + }, + { + "epoch": 1.8449994860725667, + "grad_norm": 0.1253557801246643, + "learning_rate": 3.067125677788327e-05, + "loss": 0.5135, + "step": 8975 + }, + { + "epoch": 1.845205057045945, + "grad_norm": 0.16094715893268585, + "learning_rate": 3.0661788895628595e-05, + "loss": 0.533, + "step": 8976 + }, + { + "epoch": 1.8454106280193237, + "grad_norm": 0.190200075507164, + "learning_rate": 3.065232171975054e-05, + "loss": 0.5279, + "step": 8977 + }, + { + "epoch": 1.8456161989927022, + "grad_norm": 0.20202942192554474, + "learning_rate": 3.064285525071556e-05, + "loss": 0.5234, + "step": 8978 + }, + { + "epoch": 1.8458217699660808, + "grad_norm": 0.20393583178520203, + "learning_rate": 3.063338948898999e-05, + "loss": 0.5437, + "step": 8979 + }, + { + "epoch": 1.8460273409394592, + "grad_norm": 0.19702088832855225, + "learning_rate": 3.062392443504017e-05, + "loss": 0.5375, + "step": 8980 + }, + { + "epoch": 1.8462329119128378, + "grad_norm": 0.19736789166927338, + "learning_rate": 3.061446008933239e-05, + "loss": 0.5485, + "step": 8981 + }, + { + "epoch": 1.8464384828862164, + "grad_norm": 0.19195613265037537, + "learning_rate": 3.060499645233294e-05, + "loss": 0.5325, + "step": 8982 + }, + { + "epoch": 1.846644053859595, + "grad_norm": 0.19624346494674683, + "learning_rate": 3.059553352450803e-05, + "loss": 0.5307, + "step": 8983 + }, + { + "epoch": 1.8468496248329735, + "grad_norm": 0.17116032540798187, + "learning_rate": 3.058607130632383e-05, + "loss": 0.4922, + "step": 8984 + }, + { + "epoch": 1.8470551958063521, + "grad_norm": 0.1617693156003952, + "learning_rate": 3.057660979824655e-05, + "loss": 0.5326, + "step": 8985 + }, + { + "epoch": 1.8472607667797307, + "grad_norm": 0.19013215601444244, + "learning_rate": 3.05671490007423e-05, + "loss": 0.5203, + "step": 8986 + }, + { + "epoch": 1.8474663377531093, + "grad_norm": 0.1693909615278244, + "learning_rate": 3.055768891427715e-05, + "loss": 0.5208, + "step": 8987 + }, + { + "epoch": 1.847671908726488, + "grad_norm": 0.1591087430715561, + "learning_rate": 3.054822953931716e-05, + "loss": 0.5186, + "step": 8988 + }, + { + "epoch": 1.8478774796998665, + "grad_norm": 0.19975587725639343, + "learning_rate": 3.0538770876328365e-05, + "loss": 0.5238, + "step": 8989 + }, + { + "epoch": 1.848083050673245, + "grad_norm": 0.21245107054710388, + "learning_rate": 3.052931292577673e-05, + "loss": 0.5405, + "step": 8990 + }, + { + "epoch": 1.8482886216466234, + "grad_norm": 0.19569487869739532, + "learning_rate": 3.051985568812819e-05, + "loss": 0.5452, + "step": 8991 + }, + { + "epoch": 1.848494192620002, + "grad_norm": 0.19539184868335724, + "learning_rate": 3.0510399163848704e-05, + "loss": 0.518, + "step": 8992 + }, + { + "epoch": 1.8486997635933806, + "grad_norm": 0.1992693692445755, + "learning_rate": 3.0500943353404117e-05, + "loss": 0.5521, + "step": 8993 + }, + { + "epoch": 1.8489053345667592, + "grad_norm": 0.1907494217157364, + "learning_rate": 3.0491488257260293e-05, + "loss": 0.5105, + "step": 8994 + }, + { + "epoch": 1.8491109055401376, + "grad_norm": 0.1949932873249054, + "learning_rate": 3.0482033875883026e-05, + "loss": 0.5214, + "step": 8995 + }, + { + "epoch": 1.8493164765135162, + "grad_norm": 0.19968056678771973, + "learning_rate": 3.0472580209738096e-05, + "loss": 0.5388, + "step": 8996 + }, + { + "epoch": 1.8495220474868947, + "grad_norm": 0.1978997439146042, + "learning_rate": 3.0463127259291236e-05, + "loss": 0.5319, + "step": 8997 + }, + { + "epoch": 1.8497276184602733, + "grad_norm": 0.2019454538822174, + "learning_rate": 3.0453675025008134e-05, + "loss": 0.532, + "step": 8998 + }, + { + "epoch": 1.849933189433652, + "grad_norm": 0.20041054487228394, + "learning_rate": 3.0444223507354492e-05, + "loss": 0.5036, + "step": 8999 + }, + { + "epoch": 1.8501387604070305, + "grad_norm": 0.20030958950519562, + "learning_rate": 3.0434772706795925e-05, + "loss": 0.5458, + "step": 9000 + }, + { + "epoch": 1.850344331380409, + "grad_norm": 0.16151051223278046, + "learning_rate": 3.042532262379803e-05, + "loss": 0.5085, + "step": 9001 + }, + { + "epoch": 1.8505499023537877, + "grad_norm": 0.15830279886722565, + "learning_rate": 3.0415873258826368e-05, + "loss": 0.536, + "step": 9002 + }, + { + "epoch": 1.8507554733271663, + "grad_norm": 0.19460676610469818, + "learning_rate": 3.040642461234645e-05, + "loss": 0.5357, + "step": 9003 + }, + { + "epoch": 1.8509610443005449, + "grad_norm": 0.1874227076768875, + "learning_rate": 3.0396976684823795e-05, + "loss": 0.5028, + "step": 9004 + }, + { + "epoch": 1.8511666152739235, + "grad_norm": 0.19529518485069275, + "learning_rate": 3.0387529476723823e-05, + "loss": 0.548, + "step": 9005 + }, + { + "epoch": 1.851372186247302, + "grad_norm": 0.19249314069747925, + "learning_rate": 3.0378082988511997e-05, + "loss": 0.4975, + "step": 9006 + }, + { + "epoch": 1.8515777572206804, + "grad_norm": 0.15955211222171783, + "learning_rate": 3.0368637220653672e-05, + "loss": 0.4934, + "step": 9007 + }, + { + "epoch": 1.851783328194059, + "grad_norm": 0.16296181082725525, + "learning_rate": 3.0359192173614212e-05, + "loss": 0.5391, + "step": 9008 + }, + { + "epoch": 1.8519888991674376, + "grad_norm": 0.18920543789863586, + "learning_rate": 3.0349747847858923e-05, + "loss": 0.5126, + "step": 9009 + }, + { + "epoch": 1.852194470140816, + "grad_norm": 0.20026175677776337, + "learning_rate": 3.0340304243853077e-05, + "loss": 0.5336, + "step": 9010 + }, + { + "epoch": 1.8524000411141945, + "grad_norm": 0.1883440762758255, + "learning_rate": 3.0330861362061927e-05, + "loss": 0.4898, + "step": 9011 + }, + { + "epoch": 1.8526056120875731, + "grad_norm": 0.19095604121685028, + "learning_rate": 3.0321419202950652e-05, + "loss": 0.5055, + "step": 9012 + }, + { + "epoch": 1.8528111830609517, + "grad_norm": 0.19625356793403625, + "learning_rate": 3.0311977766984462e-05, + "loss": 0.5161, + "step": 9013 + }, + { + "epoch": 1.8530167540343303, + "grad_norm": 0.19662852585315704, + "learning_rate": 3.0302537054628483e-05, + "loss": 0.5448, + "step": 9014 + }, + { + "epoch": 1.853222325007709, + "grad_norm": 0.20150268077850342, + "learning_rate": 3.0293097066347794e-05, + "loss": 0.503, + "step": 9015 + }, + { + "epoch": 1.8534278959810875, + "grad_norm": 0.20207509398460388, + "learning_rate": 3.0283657802607484e-05, + "loss": 0.5437, + "step": 9016 + }, + { + "epoch": 1.853633466954466, + "grad_norm": 0.20044514536857605, + "learning_rate": 3.027421926387257e-05, + "loss": 0.5406, + "step": 9017 + }, + { + "epoch": 1.8538390379278447, + "grad_norm": 0.2027798891067505, + "learning_rate": 3.026478145060804e-05, + "loss": 0.5493, + "step": 9018 + }, + { + "epoch": 1.8540446089012232, + "grad_norm": 0.19402191042900085, + "learning_rate": 3.025534436327884e-05, + "loss": 0.5346, + "step": 9019 + }, + { + "epoch": 1.8542501798746018, + "grad_norm": 0.2023455947637558, + "learning_rate": 3.0245908002349927e-05, + "loss": 0.5521, + "step": 9020 + }, + { + "epoch": 1.8544557508479804, + "grad_norm": 0.19758723676204681, + "learning_rate": 3.0236472368286162e-05, + "loss": 0.5314, + "step": 9021 + }, + { + "epoch": 1.8546613218213588, + "grad_norm": 0.19898824393749237, + "learning_rate": 3.0227037461552405e-05, + "loss": 0.5221, + "step": 9022 + }, + { + "epoch": 1.8548668927947374, + "grad_norm": 0.18745093047618866, + "learning_rate": 3.021760328261346e-05, + "loss": 0.5196, + "step": 9023 + }, + { + "epoch": 1.855072463768116, + "grad_norm": 0.1959155946969986, + "learning_rate": 3.0208169831934095e-05, + "loss": 0.515, + "step": 9024 + }, + { + "epoch": 1.8552780347414946, + "grad_norm": 0.19238829612731934, + "learning_rate": 3.0198737109979084e-05, + "loss": 0.5023, + "step": 9025 + }, + { + "epoch": 1.855483605714873, + "grad_norm": 0.19325855374336243, + "learning_rate": 3.01893051172131e-05, + "loss": 0.5261, + "step": 9026 + }, + { + "epoch": 1.8556891766882515, + "grad_norm": 0.1941802203655243, + "learning_rate": 3.017987385410083e-05, + "loss": 0.5381, + "step": 9027 + }, + { + "epoch": 1.85589474766163, + "grad_norm": 0.19294960796833038, + "learning_rate": 3.0170443321106913e-05, + "loss": 0.5493, + "step": 9028 + }, + { + "epoch": 1.8561003186350087, + "grad_norm": 0.20181645452976227, + "learning_rate": 3.0161013518695943e-05, + "loss": 0.5268, + "step": 9029 + }, + { + "epoch": 1.8563058896083873, + "grad_norm": 0.1975722759962082, + "learning_rate": 3.0151584447332476e-05, + "loss": 0.5342, + "step": 9030 + }, + { + "epoch": 1.8565114605817659, + "grad_norm": 0.20087282359600067, + "learning_rate": 3.0142156107481048e-05, + "loss": 0.522, + "step": 9031 + }, + { + "epoch": 1.8567170315551444, + "grad_norm": 0.19749726355075836, + "learning_rate": 3.013272849960612e-05, + "loss": 0.5077, + "step": 9032 + }, + { + "epoch": 1.856922602528523, + "grad_norm": 0.19727249443531036, + "learning_rate": 3.0123301624172185e-05, + "loss": 0.5261, + "step": 9033 + }, + { + "epoch": 1.8571281735019016, + "grad_norm": 0.2018827497959137, + "learning_rate": 3.0113875481643647e-05, + "loss": 0.5258, + "step": 9034 + }, + { + "epoch": 1.8573337444752802, + "grad_norm": 0.17497631907463074, + "learning_rate": 3.0104450072484895e-05, + "loss": 0.525, + "step": 9035 + }, + { + "epoch": 1.8575393154486588, + "grad_norm": 0.16717809438705444, + "learning_rate": 3.0095025397160248e-05, + "loss": 0.5311, + "step": 9036 + }, + { + "epoch": 1.8577448864220372, + "grad_norm": 0.19906377792358398, + "learning_rate": 3.0085601456134044e-05, + "loss": 0.521, + "step": 9037 + }, + { + "epoch": 1.8579504573954158, + "grad_norm": 0.19669370353221893, + "learning_rate": 3.0076178249870547e-05, + "loss": 0.495, + "step": 9038 + }, + { + "epoch": 1.8581560283687943, + "grad_norm": 0.1930094212293625, + "learning_rate": 3.006675577883398e-05, + "loss": 0.5243, + "step": 9039 + }, + { + "epoch": 1.858361599342173, + "grad_norm": 0.1566167026758194, + "learning_rate": 3.0057334043488573e-05, + "loss": 0.4969, + "step": 9040 + }, + { + "epoch": 1.8585671703155513, + "grad_norm": 0.1628788709640503, + "learning_rate": 3.0047913044298474e-05, + "loss": 0.534, + "step": 9041 + }, + { + "epoch": 1.8587727412889299, + "grad_norm": 0.19704844057559967, + "learning_rate": 3.0038492781727817e-05, + "loss": 0.5278, + "step": 9042 + }, + { + "epoch": 1.8589783122623085, + "grad_norm": 0.2023809403181076, + "learning_rate": 3.002907325624069e-05, + "loss": 0.5197, + "step": 9043 + }, + { + "epoch": 1.859183883235687, + "grad_norm": 0.1649642139673233, + "learning_rate": 3.0019654468301153e-05, + "loss": 0.5043, + "step": 9044 + }, + { + "epoch": 1.8593894542090657, + "grad_norm": 0.16661269962787628, + "learning_rate": 3.001023641837321e-05, + "loss": 0.5443, + "step": 9045 + }, + { + "epoch": 1.8595950251824442, + "grad_norm": 0.19846822321414948, + "learning_rate": 3.000081910692085e-05, + "loss": 0.5275, + "step": 9046 + }, + { + "epoch": 1.8598005961558228, + "grad_norm": 0.1986059993505478, + "learning_rate": 2.9991402534408043e-05, + "loss": 0.5404, + "step": 9047 + }, + { + "epoch": 1.8600061671292014, + "grad_norm": 0.20389589667320251, + "learning_rate": 2.9981986701298672e-05, + "loss": 0.5433, + "step": 9048 + }, + { + "epoch": 1.86021173810258, + "grad_norm": 0.1978558897972107, + "learning_rate": 2.9972571608056634e-05, + "loss": 0.5279, + "step": 9049 + }, + { + "epoch": 1.8604173090759586, + "grad_norm": 0.19354282319545746, + "learning_rate": 2.996315725514575e-05, + "loss": 0.5127, + "step": 9050 + }, + { + "epoch": 1.8606228800493372, + "grad_norm": 0.2036632001399994, + "learning_rate": 2.995374364302983e-05, + "loss": 0.5386, + "step": 9051 + }, + { + "epoch": 1.8608284510227155, + "grad_norm": 0.19390377402305603, + "learning_rate": 2.9944330772172635e-05, + "loss": 0.5256, + "step": 9052 + }, + { + "epoch": 1.8610340219960941, + "grad_norm": 0.16378752887248993, + "learning_rate": 2.9934918643037872e-05, + "loss": 0.523, + "step": 9053 + }, + { + "epoch": 1.8612395929694727, + "grad_norm": 0.28664878010749817, + "learning_rate": 2.9925507256089277e-05, + "loss": 0.5099, + "step": 9054 + }, + { + "epoch": 1.8614451639428513, + "grad_norm": 0.16860786080360413, + "learning_rate": 2.9916096611790473e-05, + "loss": 0.5528, + "step": 9055 + }, + { + "epoch": 1.8616507349162297, + "grad_norm": 0.1950398087501526, + "learning_rate": 2.990668671060509e-05, + "loss": 0.5442, + "step": 9056 + }, + { + "epoch": 1.8618563058896083, + "grad_norm": 0.1937715858221054, + "learning_rate": 2.98972775529967e-05, + "loss": 0.5393, + "step": 9057 + }, + { + "epoch": 1.8620618768629869, + "grad_norm": 0.1997014582157135, + "learning_rate": 2.988786913942886e-05, + "loss": 0.5328, + "step": 9058 + }, + { + "epoch": 1.8622674478363654, + "grad_norm": 0.19975896179676056, + "learning_rate": 2.9878461470365082e-05, + "loss": 0.5348, + "step": 9059 + }, + { + "epoch": 1.862473018809744, + "grad_norm": 0.15818095207214355, + "learning_rate": 2.986905454626881e-05, + "loss": 0.4831, + "step": 9060 + }, + { + "epoch": 1.8626785897831226, + "grad_norm": 0.1348692923784256, + "learning_rate": 2.9859648367603506e-05, + "loss": 0.4821, + "step": 9061 + }, + { + "epoch": 1.8628841607565012, + "grad_norm": 0.1564972698688507, + "learning_rate": 2.9850242934832573e-05, + "loss": 0.5206, + "step": 9062 + }, + { + "epoch": 1.8630897317298798, + "grad_norm": 0.19233213365077972, + "learning_rate": 2.9840838248419352e-05, + "loss": 0.5317, + "step": 9063 + }, + { + "epoch": 1.8632953027032584, + "grad_norm": 0.20065823197364807, + "learning_rate": 2.983143430882718e-05, + "loss": 0.5267, + "step": 9064 + }, + { + "epoch": 1.863500873676637, + "grad_norm": 0.20056359469890594, + "learning_rate": 2.9822031116519345e-05, + "loss": 0.5365, + "step": 9065 + }, + { + "epoch": 1.8637064446500156, + "grad_norm": 0.2093392014503479, + "learning_rate": 2.9812628671959084e-05, + "loss": 0.53, + "step": 9066 + }, + { + "epoch": 1.863912015623394, + "grad_norm": 0.20502184331417084, + "learning_rate": 2.9803226975609622e-05, + "loss": 0.5227, + "step": 9067 + }, + { + "epoch": 1.8641175865967725, + "grad_norm": 0.19816361367702484, + "learning_rate": 2.9793826027934147e-05, + "loss": 0.5054, + "step": 9068 + }, + { + "epoch": 1.864323157570151, + "grad_norm": 0.2030927836894989, + "learning_rate": 2.9784425829395777e-05, + "loss": 0.5327, + "step": 9069 + }, + { + "epoch": 1.8645287285435297, + "grad_norm": 0.20948849618434906, + "learning_rate": 2.9775026380457645e-05, + "loss": 0.5415, + "step": 9070 + }, + { + "epoch": 1.864734299516908, + "grad_norm": 0.19347041845321655, + "learning_rate": 2.97656276815828e-05, + "loss": 0.5216, + "step": 9071 + }, + { + "epoch": 1.8649398704902866, + "grad_norm": 0.16669385135173798, + "learning_rate": 2.975622973323427e-05, + "loss": 0.4946, + "step": 9072 + }, + { + "epoch": 1.8651454414636652, + "grad_norm": 0.1577424257993698, + "learning_rate": 2.9746832535875054e-05, + "loss": 0.5255, + "step": 9073 + }, + { + "epoch": 1.8653510124370438, + "grad_norm": 0.19290731847286224, + "learning_rate": 2.973743608996809e-05, + "loss": 0.5162, + "step": 9074 + }, + { + "epoch": 1.8655565834104224, + "grad_norm": 0.22282882034778595, + "learning_rate": 2.9728040395976326e-05, + "loss": 0.5466, + "step": 9075 + }, + { + "epoch": 1.865762154383801, + "grad_norm": 0.19410258531570435, + "learning_rate": 2.9718645454362635e-05, + "loss": 0.5002, + "step": 9076 + }, + { + "epoch": 1.8659677253571796, + "grad_norm": 0.199889674782753, + "learning_rate": 2.9709251265589857e-05, + "loss": 0.5468, + "step": 9077 + }, + { + "epoch": 1.8661732963305582, + "grad_norm": 0.19312036037445068, + "learning_rate": 2.969985783012079e-05, + "loss": 0.5278, + "step": 9078 + }, + { + "epoch": 1.8663788673039368, + "grad_norm": 0.1675236076116562, + "learning_rate": 2.9690465148418225e-05, + "loss": 0.5274, + "step": 9079 + }, + { + "epoch": 1.8665844382773153, + "grad_norm": 0.1646019071340561, + "learning_rate": 2.9681073220944887e-05, + "loss": 0.5227, + "step": 9080 + }, + { + "epoch": 1.866790009250694, + "grad_norm": 0.19794802367687225, + "learning_rate": 2.9671682048163452e-05, + "loss": 0.5234, + "step": 9081 + }, + { + "epoch": 1.8669955802240725, + "grad_norm": 0.19309046864509583, + "learning_rate": 2.9662291630536612e-05, + "loss": 0.5235, + "step": 9082 + }, + { + "epoch": 1.867201151197451, + "grad_norm": 0.19643832743167877, + "learning_rate": 2.965290196852698e-05, + "loss": 0.5161, + "step": 9083 + }, + { + "epoch": 1.8674067221708295, + "grad_norm": 0.19640390574932098, + "learning_rate": 2.964351306259713e-05, + "loss": 0.5374, + "step": 9084 + }, + { + "epoch": 1.867612293144208, + "grad_norm": 0.2005051225423813, + "learning_rate": 2.9634124913209623e-05, + "loss": 0.5183, + "step": 9085 + }, + { + "epoch": 1.8678178641175864, + "grad_norm": 0.20132414996623993, + "learning_rate": 2.9624737520826958e-05, + "loss": 0.5101, + "step": 9086 + }, + { + "epoch": 1.868023435090965, + "grad_norm": 0.19228623807430267, + "learning_rate": 2.9615350885911618e-05, + "loss": 0.5274, + "step": 9087 + }, + { + "epoch": 1.8682290060643436, + "grad_norm": 0.16477905213832855, + "learning_rate": 2.9605965008926004e-05, + "loss": 0.4958, + "step": 9088 + }, + { + "epoch": 1.8684345770377222, + "grad_norm": 0.16282616555690765, + "learning_rate": 2.9596579890332563e-05, + "loss": 0.5274, + "step": 9089 + }, + { + "epoch": 1.8686401480111008, + "grad_norm": 0.19304989278316498, + "learning_rate": 2.958719553059363e-05, + "loss": 0.5014, + "step": 9090 + }, + { + "epoch": 1.8688457189844794, + "grad_norm": 0.16735190153121948, + "learning_rate": 2.957781193017154e-05, + "loss": 0.513, + "step": 9091 + }, + { + "epoch": 1.869051289957858, + "grad_norm": 0.11929447948932648, + "learning_rate": 2.9568429089528573e-05, + "loss": 0.502, + "step": 9092 + }, + { + "epoch": 1.8692568609312366, + "grad_norm": 0.16071327030658722, + "learning_rate": 2.955904700912698e-05, + "loss": 0.5299, + "step": 9093 + }, + { + "epoch": 1.8694624319046151, + "grad_norm": 0.20334213972091675, + "learning_rate": 2.954966568942897e-05, + "loss": 0.534, + "step": 9094 + }, + { + "epoch": 1.8696680028779937, + "grad_norm": 0.19918020069599152, + "learning_rate": 2.9540285130896692e-05, + "loss": 0.5146, + "step": 9095 + }, + { + "epoch": 1.8698735738513723, + "grad_norm": 0.16279511153697968, + "learning_rate": 2.9530905333992337e-05, + "loss": 0.4845, + "step": 9096 + }, + { + "epoch": 1.870079144824751, + "grad_norm": 0.16142159700393677, + "learning_rate": 2.9521526299177962e-05, + "loss": 0.5215, + "step": 9097 + }, + { + "epoch": 1.8702847157981293, + "grad_norm": 0.20098471641540527, + "learning_rate": 2.951214802691565e-05, + "loss": 0.5314, + "step": 9098 + }, + { + "epoch": 1.8704902867715079, + "grad_norm": 0.19500714540481567, + "learning_rate": 2.950277051766741e-05, + "loss": 0.5257, + "step": 9099 + }, + { + "epoch": 1.8706958577448864, + "grad_norm": 0.19438640773296356, + "learning_rate": 2.949339377189522e-05, + "loss": 0.5412, + "step": 9100 + }, + { + "epoch": 1.870901428718265, + "grad_norm": 0.16201166808605194, + "learning_rate": 2.9484017790061058e-05, + "loss": 0.4902, + "step": 9101 + }, + { + "epoch": 1.8711069996916434, + "grad_norm": 0.15816009044647217, + "learning_rate": 2.9474642572626804e-05, + "loss": 0.5344, + "step": 9102 + }, + { + "epoch": 1.871312570665022, + "grad_norm": 0.19865500926971436, + "learning_rate": 2.9465268120054347e-05, + "loss": 0.5286, + "step": 9103 + }, + { + "epoch": 1.8715181416384006, + "grad_norm": 0.19558370113372803, + "learning_rate": 2.945589443280553e-05, + "loss": 0.5003, + "step": 9104 + }, + { + "epoch": 1.8717237126117792, + "grad_norm": 0.19835351407527924, + "learning_rate": 2.944652151134214e-05, + "loss": 0.5106, + "step": 9105 + }, + { + "epoch": 1.8719292835851578, + "grad_norm": 0.1939878761768341, + "learning_rate": 2.9437149356125937e-05, + "loss": 0.5295, + "step": 9106 + }, + { + "epoch": 1.8721348545585363, + "grad_norm": 0.18890373408794403, + "learning_rate": 2.9427777967618645e-05, + "loss": 0.5072, + "step": 9107 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 0.20133374631404877, + "learning_rate": 2.9418407346281948e-05, + "loss": 0.5436, + "step": 9108 + }, + { + "epoch": 1.8725459965052935, + "grad_norm": 0.19078055024147034, + "learning_rate": 2.940903749257748e-05, + "loss": 0.4905, + "step": 9109 + }, + { + "epoch": 1.872751567478672, + "grad_norm": 0.1644534170627594, + "learning_rate": 2.9399668406966874e-05, + "loss": 0.5029, + "step": 9110 + }, + { + "epoch": 1.8729571384520507, + "grad_norm": 0.1681102067232132, + "learning_rate": 2.9390300089911696e-05, + "loss": 0.54, + "step": 9111 + }, + { + "epoch": 1.8731627094254293, + "grad_norm": 0.16144701838493347, + "learning_rate": 2.938093254187346e-05, + "loss": 0.5035, + "step": 9112 + }, + { + "epoch": 1.8733682803988077, + "grad_norm": 0.16440553963184357, + "learning_rate": 2.937156576331368e-05, + "loss": 0.5317, + "step": 9113 + }, + { + "epoch": 1.8735738513721862, + "grad_norm": 0.19571755826473236, + "learning_rate": 2.936219975469382e-05, + "loss": 0.5346, + "step": 9114 + }, + { + "epoch": 1.8737794223455648, + "grad_norm": 0.6120141744613647, + "learning_rate": 2.9352834516475254e-05, + "loss": 0.5264, + "step": 9115 + }, + { + "epoch": 1.8739849933189434, + "grad_norm": 0.1942664086818695, + "learning_rate": 2.9343470049119426e-05, + "loss": 0.5409, + "step": 9116 + }, + { + "epoch": 1.8741905642923218, + "grad_norm": 0.19954350590705872, + "learning_rate": 2.9334106353087646e-05, + "loss": 0.5159, + "step": 9117 + }, + { + "epoch": 1.8743961352657004, + "grad_norm": 0.156526118516922, + "learning_rate": 2.9324743428841223e-05, + "loss": 0.4767, + "step": 9118 + }, + { + "epoch": 1.874601706239079, + "grad_norm": 0.1730726808309555, + "learning_rate": 2.9315381276841425e-05, + "loss": 0.5267, + "step": 9119 + }, + { + "epoch": 1.8748072772124575, + "grad_norm": 0.20543117821216583, + "learning_rate": 2.9306019897549483e-05, + "loss": 0.5323, + "step": 9120 + }, + { + "epoch": 1.8750128481858361, + "grad_norm": 0.17114831507205963, + "learning_rate": 2.9296659291426576e-05, + "loss": 0.5179, + "step": 9121 + }, + { + "epoch": 1.8752184191592147, + "grad_norm": 0.16466084122657776, + "learning_rate": 2.928729945893387e-05, + "loss": 0.5224, + "step": 9122 + }, + { + "epoch": 1.8754239901325933, + "grad_norm": 0.19490589201450348, + "learning_rate": 2.927794040053249e-05, + "loss": 0.5288, + "step": 9123 + }, + { + "epoch": 1.875629561105972, + "grad_norm": 0.165072500705719, + "learning_rate": 2.926858211668349e-05, + "loss": 0.4984, + "step": 9124 + }, + { + "epoch": 1.8758351320793505, + "grad_norm": 0.1546640247106552, + "learning_rate": 2.9259224607847928e-05, + "loss": 0.544, + "step": 9125 + }, + { + "epoch": 1.876040703052729, + "grad_norm": 0.18832944333553314, + "learning_rate": 2.9249867874486802e-05, + "loss": 0.5269, + "step": 9126 + }, + { + "epoch": 1.8762462740261077, + "grad_norm": 0.19997960329055786, + "learning_rate": 2.924051191706107e-05, + "loss": 0.539, + "step": 9127 + }, + { + "epoch": 1.876451844999486, + "grad_norm": 0.21653611958026886, + "learning_rate": 2.9231156736031653e-05, + "loss": 0.5414, + "step": 9128 + }, + { + "epoch": 1.8766574159728646, + "grad_norm": 0.1963309794664383, + "learning_rate": 2.922180233185942e-05, + "loss": 0.5308, + "step": 9129 + }, + { + "epoch": 1.8768629869462432, + "grad_norm": 0.19767159223556519, + "learning_rate": 2.921244870500526e-05, + "loss": 0.5479, + "step": 9130 + }, + { + "epoch": 1.8770685579196218, + "grad_norm": 0.19611725211143494, + "learning_rate": 2.9203095855929962e-05, + "loss": 0.5213, + "step": 9131 + }, + { + "epoch": 1.8772741288930002, + "grad_norm": 0.19497352838516235, + "learning_rate": 2.91937437850943e-05, + "loss": 0.5212, + "step": 9132 + }, + { + "epoch": 1.8774796998663787, + "grad_norm": 0.19085940718650818, + "learning_rate": 2.918439249295899e-05, + "loss": 0.532, + "step": 9133 + }, + { + "epoch": 1.8776852708397573, + "grad_norm": 0.1957186907529831, + "learning_rate": 2.917504197998475e-05, + "loss": 0.5046, + "step": 9134 + }, + { + "epoch": 1.877890841813136, + "grad_norm": 0.17413191497325897, + "learning_rate": 2.916569224663223e-05, + "loss": 0.5181, + "step": 9135 + }, + { + "epoch": 1.8780964127865145, + "grad_norm": 0.16276034712791443, + "learning_rate": 2.9156343293362013e-05, + "loss": 0.5378, + "step": 9136 + }, + { + "epoch": 1.878301983759893, + "grad_norm": 0.19842852652072906, + "learning_rate": 2.914699512063474e-05, + "loss": 0.5238, + "step": 9137 + }, + { + "epoch": 1.8785075547332717, + "grad_norm": 0.20142436027526855, + "learning_rate": 2.9137647728910915e-05, + "loss": 0.53, + "step": 9138 + }, + { + "epoch": 1.8787131257066503, + "grad_norm": 0.1989341378211975, + "learning_rate": 2.9128301118651043e-05, + "loss": 0.5447, + "step": 9139 + }, + { + "epoch": 1.8789186966800289, + "grad_norm": 0.15917402505874634, + "learning_rate": 2.9118955290315593e-05, + "loss": 0.4962, + "step": 9140 + }, + { + "epoch": 1.8791242676534075, + "grad_norm": 0.16550025343894958, + "learning_rate": 2.9109610244364994e-05, + "loss": 0.5044, + "step": 9141 + }, + { + "epoch": 1.879329838626786, + "grad_norm": 0.19528605043888092, + "learning_rate": 2.9100265981259613e-05, + "loss": 0.52, + "step": 9142 + }, + { + "epoch": 1.8795354096001644, + "grad_norm": 0.15918217599391937, + "learning_rate": 2.909092250145981e-05, + "loss": 0.5113, + "step": 9143 + }, + { + "epoch": 1.879740980573543, + "grad_norm": 0.19821493327617645, + "learning_rate": 2.9081579805425912e-05, + "loss": 0.5407, + "step": 9144 + }, + { + "epoch": 1.8799465515469216, + "grad_norm": 0.19132786989212036, + "learning_rate": 2.9072237893618154e-05, + "loss": 0.5243, + "step": 9145 + }, + { + "epoch": 1.8801521225203002, + "grad_norm": 0.20594055950641632, + "learning_rate": 2.9062896766496812e-05, + "loss": 0.5245, + "step": 9146 + }, + { + "epoch": 1.8803576934936785, + "grad_norm": 0.1643124520778656, + "learning_rate": 2.9053556424522043e-05, + "loss": 0.5056, + "step": 9147 + }, + { + "epoch": 1.8805632644670571, + "grad_norm": 0.16157592833042145, + "learning_rate": 2.9044216868154028e-05, + "loss": 0.5309, + "step": 9148 + }, + { + "epoch": 1.8807688354404357, + "grad_norm": 0.1968626081943512, + "learning_rate": 2.9034878097852863e-05, + "loss": 0.5417, + "step": 9149 + }, + { + "epoch": 1.8809744064138143, + "grad_norm": 0.1947098821401596, + "learning_rate": 2.9025540114078615e-05, + "loss": 0.5476, + "step": 9150 + }, + { + "epoch": 1.881179977387193, + "grad_norm": 0.1969742625951767, + "learning_rate": 2.9016202917291363e-05, + "loss": 0.5182, + "step": 9151 + }, + { + "epoch": 1.8813855483605715, + "grad_norm": 0.199255108833313, + "learning_rate": 2.9006866507951085e-05, + "loss": 0.5049, + "step": 9152 + }, + { + "epoch": 1.88159111933395, + "grad_norm": 0.1936180591583252, + "learning_rate": 2.899753088651774e-05, + "loss": 0.5345, + "step": 9153 + }, + { + "epoch": 1.8817966903073287, + "grad_norm": 0.1694704294204712, + "learning_rate": 2.8988196053451242e-05, + "loss": 0.4989, + "step": 9154 + }, + { + "epoch": 1.8820022612807072, + "grad_norm": 0.16625025868415833, + "learning_rate": 2.89788620092115e-05, + "loss": 0.5487, + "step": 9155 + }, + { + "epoch": 1.8822078322540858, + "grad_norm": 0.2028672993183136, + "learning_rate": 2.8969528754258344e-05, + "loss": 0.5241, + "step": 9156 + }, + { + "epoch": 1.8824134032274644, + "grad_norm": 0.20288680493831635, + "learning_rate": 2.896019628905156e-05, + "loss": 0.5528, + "step": 9157 + }, + { + "epoch": 1.8826189742008428, + "grad_norm": 0.1986982226371765, + "learning_rate": 2.8950864614050947e-05, + "loss": 0.5214, + "step": 9158 + }, + { + "epoch": 1.8828245451742214, + "grad_norm": 0.16761691868305206, + "learning_rate": 2.8941533729716225e-05, + "loss": 0.5164, + "step": 9159 + }, + { + "epoch": 1.8830301161476, + "grad_norm": 0.17081286013126373, + "learning_rate": 2.8932203636507085e-05, + "loss": 0.5364, + "step": 9160 + }, + { + "epoch": 1.8832356871209786, + "grad_norm": 0.19834405183792114, + "learning_rate": 2.8922874334883166e-05, + "loss": 0.5329, + "step": 9161 + }, + { + "epoch": 1.883441258094357, + "grad_norm": 0.1979091614484787, + "learning_rate": 2.8913545825304082e-05, + "loss": 0.5363, + "step": 9162 + }, + { + "epoch": 1.8836468290677355, + "grad_norm": 0.19581232964992523, + "learning_rate": 2.8904218108229417e-05, + "loss": 0.5278, + "step": 9163 + }, + { + "epoch": 1.883852400041114, + "grad_norm": 0.1917477548122406, + "learning_rate": 2.8894891184118666e-05, + "loss": 0.5051, + "step": 9164 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.1971418410539627, + "learning_rate": 2.888556505343137e-05, + "loss": 0.5333, + "step": 9165 + }, + { + "epoch": 1.8842635419878713, + "grad_norm": 0.19777396321296692, + "learning_rate": 2.8876239716626963e-05, + "loss": 0.5299, + "step": 9166 + }, + { + "epoch": 1.8844691129612499, + "grad_norm": 0.20348910987377167, + "learning_rate": 2.8866915174164866e-05, + "loss": 0.5299, + "step": 9167 + }, + { + "epoch": 1.8846746839346284, + "grad_norm": 0.16637156903743744, + "learning_rate": 2.8857591426504452e-05, + "loss": 0.5043, + "step": 9168 + }, + { + "epoch": 1.884880254908007, + "grad_norm": 0.1301034539937973, + "learning_rate": 2.8848268474105064e-05, + "loss": 0.5234, + "step": 9169 + }, + { + "epoch": 1.8850858258813856, + "grad_norm": 0.16645324230194092, + "learning_rate": 2.8838946317425992e-05, + "loss": 0.5215, + "step": 9170 + }, + { + "epoch": 1.8852913968547642, + "grad_norm": 0.19991381466388702, + "learning_rate": 2.882962495692648e-05, + "loss": 0.5436, + "step": 9171 + }, + { + "epoch": 1.8854969678281428, + "grad_norm": 0.19237017631530762, + "learning_rate": 2.8820304393065785e-05, + "loss": 0.5136, + "step": 9172 + }, + { + "epoch": 1.8857025388015214, + "grad_norm": 0.19153942167758942, + "learning_rate": 2.8810984626303068e-05, + "loss": 0.5263, + "step": 9173 + }, + { + "epoch": 1.8859081097748998, + "grad_norm": 0.20043647289276123, + "learning_rate": 2.8801665657097478e-05, + "loss": 0.5285, + "step": 9174 + }, + { + "epoch": 1.8861136807482783, + "grad_norm": 0.1646028459072113, + "learning_rate": 2.87923474859081e-05, + "loss": 0.4842, + "step": 9175 + }, + { + "epoch": 1.886319251721657, + "grad_norm": 0.1615920066833496, + "learning_rate": 2.8783030113194004e-05, + "loss": 0.5396, + "step": 9176 + }, + { + "epoch": 1.8865248226950353, + "grad_norm": 0.19795995950698853, + "learning_rate": 2.8773713539414224e-05, + "loss": 0.5248, + "step": 9177 + }, + { + "epoch": 1.8867303936684139, + "grad_norm": 0.19956757128238678, + "learning_rate": 2.8764397765027717e-05, + "loss": 0.5554, + "step": 9178 + }, + { + "epoch": 1.8869359646417925, + "grad_norm": 0.20267313718795776, + "learning_rate": 2.8755082790493463e-05, + "loss": 0.5167, + "step": 9179 + }, + { + "epoch": 1.887141535615171, + "grad_norm": 0.2114168256521225, + "learning_rate": 2.8745768616270358e-05, + "loss": 0.5346, + "step": 9180 + }, + { + "epoch": 1.8873471065885497, + "grad_norm": 0.1968614161014557, + "learning_rate": 2.873645524281726e-05, + "loss": 0.5359, + "step": 9181 + }, + { + "epoch": 1.8875526775619282, + "grad_norm": 0.19454684853553772, + "learning_rate": 2.8727142670592992e-05, + "loss": 0.5156, + "step": 9182 + }, + { + "epoch": 1.8877582485353068, + "grad_norm": 0.19672390818595886, + "learning_rate": 2.8717830900056353e-05, + "loss": 0.521, + "step": 9183 + }, + { + "epoch": 1.8879638195086854, + "grad_norm": 0.1960788369178772, + "learning_rate": 2.8708519931666074e-05, + "loss": 0.5307, + "step": 9184 + }, + { + "epoch": 1.888169390482064, + "grad_norm": 0.20308300852775574, + "learning_rate": 2.869920976588086e-05, + "loss": 0.535, + "step": 9185 + }, + { + "epoch": 1.8883749614554426, + "grad_norm": 0.2008272409439087, + "learning_rate": 2.86899004031594e-05, + "loss": 0.5472, + "step": 9186 + }, + { + "epoch": 1.8885805324288212, + "grad_norm": 0.20526312291622162, + "learning_rate": 2.8680591843960325e-05, + "loss": 0.5531, + "step": 9187 + }, + { + "epoch": 1.8887861034021998, + "grad_norm": 0.19478276371955872, + "learning_rate": 2.8671284088742203e-05, + "loss": 0.5059, + "step": 9188 + }, + { + "epoch": 1.8889916743755781, + "grad_norm": 0.19619226455688477, + "learning_rate": 2.86619771379636e-05, + "loss": 0.5273, + "step": 9189 + }, + { + "epoch": 1.8891972453489567, + "grad_norm": 0.1992175430059433, + "learning_rate": 2.8652670992083012e-05, + "loss": 0.502, + "step": 9190 + }, + { + "epoch": 1.8894028163223353, + "grad_norm": 0.20220716297626495, + "learning_rate": 2.864336565155891e-05, + "loss": 0.5146, + "step": 9191 + }, + { + "epoch": 1.889608387295714, + "grad_norm": 0.2015795111656189, + "learning_rate": 2.863406111684975e-05, + "loss": 0.5589, + "step": 9192 + }, + { + "epoch": 1.8898139582690923, + "grad_norm": 0.19961073994636536, + "learning_rate": 2.86247573884139e-05, + "loss": 0.5222, + "step": 9193 + }, + { + "epoch": 1.8900195292424709, + "grad_norm": 0.1948811113834381, + "learning_rate": 2.8615454466709714e-05, + "loss": 0.5291, + "step": 9194 + }, + { + "epoch": 1.8902251002158494, + "grad_norm": 0.16690470278263092, + "learning_rate": 2.8606152352195506e-05, + "loss": 0.4997, + "step": 9195 + }, + { + "epoch": 1.890430671189228, + "grad_norm": 0.12287265807390213, + "learning_rate": 2.8596851045329547e-05, + "loss": 0.4877, + "step": 9196 + }, + { + "epoch": 1.8906362421626066, + "grad_norm": 0.1636972278356552, + "learning_rate": 2.8587550546570063e-05, + "loss": 0.5293, + "step": 9197 + }, + { + "epoch": 1.8908418131359852, + "grad_norm": 0.1975325345993042, + "learning_rate": 2.8578250856375253e-05, + "loss": 0.5406, + "step": 9198 + }, + { + "epoch": 1.8910473841093638, + "grad_norm": 0.19792711734771729, + "learning_rate": 2.8568951975203272e-05, + "loss": 0.5435, + "step": 9199 + }, + { + "epoch": 1.8912529550827424, + "grad_norm": 0.1665029525756836, + "learning_rate": 2.8559653903512225e-05, + "loss": 0.5264, + "step": 9200 + }, + { + "epoch": 1.891458526056121, + "grad_norm": 0.1603616178035736, + "learning_rate": 2.855035664176019e-05, + "loss": 0.5334, + "step": 9201 + }, + { + "epoch": 1.8916640970294996, + "grad_norm": 0.16326431930065155, + "learning_rate": 2.8541060190405204e-05, + "loss": 0.5195, + "step": 9202 + }, + { + "epoch": 1.8918696680028781, + "grad_norm": 0.17168152332305908, + "learning_rate": 2.8531764549905253e-05, + "loss": 0.5336, + "step": 9203 + }, + { + "epoch": 1.8920752389762565, + "grad_norm": 0.19132192432880402, + "learning_rate": 2.8522469720718287e-05, + "loss": 0.5346, + "step": 9204 + }, + { + "epoch": 1.892280809949635, + "grad_norm": 0.20156370103359222, + "learning_rate": 2.851317570330221e-05, + "loss": 0.53, + "step": 9205 + }, + { + "epoch": 1.8924863809230137, + "grad_norm": 0.19648964703083038, + "learning_rate": 2.850388249811492e-05, + "loss": 0.5264, + "step": 9206 + }, + { + "epoch": 1.8926919518963923, + "grad_norm": 0.19839881360530853, + "learning_rate": 2.8494590105614233e-05, + "loss": 0.5382, + "step": 9207 + }, + { + "epoch": 1.8928975228697706, + "grad_norm": 0.19902363419532776, + "learning_rate": 2.8485298526257956e-05, + "loss": 0.501, + "step": 9208 + }, + { + "epoch": 1.8931030938431492, + "grad_norm": 0.19624055922031403, + "learning_rate": 2.8476007760503814e-05, + "loss": 0.5242, + "step": 9209 + }, + { + "epoch": 1.8933086648165278, + "grad_norm": 0.1998245269060135, + "learning_rate": 2.8466717808809548e-05, + "loss": 0.5281, + "step": 9210 + }, + { + "epoch": 1.8935142357899064, + "grad_norm": 0.2012401521205902, + "learning_rate": 2.845742867163282e-05, + "loss": 0.5282, + "step": 9211 + }, + { + "epoch": 1.893719806763285, + "grad_norm": 0.22999098896980286, + "learning_rate": 2.844814034943124e-05, + "loss": 0.4983, + "step": 9212 + }, + { + "epoch": 1.8939253777366636, + "grad_norm": 0.1972244828939438, + "learning_rate": 2.8438852842662445e-05, + "loss": 0.5242, + "step": 9213 + }, + { + "epoch": 1.8941309487100422, + "grad_norm": 0.19226615130901337, + "learning_rate": 2.8429566151783957e-05, + "loss": 0.5224, + "step": 9214 + }, + { + "epoch": 1.8943365196834208, + "grad_norm": 0.19878603518009186, + "learning_rate": 2.8420280277253303e-05, + "loss": 0.5235, + "step": 9215 + }, + { + "epoch": 1.8945420906567993, + "grad_norm": 0.19549743831157684, + "learning_rate": 2.8410995219527937e-05, + "loss": 0.5459, + "step": 9216 + }, + { + "epoch": 1.894747661630178, + "grad_norm": 0.19706833362579346, + "learning_rate": 2.8401710979065313e-05, + "loss": 0.5388, + "step": 9217 + }, + { + "epoch": 1.8949532326035565, + "grad_norm": 0.19738836586475372, + "learning_rate": 2.839242755632279e-05, + "loss": 0.5026, + "step": 9218 + }, + { + "epoch": 1.895158803576935, + "grad_norm": 0.1891833394765854, + "learning_rate": 2.838314495175774e-05, + "loss": 0.5232, + "step": 9219 + }, + { + "epoch": 1.8953643745503135, + "grad_norm": 0.2042219191789627, + "learning_rate": 2.837386316582748e-05, + "loss": 0.5171, + "step": 9220 + }, + { + "epoch": 1.895569945523692, + "grad_norm": 0.19573958218097687, + "learning_rate": 2.8364582198989256e-05, + "loss": 0.521, + "step": 9221 + }, + { + "epoch": 1.8957755164970707, + "grad_norm": 0.1948520541191101, + "learning_rate": 2.835530205170033e-05, + "loss": 0.5316, + "step": 9222 + }, + { + "epoch": 1.895981087470449, + "grad_norm": 0.1974736452102661, + "learning_rate": 2.8346022724417877e-05, + "loss": 0.5227, + "step": 9223 + }, + { + "epoch": 1.8961866584438276, + "grad_norm": 0.19401709735393524, + "learning_rate": 2.8336744217599044e-05, + "loss": 0.5546, + "step": 9224 + }, + { + "epoch": 1.8963922294172062, + "grad_norm": 0.2048596292734146, + "learning_rate": 2.832746653170093e-05, + "loss": 0.536, + "step": 9225 + }, + { + "epoch": 1.8965978003905848, + "grad_norm": 0.19587990641593933, + "learning_rate": 2.8318189667180604e-05, + "loss": 0.5109, + "step": 9226 + }, + { + "epoch": 1.8968033713639634, + "grad_norm": 0.20599375665187836, + "learning_rate": 2.8308913624495113e-05, + "loss": 0.5001, + "step": 9227 + }, + { + "epoch": 1.897008942337342, + "grad_norm": 0.12338798493146896, + "learning_rate": 2.829963840410144e-05, + "loss": 0.5183, + "step": 9228 + }, + { + "epoch": 1.8972145133107206, + "grad_norm": 0.16888199746608734, + "learning_rate": 2.829036400645652e-05, + "loss": 0.5375, + "step": 9229 + }, + { + "epoch": 1.8974200842840991, + "grad_norm": 0.1930612325668335, + "learning_rate": 2.8281090432017264e-05, + "loss": 0.5138, + "step": 9230 + }, + { + "epoch": 1.8976256552574777, + "grad_norm": 0.19552965462207794, + "learning_rate": 2.827181768124054e-05, + "loss": 0.5154, + "step": 9231 + }, + { + "epoch": 1.8978312262308563, + "grad_norm": 0.1639067679643631, + "learning_rate": 2.8262545754583176e-05, + "loss": 0.5142, + "step": 9232 + }, + { + "epoch": 1.898036797204235, + "grad_norm": 0.15904250741004944, + "learning_rate": 2.8253274652501932e-05, + "loss": 0.5298, + "step": 9233 + }, + { + "epoch": 1.8982423681776133, + "grad_norm": 0.19867335259914398, + "learning_rate": 2.824400437545359e-05, + "loss": 0.5261, + "step": 9234 + }, + { + "epoch": 1.8984479391509919, + "grad_norm": 0.16743482649326324, + "learning_rate": 2.8234734923894837e-05, + "loss": 0.4987, + "step": 9235 + }, + { + "epoch": 1.8986535101243704, + "grad_norm": 0.15749593079090118, + "learning_rate": 2.822546629828233e-05, + "loss": 0.5142, + "step": 9236 + }, + { + "epoch": 1.898859081097749, + "grad_norm": 0.21486081182956696, + "learning_rate": 2.8216198499072697e-05, + "loss": 0.5442, + "step": 9237 + }, + { + "epoch": 1.8990646520711274, + "grad_norm": 0.16459722816944122, + "learning_rate": 2.8206931526722516e-05, + "loss": 0.5078, + "step": 9238 + }, + { + "epoch": 1.899270223044506, + "grad_norm": 0.16410863399505615, + "learning_rate": 2.819766538168832e-05, + "loss": 0.5079, + "step": 9239 + }, + { + "epoch": 1.8994757940178846, + "grad_norm": 0.20152071118354797, + "learning_rate": 2.8188400064426613e-05, + "loss": 0.5097, + "step": 9240 + }, + { + "epoch": 1.8996813649912632, + "grad_norm": 0.19304706156253815, + "learning_rate": 2.8179135575393867e-05, + "loss": 0.5271, + "step": 9241 + }, + { + "epoch": 1.8998869359646418, + "grad_norm": 0.19228653609752655, + "learning_rate": 2.8169871915046488e-05, + "loss": 0.5202, + "step": 9242 + }, + { + "epoch": 1.9000925069380203, + "grad_norm": 0.20129820704460144, + "learning_rate": 2.816060908384086e-05, + "loss": 0.527, + "step": 9243 + }, + { + "epoch": 1.900298077911399, + "grad_norm": 0.203932523727417, + "learning_rate": 2.8151347082233317e-05, + "loss": 0.5486, + "step": 9244 + }, + { + "epoch": 1.9005036488847775, + "grad_norm": 0.1936814785003662, + "learning_rate": 2.8142085910680153e-05, + "loss": 0.5111, + "step": 9245 + }, + { + "epoch": 1.900709219858156, + "grad_norm": 0.198873832821846, + "learning_rate": 2.813282556963762e-05, + "loss": 0.5324, + "step": 9246 + }, + { + "epoch": 1.9009147908315347, + "grad_norm": 0.19669774174690247, + "learning_rate": 2.8123566059561917e-05, + "loss": 0.5218, + "step": 9247 + }, + { + "epoch": 1.9011203618049133, + "grad_norm": 0.19902367889881134, + "learning_rate": 2.8114307380909255e-05, + "loss": 0.5155, + "step": 9248 + }, + { + "epoch": 1.9013259327782919, + "grad_norm": 0.1605267971754074, + "learning_rate": 2.8105049534135744e-05, + "loss": 0.502, + "step": 9249 + }, + { + "epoch": 1.9015315037516702, + "grad_norm": 0.16073958575725555, + "learning_rate": 2.8095792519697472e-05, + "loss": 0.5487, + "step": 9250 + }, + { + "epoch": 1.9017370747250488, + "grad_norm": 0.19656385481357574, + "learning_rate": 2.8086536338050488e-05, + "loss": 0.5008, + "step": 9251 + }, + { + "epoch": 1.9019426456984274, + "grad_norm": 0.20274598896503448, + "learning_rate": 2.807728098965081e-05, + "loss": 0.5181, + "step": 9252 + }, + { + "epoch": 1.9021482166718058, + "grad_norm": 0.1996408849954605, + "learning_rate": 2.8068026474954407e-05, + "loss": 0.5041, + "step": 9253 + }, + { + "epoch": 1.9023537876451844, + "grad_norm": 0.20199070870876312, + "learning_rate": 2.805877279441717e-05, + "loss": 0.5173, + "step": 9254 + }, + { + "epoch": 1.902559358618563, + "grad_norm": 0.16346138715744019, + "learning_rate": 2.804951994849504e-05, + "loss": 0.5007, + "step": 9255 + }, + { + "epoch": 1.9027649295919415, + "grad_norm": 0.16361282765865326, + "learning_rate": 2.8040267937643842e-05, + "loss": 0.5564, + "step": 9256 + }, + { + "epoch": 1.9029705005653201, + "grad_norm": 0.1939617544412613, + "learning_rate": 2.8031016762319366e-05, + "loss": 0.5369, + "step": 9257 + }, + { + "epoch": 1.9031760715386987, + "grad_norm": 0.20072026550769806, + "learning_rate": 2.802176642297738e-05, + "loss": 0.5177, + "step": 9258 + }, + { + "epoch": 1.9033816425120773, + "grad_norm": 0.20429256558418274, + "learning_rate": 2.801251692007361e-05, + "loss": 0.5275, + "step": 9259 + }, + { + "epoch": 1.903587213485456, + "grad_norm": 0.20607547461986542, + "learning_rate": 2.8003268254063734e-05, + "loss": 0.5278, + "step": 9260 + }, + { + "epoch": 1.9037927844588345, + "grad_norm": 0.19499348104000092, + "learning_rate": 2.7994020425403363e-05, + "loss": 0.5303, + "step": 9261 + }, + { + "epoch": 1.903998355432213, + "grad_norm": 0.19774754345417023, + "learning_rate": 2.7984773434548146e-05, + "loss": 0.5161, + "step": 9262 + }, + { + "epoch": 1.9042039264055917, + "grad_norm": 0.2001447230577469, + "learning_rate": 2.79755272819536e-05, + "loss": 0.5278, + "step": 9263 + }, + { + "epoch": 1.9044094973789703, + "grad_norm": 0.18907634913921356, + "learning_rate": 2.7966281968075258e-05, + "loss": 0.5056, + "step": 9264 + }, + { + "epoch": 1.9046150683523486, + "grad_norm": 0.19662658870220184, + "learning_rate": 2.7957037493368595e-05, + "loss": 0.5241, + "step": 9265 + }, + { + "epoch": 1.9048206393257272, + "grad_norm": 0.16739560663700104, + "learning_rate": 2.794779385828903e-05, + "loss": 0.5178, + "step": 9266 + }, + { + "epoch": 1.9050262102991058, + "grad_norm": 0.16151371598243713, + "learning_rate": 2.7938551063291945e-05, + "loss": 0.5285, + "step": 9267 + }, + { + "epoch": 1.9052317812724842, + "grad_norm": 0.20817650854587555, + "learning_rate": 2.7929309108832727e-05, + "loss": 0.5066, + "step": 9268 + }, + { + "epoch": 1.9054373522458627, + "grad_norm": 0.20188304781913757, + "learning_rate": 2.7920067995366655e-05, + "loss": 0.5425, + "step": 9269 + }, + { + "epoch": 1.9056429232192413, + "grad_norm": 0.20524436235427856, + "learning_rate": 2.7910827723348997e-05, + "loss": 0.5229, + "step": 9270 + }, + { + "epoch": 1.90584849419262, + "grad_norm": 0.1990385204553604, + "learning_rate": 2.790158829323499e-05, + "loss": 0.5126, + "step": 9271 + }, + { + "epoch": 1.9060540651659985, + "grad_norm": 0.19942565262317657, + "learning_rate": 2.7892349705479808e-05, + "loss": 0.5362, + "step": 9272 + }, + { + "epoch": 1.906259636139377, + "grad_norm": 0.2039434313774109, + "learning_rate": 2.7883111960538585e-05, + "loss": 0.517, + "step": 9273 + }, + { + "epoch": 1.9064652071127557, + "grad_norm": 0.20296645164489746, + "learning_rate": 2.7873875058866438e-05, + "loss": 0.5199, + "step": 9274 + }, + { + "epoch": 1.9066707780861343, + "grad_norm": 0.16485995054244995, + "learning_rate": 2.786463900091842e-05, + "loss": 0.4991, + "step": 9275 + }, + { + "epoch": 1.9068763490595129, + "grad_norm": 0.12144458293914795, + "learning_rate": 2.7855403787149536e-05, + "loss": 0.5215, + "step": 9276 + }, + { + "epoch": 1.9070819200328915, + "grad_norm": 0.12911517918109894, + "learning_rate": 2.7846169418014794e-05, + "loss": 0.5127, + "step": 9277 + }, + { + "epoch": 1.90728749100627, + "grad_norm": 0.16436396539211273, + "learning_rate": 2.78369358939691e-05, + "loss": 0.5474, + "step": 9278 + }, + { + "epoch": 1.9074930619796486, + "grad_norm": 0.19795219600200653, + "learning_rate": 2.782770321546736e-05, + "loss": 0.5031, + "step": 9279 + }, + { + "epoch": 1.907698632953027, + "grad_norm": 0.20450328290462494, + "learning_rate": 2.7818471382964418e-05, + "loss": 0.5389, + "step": 9280 + }, + { + "epoch": 1.9079042039264056, + "grad_norm": 0.19718262553215027, + "learning_rate": 2.7809240396915066e-05, + "loss": 0.5057, + "step": 9281 + }, + { + "epoch": 1.9081097748997842, + "grad_norm": 0.19920390844345093, + "learning_rate": 2.7800010257774107e-05, + "loss": 0.5347, + "step": 9282 + }, + { + "epoch": 1.9083153458731628, + "grad_norm": 0.19880931079387665, + "learning_rate": 2.7790780965996248e-05, + "loss": 0.525, + "step": 9283 + }, + { + "epoch": 1.9085209168465411, + "grad_norm": 0.19791699945926666, + "learning_rate": 2.778155252203618e-05, + "loss": 0.5229, + "step": 9284 + }, + { + "epoch": 1.9087264878199197, + "grad_norm": 0.19774897396564484, + "learning_rate": 2.7772324926348524e-05, + "loss": 0.4856, + "step": 9285 + }, + { + "epoch": 1.9089320587932983, + "grad_norm": 0.20268942415714264, + "learning_rate": 2.7763098179387917e-05, + "loss": 0.5158, + "step": 9286 + }, + { + "epoch": 1.909137629766677, + "grad_norm": 0.19894084334373474, + "learning_rate": 2.7753872281608892e-05, + "loss": 0.5155, + "step": 9287 + }, + { + "epoch": 1.9093432007400555, + "grad_norm": 0.20365378260612488, + "learning_rate": 2.774464723346595e-05, + "loss": 0.5379, + "step": 9288 + }, + { + "epoch": 1.909548771713434, + "grad_norm": 0.20348501205444336, + "learning_rate": 2.773542303541361e-05, + "loss": 0.5442, + "step": 9289 + }, + { + "epoch": 1.9097543426868127, + "grad_norm": 0.20363953709602356, + "learning_rate": 2.772619968790628e-05, + "loss": 0.5028, + "step": 9290 + }, + { + "epoch": 1.9099599136601912, + "grad_norm": 0.20089566707611084, + "learning_rate": 2.771697719139836e-05, + "loss": 0.5419, + "step": 9291 + }, + { + "epoch": 1.9101654846335698, + "grad_norm": 0.19938144087791443, + "learning_rate": 2.7707755546344185e-05, + "loss": 0.5269, + "step": 9292 + }, + { + "epoch": 1.9103710556069484, + "grad_norm": 0.1974303126335144, + "learning_rate": 2.7698534753198074e-05, + "loss": 0.5038, + "step": 9293 + }, + { + "epoch": 1.910576626580327, + "grad_norm": 0.16670559346675873, + "learning_rate": 2.768931481241428e-05, + "loss": 0.498, + "step": 9294 + }, + { + "epoch": 1.9107821975537054, + "grad_norm": 0.16241280734539032, + "learning_rate": 2.768009572444703e-05, + "loss": 0.4979, + "step": 9295 + }, + { + "epoch": 1.910987768527084, + "grad_norm": 0.2039078176021576, + "learning_rate": 2.767087748975053e-05, + "loss": 0.5426, + "step": 9296 + }, + { + "epoch": 1.9111933395004626, + "grad_norm": 0.20147615671157837, + "learning_rate": 2.766166010877889e-05, + "loss": 0.5098, + "step": 9297 + }, + { + "epoch": 1.9113989104738411, + "grad_norm": 0.19318887591362, + "learning_rate": 2.765244358198621e-05, + "loss": 0.5412, + "step": 9298 + }, + { + "epoch": 1.9116044814472195, + "grad_norm": 0.19322159886360168, + "learning_rate": 2.7643227909826573e-05, + "loss": 0.5412, + "step": 9299 + }, + { + "epoch": 1.911810052420598, + "grad_norm": 0.1994897574186325, + "learning_rate": 2.7634013092753962e-05, + "loss": 0.5362, + "step": 9300 + }, + { + "epoch": 1.9120156233939767, + "grad_norm": 0.19642673432826996, + "learning_rate": 2.762479913122236e-05, + "loss": 0.5088, + "step": 9301 + }, + { + "epoch": 1.9122211943673553, + "grad_norm": 0.20559348165988922, + "learning_rate": 2.761558602568567e-05, + "loss": 0.5465, + "step": 9302 + }, + { + "epoch": 1.9124267653407339, + "grad_norm": 0.1719941794872284, + "learning_rate": 2.760637377659782e-05, + "loss": 0.4991, + "step": 9303 + }, + { + "epoch": 1.9126323363141124, + "grad_norm": 0.1635911911725998, + "learning_rate": 2.7597162384412645e-05, + "loss": 0.5127, + "step": 9304 + }, + { + "epoch": 1.912837907287491, + "grad_norm": 0.19266277551651, + "learning_rate": 2.7587951849583936e-05, + "loss": 0.5235, + "step": 9305 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 0.20263995230197906, + "learning_rate": 2.757874217256544e-05, + "loss": 0.5287, + "step": 9306 + }, + { + "epoch": 1.9132490492342482, + "grad_norm": 0.21031515300273895, + "learning_rate": 2.75695333538109e-05, + "loss": 0.5176, + "step": 9307 + }, + { + "epoch": 1.9134546202076268, + "grad_norm": 0.19321498274803162, + "learning_rate": 2.7560325393773992e-05, + "loss": 0.5296, + "step": 9308 + }, + { + "epoch": 1.9136601911810054, + "grad_norm": 0.16547061502933502, + "learning_rate": 2.7551118292908317e-05, + "loss": 0.5214, + "step": 9309 + }, + { + "epoch": 1.9138657621543838, + "grad_norm": 0.15889470279216766, + "learning_rate": 2.7541912051667503e-05, + "loss": 0.5355, + "step": 9310 + }, + { + "epoch": 1.9140713331277623, + "grad_norm": 0.19868826866149902, + "learning_rate": 2.7532706670505082e-05, + "loss": 0.5194, + "step": 9311 + }, + { + "epoch": 1.914276904101141, + "grad_norm": 0.20042477548122406, + "learning_rate": 2.7523502149874562e-05, + "loss": 0.4973, + "step": 9312 + }, + { + "epoch": 1.9144824750745195, + "grad_norm": 0.2017516791820526, + "learning_rate": 2.7514298490229408e-05, + "loss": 0.5121, + "step": 9313 + }, + { + "epoch": 1.9146880460478979, + "grad_norm": 0.19849123060703278, + "learning_rate": 2.7505095692023043e-05, + "loss": 0.527, + "step": 9314 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 0.19380970299243927, + "learning_rate": 2.7495893755708823e-05, + "loss": 0.5175, + "step": 9315 + }, + { + "epoch": 1.915099187994655, + "grad_norm": 0.16672258079051971, + "learning_rate": 2.748669268174011e-05, + "loss": 0.4853, + "step": 9316 + }, + { + "epoch": 1.9153047589680336, + "grad_norm": 0.1614876687526703, + "learning_rate": 2.74774924705702e-05, + "loss": 0.5163, + "step": 9317 + }, + { + "epoch": 1.9155103299414122, + "grad_norm": 0.1949508935213089, + "learning_rate": 2.746829312265233e-05, + "loss": 0.515, + "step": 9318 + }, + { + "epoch": 1.9157159009147908, + "grad_norm": 0.2158740758895874, + "learning_rate": 2.745909463843972e-05, + "loss": 0.5089, + "step": 9319 + }, + { + "epoch": 1.9159214718881694, + "grad_norm": 0.16678479313850403, + "learning_rate": 2.744989701838553e-05, + "loss": 0.5147, + "step": 9320 + }, + { + "epoch": 1.916127042861548, + "grad_norm": 0.16045857965946198, + "learning_rate": 2.7440700262942893e-05, + "loss": 0.505, + "step": 9321 + }, + { + "epoch": 1.9163326138349266, + "grad_norm": 0.17274217307567596, + "learning_rate": 2.7431504372564874e-05, + "loss": 0.5002, + "step": 9322 + }, + { + "epoch": 1.9165381848083052, + "grad_norm": 0.16283760964870453, + "learning_rate": 2.7422309347704505e-05, + "loss": 0.5303, + "step": 9323 + }, + { + "epoch": 1.9167437557816838, + "grad_norm": 0.1970645785331726, + "learning_rate": 2.741311518881481e-05, + "loss": 0.5198, + "step": 9324 + }, + { + "epoch": 1.9169493267550621, + "grad_norm": 0.20442216098308563, + "learning_rate": 2.7403921896348735e-05, + "loss": 0.4928, + "step": 9325 + }, + { + "epoch": 1.9171548977284407, + "grad_norm": 0.19744066894054413, + "learning_rate": 2.739472947075918e-05, + "loss": 0.5099, + "step": 9326 + }, + { + "epoch": 1.9173604687018193, + "grad_norm": 0.20807257294654846, + "learning_rate": 2.7385537912499014e-05, + "loss": 0.5173, + "step": 9327 + }, + { + "epoch": 1.917566039675198, + "grad_norm": 0.19921061396598816, + "learning_rate": 2.7376347222021067e-05, + "loss": 0.5094, + "step": 9328 + }, + { + "epoch": 1.9177716106485763, + "grad_norm": 0.1887097805738449, + "learning_rate": 2.7367157399778125e-05, + "loss": 0.514, + "step": 9329 + }, + { + "epoch": 1.9179771816219549, + "grad_norm": 0.1967703402042389, + "learning_rate": 2.7357968446222903e-05, + "loss": 0.5085, + "step": 9330 + }, + { + "epoch": 1.9181827525953334, + "grad_norm": 0.1980697363615036, + "learning_rate": 2.734878036180813e-05, + "loss": 0.5417, + "step": 9331 + }, + { + "epoch": 1.918388323568712, + "grad_norm": 0.20071645081043243, + "learning_rate": 2.733959314698645e-05, + "loss": 0.5293, + "step": 9332 + }, + { + "epoch": 1.9185938945420906, + "grad_norm": 0.1977865993976593, + "learning_rate": 2.7330406802210472e-05, + "loss": 0.5359, + "step": 9333 + }, + { + "epoch": 1.9187994655154692, + "grad_norm": 0.19883140921592712, + "learning_rate": 2.7321221327932762e-05, + "loss": 0.5049, + "step": 9334 + }, + { + "epoch": 1.9190050364888478, + "grad_norm": 0.19968102872371674, + "learning_rate": 2.7312036724605848e-05, + "loss": 0.5255, + "step": 9335 + }, + { + "epoch": 1.9192106074622264, + "grad_norm": 0.19368182122707367, + "learning_rate": 2.7302852992682212e-05, + "loss": 0.5299, + "step": 9336 + }, + { + "epoch": 1.919416178435605, + "grad_norm": 0.18962502479553223, + "learning_rate": 2.7293670132614277e-05, + "loss": 0.505, + "step": 9337 + }, + { + "epoch": 1.9196217494089836, + "grad_norm": 0.19553595781326294, + "learning_rate": 2.7284488144854465e-05, + "loss": 0.5214, + "step": 9338 + }, + { + "epoch": 1.9198273203823621, + "grad_norm": 0.1957550048828125, + "learning_rate": 2.7275307029855118e-05, + "loss": 0.5377, + "step": 9339 + }, + { + "epoch": 1.9200328913557407, + "grad_norm": 0.19873984158039093, + "learning_rate": 2.726612678806856e-05, + "loss": 0.53, + "step": 9340 + }, + { + "epoch": 1.920238462329119, + "grad_norm": 0.2044048011302948, + "learning_rate": 2.7256947419947038e-05, + "loss": 0.5364, + "step": 9341 + }, + { + "epoch": 1.9204440333024977, + "grad_norm": 0.1971905678510666, + "learning_rate": 2.7247768925942793e-05, + "loss": 0.5233, + "step": 9342 + }, + { + "epoch": 1.9206496042758763, + "grad_norm": 0.15951332449913025, + "learning_rate": 2.7238591306507985e-05, + "loss": 0.5017, + "step": 9343 + }, + { + "epoch": 1.9208551752492546, + "grad_norm": 0.172995924949646, + "learning_rate": 2.722941456209478e-05, + "loss": 0.5254, + "step": 9344 + }, + { + "epoch": 1.9210607462226332, + "grad_norm": 0.2066241055727005, + "learning_rate": 2.7220238693155255e-05, + "loss": 0.5268, + "step": 9345 + }, + { + "epoch": 1.9212663171960118, + "grad_norm": 0.19944432377815247, + "learning_rate": 2.721106370014147e-05, + "loss": 0.5281, + "step": 9346 + }, + { + "epoch": 1.9214718881693904, + "grad_norm": 0.20762419700622559, + "learning_rate": 2.7201889583505433e-05, + "loss": 0.5314, + "step": 9347 + }, + { + "epoch": 1.921677459142769, + "grad_norm": 0.16759265959262848, + "learning_rate": 2.7192716343699114e-05, + "loss": 0.4948, + "step": 9348 + }, + { + "epoch": 1.9218830301161476, + "grad_norm": 0.12171138823032379, + "learning_rate": 2.718354398117441e-05, + "loss": 0.4984, + "step": 9349 + }, + { + "epoch": 1.9220886010895262, + "grad_norm": 0.17452505230903625, + "learning_rate": 2.7174372496383224e-05, + "loss": 0.5404, + "step": 9350 + }, + { + "epoch": 1.9222941720629048, + "grad_norm": 0.16276037693023682, + "learning_rate": 2.716520188977739e-05, + "loss": 0.5044, + "step": 9351 + }, + { + "epoch": 1.9224997430362833, + "grad_norm": 0.1610327959060669, + "learning_rate": 2.7156032161808704e-05, + "loss": 0.5304, + "step": 9352 + }, + { + "epoch": 1.922705314009662, + "grad_norm": 0.2447415590286255, + "learning_rate": 2.7146863312928917e-05, + "loss": 0.5119, + "step": 9353 + }, + { + "epoch": 1.9229108849830405, + "grad_norm": 0.19157683849334717, + "learning_rate": 2.7137695343589725e-05, + "loss": 0.5232, + "step": 9354 + }, + { + "epoch": 1.9231164559564191, + "grad_norm": 0.20079728960990906, + "learning_rate": 2.71285282542428e-05, + "loss": 0.5146, + "step": 9355 + }, + { + "epoch": 1.9233220269297975, + "grad_norm": 0.20246592164039612, + "learning_rate": 2.7119362045339755e-05, + "loss": 0.5289, + "step": 9356 + }, + { + "epoch": 1.923527597903176, + "grad_norm": 0.1998445987701416, + "learning_rate": 2.7110196717332144e-05, + "loss": 0.537, + "step": 9357 + }, + { + "epoch": 1.9237331688765547, + "grad_norm": 0.20412832498550415, + "learning_rate": 2.7101032270671548e-05, + "loss": 0.5388, + "step": 9358 + }, + { + "epoch": 1.9239387398499332, + "grad_norm": 0.19689737260341644, + "learning_rate": 2.709186870580943e-05, + "loss": 0.529, + "step": 9359 + }, + { + "epoch": 1.9241443108233116, + "grad_norm": 0.19693289697170258, + "learning_rate": 2.7082706023197238e-05, + "loss": 0.5362, + "step": 9360 + }, + { + "epoch": 1.9243498817966902, + "grad_norm": 0.1994449496269226, + "learning_rate": 2.707354422328637e-05, + "loss": 0.5326, + "step": 9361 + }, + { + "epoch": 1.9245554527700688, + "grad_norm": 0.20162896811962128, + "learning_rate": 2.7064383306528194e-05, + "loss": 0.5167, + "step": 9362 + }, + { + "epoch": 1.9247610237434474, + "grad_norm": 0.19568750262260437, + "learning_rate": 2.7055223273374027e-05, + "loss": 0.5314, + "step": 9363 + }, + { + "epoch": 1.924966594716826, + "grad_norm": 0.20198176801204681, + "learning_rate": 2.7046064124275115e-05, + "loss": 0.5225, + "step": 9364 + }, + { + "epoch": 1.9251721656902046, + "grad_norm": 0.22592489421367645, + "learning_rate": 2.7036905859682726e-05, + "loss": 0.5282, + "step": 9365 + }, + { + "epoch": 1.9253777366635831, + "grad_norm": 0.19349443912506104, + "learning_rate": 2.7027748480048022e-05, + "loss": 0.4946, + "step": 9366 + }, + { + "epoch": 1.9255833076369617, + "grad_norm": 0.20024524629116058, + "learning_rate": 2.701859198582215e-05, + "loss": 0.5214, + "step": 9367 + }, + { + "epoch": 1.9257888786103403, + "grad_norm": 0.19572319090366364, + "learning_rate": 2.700943637745621e-05, + "loss": 0.5243, + "step": 9368 + }, + { + "epoch": 1.925994449583719, + "grad_norm": 0.20359370112419128, + "learning_rate": 2.7000281655401248e-05, + "loss": 0.5192, + "step": 9369 + }, + { + "epoch": 1.9262000205570975, + "grad_norm": 0.17284277081489563, + "learning_rate": 2.6991127820108274e-05, + "loss": 0.5126, + "step": 9370 + }, + { + "epoch": 1.9264055915304759, + "grad_norm": 0.17155306041240692, + "learning_rate": 2.6981974872028255e-05, + "loss": 0.5354, + "step": 9371 + }, + { + "epoch": 1.9266111625038544, + "grad_norm": 0.20635953545570374, + "learning_rate": 2.6972822811612127e-05, + "loss": 0.5047, + "step": 9372 + }, + { + "epoch": 1.926816733477233, + "grad_norm": 0.16604094207286835, + "learning_rate": 2.696367163931075e-05, + "loss": 0.5067, + "step": 9373 + }, + { + "epoch": 1.9270223044506116, + "grad_norm": 0.15949425101280212, + "learning_rate": 2.695452135557498e-05, + "loss": 0.5084, + "step": 9374 + }, + { + "epoch": 1.92722787542399, + "grad_norm": 0.19722892343997955, + "learning_rate": 2.69453719608556e-05, + "loss": 0.5247, + "step": 9375 + }, + { + "epoch": 1.9274334463973686, + "grad_norm": 0.189317524433136, + "learning_rate": 2.6936223455603357e-05, + "loss": 0.5275, + "step": 9376 + }, + { + "epoch": 1.9276390173707472, + "grad_norm": 0.193404883146286, + "learning_rate": 2.6927075840268952e-05, + "loss": 0.5003, + "step": 9377 + }, + { + "epoch": 1.9278445883441258, + "grad_norm": 0.18967877328395844, + "learning_rate": 2.6917929115303032e-05, + "loss": 0.5414, + "step": 9378 + }, + { + "epoch": 1.9280501593175043, + "grad_norm": 0.2023673802614212, + "learning_rate": 2.690878328115625e-05, + "loss": 0.5172, + "step": 9379 + }, + { + "epoch": 1.928255730290883, + "grad_norm": 0.1624782383441925, + "learning_rate": 2.6899638338279148e-05, + "loss": 0.5076, + "step": 9380 + }, + { + "epoch": 1.9284613012642615, + "grad_norm": 0.1585642248392105, + "learning_rate": 2.6890494287122268e-05, + "loss": 0.5232, + "step": 9381 + }, + { + "epoch": 1.92866687223764, + "grad_norm": 0.20032867789268494, + "learning_rate": 2.6881351128136084e-05, + "loss": 0.5015, + "step": 9382 + }, + { + "epoch": 1.9288724432110187, + "grad_norm": 0.20595210790634155, + "learning_rate": 2.6872208861771055e-05, + "loss": 0.5079, + "step": 9383 + }, + { + "epoch": 1.9290780141843973, + "grad_norm": 0.2049880176782608, + "learning_rate": 2.6863067488477565e-05, + "loss": 0.5073, + "step": 9384 + }, + { + "epoch": 1.9292835851577759, + "grad_norm": 0.19586196541786194, + "learning_rate": 2.6853927008705945e-05, + "loss": 0.5362, + "step": 9385 + }, + { + "epoch": 1.9294891561311542, + "grad_norm": 0.19678068161010742, + "learning_rate": 2.684478742290655e-05, + "loss": 0.507, + "step": 9386 + }, + { + "epoch": 1.9296947271045328, + "grad_norm": 0.15755969285964966, + "learning_rate": 2.683564873152962e-05, + "loss": 0.5057, + "step": 9387 + }, + { + "epoch": 1.9299002980779114, + "grad_norm": 0.16498331725597382, + "learning_rate": 2.6826510935025375e-05, + "loss": 0.5442, + "step": 9388 + }, + { + "epoch": 1.93010586905129, + "grad_norm": 0.19928227365016937, + "learning_rate": 2.681737403384399e-05, + "loss": 0.521, + "step": 9389 + }, + { + "epoch": 1.9303114400246684, + "grad_norm": 0.1977323740720749, + "learning_rate": 2.680823802843561e-05, + "loss": 0.528, + "step": 9390 + }, + { + "epoch": 1.930517010998047, + "grad_norm": 0.1704244613647461, + "learning_rate": 2.67991029192503e-05, + "loss": 0.509, + "step": 9391 + }, + { + "epoch": 1.9307225819714255, + "grad_norm": 0.16151131689548492, + "learning_rate": 2.6789968706738123e-05, + "loss": 0.5298, + "step": 9392 + }, + { + "epoch": 1.9309281529448041, + "grad_norm": 0.18972033262252808, + "learning_rate": 2.678083539134908e-05, + "loss": 0.5135, + "step": 9393 + }, + { + "epoch": 1.9311337239181827, + "grad_norm": 0.19905173778533936, + "learning_rate": 2.677170297353311e-05, + "loss": 0.496, + "step": 9394 + }, + { + "epoch": 1.9313392948915613, + "grad_norm": 0.21623218059539795, + "learning_rate": 2.6762571453740148e-05, + "loss": 0.539, + "step": 9395 + }, + { + "epoch": 1.93154486586494, + "grad_norm": 0.16825906932353973, + "learning_rate": 2.675344083242005e-05, + "loss": 0.5124, + "step": 9396 + }, + { + "epoch": 1.9317504368383185, + "grad_norm": 0.12174926698207855, + "learning_rate": 2.674431111002263e-05, + "loss": 0.5125, + "step": 9397 + }, + { + "epoch": 1.931956007811697, + "grad_norm": 0.16127155721187592, + "learning_rate": 2.6735182286997685e-05, + "loss": 0.5231, + "step": 9398 + }, + { + "epoch": 1.9321615787850757, + "grad_norm": 0.19533561170101166, + "learning_rate": 2.6726054363794914e-05, + "loss": 0.5015, + "step": 9399 + }, + { + "epoch": 1.9323671497584543, + "grad_norm": 0.1924934983253479, + "learning_rate": 2.671692734086405e-05, + "loss": 0.5085, + "step": 9400 + }, + { + "epoch": 1.9325727207318326, + "grad_norm": 0.1985793113708496, + "learning_rate": 2.6707801218654726e-05, + "loss": 0.5133, + "step": 9401 + }, + { + "epoch": 1.9327782917052112, + "grad_norm": 0.17007775604724884, + "learning_rate": 2.669867599761654e-05, + "loss": 0.5008, + "step": 9402 + }, + { + "epoch": 1.9329838626785898, + "grad_norm": 0.16425763070583344, + "learning_rate": 2.6689551678199035e-05, + "loss": 0.5018, + "step": 9403 + }, + { + "epoch": 1.9331894336519684, + "grad_norm": 0.17384882271289825, + "learning_rate": 2.6680428260851744e-05, + "loss": 0.4811, + "step": 9404 + }, + { + "epoch": 1.9333950046253467, + "grad_norm": 0.1561937779188156, + "learning_rate": 2.6671305746024126e-05, + "loss": 0.5158, + "step": 9405 + }, + { + "epoch": 1.9336005755987253, + "grad_norm": 0.20057018101215363, + "learning_rate": 2.6662184134165594e-05, + "loss": 0.5178, + "step": 9406 + }, + { + "epoch": 1.933806146572104, + "grad_norm": 0.17240118980407715, + "learning_rate": 2.6653063425725552e-05, + "loss": 0.4964, + "step": 9407 + }, + { + "epoch": 1.9340117175454825, + "grad_norm": 0.16643132269382477, + "learning_rate": 2.664394362115332e-05, + "loss": 0.5116, + "step": 9408 + }, + { + "epoch": 1.934217288518861, + "grad_norm": 0.19673089683055878, + "learning_rate": 2.6634824720898195e-05, + "loss": 0.5233, + "step": 9409 + }, + { + "epoch": 1.9344228594922397, + "grad_norm": 0.19296656548976898, + "learning_rate": 2.6625706725409412e-05, + "loss": 0.5305, + "step": 9410 + }, + { + "epoch": 1.9346284304656183, + "grad_norm": 0.18779776990413666, + "learning_rate": 2.6616589635136185e-05, + "loss": 0.5354, + "step": 9411 + }, + { + "epoch": 1.9348340014389969, + "grad_norm": 0.19164229929447174, + "learning_rate": 2.6607473450527648e-05, + "loss": 0.5135, + "step": 9412 + }, + { + "epoch": 1.9350395724123755, + "grad_norm": 0.19808048009872437, + "learning_rate": 2.6598358172032928e-05, + "loss": 0.4932, + "step": 9413 + }, + { + "epoch": 1.935245143385754, + "grad_norm": 0.19213752448558807, + "learning_rate": 2.65892438001011e-05, + "loss": 0.5196, + "step": 9414 + }, + { + "epoch": 1.9354507143591326, + "grad_norm": 0.19726723432540894, + "learning_rate": 2.658013033518117e-05, + "loss": 0.523, + "step": 9415 + }, + { + "epoch": 1.935656285332511, + "grad_norm": 0.19998745620250702, + "learning_rate": 2.657101777772214e-05, + "loss": 0.5311, + "step": 9416 + }, + { + "epoch": 1.9358618563058896, + "grad_norm": 0.2027643620967865, + "learning_rate": 2.6561906128172917e-05, + "loss": 0.5243, + "step": 9417 + }, + { + "epoch": 1.9360674272792682, + "grad_norm": 0.20316363871097565, + "learning_rate": 2.6552795386982405e-05, + "loss": 0.5291, + "step": 9418 + }, + { + "epoch": 1.9362729982526468, + "grad_norm": 0.20627467334270477, + "learning_rate": 2.6543685554599437e-05, + "loss": 0.4963, + "step": 9419 + }, + { + "epoch": 1.9364785692260251, + "grad_norm": 0.19964690506458282, + "learning_rate": 2.6534576631472806e-05, + "loss": 0.5131, + "step": 9420 + }, + { + "epoch": 1.9366841401994037, + "grad_norm": 0.19893944263458252, + "learning_rate": 2.6525468618051296e-05, + "loss": 0.5256, + "step": 9421 + }, + { + "epoch": 1.9368897111727823, + "grad_norm": 0.17132525146007538, + "learning_rate": 2.6516361514783592e-05, + "loss": 0.5057, + "step": 9422 + }, + { + "epoch": 1.937095282146161, + "grad_norm": 0.16164752840995789, + "learning_rate": 2.6507255322118362e-05, + "loss": 0.5165, + "step": 9423 + }, + { + "epoch": 1.9373008531195395, + "grad_norm": 0.19539949297904968, + "learning_rate": 2.6498150040504224e-05, + "loss": 0.5299, + "step": 9424 + }, + { + "epoch": 1.937506424092918, + "grad_norm": 0.1996447741985321, + "learning_rate": 2.6489045670389765e-05, + "loss": 0.5199, + "step": 9425 + }, + { + "epoch": 1.9377119950662967, + "grad_norm": 0.20690996944904327, + "learning_rate": 2.6479942212223494e-05, + "loss": 0.4806, + "step": 9426 + }, + { + "epoch": 1.9379175660396752, + "grad_norm": 0.19668295979499817, + "learning_rate": 2.6470839666453906e-05, + "loss": 0.5259, + "step": 9427 + }, + { + "epoch": 1.9381231370130538, + "grad_norm": 0.200824573636055, + "learning_rate": 2.6461738033529452e-05, + "loss": 0.5383, + "step": 9428 + }, + { + "epoch": 1.9383287079864324, + "grad_norm": 0.1936202496290207, + "learning_rate": 2.6452637313898524e-05, + "loss": 0.5199, + "step": 9429 + }, + { + "epoch": 1.938534278959811, + "grad_norm": 0.1961507350206375, + "learning_rate": 2.644353750800946e-05, + "loss": 0.5261, + "step": 9430 + }, + { + "epoch": 1.9387398499331896, + "grad_norm": 0.1959598958492279, + "learning_rate": 2.643443861631057e-05, + "loss": 0.5204, + "step": 9431 + }, + { + "epoch": 1.938945420906568, + "grad_norm": 0.200755774974823, + "learning_rate": 2.642534063925012e-05, + "loss": 0.5277, + "step": 9432 + }, + { + "epoch": 1.9391509918799466, + "grad_norm": 0.1906225085258484, + "learning_rate": 2.6416243577276295e-05, + "loss": 0.5116, + "step": 9433 + }, + { + "epoch": 1.9393565628533251, + "grad_norm": 0.20025970041751862, + "learning_rate": 2.6407147430837307e-05, + "loss": 0.5146, + "step": 9434 + }, + { + "epoch": 1.9395621338267035, + "grad_norm": 0.19855552911758423, + "learning_rate": 2.6398052200381266e-05, + "loss": 0.5334, + "step": 9435 + }, + { + "epoch": 1.939767704800082, + "grad_norm": 0.19425593316555023, + "learning_rate": 2.638895788635623e-05, + "loss": 0.5349, + "step": 9436 + }, + { + "epoch": 1.9399732757734607, + "grad_norm": 0.1955750733613968, + "learning_rate": 2.637986448921027e-05, + "loss": 0.5267, + "step": 9437 + }, + { + "epoch": 1.9401788467468393, + "grad_norm": 0.19604718685150146, + "learning_rate": 2.637077200939135e-05, + "loss": 0.5303, + "step": 9438 + }, + { + "epoch": 1.9403844177202179, + "grad_norm": 0.1608019322156906, + "learning_rate": 2.6361680447347424e-05, + "loss": 0.4905, + "step": 9439 + }, + { + "epoch": 1.9405899886935964, + "grad_norm": 0.16983415186405182, + "learning_rate": 2.635258980352637e-05, + "loss": 0.533, + "step": 9440 + }, + { + "epoch": 1.940795559666975, + "grad_norm": 0.2078002691268921, + "learning_rate": 2.6343500078376077e-05, + "loss": 0.5277, + "step": 9441 + }, + { + "epoch": 1.9410011306403536, + "grad_norm": 0.20735982060432434, + "learning_rate": 2.6334411272344328e-05, + "loss": 0.5188, + "step": 9442 + }, + { + "epoch": 1.9412067016137322, + "grad_norm": 0.1942051202058792, + "learning_rate": 2.63253233858789e-05, + "loss": 0.5166, + "step": 9443 + }, + { + "epoch": 1.9414122725871108, + "grad_norm": 0.1942778080701828, + "learning_rate": 2.6316236419427502e-05, + "loss": 0.5302, + "step": 9444 + }, + { + "epoch": 1.9416178435604894, + "grad_norm": 0.19624213874340057, + "learning_rate": 2.6307150373437803e-05, + "loss": 0.5376, + "step": 9445 + }, + { + "epoch": 1.941823414533868, + "grad_norm": 0.19899539649486542, + "learning_rate": 2.629806524835743e-05, + "loss": 0.5369, + "step": 9446 + }, + { + "epoch": 1.9420289855072463, + "grad_norm": 0.1954500824213028, + "learning_rate": 2.628898104463397e-05, + "loss": 0.5101, + "step": 9447 + }, + { + "epoch": 1.942234556480625, + "grad_norm": 0.17353855073451996, + "learning_rate": 2.627989776271496e-05, + "loss": 0.5164, + "step": 9448 + }, + { + "epoch": 1.9424401274540035, + "grad_norm": 0.16081948578357697, + "learning_rate": 2.6270815403047906e-05, + "loss": 0.5429, + "step": 9449 + }, + { + "epoch": 1.942645698427382, + "grad_norm": 0.19543206691741943, + "learning_rate": 2.626173396608023e-05, + "loss": 0.5165, + "step": 9450 + }, + { + "epoch": 1.9428512694007605, + "grad_norm": 0.20097336173057556, + "learning_rate": 2.6252653452259336e-05, + "loss": 0.5329, + "step": 9451 + }, + { + "epoch": 1.943056840374139, + "grad_norm": 0.20384319126605988, + "learning_rate": 2.6243573862032566e-05, + "loss": 0.5314, + "step": 9452 + }, + { + "epoch": 1.9432624113475176, + "grad_norm": 0.19779393076896667, + "learning_rate": 2.6234495195847262e-05, + "loss": 0.489, + "step": 9453 + }, + { + "epoch": 1.9434679823208962, + "grad_norm": 0.16127046942710876, + "learning_rate": 2.6225417454150668e-05, + "loss": 0.5033, + "step": 9454 + }, + { + "epoch": 1.9436735532942748, + "grad_norm": 0.1624097228050232, + "learning_rate": 2.6216340637389987e-05, + "loss": 0.532, + "step": 9455 + }, + { + "epoch": 1.9438791242676534, + "grad_norm": 0.16971097886562347, + "learning_rate": 2.620726474601243e-05, + "loss": 0.5058, + "step": 9456 + }, + { + "epoch": 1.944084695241032, + "grad_norm": 0.12277817726135254, + "learning_rate": 2.619818978046509e-05, + "loss": 0.4925, + "step": 9457 + }, + { + "epoch": 1.9442902662144106, + "grad_norm": 0.16644692420959473, + "learning_rate": 2.618911574119507e-05, + "loss": 0.5217, + "step": 9458 + }, + { + "epoch": 1.9444958371877892, + "grad_norm": 0.16563105583190918, + "learning_rate": 2.61800426286494e-05, + "loss": 0.5091, + "step": 9459 + }, + { + "epoch": 1.9447014081611678, + "grad_norm": 0.1673881858587265, + "learning_rate": 2.6170970443275054e-05, + "loss": 0.5416, + "step": 9460 + }, + { + "epoch": 1.9449069791345464, + "grad_norm": 0.20645494759082794, + "learning_rate": 2.6161899185518977e-05, + "loss": 0.5182, + "step": 9461 + }, + { + "epoch": 1.9451125501079247, + "grad_norm": 0.19935904443264008, + "learning_rate": 2.615282885582809e-05, + "loss": 0.5234, + "step": 9462 + }, + { + "epoch": 1.9453181210813033, + "grad_norm": 0.1983654797077179, + "learning_rate": 2.614375945464924e-05, + "loss": 0.5292, + "step": 9463 + }, + { + "epoch": 1.945523692054682, + "grad_norm": 0.20159868896007538, + "learning_rate": 2.6134690982429228e-05, + "loss": 0.5162, + "step": 9464 + }, + { + "epoch": 1.9457292630280605, + "grad_norm": 0.2034175992012024, + "learning_rate": 2.612562343961481e-05, + "loss": 0.5495, + "step": 9465 + }, + { + "epoch": 1.9459348340014389, + "grad_norm": 0.16713906824588776, + "learning_rate": 2.611655682665271e-05, + "loss": 0.5031, + "step": 9466 + }, + { + "epoch": 1.9461404049748174, + "grad_norm": 0.1686525195837021, + "learning_rate": 2.6107491143989593e-05, + "loss": 0.5444, + "step": 9467 + }, + { + "epoch": 1.946345975948196, + "grad_norm": 0.19990558922290802, + "learning_rate": 2.6098426392072068e-05, + "loss": 0.5149, + "step": 9468 + }, + { + "epoch": 1.9465515469215746, + "grad_norm": 0.1923760622739792, + "learning_rate": 2.608936257134675e-05, + "loss": 0.5353, + "step": 9469 + }, + { + "epoch": 1.9467571178949532, + "grad_norm": 0.20133623480796814, + "learning_rate": 2.6080299682260142e-05, + "loss": 0.5079, + "step": 9470 + }, + { + "epoch": 1.9469626888683318, + "grad_norm": 0.19276608526706696, + "learning_rate": 2.6071237725258744e-05, + "loss": 0.5221, + "step": 9471 + }, + { + "epoch": 1.9471682598417104, + "grad_norm": 0.20174479484558105, + "learning_rate": 2.6062176700788986e-05, + "loss": 0.5174, + "step": 9472 + }, + { + "epoch": 1.947373830815089, + "grad_norm": 0.2010992169380188, + "learning_rate": 2.605311660929725e-05, + "loss": 0.5325, + "step": 9473 + }, + { + "epoch": 1.9475794017884676, + "grad_norm": 0.16488604247570038, + "learning_rate": 2.604405745122992e-05, + "loss": 0.4957, + "step": 9474 + }, + { + "epoch": 1.9477849727618461, + "grad_norm": 0.12317883968353271, + "learning_rate": 2.6034999227033278e-05, + "loss": 0.527, + "step": 9475 + }, + { + "epoch": 1.9479905437352247, + "grad_norm": 0.16236087679862976, + "learning_rate": 2.602594193715357e-05, + "loss": 0.5246, + "step": 9476 + }, + { + "epoch": 1.948196114708603, + "grad_norm": 0.20070423185825348, + "learning_rate": 2.6016885582037027e-05, + "loss": 0.5258, + "step": 9477 + }, + { + "epoch": 1.9484016856819817, + "grad_norm": 0.20194244384765625, + "learning_rate": 2.6007830162129808e-05, + "loss": 0.5142, + "step": 9478 + }, + { + "epoch": 1.9486072566553603, + "grad_norm": 0.20240890979766846, + "learning_rate": 2.599877567787803e-05, + "loss": 0.5443, + "step": 9479 + }, + { + "epoch": 1.9488128276287389, + "grad_norm": 0.19648049771785736, + "learning_rate": 2.598972212972776e-05, + "loss": 0.534, + "step": 9480 + }, + { + "epoch": 1.9490183986021172, + "grad_norm": 0.2065919041633606, + "learning_rate": 2.5980669518125028e-05, + "loss": 0.5381, + "step": 9481 + }, + { + "epoch": 1.9492239695754958, + "grad_norm": 0.20330984890460968, + "learning_rate": 2.59716178435158e-05, + "loss": 0.5169, + "step": 9482 + }, + { + "epoch": 1.9494295405488744, + "grad_norm": 0.20162275433540344, + "learning_rate": 2.5962567106346034e-05, + "loss": 0.521, + "step": 9483 + }, + { + "epoch": 1.949635111522253, + "grad_norm": 0.16646580398082733, + "learning_rate": 2.5953517307061608e-05, + "loss": 0.5098, + "step": 9484 + }, + { + "epoch": 1.9498406824956316, + "grad_norm": 0.16191188991069794, + "learning_rate": 2.594446844610836e-05, + "loss": 0.5327, + "step": 9485 + }, + { + "epoch": 1.9500462534690102, + "grad_norm": 0.1962418407201767, + "learning_rate": 2.593542052393209e-05, + "loss": 0.5035, + "step": 9486 + }, + { + "epoch": 1.9502518244423888, + "grad_norm": 0.16942986845970154, + "learning_rate": 2.5926373540978536e-05, + "loss": 0.4919, + "step": 9487 + }, + { + "epoch": 1.9504573954157673, + "grad_norm": 0.16431602835655212, + "learning_rate": 2.5917327497693413e-05, + "loss": 0.5368, + "step": 9488 + }, + { + "epoch": 1.950662966389146, + "grad_norm": 0.1935006082057953, + "learning_rate": 2.590828239452235e-05, + "loss": 0.5189, + "step": 9489 + }, + { + "epoch": 1.9508685373625245, + "grad_norm": 0.20640498399734497, + "learning_rate": 2.5899238231911006e-05, + "loss": 0.4955, + "step": 9490 + }, + { + "epoch": 1.9510741083359031, + "grad_norm": 0.19904139637947083, + "learning_rate": 2.5890195010304913e-05, + "loss": 0.5199, + "step": 9491 + }, + { + "epoch": 1.9512796793092815, + "grad_norm": 0.19545705616474152, + "learning_rate": 2.5881152730149588e-05, + "loss": 0.491, + "step": 9492 + }, + { + "epoch": 1.95148525028266, + "grad_norm": 0.203142449259758, + "learning_rate": 2.5872111391890512e-05, + "loss": 0.5364, + "step": 9493 + }, + { + "epoch": 1.9516908212560387, + "grad_norm": 0.2026265263557434, + "learning_rate": 2.586307099597308e-05, + "loss": 0.5117, + "step": 9494 + }, + { + "epoch": 1.9518963922294172, + "grad_norm": 0.1932077556848526, + "learning_rate": 2.585403154284272e-05, + "loss": 0.4905, + "step": 9495 + }, + { + "epoch": 1.9521019632027956, + "grad_norm": 0.19804999232292175, + "learning_rate": 2.5844993032944735e-05, + "loss": 0.5318, + "step": 9496 + }, + { + "epoch": 1.9523075341761742, + "grad_norm": 0.19540899991989136, + "learning_rate": 2.58359554667244e-05, + "loss": 0.5114, + "step": 9497 + }, + { + "epoch": 1.9525131051495528, + "grad_norm": 0.1968623399734497, + "learning_rate": 2.5826918844626975e-05, + "loss": 0.5148, + "step": 9498 + }, + { + "epoch": 1.9527186761229314, + "grad_norm": 0.19433245062828064, + "learning_rate": 2.5817883167097644e-05, + "loss": 0.5073, + "step": 9499 + }, + { + "epoch": 1.95292424709631, + "grad_norm": 0.2015180140733719, + "learning_rate": 2.580884843458156e-05, + "loss": 0.5178, + "step": 9500 + }, + { + "epoch": 1.9531298180696886, + "grad_norm": 0.199843630194664, + "learning_rate": 2.579981464752381e-05, + "loss": 0.4852, + "step": 9501 + }, + { + "epoch": 1.9533353890430671, + "grad_norm": 0.16600465774536133, + "learning_rate": 2.5790781806369435e-05, + "loss": 0.4928, + "step": 9502 + }, + { + "epoch": 1.9535409600164457, + "grad_norm": 0.16455240547657013, + "learning_rate": 2.578174991156347e-05, + "loss": 0.5443, + "step": 9503 + }, + { + "epoch": 1.9537465309898243, + "grad_norm": 0.16569803655147552, + "learning_rate": 2.5772718963550868e-05, + "loss": 0.5102, + "step": 9504 + }, + { + "epoch": 1.953952101963203, + "grad_norm": 0.1648106575012207, + "learning_rate": 2.5763688962776526e-05, + "loss": 0.5247, + "step": 9505 + }, + { + "epoch": 1.9541576729365815, + "grad_norm": 0.20244595408439636, + "learning_rate": 2.5754659909685322e-05, + "loss": 0.5192, + "step": 9506 + }, + { + "epoch": 1.95436324390996, + "grad_norm": 0.20293334126472473, + "learning_rate": 2.5745631804722077e-05, + "loss": 0.5294, + "step": 9507 + }, + { + "epoch": 1.9545688148833384, + "grad_norm": 0.19975414872169495, + "learning_rate": 2.5736604648331552e-05, + "loss": 0.5245, + "step": 9508 + }, + { + "epoch": 1.954774385856717, + "grad_norm": 0.19464215636253357, + "learning_rate": 2.5727578440958465e-05, + "loss": 0.5115, + "step": 9509 + }, + { + "epoch": 1.9549799568300956, + "grad_norm": 0.19542162120342255, + "learning_rate": 2.571855318304753e-05, + "loss": 0.5251, + "step": 9510 + }, + { + "epoch": 1.955185527803474, + "grad_norm": 0.19343827664852142, + "learning_rate": 2.570952887504335e-05, + "loss": 0.5204, + "step": 9511 + }, + { + "epoch": 1.9553910987768526, + "grad_norm": 0.17137175798416138, + "learning_rate": 2.5700505517390526e-05, + "loss": 0.5097, + "step": 9512 + }, + { + "epoch": 1.9555966697502312, + "grad_norm": 0.11898645013570786, + "learning_rate": 2.569148311053358e-05, + "loss": 0.5127, + "step": 9513 + }, + { + "epoch": 1.9558022407236098, + "grad_norm": 0.12215547263622284, + "learning_rate": 2.5682461654917025e-05, + "loss": 0.5129, + "step": 9514 + }, + { + "epoch": 1.9560078116969883, + "grad_norm": 0.1605924665927887, + "learning_rate": 2.5673441150985286e-05, + "loss": 0.508, + "step": 9515 + }, + { + "epoch": 1.956213382670367, + "grad_norm": 0.16376885771751404, + "learning_rate": 2.5664421599182757e-05, + "loss": 0.4932, + "step": 9516 + }, + { + "epoch": 1.9564189536437455, + "grad_norm": 0.15705506503582, + "learning_rate": 2.5655402999953816e-05, + "loss": 0.5217, + "step": 9517 + }, + { + "epoch": 1.956624524617124, + "grad_norm": 0.19561244547367096, + "learning_rate": 2.5646385353742732e-05, + "loss": 0.5084, + "step": 9518 + }, + { + "epoch": 1.9568300955905027, + "grad_norm": 0.19796496629714966, + "learning_rate": 2.563736866099381e-05, + "loss": 0.5076, + "step": 9519 + }, + { + "epoch": 1.9570356665638813, + "grad_norm": 0.20186658203601837, + "learning_rate": 2.562835292215123e-05, + "loss": 0.5518, + "step": 9520 + }, + { + "epoch": 1.9572412375372599, + "grad_norm": 0.16234740614891052, + "learning_rate": 2.5619338137659155e-05, + "loss": 0.4855, + "step": 9521 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 0.1610114425420761, + "learning_rate": 2.5610324307961708e-05, + "loss": 0.5212, + "step": 9522 + }, + { + "epoch": 1.9576523794840168, + "grad_norm": 0.19542771577835083, + "learning_rate": 2.560131143350294e-05, + "loss": 0.5029, + "step": 9523 + }, + { + "epoch": 1.9578579504573954, + "grad_norm": 0.20270508527755737, + "learning_rate": 2.55922995147269e-05, + "loss": 0.5271, + "step": 9524 + }, + { + "epoch": 1.958063521430774, + "grad_norm": 0.18990576267242432, + "learning_rate": 2.5583288552077552e-05, + "loss": 0.5104, + "step": 9525 + }, + { + "epoch": 1.9582690924041524, + "grad_norm": 0.15766002237796783, + "learning_rate": 2.5574278545998827e-05, + "loss": 0.4951, + "step": 9526 + }, + { + "epoch": 1.958474663377531, + "grad_norm": 0.16321411728858948, + "learning_rate": 2.5565269496934602e-05, + "loss": 0.5406, + "step": 9527 + }, + { + "epoch": 1.9586802343509095, + "grad_norm": 0.2016243040561676, + "learning_rate": 2.5556261405328712e-05, + "loss": 0.5152, + "step": 9528 + }, + { + "epoch": 1.9588858053242881, + "grad_norm": 0.17193591594696045, + "learning_rate": 2.554725427162494e-05, + "loss": 0.5029, + "step": 9529 + }, + { + "epoch": 1.9590913762976667, + "grad_norm": 0.16781920194625854, + "learning_rate": 2.553824809626701e-05, + "loss": 0.5322, + "step": 9530 + }, + { + "epoch": 1.9592969472710453, + "grad_norm": 0.17390578985214233, + "learning_rate": 2.5529242879698655e-05, + "loss": 0.5052, + "step": 9531 + }, + { + "epoch": 1.959502518244424, + "grad_norm": 0.12395156174898148, + "learning_rate": 2.552023862236349e-05, + "loss": 0.4978, + "step": 9532 + }, + { + "epoch": 1.9597080892178025, + "grad_norm": 0.16266000270843506, + "learning_rate": 2.5511235324705127e-05, + "loss": 0.5345, + "step": 9533 + }, + { + "epoch": 1.959913660191181, + "grad_norm": 0.2078227996826172, + "learning_rate": 2.5502232987167103e-05, + "loss": 0.5167, + "step": 9534 + }, + { + "epoch": 1.9601192311645597, + "grad_norm": 0.20280295610427856, + "learning_rate": 2.549323161019293e-05, + "loss": 0.5316, + "step": 9535 + }, + { + "epoch": 1.9603248021379382, + "grad_norm": 0.20383380353450775, + "learning_rate": 2.5484231194226058e-05, + "loss": 0.5124, + "step": 9536 + }, + { + "epoch": 1.9605303731113168, + "grad_norm": 0.19895561039447784, + "learning_rate": 2.547523173970989e-05, + "loss": 0.5198, + "step": 9537 + }, + { + "epoch": 1.9607359440846952, + "grad_norm": 0.20123358070850372, + "learning_rate": 2.546623324708781e-05, + "loss": 0.5255, + "step": 9538 + }, + { + "epoch": 1.9609415150580738, + "grad_norm": 0.2038145512342453, + "learning_rate": 2.5457235716803115e-05, + "loss": 0.5309, + "step": 9539 + }, + { + "epoch": 1.9611470860314524, + "grad_norm": 0.20116189122200012, + "learning_rate": 2.5448239149299055e-05, + "loss": 0.5075, + "step": 9540 + }, + { + "epoch": 1.961352657004831, + "grad_norm": 0.2058117836713791, + "learning_rate": 2.5439243545018884e-05, + "loss": 0.533, + "step": 9541 + }, + { + "epoch": 1.9615582279782093, + "grad_norm": 0.2008356899023056, + "learning_rate": 2.543024890440576e-05, + "loss": 0.5321, + "step": 9542 + }, + { + "epoch": 1.961763798951588, + "grad_norm": 0.19685760140419006, + "learning_rate": 2.5421255227902804e-05, + "loss": 0.4969, + "step": 9543 + }, + { + "epoch": 1.9619693699249665, + "grad_norm": 0.1951378434896469, + "learning_rate": 2.541226251595307e-05, + "loss": 0.4999, + "step": 9544 + }, + { + "epoch": 1.962174940898345, + "grad_norm": 0.19807179272174835, + "learning_rate": 2.5403270768999633e-05, + "loss": 0.5146, + "step": 9545 + }, + { + "epoch": 1.9623805118717237, + "grad_norm": 0.19595085084438324, + "learning_rate": 2.539427998748544e-05, + "loss": 0.5223, + "step": 9546 + }, + { + "epoch": 1.9625860828451023, + "grad_norm": 0.19711394608020782, + "learning_rate": 2.5385290171853446e-05, + "loss": 0.5196, + "step": 9547 + }, + { + "epoch": 1.9627916538184809, + "grad_norm": 0.20173287391662598, + "learning_rate": 2.5376301322546523e-05, + "loss": 0.5277, + "step": 9548 + }, + { + "epoch": 1.9629972247918595, + "grad_norm": 0.20318298041820526, + "learning_rate": 2.5367313440007513e-05, + "loss": 0.5174, + "step": 9549 + }, + { + "epoch": 1.963202795765238, + "grad_norm": 0.20232440531253815, + "learning_rate": 2.5358326524679206e-05, + "loss": 0.5325, + "step": 9550 + }, + { + "epoch": 1.9634083667386166, + "grad_norm": 0.2035774439573288, + "learning_rate": 2.534934057700433e-05, + "loss": 0.516, + "step": 9551 + }, + { + "epoch": 1.9636139377119952, + "grad_norm": 0.20142172276973724, + "learning_rate": 2.534035559742561e-05, + "loss": 0.5189, + "step": 9552 + }, + { + "epoch": 1.9638195086853736, + "grad_norm": 0.2012597769498825, + "learning_rate": 2.5331371586385683e-05, + "loss": 0.5166, + "step": 9553 + }, + { + "epoch": 1.9640250796587522, + "grad_norm": 0.1986485868692398, + "learning_rate": 2.532238854432715e-05, + "loss": 0.5134, + "step": 9554 + }, + { + "epoch": 1.9642306506321308, + "grad_norm": 0.20366504788398743, + "learning_rate": 2.531340647169256e-05, + "loss": 0.5146, + "step": 9555 + }, + { + "epoch": 1.9644362216055093, + "grad_norm": 0.19817805290222168, + "learning_rate": 2.530442536892442e-05, + "loss": 0.4911, + "step": 9556 + }, + { + "epoch": 1.9646417925788877, + "grad_norm": 0.20008954405784607, + "learning_rate": 2.529544523646518e-05, + "loss": 0.574, + "step": 9557 + }, + { + "epoch": 1.9648473635522663, + "grad_norm": 0.2054361253976822, + "learning_rate": 2.5286466074757237e-05, + "loss": 0.5204, + "step": 9558 + }, + { + "epoch": 1.965052934525645, + "grad_norm": 0.19738180935382843, + "learning_rate": 2.527748788424299e-05, + "loss": 0.5198, + "step": 9559 + }, + { + "epoch": 1.9652585054990235, + "grad_norm": 0.20528697967529297, + "learning_rate": 2.526851066536473e-05, + "loss": 0.5439, + "step": 9560 + }, + { + "epoch": 1.965464076472402, + "grad_norm": 0.21813803911209106, + "learning_rate": 2.5259534418564713e-05, + "loss": 0.5442, + "step": 9561 + }, + { + "epoch": 1.9656696474457807, + "grad_norm": 0.20172588527202606, + "learning_rate": 2.5250559144285174e-05, + "loss": 0.5133, + "step": 9562 + }, + { + "epoch": 1.9658752184191592, + "grad_norm": 0.19807198643684387, + "learning_rate": 2.5241584842968285e-05, + "loss": 0.5051, + "step": 9563 + }, + { + "epoch": 1.9660807893925378, + "grad_norm": 0.2078738957643509, + "learning_rate": 2.5232611515056168e-05, + "loss": 0.5716, + "step": 9564 + }, + { + "epoch": 1.9662863603659164, + "grad_norm": 0.19806239008903503, + "learning_rate": 2.522363916099086e-05, + "loss": 0.5293, + "step": 9565 + }, + { + "epoch": 1.966491931339295, + "grad_norm": 0.20789627730846405, + "learning_rate": 2.5214667781214436e-05, + "loss": 0.5446, + "step": 9566 + }, + { + "epoch": 1.9666975023126736, + "grad_norm": 0.20237933099269867, + "learning_rate": 2.5205697376168853e-05, + "loss": 0.5286, + "step": 9567 + }, + { + "epoch": 1.966903073286052, + "grad_norm": 0.2071990966796875, + "learning_rate": 2.5196727946296043e-05, + "loss": 0.5321, + "step": 9568 + }, + { + "epoch": 1.9671086442594305, + "grad_norm": 0.19845061004161835, + "learning_rate": 2.518775949203789e-05, + "loss": 0.5272, + "step": 9569 + }, + { + "epoch": 1.9673142152328091, + "grad_norm": 0.2033272236585617, + "learning_rate": 2.5178792013836224e-05, + "loss": 0.513, + "step": 9570 + }, + { + "epoch": 1.9675197862061877, + "grad_norm": 0.20528094470500946, + "learning_rate": 2.5169825512132833e-05, + "loss": 0.5322, + "step": 9571 + }, + { + "epoch": 1.967725357179566, + "grad_norm": 0.19687287509441376, + "learning_rate": 2.516085998736943e-05, + "loss": 0.5129, + "step": 9572 + }, + { + "epoch": 1.9679309281529447, + "grad_norm": 0.16771896183490753, + "learning_rate": 2.5151895439987746e-05, + "loss": 0.5116, + "step": 9573 + }, + { + "epoch": 1.9681364991263233, + "grad_norm": 0.16580241918563843, + "learning_rate": 2.5142931870429404e-05, + "loss": 0.527, + "step": 9574 + }, + { + "epoch": 1.9683420700997019, + "grad_norm": 0.20436574518680573, + "learning_rate": 2.5133969279136e-05, + "loss": 0.521, + "step": 9575 + }, + { + "epoch": 1.9685476410730804, + "grad_norm": 0.1928415149450302, + "learning_rate": 2.5125007666549074e-05, + "loss": 0.5062, + "step": 9576 + }, + { + "epoch": 1.968753212046459, + "grad_norm": 0.19831101596355438, + "learning_rate": 2.5116047033110125e-05, + "loss": 0.5124, + "step": 9577 + }, + { + "epoch": 1.9689587830198376, + "grad_norm": 0.1986418068408966, + "learning_rate": 2.510708737926058e-05, + "loss": 0.547, + "step": 9578 + }, + { + "epoch": 1.9691643539932162, + "grad_norm": 0.19999928772449493, + "learning_rate": 2.509812870544189e-05, + "loss": 0.5286, + "step": 9579 + }, + { + "epoch": 1.9693699249665948, + "grad_norm": 0.1935226321220398, + "learning_rate": 2.5089171012095367e-05, + "loss": 0.5285, + "step": 9580 + }, + { + "epoch": 1.9695754959399734, + "grad_norm": 0.18808215856552124, + "learning_rate": 2.5080214299662322e-05, + "loss": 0.5051, + "step": 9581 + }, + { + "epoch": 1.969781066913352, + "grad_norm": 0.20196162164211273, + "learning_rate": 2.507125856858401e-05, + "loss": 0.5222, + "step": 9582 + }, + { + "epoch": 1.9699866378867303, + "grad_norm": 0.20096677541732788, + "learning_rate": 2.5062303819301645e-05, + "loss": 0.5405, + "step": 9583 + }, + { + "epoch": 1.970192208860109, + "grad_norm": 0.20000407099723816, + "learning_rate": 2.5053350052256393e-05, + "loss": 0.5173, + "step": 9584 + }, + { + "epoch": 1.9703977798334875, + "grad_norm": 0.19387024641036987, + "learning_rate": 2.5044397267889327e-05, + "loss": 0.4956, + "step": 9585 + }, + { + "epoch": 1.970603350806866, + "grad_norm": 0.16809746623039246, + "learning_rate": 2.5035445466641558e-05, + "loss": 0.5046, + "step": 9586 + }, + { + "epoch": 1.9708089217802445, + "grad_norm": 0.16820058226585388, + "learning_rate": 2.502649464895408e-05, + "loss": 0.5309, + "step": 9587 + }, + { + "epoch": 1.971014492753623, + "grad_norm": 0.21059322357177734, + "learning_rate": 2.501754481526785e-05, + "loss": 0.5047, + "step": 9588 + }, + { + "epoch": 1.9712200637270016, + "grad_norm": 0.20109686255455017, + "learning_rate": 2.5008595966023786e-05, + "loss": 0.5069, + "step": 9589 + }, + { + "epoch": 1.9714256347003802, + "grad_norm": 0.20082977414131165, + "learning_rate": 2.4999648101662763e-05, + "loss": 0.5329, + "step": 9590 + }, + { + "epoch": 1.9716312056737588, + "grad_norm": 0.1989169418811798, + "learning_rate": 2.4990701222625602e-05, + "loss": 0.5102, + "step": 9591 + }, + { + "epoch": 1.9718367766471374, + "grad_norm": 0.19520479440689087, + "learning_rate": 2.4981755329353043e-05, + "loss": 0.5116, + "step": 9592 + }, + { + "epoch": 1.972042347620516, + "grad_norm": 0.17147661745548248, + "learning_rate": 2.4972810422285853e-05, + "loss": 0.4902, + "step": 9593 + }, + { + "epoch": 1.9722479185938946, + "grad_norm": 0.16414588689804077, + "learning_rate": 2.496386650186469e-05, + "loss": 0.5109, + "step": 9594 + }, + { + "epoch": 1.9724534895672732, + "grad_norm": 0.20732592046260834, + "learning_rate": 2.4954923568530175e-05, + "loss": 0.5128, + "step": 9595 + }, + { + "epoch": 1.9726590605406518, + "grad_norm": 0.19795072078704834, + "learning_rate": 2.4945981622722878e-05, + "loss": 0.5122, + "step": 9596 + }, + { + "epoch": 1.9728646315140304, + "grad_norm": 0.2000289112329483, + "learning_rate": 2.493704066488334e-05, + "loss": 0.5282, + "step": 9597 + }, + { + "epoch": 1.973070202487409, + "grad_norm": 0.1769014447927475, + "learning_rate": 2.4928100695452037e-05, + "loss": 0.4991, + "step": 9598 + }, + { + "epoch": 1.9732757734607873, + "grad_norm": 0.16739298403263092, + "learning_rate": 2.4919161714869377e-05, + "loss": 0.507, + "step": 9599 + }, + { + "epoch": 1.973481344434166, + "grad_norm": 0.199861079454422, + "learning_rate": 2.4910223723575778e-05, + "loss": 0.5178, + "step": 9600 + }, + { + "epoch": 1.9736869154075445, + "grad_norm": 0.16744980216026306, + "learning_rate": 2.490128672201156e-05, + "loss": 0.4671, + "step": 9601 + }, + { + "epoch": 1.9738924863809229, + "grad_norm": 0.16180412471294403, + "learning_rate": 2.4892350710617003e-05, + "loss": 0.5274, + "step": 9602 + }, + { + "epoch": 1.9740980573543014, + "grad_norm": 0.2564503848552704, + "learning_rate": 2.488341568983232e-05, + "loss": 0.5285, + "step": 9603 + }, + { + "epoch": 1.97430362832768, + "grad_norm": 0.16161498427391052, + "learning_rate": 2.4874481660097748e-05, + "loss": 0.4968, + "step": 9604 + }, + { + "epoch": 1.9745091993010586, + "grad_norm": 0.11919713020324707, + "learning_rate": 2.4865548621853394e-05, + "loss": 0.5128, + "step": 9605 + }, + { + "epoch": 1.9747147702744372, + "grad_norm": 0.16267365217208862, + "learning_rate": 2.4856616575539334e-05, + "loss": 0.5247, + "step": 9606 + }, + { + "epoch": 1.9749203412478158, + "grad_norm": 0.16840054094791412, + "learning_rate": 2.4847685521595643e-05, + "loss": 0.4839, + "step": 9607 + }, + { + "epoch": 1.9751259122211944, + "grad_norm": 0.17324216663837433, + "learning_rate": 2.48387554604623e-05, + "loss": 0.5092, + "step": 9608 + }, + { + "epoch": 1.975331483194573, + "grad_norm": 0.16955405473709106, + "learning_rate": 2.4829826392579227e-05, + "loss": 0.4955, + "step": 9609 + }, + { + "epoch": 1.9755370541679516, + "grad_norm": 0.16968326270580292, + "learning_rate": 2.4820898318386345e-05, + "loss": 0.5285, + "step": 9610 + }, + { + "epoch": 1.9757426251413301, + "grad_norm": 0.2073184996843338, + "learning_rate": 2.481197123832348e-05, + "loss": 0.5258, + "step": 9611 + }, + { + "epoch": 1.9759481961147087, + "grad_norm": 0.2012372761964798, + "learning_rate": 2.4803045152830442e-05, + "loss": 0.5157, + "step": 9612 + }, + { + "epoch": 1.9761537670880873, + "grad_norm": 0.1959368884563446, + "learning_rate": 2.4794120062346946e-05, + "loss": 0.5346, + "step": 9613 + }, + { + "epoch": 1.9763593380614657, + "grad_norm": 0.19632303714752197, + "learning_rate": 2.478519596731273e-05, + "loss": 0.5138, + "step": 9614 + }, + { + "epoch": 1.9765649090348443, + "grad_norm": 0.19955292344093323, + "learning_rate": 2.4776272868167424e-05, + "loss": 0.535, + "step": 9615 + }, + { + "epoch": 1.9767704800082229, + "grad_norm": 0.19841422140598297, + "learning_rate": 2.476735076535063e-05, + "loss": 0.5054, + "step": 9616 + }, + { + "epoch": 1.9769760509816015, + "grad_norm": 0.19676409661769867, + "learning_rate": 2.4758429659301894e-05, + "loss": 0.5238, + "step": 9617 + }, + { + "epoch": 1.9771816219549798, + "grad_norm": 0.19223178923130035, + "learning_rate": 2.4749509550460724e-05, + "loss": 0.5013, + "step": 9618 + }, + { + "epoch": 1.9773871929283584, + "grad_norm": 0.20213696360588074, + "learning_rate": 2.474059043926656e-05, + "loss": 0.5086, + "step": 9619 + }, + { + "epoch": 1.977592763901737, + "grad_norm": 0.2001548409461975, + "learning_rate": 2.4731672326158804e-05, + "loss": 0.4985, + "step": 9620 + }, + { + "epoch": 1.9777983348751156, + "grad_norm": 0.20245525240898132, + "learning_rate": 2.4722755211576836e-05, + "loss": 0.5327, + "step": 9621 + }, + { + "epoch": 1.9780039058484942, + "grad_norm": 0.2233567237854004, + "learning_rate": 2.4713839095959936e-05, + "loss": 0.5095, + "step": 9622 + }, + { + "epoch": 1.9782094768218728, + "grad_norm": 0.19729016721248627, + "learning_rate": 2.470492397974737e-05, + "loss": 0.4831, + "step": 9623 + }, + { + "epoch": 1.9784150477952513, + "grad_norm": 0.20027440786361694, + "learning_rate": 2.4696009863378342e-05, + "loss": 0.5315, + "step": 9624 + }, + { + "epoch": 1.97862061876863, + "grad_norm": 0.20336763560771942, + "learning_rate": 2.4687096747291987e-05, + "loss": 0.5019, + "step": 9625 + }, + { + "epoch": 1.9788261897420085, + "grad_norm": 0.16322872042655945, + "learning_rate": 2.4678184631927453e-05, + "loss": 0.4873, + "step": 9626 + }, + { + "epoch": 1.9790317607153871, + "grad_norm": 0.1632460653781891, + "learning_rate": 2.4669273517723777e-05, + "loss": 0.524, + "step": 9627 + }, + { + "epoch": 1.9792373316887657, + "grad_norm": 0.19479408860206604, + "learning_rate": 2.466036340511995e-05, + "loss": 0.5186, + "step": 9628 + }, + { + "epoch": 1.979442902662144, + "grad_norm": 0.19414758682250977, + "learning_rate": 2.4651454294554972e-05, + "loss": 0.5153, + "step": 9629 + }, + { + "epoch": 1.9796484736355227, + "grad_norm": 0.1960826814174652, + "learning_rate": 2.464254618646773e-05, + "loss": 0.5356, + "step": 9630 + }, + { + "epoch": 1.9798540446089012, + "grad_norm": 0.19612587988376617, + "learning_rate": 2.4633639081297088e-05, + "loss": 0.5033, + "step": 9631 + }, + { + "epoch": 1.9800596155822798, + "grad_norm": 1.9576839208602905, + "learning_rate": 2.462473297948186e-05, + "loss": 0.5465, + "step": 9632 + }, + { + "epoch": 1.9802651865556582, + "grad_norm": 0.2153571993112564, + "learning_rate": 2.4615827881460797e-05, + "loss": 0.531, + "step": 9633 + }, + { + "epoch": 1.9804707575290368, + "grad_norm": 0.20636354386806488, + "learning_rate": 2.4606923787672607e-05, + "loss": 0.5394, + "step": 9634 + }, + { + "epoch": 1.9806763285024154, + "grad_norm": 0.19910024106502533, + "learning_rate": 2.4598020698555975e-05, + "loss": 0.5212, + "step": 9635 + }, + { + "epoch": 1.980881899475794, + "grad_norm": 0.19475533068180084, + "learning_rate": 2.458911861454951e-05, + "loss": 0.5175, + "step": 9636 + }, + { + "epoch": 1.9810874704491725, + "grad_norm": 0.20673874020576477, + "learning_rate": 2.4580217536091772e-05, + "loss": 0.5258, + "step": 9637 + }, + { + "epoch": 1.9812930414225511, + "grad_norm": 0.20791196823120117, + "learning_rate": 2.4571317463621278e-05, + "loss": 0.5278, + "step": 9638 + }, + { + "epoch": 1.9814986123959297, + "grad_norm": 0.20311853289604187, + "learning_rate": 2.4562418397576482e-05, + "loss": 0.5103, + "step": 9639 + }, + { + "epoch": 1.9817041833693083, + "grad_norm": 0.18043197691440582, + "learning_rate": 2.4553520338395808e-05, + "loss": 0.5009, + "step": 9640 + }, + { + "epoch": 1.981909754342687, + "grad_norm": 0.16400253772735596, + "learning_rate": 2.45446232865176e-05, + "loss": 0.5219, + "step": 9641 + }, + { + "epoch": 1.9821153253160655, + "grad_norm": 0.20592088997364044, + "learning_rate": 2.453572724238022e-05, + "loss": 0.5247, + "step": 9642 + }, + { + "epoch": 1.982320896289444, + "grad_norm": 0.22053800523281097, + "learning_rate": 2.45268322064219e-05, + "loss": 0.5272, + "step": 9643 + }, + { + "epoch": 1.9825264672628224, + "grad_norm": 0.21963202953338623, + "learning_rate": 2.451793817908087e-05, + "loss": 0.51, + "step": 9644 + }, + { + "epoch": 1.982732038236201, + "grad_norm": 0.21020135283470154, + "learning_rate": 2.4509045160795295e-05, + "loss": 0.5338, + "step": 9645 + }, + { + "epoch": 1.9829376092095796, + "grad_norm": 0.17611977458000183, + "learning_rate": 2.450015315200327e-05, + "loss": 0.5083, + "step": 9646 + }, + { + "epoch": 1.9831431801829582, + "grad_norm": 0.16838988661766052, + "learning_rate": 2.44912621531429e-05, + "loss": 0.5075, + "step": 9647 + }, + { + "epoch": 1.9833487511563366, + "grad_norm": 0.20639371871948242, + "learning_rate": 2.448237216465219e-05, + "loss": 0.5329, + "step": 9648 + }, + { + "epoch": 1.9835543221297152, + "grad_norm": 0.20562691986560822, + "learning_rate": 2.4473483186969085e-05, + "loss": 0.5001, + "step": 9649 + }, + { + "epoch": 1.9837598931030938, + "grad_norm": 0.20028932392597198, + "learning_rate": 2.4464595220531542e-05, + "loss": 0.5145, + "step": 9650 + }, + { + "epoch": 1.9839654640764723, + "grad_norm": 0.205689936876297, + "learning_rate": 2.4455708265777406e-05, + "loss": 0.5347, + "step": 9651 + }, + { + "epoch": 1.984171035049851, + "grad_norm": 0.20499835908412933, + "learning_rate": 2.4446822323144497e-05, + "loss": 0.5239, + "step": 9652 + }, + { + "epoch": 1.9843766060232295, + "grad_norm": 0.20297472178936005, + "learning_rate": 2.4437937393070596e-05, + "loss": 0.5307, + "step": 9653 + }, + { + "epoch": 1.984582176996608, + "grad_norm": 0.1985624134540558, + "learning_rate": 2.442905347599339e-05, + "loss": 0.5076, + "step": 9654 + }, + { + "epoch": 1.9847877479699867, + "grad_norm": 0.20252910256385803, + "learning_rate": 2.442017057235059e-05, + "loss": 0.528, + "step": 9655 + }, + { + "epoch": 1.9849933189433653, + "grad_norm": 0.2101006656885147, + "learning_rate": 2.441128868257979e-05, + "loss": 0.5188, + "step": 9656 + }, + { + "epoch": 1.9851988899167439, + "grad_norm": 0.1986953169107437, + "learning_rate": 2.4402407807118577e-05, + "loss": 0.5267, + "step": 9657 + }, + { + "epoch": 1.9854044608901225, + "grad_norm": 0.20518286526203156, + "learning_rate": 2.4393527946404447e-05, + "loss": 0.5362, + "step": 9658 + }, + { + "epoch": 1.9856100318635008, + "grad_norm": 0.20495247840881348, + "learning_rate": 2.438464910087489e-05, + "loss": 0.5306, + "step": 9659 + }, + { + "epoch": 1.9858156028368794, + "grad_norm": 0.20301851630210876, + "learning_rate": 2.437577127096731e-05, + "loss": 0.5106, + "step": 9660 + }, + { + "epoch": 1.986021173810258, + "grad_norm": 0.17709769308567047, + "learning_rate": 2.4366894457119066e-05, + "loss": 0.4954, + "step": 9661 + }, + { + "epoch": 1.9862267447836366, + "grad_norm": 0.1601599156856537, + "learning_rate": 2.4358018659767514e-05, + "loss": 0.4917, + "step": 9662 + }, + { + "epoch": 1.986432315757015, + "grad_norm": 0.19886882603168488, + "learning_rate": 2.4349143879349898e-05, + "loss": 0.5363, + "step": 9663 + }, + { + "epoch": 1.9866378867303935, + "grad_norm": 0.16597384214401245, + "learning_rate": 2.434027011630344e-05, + "loss": 0.5121, + "step": 9664 + }, + { + "epoch": 1.9868434577037721, + "grad_norm": 0.163084477186203, + "learning_rate": 2.4331397371065314e-05, + "loss": 0.5358, + "step": 9665 + }, + { + "epoch": 1.9870490286771507, + "grad_norm": 0.19397611916065216, + "learning_rate": 2.4322525644072636e-05, + "loss": 0.4968, + "step": 9666 + }, + { + "epoch": 1.9872545996505293, + "grad_norm": 0.19655869901180267, + "learning_rate": 2.4313654935762452e-05, + "loss": 0.5081, + "step": 9667 + }, + { + "epoch": 1.987460170623908, + "grad_norm": 0.20611554384231567, + "learning_rate": 2.4304785246571817e-05, + "loss": 0.5285, + "step": 9668 + }, + { + "epoch": 1.9876657415972865, + "grad_norm": 0.20290662348270416, + "learning_rate": 2.4295916576937687e-05, + "loss": 0.5153, + "step": 9669 + }, + { + "epoch": 1.987871312570665, + "grad_norm": 0.20132143795490265, + "learning_rate": 2.428704892729696e-05, + "loss": 0.5342, + "step": 9670 + }, + { + "epoch": 1.9880768835440437, + "grad_norm": 0.20216117799282074, + "learning_rate": 2.4278182298086535e-05, + "loss": 0.517, + "step": 9671 + }, + { + "epoch": 1.9882824545174222, + "grad_norm": 0.19936327636241913, + "learning_rate": 2.426931668974322e-05, + "loss": 0.5252, + "step": 9672 + }, + { + "epoch": 1.9884880254908008, + "grad_norm": 0.33940476179122925, + "learning_rate": 2.426045210270377e-05, + "loss": 0.5247, + "step": 9673 + }, + { + "epoch": 1.9886935964641792, + "grad_norm": 0.20160600543022156, + "learning_rate": 2.4251588537404913e-05, + "loss": 0.5223, + "step": 9674 + }, + { + "epoch": 1.9888991674375578, + "grad_norm": 0.2030128389596939, + "learning_rate": 2.4242725994283292e-05, + "loss": 0.5135, + "step": 9675 + }, + { + "epoch": 1.9891047384109364, + "grad_norm": 0.22344298660755157, + "learning_rate": 2.4233864473775556e-05, + "loss": 0.5226, + "step": 9676 + }, + { + "epoch": 1.989310309384315, + "grad_norm": 0.20270341634750366, + "learning_rate": 2.422500397631826e-05, + "loss": 0.5173, + "step": 9677 + }, + { + "epoch": 1.9895158803576933, + "grad_norm": 0.17036183178424835, + "learning_rate": 2.421614450234792e-05, + "loss": 0.4997, + "step": 9678 + }, + { + "epoch": 1.989721451331072, + "grad_norm": 0.16131217777729034, + "learning_rate": 2.420728605230099e-05, + "loss": 0.5233, + "step": 9679 + }, + { + "epoch": 1.9899270223044505, + "grad_norm": 0.19689194858074188, + "learning_rate": 2.4198428626613895e-05, + "loss": 0.5235, + "step": 9680 + }, + { + "epoch": 1.990132593277829, + "grad_norm": 0.1997881680727005, + "learning_rate": 2.418957222572299e-05, + "loss": 0.5469, + "step": 9681 + }, + { + "epoch": 1.9903381642512077, + "grad_norm": 0.1594388335943222, + "learning_rate": 2.4180716850064584e-05, + "loss": 0.4688, + "step": 9682 + }, + { + "epoch": 1.9905437352245863, + "grad_norm": 0.1173081025481224, + "learning_rate": 2.4171862500074968e-05, + "loss": 0.5151, + "step": 9683 + }, + { + "epoch": 1.9907493061979649, + "grad_norm": 0.16193978488445282, + "learning_rate": 2.416300917619033e-05, + "loss": 0.507, + "step": 9684 + }, + { + "epoch": 1.9909548771713435, + "grad_norm": 0.19650469720363617, + "learning_rate": 2.415415687884684e-05, + "loss": 0.512, + "step": 9685 + }, + { + "epoch": 1.991160448144722, + "grad_norm": 0.19806897640228271, + "learning_rate": 2.414530560848061e-05, + "loss": 0.5165, + "step": 9686 + }, + { + "epoch": 1.9913660191181006, + "grad_norm": 0.20564566552639008, + "learning_rate": 2.4136455365527692e-05, + "loss": 0.5088, + "step": 9687 + }, + { + "epoch": 1.9915715900914792, + "grad_norm": 0.20067964494228363, + "learning_rate": 2.412760615042411e-05, + "loss": 0.5163, + "step": 9688 + }, + { + "epoch": 1.9917771610648578, + "grad_norm": 0.20195259153842926, + "learning_rate": 2.4118757963605788e-05, + "loss": 0.5013, + "step": 9689 + }, + { + "epoch": 1.9919827320382362, + "grad_norm": 0.2007036656141281, + "learning_rate": 2.410991080550869e-05, + "loss": 0.5301, + "step": 9690 + }, + { + "epoch": 1.9921883030116148, + "grad_norm": 0.16521452367305756, + "learning_rate": 2.4101064676568624e-05, + "loss": 0.4947, + "step": 9691 + }, + { + "epoch": 1.9923938739849933, + "grad_norm": 0.16318975389003754, + "learning_rate": 2.4092219577221435e-05, + "loss": 0.5409, + "step": 9692 + }, + { + "epoch": 1.9925994449583717, + "grad_norm": 0.20644515752792358, + "learning_rate": 2.4083375507902872e-05, + "loss": 0.5451, + "step": 9693 + }, + { + "epoch": 1.9928050159317503, + "grad_norm": 0.19570566713809967, + "learning_rate": 2.407453246904863e-05, + "loss": 0.5005, + "step": 9694 + }, + { + "epoch": 1.993010586905129, + "grad_norm": 0.19532164931297302, + "learning_rate": 2.4065690461094367e-05, + "loss": 0.5377, + "step": 9695 + }, + { + "epoch": 1.9932161578785075, + "grad_norm": 0.20121091604232788, + "learning_rate": 2.405684948447567e-05, + "loss": 0.5096, + "step": 9696 + }, + { + "epoch": 1.993421728851886, + "grad_norm": 0.1667921096086502, + "learning_rate": 2.4048009539628128e-05, + "loss": 0.5165, + "step": 9697 + }, + { + "epoch": 1.9936272998252647, + "grad_norm": 0.12459738552570343, + "learning_rate": 2.403917062698723e-05, + "loss": 0.5162, + "step": 9698 + }, + { + "epoch": 1.9938328707986432, + "grad_norm": 0.16275346279144287, + "learning_rate": 2.4030332746988426e-05, + "loss": 0.513, + "step": 9699 + }, + { + "epoch": 1.9940384417720218, + "grad_norm": 0.2167256772518158, + "learning_rate": 2.4021495900067113e-05, + "loss": 0.5138, + "step": 9700 + }, + { + "epoch": 1.9942440127454004, + "grad_norm": 0.20247885584831238, + "learning_rate": 2.4012660086658642e-05, + "loss": 0.5086, + "step": 9701 + }, + { + "epoch": 1.994449583718779, + "grad_norm": 0.20237302780151367, + "learning_rate": 2.400382530719832e-05, + "loss": 0.4994, + "step": 9702 + }, + { + "epoch": 1.9946551546921576, + "grad_norm": 0.193708136677742, + "learning_rate": 2.3994991562121362e-05, + "loss": 0.5112, + "step": 9703 + }, + { + "epoch": 1.9948607256655362, + "grad_norm": 0.20271430909633636, + "learning_rate": 2.3986158851863016e-05, + "loss": 0.5148, + "step": 9704 + }, + { + "epoch": 1.9950662966389145, + "grad_norm": 0.16858288645744324, + "learning_rate": 2.39773271768584e-05, + "loss": 0.502, + "step": 9705 + }, + { + "epoch": 1.9952718676122931, + "grad_norm": 0.1224452555179596, + "learning_rate": 2.3968496537542624e-05, + "loss": 0.5069, + "step": 9706 + }, + { + "epoch": 1.9954774385856717, + "grad_norm": 0.1615760177373886, + "learning_rate": 2.3959666934350715e-05, + "loss": 0.5327, + "step": 9707 + }, + { + "epoch": 1.9956830095590503, + "grad_norm": 0.19293002784252167, + "learning_rate": 2.3950838367717675e-05, + "loss": 0.5051, + "step": 9708 + }, + { + "epoch": 1.9958885805324287, + "grad_norm": 0.20506036281585693, + "learning_rate": 2.394201083807845e-05, + "loss": 0.5306, + "step": 9709 + }, + { + "epoch": 1.9960941515058073, + "grad_norm": 0.19566957652568817, + "learning_rate": 2.3933184345867902e-05, + "loss": 0.5146, + "step": 9710 + }, + { + "epoch": 1.9962997224791859, + "grad_norm": 0.19693787395954132, + "learning_rate": 2.3924358891520916e-05, + "loss": 0.5098, + "step": 9711 + }, + { + "epoch": 1.9965052934525644, + "grad_norm": 0.20601771771907806, + "learning_rate": 2.391553447547226e-05, + "loss": 0.5345, + "step": 9712 + }, + { + "epoch": 1.996710864425943, + "grad_norm": 0.19721956551074982, + "learning_rate": 2.3906711098156654e-05, + "loss": 0.5034, + "step": 9713 + }, + { + "epoch": 1.9969164353993216, + "grad_norm": 0.19830164313316345, + "learning_rate": 2.389788876000882e-05, + "loss": 0.5055, + "step": 9714 + }, + { + "epoch": 1.9971220063727002, + "grad_norm": 0.19704151153564453, + "learning_rate": 2.3889067461463375e-05, + "loss": 0.4994, + "step": 9715 + }, + { + "epoch": 1.9973275773460788, + "grad_norm": 0.2041328102350235, + "learning_rate": 2.3880247202954906e-05, + "loss": 0.5322, + "step": 9716 + }, + { + "epoch": 1.9975331483194574, + "grad_norm": 0.20206472277641296, + "learning_rate": 2.387142798491792e-05, + "loss": 0.5115, + "step": 9717 + }, + { + "epoch": 1.997738719292836, + "grad_norm": 0.20135797560214996, + "learning_rate": 2.386260980778695e-05, + "loss": 0.5294, + "step": 9718 + }, + { + "epoch": 1.9979442902662146, + "grad_norm": 0.19181190431118011, + "learning_rate": 2.3853792671996394e-05, + "loss": 0.5249, + "step": 9719 + }, + { + "epoch": 1.998149861239593, + "grad_norm": 0.199905663728714, + "learning_rate": 2.3844976577980637e-05, + "loss": 0.5133, + "step": 9720 + }, + { + "epoch": 1.9983554322129715, + "grad_norm": 0.19756287336349487, + "learning_rate": 2.3836161526173998e-05, + "loss": 0.491, + "step": 9721 + }, + { + "epoch": 1.99856100318635, + "grad_norm": 0.16492635011672974, + "learning_rate": 2.382734751701077e-05, + "loss": 0.4839, + "step": 9722 + }, + { + "epoch": 1.9987665741597287, + "grad_norm": 0.16064047813415527, + "learning_rate": 2.3818534550925166e-05, + "loss": 0.525, + "step": 9723 + }, + { + "epoch": 1.998972145133107, + "grad_norm": 0.1621170938014984, + "learning_rate": 2.3809722628351345e-05, + "loss": 0.5041, + "step": 9724 + }, + { + "epoch": 1.9991777161064856, + "grad_norm": 0.1653175801038742, + "learning_rate": 2.3800911749723466e-05, + "loss": 0.5125, + "step": 9725 + }, + { + "epoch": 1.9993832870798642, + "grad_norm": 0.16732336580753326, + "learning_rate": 2.3792101915475583e-05, + "loss": 0.5047, + "step": 9726 + }, + { + "epoch": 1.9995888580532428, + "grad_norm": 0.1221918985247612, + "learning_rate": 2.378329312604171e-05, + "loss": 0.5094, + "step": 9727 + }, + { + "epoch": 1.9997944290266214, + "grad_norm": 0.15841197967529297, + "learning_rate": 2.3774485381855812e-05, + "loss": 0.5167, + "step": 9728 + }, + { + "epoch": 2.0, + "grad_norm": 0.17381541430950165, + "learning_rate": 2.3765678683351824e-05, + "loss": 0.5104, + "step": 9729 + }, + { + "epoch": 2.0002055709733786, + "grad_norm": 0.35462313890457153, + "learning_rate": 2.375687303096359e-05, + "loss": 0.4014, + "step": 9730 + }, + { + "epoch": 2.000411141946757, + "grad_norm": 0.3547631800174713, + "learning_rate": 2.3748068425124914e-05, + "loss": 0.4388, + "step": 9731 + }, + { + "epoch": 2.0006167129201358, + "grad_norm": 0.28014928102493286, + "learning_rate": 2.373926486626959e-05, + "loss": 0.4034, + "step": 9732 + }, + { + "epoch": 2.0008222838935144, + "grad_norm": 0.23848789930343628, + "learning_rate": 2.3730462354831326e-05, + "loss": 0.4079, + "step": 9733 + }, + { + "epoch": 2.001027854866893, + "grad_norm": 0.22783653438091278, + "learning_rate": 2.3721660891243738e-05, + "loss": 0.4117, + "step": 9734 + }, + { + "epoch": 2.0012334258402715, + "grad_norm": 0.27238190174102783, + "learning_rate": 2.371286047594049e-05, + "loss": 0.3957, + "step": 9735 + }, + { + "epoch": 2.00143899681365, + "grad_norm": 0.35664230585098267, + "learning_rate": 2.3704061109355107e-05, + "loss": 0.4034, + "step": 9736 + }, + { + "epoch": 2.0016445677870283, + "grad_norm": 0.3655121326446533, + "learning_rate": 2.369526279192108e-05, + "loss": 0.3992, + "step": 9737 + }, + { + "epoch": 2.001850138760407, + "grad_norm": 0.31957703828811646, + "learning_rate": 2.3686465524071887e-05, + "loss": 0.4125, + "step": 9738 + }, + { + "epoch": 2.0020557097337854, + "grad_norm": 0.2534150183200836, + "learning_rate": 2.3677669306240927e-05, + "loss": 0.3829, + "step": 9739 + }, + { + "epoch": 2.002261280707164, + "grad_norm": 0.23875583708286285, + "learning_rate": 2.3668874138861533e-05, + "loss": 0.4053, + "step": 9740 + }, + { + "epoch": 2.0024668516805426, + "grad_norm": 0.24184350669384003, + "learning_rate": 2.366008002236702e-05, + "loss": 0.4061, + "step": 9741 + }, + { + "epoch": 2.002672422653921, + "grad_norm": 0.232225701212883, + "learning_rate": 2.3651286957190612e-05, + "loss": 0.4083, + "step": 9742 + }, + { + "epoch": 2.0028779936273, + "grad_norm": 0.1986769735813141, + "learning_rate": 2.3642494943765516e-05, + "loss": 0.4456, + "step": 9743 + }, + { + "epoch": 2.0030835646006784, + "grad_norm": 0.1628189980983734, + "learning_rate": 2.363370398252485e-05, + "loss": 0.4615, + "step": 9744 + }, + { + "epoch": 2.003289135574057, + "grad_norm": 0.13968214392662048, + "learning_rate": 2.362491407390174e-05, + "loss": 0.4571, + "step": 9745 + }, + { + "epoch": 2.0034947065474356, + "grad_norm": 0.2863624095916748, + "learning_rate": 2.3616125218329208e-05, + "loss": 0.3981, + "step": 9746 + }, + { + "epoch": 2.003700277520814, + "grad_norm": 0.27160152792930603, + "learning_rate": 2.360733741624024e-05, + "loss": 0.3855, + "step": 9747 + }, + { + "epoch": 2.0039058484941927, + "grad_norm": 0.24677185714244843, + "learning_rate": 2.3598550668067765e-05, + "loss": 0.373, + "step": 9748 + }, + { + "epoch": 2.0041114194675713, + "grad_norm": 0.23863226175308228, + "learning_rate": 2.358976497424467e-05, + "loss": 0.4122, + "step": 9749 + }, + { + "epoch": 2.00431699044095, + "grad_norm": 0.23597677052021027, + "learning_rate": 2.3580980335203787e-05, + "loss": 0.4114, + "step": 9750 + }, + { + "epoch": 2.0045225614143285, + "grad_norm": 0.24519526958465576, + "learning_rate": 2.357219675137787e-05, + "loss": 0.4, + "step": 9751 + }, + { + "epoch": 2.0047281323877066, + "grad_norm": 0.26484453678131104, + "learning_rate": 2.356341422319968e-05, + "loss": 0.4106, + "step": 9752 + }, + { + "epoch": 2.0049337033610852, + "grad_norm": 0.2829241156578064, + "learning_rate": 2.3554632751101882e-05, + "loss": 0.4101, + "step": 9753 + }, + { + "epoch": 2.005139274334464, + "grad_norm": 0.2894810438156128, + "learning_rate": 2.354585233551709e-05, + "loss": 0.4072, + "step": 9754 + }, + { + "epoch": 2.0053448453078424, + "grad_norm": 0.26924699544906616, + "learning_rate": 2.3537072976877862e-05, + "loss": 0.3941, + "step": 9755 + }, + { + "epoch": 2.005550416281221, + "grad_norm": 0.26175355911254883, + "learning_rate": 2.352829467561675e-05, + "loss": 0.4, + "step": 9756 + }, + { + "epoch": 2.0057559872545996, + "grad_norm": 0.15815532207489014, + "learning_rate": 2.3519517432166195e-05, + "loss": 0.4488, + "step": 9757 + }, + { + "epoch": 2.005961558227978, + "grad_norm": 0.1532447189092636, + "learning_rate": 2.3510741246958602e-05, + "loss": 0.4491, + "step": 9758 + }, + { + "epoch": 2.0061671292013568, + "grad_norm": 0.2337024062871933, + "learning_rate": 2.3501966120426364e-05, + "loss": 0.4038, + "step": 9759 + }, + { + "epoch": 2.0063727001747353, + "grad_norm": 0.2317887842655182, + "learning_rate": 2.3493192053001774e-05, + "loss": 0.4058, + "step": 9760 + }, + { + "epoch": 2.006578271148114, + "grad_norm": 0.22884777188301086, + "learning_rate": 2.3484419045117088e-05, + "loss": 0.3987, + "step": 9761 + }, + { + "epoch": 2.0067838421214925, + "grad_norm": 0.2271248698234558, + "learning_rate": 2.3475647097204513e-05, + "loss": 0.3916, + "step": 9762 + }, + { + "epoch": 2.006989413094871, + "grad_norm": 0.2272649109363556, + "learning_rate": 2.3466876209696204e-05, + "loss": 0.4061, + "step": 9763 + }, + { + "epoch": 2.0071949840682497, + "grad_norm": 0.22100196778774261, + "learning_rate": 2.345810638302425e-05, + "loss": 0.4063, + "step": 9764 + }, + { + "epoch": 2.0074005550416283, + "grad_norm": 0.22727227210998535, + "learning_rate": 2.3449337617620705e-05, + "loss": 0.3931, + "step": 9765 + }, + { + "epoch": 2.007606126015007, + "grad_norm": 0.24030756950378418, + "learning_rate": 2.344056991391757e-05, + "loss": 0.4071, + "step": 9766 + }, + { + "epoch": 2.007811696988385, + "grad_norm": 0.2378872036933899, + "learning_rate": 2.3431803272346795e-05, + "loss": 0.4167, + "step": 9767 + }, + { + "epoch": 2.0080172679617636, + "grad_norm": 0.23873169720172882, + "learning_rate": 2.3423037693340263e-05, + "loss": 0.4025, + "step": 9768 + }, + { + "epoch": 2.008222838935142, + "grad_norm": 0.16406850516796112, + "learning_rate": 2.341427317732981e-05, + "loss": 0.4482, + "step": 9769 + }, + { + "epoch": 2.008428409908521, + "grad_norm": 0.24554254114627838, + "learning_rate": 2.340550972474723e-05, + "loss": 0.4149, + "step": 9770 + }, + { + "epoch": 2.0086339808818994, + "grad_norm": 0.24509701132774353, + "learning_rate": 2.339674733602425e-05, + "loss": 0.3931, + "step": 9771 + }, + { + "epoch": 2.008839551855278, + "grad_norm": 0.2255314290523529, + "learning_rate": 2.3387986011592542e-05, + "loss": 0.4023, + "step": 9772 + }, + { + "epoch": 2.0090451228286565, + "grad_norm": 0.22587113082408905, + "learning_rate": 2.3379225751883768e-05, + "loss": 0.403, + "step": 9773 + }, + { + "epoch": 2.009250693802035, + "grad_norm": 0.13071568310260773, + "learning_rate": 2.337046655732948e-05, + "loss": 0.4701, + "step": 9774 + }, + { + "epoch": 2.0094562647754137, + "grad_norm": 0.2212098240852356, + "learning_rate": 2.336170842836121e-05, + "loss": 0.394, + "step": 9775 + }, + { + "epoch": 2.0096618357487923, + "grad_norm": 0.23073311150074005, + "learning_rate": 2.3352951365410414e-05, + "loss": 0.421, + "step": 9776 + }, + { + "epoch": 2.009867406722171, + "grad_norm": 0.21537451446056366, + "learning_rate": 2.334419536890854e-05, + "loss": 0.3929, + "step": 9777 + }, + { + "epoch": 2.0100729776955495, + "grad_norm": 0.21932470798492432, + "learning_rate": 2.3335440439286943e-05, + "loss": 0.3989, + "step": 9778 + }, + { + "epoch": 2.010278548668928, + "grad_norm": 0.2174750566482544, + "learning_rate": 2.332668657697692e-05, + "loss": 0.3909, + "step": 9779 + }, + { + "epoch": 2.0104841196423067, + "grad_norm": 0.21708469092845917, + "learning_rate": 2.3317933782409764e-05, + "loss": 0.3854, + "step": 9780 + }, + { + "epoch": 2.0106896906156853, + "grad_norm": 0.22329485416412354, + "learning_rate": 2.330918205601667e-05, + "loss": 0.4068, + "step": 9781 + }, + { + "epoch": 2.010895261589064, + "grad_norm": 0.22749973833560944, + "learning_rate": 2.3300431398228786e-05, + "loss": 0.4065, + "step": 9782 + }, + { + "epoch": 2.011100832562442, + "grad_norm": 0.2266959398984909, + "learning_rate": 2.3291681809477235e-05, + "loss": 0.4044, + "step": 9783 + }, + { + "epoch": 2.0113064035358206, + "grad_norm": 0.22487907111644745, + "learning_rate": 2.3282933290193048e-05, + "loss": 0.3902, + "step": 9784 + }, + { + "epoch": 2.011511974509199, + "grad_norm": 0.22450290620326996, + "learning_rate": 2.327418584080724e-05, + "loss": 0.4144, + "step": 9785 + }, + { + "epoch": 2.0117175454825778, + "grad_norm": 0.13316728174686432, + "learning_rate": 2.3265439461750727e-05, + "loss": 0.4369, + "step": 9786 + }, + { + "epoch": 2.0119231164559563, + "grad_norm": 0.23068048059940338, + "learning_rate": 2.3256694153454446e-05, + "loss": 0.4071, + "step": 9787 + }, + { + "epoch": 2.012128687429335, + "grad_norm": 0.22546036541461945, + "learning_rate": 2.324794991634921e-05, + "loss": 0.392, + "step": 9788 + }, + { + "epoch": 2.0123342584027135, + "grad_norm": 0.2214207649230957, + "learning_rate": 2.3239206750865813e-05, + "loss": 0.3871, + "step": 9789 + }, + { + "epoch": 2.012539829376092, + "grad_norm": 0.12996140122413635, + "learning_rate": 2.3230464657434995e-05, + "loss": 0.4446, + "step": 9790 + }, + { + "epoch": 2.0127454003494707, + "grad_norm": 0.126758873462677, + "learning_rate": 2.322172363648743e-05, + "loss": 0.4344, + "step": 9791 + }, + { + "epoch": 2.0129509713228493, + "grad_norm": 0.21626314520835876, + "learning_rate": 2.3212983688453753e-05, + "loss": 0.4197, + "step": 9792 + }, + { + "epoch": 2.013156542296228, + "grad_norm": 0.11778894811868668, + "learning_rate": 2.3204244813764516e-05, + "loss": 0.4603, + "step": 9793 + }, + { + "epoch": 2.0133621132696065, + "grad_norm": 0.13116705417633057, + "learning_rate": 2.3195507012850284e-05, + "loss": 0.4376, + "step": 9794 + }, + { + "epoch": 2.013567684242985, + "grad_norm": 0.21736088395118713, + "learning_rate": 2.3186770286141507e-05, + "loss": 0.3973, + "step": 9795 + }, + { + "epoch": 2.0137732552163636, + "grad_norm": 0.2278052121400833, + "learning_rate": 2.31780346340686e-05, + "loss": 0.4055, + "step": 9796 + }, + { + "epoch": 2.013978826189742, + "grad_norm": 0.2270914614200592, + "learning_rate": 2.3169300057061935e-05, + "loss": 0.3941, + "step": 9797 + }, + { + "epoch": 2.0141843971631204, + "grad_norm": 0.22449646890163422, + "learning_rate": 2.31605665555518e-05, + "loss": 0.3728, + "step": 9798 + }, + { + "epoch": 2.014389968136499, + "grad_norm": 0.12993952631950378, + "learning_rate": 2.3151834129968495e-05, + "loss": 0.4542, + "step": 9799 + }, + { + "epoch": 2.0145955391098775, + "grad_norm": 0.21774081885814667, + "learning_rate": 2.3143102780742185e-05, + "loss": 0.3867, + "step": 9800 + }, + { + "epoch": 2.014801110083256, + "grad_norm": 0.13234397768974304, + "learning_rate": 2.3134372508303055e-05, + "loss": 0.4441, + "step": 9801 + }, + { + "epoch": 2.0150066810566347, + "grad_norm": 0.22552914917469025, + "learning_rate": 2.3125643313081194e-05, + "loss": 0.3967, + "step": 9802 + }, + { + "epoch": 2.0152122520300133, + "grad_norm": 0.22355657815933228, + "learning_rate": 2.311691519550665e-05, + "loss": 0.3994, + "step": 9803 + }, + { + "epoch": 2.015417823003392, + "grad_norm": 0.22515852749347687, + "learning_rate": 2.3108188156009412e-05, + "loss": 0.3941, + "step": 9804 + }, + { + "epoch": 2.0156233939767705, + "grad_norm": 0.2237560749053955, + "learning_rate": 2.3099462195019416e-05, + "loss": 0.4045, + "step": 9805 + }, + { + "epoch": 2.015828964950149, + "grad_norm": 0.1298869103193283, + "learning_rate": 2.309073731296656e-05, + "loss": 0.4567, + "step": 9806 + }, + { + "epoch": 2.0160345359235277, + "grad_norm": 0.22776378691196442, + "learning_rate": 2.3082013510280656e-05, + "loss": 0.4082, + "step": 9807 + }, + { + "epoch": 2.0162401068969062, + "grad_norm": 0.22463855147361755, + "learning_rate": 2.307329078739152e-05, + "loss": 0.4022, + "step": 9808 + }, + { + "epoch": 2.016445677870285, + "grad_norm": 0.22342638671398163, + "learning_rate": 2.3064569144728855e-05, + "loss": 0.4131, + "step": 9809 + }, + { + "epoch": 2.0166512488436634, + "grad_norm": 0.22417156398296356, + "learning_rate": 2.3055848582722352e-05, + "loss": 0.3981, + "step": 9810 + }, + { + "epoch": 2.016856819817042, + "grad_norm": 0.2322673499584198, + "learning_rate": 2.3047129101801618e-05, + "loss": 0.4035, + "step": 9811 + }, + { + "epoch": 2.0170623907904206, + "grad_norm": 0.2153014838695526, + "learning_rate": 2.303841070239622e-05, + "loss": 0.3957, + "step": 9812 + }, + { + "epoch": 2.0172679617637987, + "grad_norm": 0.22393642365932465, + "learning_rate": 2.302969338493567e-05, + "loss": 0.3947, + "step": 9813 + }, + { + "epoch": 2.0174735327371773, + "grad_norm": 0.23003719747066498, + "learning_rate": 2.302097714984945e-05, + "loss": 0.3909, + "step": 9814 + }, + { + "epoch": 2.017679103710556, + "grad_norm": 0.22402851283550262, + "learning_rate": 2.301226199756696e-05, + "loss": 0.3974, + "step": 9815 + }, + { + "epoch": 2.0178846746839345, + "grad_norm": 0.2208302617073059, + "learning_rate": 2.3003547928517547e-05, + "loss": 0.3763, + "step": 9816 + }, + { + "epoch": 2.018090245657313, + "grad_norm": 0.1260402798652649, + "learning_rate": 2.299483494313052e-05, + "loss": 0.4457, + "step": 9817 + }, + { + "epoch": 2.0182958166306917, + "grad_norm": 0.226173534989357, + "learning_rate": 2.298612304183512e-05, + "loss": 0.4093, + "step": 9818 + }, + { + "epoch": 2.0185013876040703, + "grad_norm": 0.12185024470090866, + "learning_rate": 2.297741222506053e-05, + "loss": 0.4517, + "step": 9819 + }, + { + "epoch": 2.018706958577449, + "grad_norm": 0.2621656358242035, + "learning_rate": 2.2968702493235923e-05, + "loss": 0.4059, + "step": 9820 + }, + { + "epoch": 2.0189125295508275, + "grad_norm": 0.2253510057926178, + "learning_rate": 2.2959993846790372e-05, + "loss": 0.4052, + "step": 9821 + }, + { + "epoch": 2.019118100524206, + "grad_norm": 0.12481515854597092, + "learning_rate": 2.2951286286152893e-05, + "loss": 0.4528, + "step": 9822 + }, + { + "epoch": 2.0193236714975846, + "grad_norm": 0.21684333682060242, + "learning_rate": 2.2942579811752496e-05, + "loss": 0.3872, + "step": 9823 + }, + { + "epoch": 2.019529242470963, + "grad_norm": 0.13086971640586853, + "learning_rate": 2.2933874424018093e-05, + "loss": 0.4632, + "step": 9824 + }, + { + "epoch": 2.019734813444342, + "grad_norm": 0.21728526055812836, + "learning_rate": 2.292517012337857e-05, + "loss": 0.3812, + "step": 9825 + }, + { + "epoch": 2.0199403844177204, + "grad_norm": 0.23790940642356873, + "learning_rate": 2.291646691026273e-05, + "loss": 0.4011, + "step": 9826 + }, + { + "epoch": 2.020145955391099, + "grad_norm": 0.12328074872493744, + "learning_rate": 2.290776478509933e-05, + "loss": 0.4421, + "step": 9827 + }, + { + "epoch": 2.020351526364477, + "grad_norm": 0.23319554328918457, + "learning_rate": 2.2899063748317123e-05, + "loss": 0.3795, + "step": 9828 + }, + { + "epoch": 2.0205570973378557, + "grad_norm": 0.21926866471767426, + "learning_rate": 2.2890363800344744e-05, + "loss": 0.3943, + "step": 9829 + }, + { + "epoch": 2.0207626683112343, + "grad_norm": 0.2243729531764984, + "learning_rate": 2.2881664941610796e-05, + "loss": 0.4123, + "step": 9830 + }, + { + "epoch": 2.020968239284613, + "grad_norm": 0.12581419944763184, + "learning_rate": 2.2872967172543843e-05, + "loss": 0.4629, + "step": 9831 + }, + { + "epoch": 2.0211738102579915, + "grad_norm": 0.12486526370048523, + "learning_rate": 2.286427049357237e-05, + "loss": 0.4541, + "step": 9832 + }, + { + "epoch": 2.02137938123137, + "grad_norm": 0.2228085696697235, + "learning_rate": 2.2855574905124826e-05, + "loss": 0.4007, + "step": 9833 + }, + { + "epoch": 2.0215849522047487, + "grad_norm": 0.1320047229528427, + "learning_rate": 2.284688040762959e-05, + "loss": 0.4513, + "step": 9834 + }, + { + "epoch": 2.0217905231781272, + "grad_norm": 0.22697174549102783, + "learning_rate": 2.283818700151503e-05, + "loss": 0.3806, + "step": 9835 + }, + { + "epoch": 2.021996094151506, + "grad_norm": 0.12552069127559662, + "learning_rate": 2.2829494687209413e-05, + "loss": 0.4545, + "step": 9836 + }, + { + "epoch": 2.0222016651248844, + "grad_norm": 0.22603359818458557, + "learning_rate": 2.282080346514097e-05, + "loss": 0.3866, + "step": 9837 + }, + { + "epoch": 2.022407236098263, + "grad_norm": 0.22030943632125854, + "learning_rate": 2.2812113335737867e-05, + "loss": 0.3983, + "step": 9838 + }, + { + "epoch": 2.0226128070716416, + "grad_norm": 0.23014822602272034, + "learning_rate": 2.280342429942824e-05, + "loss": 0.4008, + "step": 9839 + }, + { + "epoch": 2.02281837804502, + "grad_norm": 0.2164926677942276, + "learning_rate": 2.279473635664013e-05, + "loss": 0.4113, + "step": 9840 + }, + { + "epoch": 2.0230239490183988, + "grad_norm": 0.23505493998527527, + "learning_rate": 2.2786049507801594e-05, + "loss": 0.4073, + "step": 9841 + }, + { + "epoch": 2.0232295199917774, + "grad_norm": 0.21695363521575928, + "learning_rate": 2.277736375334057e-05, + "loss": 0.3937, + "step": 9842 + }, + { + "epoch": 2.0234350909651555, + "grad_norm": 0.21634046733379364, + "learning_rate": 2.2768679093684948e-05, + "loss": 0.4001, + "step": 9843 + }, + { + "epoch": 2.023640661938534, + "grad_norm": 0.22589579224586487, + "learning_rate": 2.2759995529262617e-05, + "loss": 0.3816, + "step": 9844 + }, + { + "epoch": 2.0238462329119127, + "grad_norm": 0.22080455720424652, + "learning_rate": 2.2751313060501353e-05, + "loss": 0.3994, + "step": 9845 + }, + { + "epoch": 2.0240518038852913, + "grad_norm": 0.23890239000320435, + "learning_rate": 2.2742631687828906e-05, + "loss": 0.4072, + "step": 9846 + }, + { + "epoch": 2.02425737485867, + "grad_norm": 0.2339673787355423, + "learning_rate": 2.2733951411672963e-05, + "loss": 0.4084, + "step": 9847 + }, + { + "epoch": 2.0244629458320484, + "grad_norm": 0.22778230905532837, + "learning_rate": 2.272527223246115e-05, + "loss": 0.3973, + "step": 9848 + }, + { + "epoch": 2.024668516805427, + "grad_norm": 0.22321897745132446, + "learning_rate": 2.271659415062108e-05, + "loss": 0.4052, + "step": 9849 + }, + { + "epoch": 2.0248740877788056, + "grad_norm": 0.13747207820415497, + "learning_rate": 2.270791716658026e-05, + "loss": 0.4596, + "step": 9850 + }, + { + "epoch": 2.025079658752184, + "grad_norm": 0.22815275192260742, + "learning_rate": 2.2699241280766174e-05, + "loss": 0.3894, + "step": 9851 + }, + { + "epoch": 2.025285229725563, + "grad_norm": 0.219502255320549, + "learning_rate": 2.269056649360623e-05, + "loss": 0.3969, + "step": 9852 + }, + { + "epoch": 2.0254908006989414, + "grad_norm": 0.229275181889534, + "learning_rate": 2.26818928055278e-05, + "loss": 0.4055, + "step": 9853 + }, + { + "epoch": 2.02569637167232, + "grad_norm": 0.21822713315486908, + "learning_rate": 2.2673220216958206e-05, + "loss": 0.3896, + "step": 9854 + }, + { + "epoch": 2.0259019426456986, + "grad_norm": 0.218753844499588, + "learning_rate": 2.266454872832467e-05, + "loss": 0.3951, + "step": 9855 + }, + { + "epoch": 2.026107513619077, + "grad_norm": 0.2237304002046585, + "learning_rate": 2.2655878340054446e-05, + "loss": 0.4035, + "step": 9856 + }, + { + "epoch": 2.0263130845924557, + "grad_norm": 0.2183140218257904, + "learning_rate": 2.2647209052574658e-05, + "loss": 0.3968, + "step": 9857 + }, + { + "epoch": 2.026518655565834, + "grad_norm": 0.22163569927215576, + "learning_rate": 2.26385408663124e-05, + "loss": 0.3805, + "step": 9858 + }, + { + "epoch": 2.0267242265392125, + "grad_norm": 0.22751082479953766, + "learning_rate": 2.2629873781694717e-05, + "loss": 0.3994, + "step": 9859 + }, + { + "epoch": 2.026929797512591, + "grad_norm": 0.21998751163482666, + "learning_rate": 2.2621207799148598e-05, + "loss": 0.3864, + "step": 9860 + }, + { + "epoch": 2.0271353684859696, + "grad_norm": 0.1337684839963913, + "learning_rate": 2.2612542919100973e-05, + "loss": 0.444, + "step": 9861 + }, + { + "epoch": 2.0273409394593482, + "grad_norm": 0.23163475096225739, + "learning_rate": 2.2603879141978702e-05, + "loss": 0.4133, + "step": 9862 + }, + { + "epoch": 2.027546510432727, + "grad_norm": 0.1254424899816513, + "learning_rate": 2.2595216468208643e-05, + "loss": 0.4527, + "step": 9863 + }, + { + "epoch": 2.0277520814061054, + "grad_norm": 0.23382841050624847, + "learning_rate": 2.258655489821753e-05, + "loss": 0.4075, + "step": 9864 + }, + { + "epoch": 2.027957652379484, + "grad_norm": 0.2241084724664688, + "learning_rate": 2.2577894432432115e-05, + "loss": 0.4089, + "step": 9865 + }, + { + "epoch": 2.0281632233528626, + "grad_norm": 0.12018263339996338, + "learning_rate": 2.2569235071279042e-05, + "loss": 0.4616, + "step": 9866 + }, + { + "epoch": 2.028368794326241, + "grad_norm": 0.21912699937820435, + "learning_rate": 2.256057681518491e-05, + "loss": 0.4057, + "step": 9867 + }, + { + "epoch": 2.0285743652996198, + "grad_norm": 0.12558940052986145, + "learning_rate": 2.255191966457629e-05, + "loss": 0.437, + "step": 9868 + }, + { + "epoch": 2.0287799362729984, + "grad_norm": 0.22069305181503296, + "learning_rate": 2.254326361987964e-05, + "loss": 0.3903, + "step": 9869 + }, + { + "epoch": 2.028985507246377, + "grad_norm": 0.12789428234100342, + "learning_rate": 2.2534608681521443e-05, + "loss": 0.4686, + "step": 9870 + }, + { + "epoch": 2.0291910782197555, + "grad_norm": 0.22064268589019775, + "learning_rate": 2.252595484992808e-05, + "loss": 0.3867, + "step": 9871 + }, + { + "epoch": 2.029396649193134, + "grad_norm": 0.1297440379858017, + "learning_rate": 2.251730212552587e-05, + "loss": 0.471, + "step": 9872 + }, + { + "epoch": 2.0296022201665127, + "grad_norm": 0.227555051445961, + "learning_rate": 2.2508650508741107e-05, + "loss": 0.4138, + "step": 9873 + }, + { + "epoch": 2.029807791139891, + "grad_norm": 0.2229832112789154, + "learning_rate": 2.250000000000001e-05, + "loss": 0.3846, + "step": 9874 + }, + { + "epoch": 2.0300133621132694, + "grad_norm": 0.12331897765398026, + "learning_rate": 2.2491350599728745e-05, + "loss": 0.4309, + "step": 9875 + }, + { + "epoch": 2.030218933086648, + "grad_norm": 0.12525731325149536, + "learning_rate": 2.2482702308353416e-05, + "loss": 0.4642, + "step": 9876 + }, + { + "epoch": 2.0304245040600266, + "grad_norm": 0.22697319090366364, + "learning_rate": 2.2474055126300116e-05, + "loss": 0.3967, + "step": 9877 + }, + { + "epoch": 2.030630075033405, + "grad_norm": 0.21771733462810516, + "learning_rate": 2.2465409053994835e-05, + "loss": 0.396, + "step": 9878 + }, + { + "epoch": 2.030835646006784, + "grad_norm": 0.21557028591632843, + "learning_rate": 2.2456764091863518e-05, + "loss": 0.3904, + "step": 9879 + }, + { + "epoch": 2.0310412169801624, + "grad_norm": 0.22535440325737, + "learning_rate": 2.244812024033207e-05, + "loss": 0.4019, + "step": 9880 + }, + { + "epoch": 2.031246787953541, + "grad_norm": 0.22445163130760193, + "learning_rate": 2.243947749982633e-05, + "loss": 0.3986, + "step": 9881 + }, + { + "epoch": 2.0314523589269196, + "grad_norm": 0.21911373734474182, + "learning_rate": 2.243083587077209e-05, + "loss": 0.3931, + "step": 9882 + }, + { + "epoch": 2.031657929900298, + "grad_norm": 0.21471014618873596, + "learning_rate": 2.2422195353595056e-05, + "loss": 0.3839, + "step": 9883 + }, + { + "epoch": 2.0318635008736767, + "grad_norm": 0.2156352996826172, + "learning_rate": 2.2413555948720952e-05, + "loss": 0.3843, + "step": 9884 + }, + { + "epoch": 2.0320690718470553, + "grad_norm": 0.22156722843647003, + "learning_rate": 2.240491765657537e-05, + "loss": 0.4147, + "step": 9885 + }, + { + "epoch": 2.032274642820434, + "grad_norm": 0.22945941984653473, + "learning_rate": 2.2396280477583874e-05, + "loss": 0.4038, + "step": 9886 + }, + { + "epoch": 2.0324802137938125, + "grad_norm": 0.217056542634964, + "learning_rate": 2.2387644412172005e-05, + "loss": 0.3978, + "step": 9887 + }, + { + "epoch": 2.032685784767191, + "grad_norm": 0.22490544617176056, + "learning_rate": 2.2379009460765203e-05, + "loss": 0.3874, + "step": 9888 + }, + { + "epoch": 2.0328913557405692, + "grad_norm": 0.224374920129776, + "learning_rate": 2.2370375623788862e-05, + "loss": 0.4149, + "step": 9889 + }, + { + "epoch": 2.033096926713948, + "grad_norm": 0.13248522579669952, + "learning_rate": 2.236174290166836e-05, + "loss": 0.4294, + "step": 9890 + }, + { + "epoch": 2.0333024976873264, + "grad_norm": 0.23234902322292328, + "learning_rate": 2.235311129482897e-05, + "loss": 0.395, + "step": 9891 + }, + { + "epoch": 2.033508068660705, + "grad_norm": 0.2269185483455658, + "learning_rate": 2.234448080369594e-05, + "loss": 0.3915, + "step": 9892 + }, + { + "epoch": 2.0337136396340836, + "grad_norm": 0.22412073612213135, + "learning_rate": 2.2335851428694447e-05, + "loss": 0.3766, + "step": 9893 + }, + { + "epoch": 2.033919210607462, + "grad_norm": 0.22921979427337646, + "learning_rate": 2.2327223170249626e-05, + "loss": 0.4075, + "step": 9894 + }, + { + "epoch": 2.0341247815808408, + "grad_norm": 0.12206049263477325, + "learning_rate": 2.2318596028786543e-05, + "loss": 0.4533, + "step": 9895 + }, + { + "epoch": 2.0343303525542193, + "grad_norm": 0.22003917396068573, + "learning_rate": 2.2309970004730204e-05, + "loss": 0.3874, + "step": 9896 + }, + { + "epoch": 2.034535923527598, + "grad_norm": 0.22223718464374542, + "learning_rate": 2.2301345098505608e-05, + "loss": 0.4057, + "step": 9897 + }, + { + "epoch": 2.0347414945009765, + "grad_norm": 0.2259814292192459, + "learning_rate": 2.2292721310537645e-05, + "loss": 0.3888, + "step": 9898 + }, + { + "epoch": 2.034947065474355, + "grad_norm": 0.21883010864257812, + "learning_rate": 2.2284098641251172e-05, + "loss": 0.4222, + "step": 9899 + }, + { + "epoch": 2.0351526364477337, + "grad_norm": 0.12656092643737793, + "learning_rate": 2.227547709107098e-05, + "loss": 0.4542, + "step": 9900 + }, + { + "epoch": 2.0353582074211123, + "grad_norm": 0.22307392954826355, + "learning_rate": 2.2266856660421823e-05, + "loss": 0.4201, + "step": 9901 + }, + { + "epoch": 2.035563778394491, + "grad_norm": 0.2214750051498413, + "learning_rate": 2.2258237349728382e-05, + "loss": 0.3773, + "step": 9902 + }, + { + "epoch": 2.0357693493678695, + "grad_norm": 0.22282758355140686, + "learning_rate": 2.2249619159415273e-05, + "loss": 0.4047, + "step": 9903 + }, + { + "epoch": 2.0359749203412476, + "grad_norm": 0.12212900072336197, + "learning_rate": 2.2241002089907114e-05, + "loss": 0.4447, + "step": 9904 + }, + { + "epoch": 2.036180491314626, + "grad_norm": 0.2368995100259781, + "learning_rate": 2.2232386141628407e-05, + "loss": 0.3965, + "step": 9905 + }, + { + "epoch": 2.036386062288005, + "grad_norm": 0.12619584798812866, + "learning_rate": 2.222377131500361e-05, + "loss": 0.453, + "step": 9906 + }, + { + "epoch": 2.0365916332613834, + "grad_norm": 0.1249145120382309, + "learning_rate": 2.221515761045714e-05, + "loss": 0.4397, + "step": 9907 + }, + { + "epoch": 2.036797204234762, + "grad_norm": 0.22991523146629333, + "learning_rate": 2.220654502841337e-05, + "loss": 0.3909, + "step": 9908 + }, + { + "epoch": 2.0370027752081405, + "grad_norm": 0.2213556170463562, + "learning_rate": 2.2197933569296587e-05, + "loss": 0.3813, + "step": 9909 + }, + { + "epoch": 2.037208346181519, + "grad_norm": 0.24218927323818207, + "learning_rate": 2.218932323353103e-05, + "loss": 0.4047, + "step": 9910 + }, + { + "epoch": 2.0374139171548977, + "grad_norm": 0.21407100558280945, + "learning_rate": 2.2180714021540913e-05, + "loss": 0.3848, + "step": 9911 + }, + { + "epoch": 2.0376194881282763, + "grad_norm": 0.12527808547019958, + "learning_rate": 2.217210593375036e-05, + "loss": 0.4478, + "step": 9912 + }, + { + "epoch": 2.037825059101655, + "grad_norm": 0.22188331186771393, + "learning_rate": 2.216349897058345e-05, + "loss": 0.3957, + "step": 9913 + }, + { + "epoch": 2.0380306300750335, + "grad_norm": 0.23192797601222992, + "learning_rate": 2.2154893132464207e-05, + "loss": 0.3889, + "step": 9914 + }, + { + "epoch": 2.038236201048412, + "grad_norm": 0.2198922336101532, + "learning_rate": 2.21462884198166e-05, + "loss": 0.3865, + "step": 9915 + }, + { + "epoch": 2.0384417720217907, + "grad_norm": 0.1324300318956375, + "learning_rate": 2.213768483306455e-05, + "loss": 0.4286, + "step": 9916 + }, + { + "epoch": 2.0386473429951693, + "grad_norm": 0.22883708775043488, + "learning_rate": 2.212908237263189e-05, + "loss": 0.3945, + "step": 9917 + }, + { + "epoch": 2.038852913968548, + "grad_norm": 0.2269202619791031, + "learning_rate": 2.212048103894246e-05, + "loss": 0.4058, + "step": 9918 + }, + { + "epoch": 2.039058484941926, + "grad_norm": 0.23420077562332153, + "learning_rate": 2.2111880832419995e-05, + "loss": 0.4064, + "step": 9919 + }, + { + "epoch": 2.0392640559153046, + "grad_norm": 0.12553973495960236, + "learning_rate": 2.210328175348818e-05, + "loss": 0.4317, + "step": 9920 + }, + { + "epoch": 2.039469626888683, + "grad_norm": 0.22346656024456024, + "learning_rate": 2.209468380257065e-05, + "loss": 0.3767, + "step": 9921 + }, + { + "epoch": 2.0396751978620618, + "grad_norm": 0.2343178391456604, + "learning_rate": 2.208608698009099e-05, + "loss": 0.3972, + "step": 9922 + }, + { + "epoch": 2.0398807688354403, + "grad_norm": 0.21974226832389832, + "learning_rate": 2.207749128647273e-05, + "loss": 0.3937, + "step": 9923 + }, + { + "epoch": 2.040086339808819, + "grad_norm": 0.12265095114707947, + "learning_rate": 2.206889672213932e-05, + "loss": 0.4691, + "step": 9924 + }, + { + "epoch": 2.0402919107821975, + "grad_norm": 0.13189628720283508, + "learning_rate": 2.2060303287514198e-05, + "loss": 0.4569, + "step": 9925 + }, + { + "epoch": 2.040497481755576, + "grad_norm": 0.22592967748641968, + "learning_rate": 2.2051710983020714e-05, + "loss": 0.411, + "step": 9926 + }, + { + "epoch": 2.0407030527289547, + "grad_norm": 0.22591936588287354, + "learning_rate": 2.2043119809082176e-05, + "loss": 0.4061, + "step": 9927 + }, + { + "epoch": 2.0409086237023333, + "grad_norm": 0.22242794930934906, + "learning_rate": 2.2034529766121802e-05, + "loss": 0.398, + "step": 9928 + }, + { + "epoch": 2.041114194675712, + "grad_norm": 0.21335627138614655, + "learning_rate": 2.2025940854562824e-05, + "loss": 0.4109, + "step": 9929 + }, + { + "epoch": 2.0413197656490905, + "grad_norm": 0.2250111699104309, + "learning_rate": 2.2017353074828363e-05, + "loss": 0.3865, + "step": 9930 + }, + { + "epoch": 2.041525336622469, + "grad_norm": 0.2197580337524414, + "learning_rate": 2.2008766427341477e-05, + "loss": 0.379, + "step": 9931 + }, + { + "epoch": 2.0417309075958476, + "grad_norm": 0.23078951239585876, + "learning_rate": 2.2000180912525225e-05, + "loss": 0.4046, + "step": 9932 + }, + { + "epoch": 2.041936478569226, + "grad_norm": 0.22051231563091278, + "learning_rate": 2.1991596530802558e-05, + "loss": 0.3925, + "step": 9933 + }, + { + "epoch": 2.042142049542605, + "grad_norm": 0.23026688396930695, + "learning_rate": 2.198301328259639e-05, + "loss": 0.3941, + "step": 9934 + }, + { + "epoch": 2.042347620515983, + "grad_norm": 0.23431305587291718, + "learning_rate": 2.197443116832958e-05, + "loss": 0.3928, + "step": 9935 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 0.22884812951087952, + "learning_rate": 2.1965850188424914e-05, + "loss": 0.3915, + "step": 9936 + }, + { + "epoch": 2.04275876246274, + "grad_norm": 0.22704505920410156, + "learning_rate": 2.195727034330516e-05, + "loss": 0.4077, + "step": 9937 + }, + { + "epoch": 2.0429643334361187, + "grad_norm": 0.13225297629833221, + "learning_rate": 2.194869163339297e-05, + "loss": 0.4588, + "step": 9938 + }, + { + "epoch": 2.0431699044094973, + "grad_norm": 0.220863476395607, + "learning_rate": 2.194011405911102e-05, + "loss": 0.3732, + "step": 9939 + }, + { + "epoch": 2.043375475382876, + "grad_norm": 0.22376231849193573, + "learning_rate": 2.193153762088187e-05, + "loss": 0.4105, + "step": 9940 + }, + { + "epoch": 2.0435810463562545, + "grad_norm": 0.22367540001869202, + "learning_rate": 2.192296231912804e-05, + "loss": 0.393, + "step": 9941 + }, + { + "epoch": 2.043786617329633, + "grad_norm": 0.22071625292301178, + "learning_rate": 2.1914388154271993e-05, + "loss": 0.3973, + "step": 9942 + }, + { + "epoch": 2.0439921883030117, + "grad_norm": 0.22081826627254486, + "learning_rate": 2.1905815126736143e-05, + "loss": 0.4125, + "step": 9943 + }, + { + "epoch": 2.0441977592763902, + "grad_norm": 0.22600281238555908, + "learning_rate": 2.1897243236942836e-05, + "loss": 0.3986, + "step": 9944 + }, + { + "epoch": 2.044403330249769, + "grad_norm": 0.2240431308746338, + "learning_rate": 2.1888672485314357e-05, + "loss": 0.4019, + "step": 9945 + }, + { + "epoch": 2.0446089012231474, + "grad_norm": 0.22377148270606995, + "learning_rate": 2.188010287227298e-05, + "loss": 0.4098, + "step": 9946 + }, + { + "epoch": 2.044814472196526, + "grad_norm": 0.2262306958436966, + "learning_rate": 2.1871534398240877e-05, + "loss": 0.3999, + "step": 9947 + }, + { + "epoch": 2.0450200431699046, + "grad_norm": 0.22286969423294067, + "learning_rate": 2.1862967063640164e-05, + "loss": 0.3974, + "step": 9948 + }, + { + "epoch": 2.045225614143283, + "grad_norm": 0.1264716535806656, + "learning_rate": 2.1854400868892905e-05, + "loss": 0.4572, + "step": 9949 + }, + { + "epoch": 2.0454311851166613, + "grad_norm": 0.22342973947525024, + "learning_rate": 2.1845835814421155e-05, + "loss": 0.3999, + "step": 9950 + }, + { + "epoch": 2.04563675609004, + "grad_norm": 0.22479073703289032, + "learning_rate": 2.1837271900646852e-05, + "loss": 0.3997, + "step": 9951 + }, + { + "epoch": 2.0458423270634185, + "grad_norm": 0.22151948511600494, + "learning_rate": 2.1828709127991884e-05, + "loss": 0.3914, + "step": 9952 + }, + { + "epoch": 2.046047898036797, + "grad_norm": 0.1296972632408142, + "learning_rate": 2.1820147496878126e-05, + "loss": 0.4305, + "step": 9953 + }, + { + "epoch": 2.0462534690101757, + "grad_norm": 0.25065821409225464, + "learning_rate": 2.181158700772736e-05, + "loss": 0.3911, + "step": 9954 + }, + { + "epoch": 2.0464590399835543, + "grad_norm": 0.2304956465959549, + "learning_rate": 2.180302766096132e-05, + "loss": 0.3961, + "step": 9955 + }, + { + "epoch": 2.046664610956933, + "grad_norm": 0.22731968760490417, + "learning_rate": 2.179446945700169e-05, + "loss": 0.3846, + "step": 9956 + }, + { + "epoch": 2.0468701819303114, + "grad_norm": 0.23249146342277527, + "learning_rate": 2.1785912396270084e-05, + "loss": 0.4109, + "step": 9957 + }, + { + "epoch": 2.04707575290369, + "grad_norm": 0.22886785864830017, + "learning_rate": 2.177735647918807e-05, + "loss": 0.3894, + "step": 9958 + }, + { + "epoch": 2.0472813238770686, + "grad_norm": 0.22079876065254211, + "learning_rate": 2.176880170617715e-05, + "loss": 0.4036, + "step": 9959 + }, + { + "epoch": 2.047486894850447, + "grad_norm": 0.21782319247722626, + "learning_rate": 2.1760248077658796e-05, + "loss": 0.3954, + "step": 9960 + }, + { + "epoch": 2.047692465823826, + "grad_norm": 0.22487705945968628, + "learning_rate": 2.1751695594054398e-05, + "loss": 0.4007, + "step": 9961 + }, + { + "epoch": 2.0478980367972044, + "grad_norm": 0.22865137457847595, + "learning_rate": 2.1743144255785294e-05, + "loss": 0.3998, + "step": 9962 + }, + { + "epoch": 2.048103607770583, + "grad_norm": 0.2298915535211563, + "learning_rate": 2.173459406327278e-05, + "loss": 0.4107, + "step": 9963 + }, + { + "epoch": 2.0483091787439616, + "grad_norm": 0.2230944037437439, + "learning_rate": 2.1726045016938065e-05, + "loss": 0.3866, + "step": 9964 + }, + { + "epoch": 2.0485147497173397, + "grad_norm": 0.23378700017929077, + "learning_rate": 2.1717497117202314e-05, + "loss": 0.4049, + "step": 9965 + }, + { + "epoch": 2.0487203206907183, + "grad_norm": 0.22423069179058075, + "learning_rate": 2.170895036448668e-05, + "loss": 0.3989, + "step": 9966 + }, + { + "epoch": 2.048925891664097, + "grad_norm": 0.2279648631811142, + "learning_rate": 2.17004047592122e-05, + "loss": 0.4052, + "step": 9967 + }, + { + "epoch": 2.0491314626374755, + "grad_norm": 0.2262582629919052, + "learning_rate": 2.1691860301799867e-05, + "loss": 0.391, + "step": 9968 + }, + { + "epoch": 2.049337033610854, + "grad_norm": 0.2182939350605011, + "learning_rate": 2.1683316992670644e-05, + "loss": 0.3879, + "step": 9969 + }, + { + "epoch": 2.0495426045842327, + "grad_norm": 0.21680088341236115, + "learning_rate": 2.1674774832245406e-05, + "loss": 0.3804, + "step": 9970 + }, + { + "epoch": 2.0497481755576112, + "grad_norm": 0.22588318586349487, + "learning_rate": 2.166623382094497e-05, + "loss": 0.4107, + "step": 9971 + }, + { + "epoch": 2.04995374653099, + "grad_norm": 0.22498705983161926, + "learning_rate": 2.165769395919015e-05, + "loss": 0.3904, + "step": 9972 + }, + { + "epoch": 2.0501593175043684, + "grad_norm": 0.1259543001651764, + "learning_rate": 2.1649155247401637e-05, + "loss": 0.4644, + "step": 9973 + }, + { + "epoch": 2.050364888477747, + "grad_norm": 0.22000350058078766, + "learning_rate": 2.1640617686000116e-05, + "loss": 0.3917, + "step": 9974 + }, + { + "epoch": 2.0505704594511256, + "grad_norm": 0.23319876194000244, + "learning_rate": 2.163208127540618e-05, + "loss": 0.4, + "step": 9975 + }, + { + "epoch": 2.050776030424504, + "grad_norm": 0.22796432673931122, + "learning_rate": 2.1623546016040378e-05, + "loss": 0.4044, + "step": 9976 + }, + { + "epoch": 2.0509816013978828, + "grad_norm": 0.2386104017496109, + "learning_rate": 2.16150119083232e-05, + "loss": 0.4046, + "step": 9977 + }, + { + "epoch": 2.0511871723712614, + "grad_norm": 0.22699424624443054, + "learning_rate": 2.160647895267509e-05, + "loss": 0.3846, + "step": 9978 + }, + { + "epoch": 2.05139274334464, + "grad_norm": 0.22776249051094055, + "learning_rate": 2.1597947149516403e-05, + "loss": 0.4042, + "step": 9979 + }, + { + "epoch": 2.051598314318018, + "grad_norm": 0.22444364428520203, + "learning_rate": 2.1589416499267495e-05, + "loss": 0.4076, + "step": 9980 + }, + { + "epoch": 2.0518038852913967, + "grad_norm": 0.21514415740966797, + "learning_rate": 2.158088700234861e-05, + "loss": 0.391, + "step": 9981 + }, + { + "epoch": 2.0520094562647753, + "grad_norm": 0.12512782216072083, + "learning_rate": 2.1572358659179968e-05, + "loss": 0.4546, + "step": 9982 + }, + { + "epoch": 2.052215027238154, + "grad_norm": 0.217271625995636, + "learning_rate": 2.1563831470181714e-05, + "loss": 0.392, + "step": 9983 + }, + { + "epoch": 2.0524205982115324, + "grad_norm": 0.12956684827804565, + "learning_rate": 2.155530543577394e-05, + "loss": 0.4561, + "step": 9984 + }, + { + "epoch": 2.052626169184911, + "grad_norm": 0.2247815728187561, + "learning_rate": 2.1546780556376692e-05, + "loss": 0.401, + "step": 9985 + }, + { + "epoch": 2.0528317401582896, + "grad_norm": 0.22784893214702606, + "learning_rate": 2.1538256832409923e-05, + "loss": 0.3878, + "step": 9986 + }, + { + "epoch": 2.053037311131668, + "grad_norm": 0.22039231657981873, + "learning_rate": 2.1529734264293597e-05, + "loss": 0.4089, + "step": 9987 + }, + { + "epoch": 2.053242882105047, + "grad_norm": 0.22087042033672333, + "learning_rate": 2.152121285244757e-05, + "loss": 0.4153, + "step": 9988 + }, + { + "epoch": 2.0534484530784254, + "grad_norm": 0.15735208988189697, + "learning_rate": 2.1512692597291642e-05, + "loss": 0.4635, + "step": 9989 + }, + { + "epoch": 2.053654024051804, + "grad_norm": 0.22379711270332336, + "learning_rate": 2.1504173499245572e-05, + "loss": 0.4056, + "step": 9990 + }, + { + "epoch": 2.0538595950251826, + "grad_norm": 0.22105872631072998, + "learning_rate": 2.1495655558729053e-05, + "loss": 0.407, + "step": 9991 + }, + { + "epoch": 2.054065165998561, + "grad_norm": 0.2312091439962387, + "learning_rate": 2.1487138776161708e-05, + "loss": 0.3885, + "step": 9992 + }, + { + "epoch": 2.0542707369719397, + "grad_norm": 0.22999829053878784, + "learning_rate": 2.1478623151963156e-05, + "loss": 0.3916, + "step": 9993 + }, + { + "epoch": 2.0544763079453183, + "grad_norm": 0.2265433371067047, + "learning_rate": 2.14701086865529e-05, + "loss": 0.3997, + "step": 9994 + }, + { + "epoch": 2.0546818789186965, + "grad_norm": 0.21633121371269226, + "learning_rate": 2.1461595380350395e-05, + "loss": 0.3746, + "step": 9995 + }, + { + "epoch": 2.054887449892075, + "grad_norm": 0.22249945998191833, + "learning_rate": 2.1453083233775083e-05, + "loss": 0.3946, + "step": 9996 + }, + { + "epoch": 2.0550930208654536, + "grad_norm": 0.22257232666015625, + "learning_rate": 2.1444572247246306e-05, + "loss": 0.4039, + "step": 9997 + }, + { + "epoch": 2.0552985918388322, + "grad_norm": 0.1395193338394165, + "learning_rate": 2.143606242118335e-05, + "loss": 0.4434, + "step": 9998 + }, + { + "epoch": 2.055504162812211, + "grad_norm": 0.22854886949062347, + "learning_rate": 2.1427553756005467e-05, + "loss": 0.409, + "step": 9999 + }, + { + "epoch": 2.0557097337855894, + "grad_norm": 0.23623695969581604, + "learning_rate": 2.1419046252131813e-05, + "loss": 0.3945, + "step": 10000 + }, + { + "epoch": 2.055915304758968, + "grad_norm": 0.22533413767814636, + "learning_rate": 2.1410539909981554e-05, + "loss": 0.4078, + "step": 10001 + }, + { + "epoch": 2.0561208757323466, + "grad_norm": 0.21484293043613434, + "learning_rate": 2.1402034729973735e-05, + "loss": 0.3971, + "step": 10002 + }, + { + "epoch": 2.056326446705725, + "grad_norm": 0.12295730412006378, + "learning_rate": 2.1393530712527364e-05, + "loss": 0.4583, + "step": 10003 + }, + { + "epoch": 2.0565320176791038, + "grad_norm": 0.21692106127738953, + "learning_rate": 2.1385027858061404e-05, + "loss": 0.3951, + "step": 10004 + }, + { + "epoch": 2.0567375886524824, + "grad_norm": 0.23760221898555756, + "learning_rate": 2.137652616699474e-05, + "loss": 0.4146, + "step": 10005 + }, + { + "epoch": 2.056943159625861, + "grad_norm": 0.2326803058385849, + "learning_rate": 2.1368025639746222e-05, + "loss": 0.3751, + "step": 10006 + }, + { + "epoch": 2.0571487305992395, + "grad_norm": 0.12141763418912888, + "learning_rate": 2.13595262767346e-05, + "loss": 0.4688, + "step": 10007 + }, + { + "epoch": 2.057354301572618, + "grad_norm": 0.1330864131450653, + "learning_rate": 2.135102807837865e-05, + "loss": 0.463, + "step": 10008 + }, + { + "epoch": 2.0575598725459967, + "grad_norm": 0.12697000801563263, + "learning_rate": 2.1342531045097006e-05, + "loss": 0.4498, + "step": 10009 + }, + { + "epoch": 2.057765443519375, + "grad_norm": 0.12423637509346008, + "learning_rate": 2.1334035177308284e-05, + "loss": 0.4417, + "step": 10010 + }, + { + "epoch": 2.0579710144927534, + "grad_norm": 0.23774953186511993, + "learning_rate": 2.1325540475431032e-05, + "loss": 0.4171, + "step": 10011 + }, + { + "epoch": 2.058176585466132, + "grad_norm": 0.12215947359800339, + "learning_rate": 2.131704693988375e-05, + "loss": 0.4431, + "step": 10012 + }, + { + "epoch": 2.0583821564395106, + "grad_norm": 0.22526676952838898, + "learning_rate": 2.130855457108485e-05, + "loss": 0.4086, + "step": 10013 + }, + { + "epoch": 2.058587727412889, + "grad_norm": 0.2246025949716568, + "learning_rate": 2.1300063369452754e-05, + "loss": 0.3882, + "step": 10014 + }, + { + "epoch": 2.058793298386268, + "grad_norm": 0.22365763783454895, + "learning_rate": 2.1291573335405763e-05, + "loss": 0.3854, + "step": 10015 + }, + { + "epoch": 2.0589988693596464, + "grad_norm": 0.2273135632276535, + "learning_rate": 2.1283084469362117e-05, + "loss": 0.4483, + "step": 10016 + }, + { + "epoch": 2.059204440333025, + "grad_norm": 0.2241649329662323, + "learning_rate": 2.1274596771740074e-05, + "loss": 0.4028, + "step": 10017 + }, + { + "epoch": 2.0594100113064036, + "grad_norm": 0.1520613133907318, + "learning_rate": 2.1266110242957747e-05, + "loss": 0.4413, + "step": 10018 + }, + { + "epoch": 2.059615582279782, + "grad_norm": 0.22100979089736938, + "learning_rate": 2.125762488343324e-05, + "loss": 0.4095, + "step": 10019 + }, + { + "epoch": 2.0598211532531607, + "grad_norm": 0.22822456061840057, + "learning_rate": 2.1249140693584583e-05, + "loss": 0.4182, + "step": 10020 + }, + { + "epoch": 2.0600267242265393, + "grad_norm": 0.22433196008205414, + "learning_rate": 2.1240657673829736e-05, + "loss": 0.3938, + "step": 10021 + }, + { + "epoch": 2.060232295199918, + "grad_norm": 0.2217511087656021, + "learning_rate": 2.1232175824586653e-05, + "loss": 0.3991, + "step": 10022 + }, + { + "epoch": 2.0604378661732965, + "grad_norm": 0.2158900797367096, + "learning_rate": 2.1223695146273172e-05, + "loss": 0.3928, + "step": 10023 + }, + { + "epoch": 2.060643437146675, + "grad_norm": 0.22462232410907745, + "learning_rate": 2.1215215639307106e-05, + "loss": 0.4, + "step": 10024 + }, + { + "epoch": 2.0608490081200532, + "grad_norm": 0.235184445977211, + "learning_rate": 2.1206737304106196e-05, + "loss": 0.4223, + "step": 10025 + }, + { + "epoch": 2.061054579093432, + "grad_norm": 0.23646195232868195, + "learning_rate": 2.1198260141088127e-05, + "loss": 0.377, + "step": 10026 + }, + { + "epoch": 2.0612601500668104, + "grad_norm": 0.23219510912895203, + "learning_rate": 2.1189784150670534e-05, + "loss": 0.4182, + "step": 10027 + }, + { + "epoch": 2.061465721040189, + "grad_norm": 0.22460506856441498, + "learning_rate": 2.1181309333270966e-05, + "loss": 0.3969, + "step": 10028 + }, + { + "epoch": 2.0616712920135676, + "grad_norm": 0.2338314950466156, + "learning_rate": 2.1172835689306973e-05, + "loss": 0.3975, + "step": 10029 + }, + { + "epoch": 2.061876862986946, + "grad_norm": 0.22709804773330688, + "learning_rate": 2.116436321919601e-05, + "loss": 0.4034, + "step": 10030 + }, + { + "epoch": 2.0620824339603248, + "grad_norm": 0.2227647751569748, + "learning_rate": 2.115589192335545e-05, + "loss": 0.3797, + "step": 10031 + }, + { + "epoch": 2.0622880049337033, + "grad_norm": 0.2209719717502594, + "learning_rate": 2.1147421802202655e-05, + "loss": 0.3913, + "step": 10032 + }, + { + "epoch": 2.062493575907082, + "grad_norm": 0.2267482727766037, + "learning_rate": 2.1138952856154907e-05, + "loss": 0.4176, + "step": 10033 + }, + { + "epoch": 2.0626991468804605, + "grad_norm": 0.22682222723960876, + "learning_rate": 2.1130485085629413e-05, + "loss": 0.4015, + "step": 10034 + }, + { + "epoch": 2.062904717853839, + "grad_norm": 0.23114748299121857, + "learning_rate": 2.1122018491043344e-05, + "loss": 0.3889, + "step": 10035 + }, + { + "epoch": 2.0631102888272177, + "grad_norm": 0.22637394070625305, + "learning_rate": 2.1113553072813834e-05, + "loss": 0.4254, + "step": 10036 + }, + { + "epoch": 2.0633158598005963, + "grad_norm": 0.2336263358592987, + "learning_rate": 2.1105088831357904e-05, + "loss": 0.4082, + "step": 10037 + }, + { + "epoch": 2.063521430773975, + "grad_norm": 0.6490523815155029, + "learning_rate": 2.1096625767092575e-05, + "loss": 0.438, + "step": 10038 + }, + { + "epoch": 2.0637270017473535, + "grad_norm": 0.22613218426704407, + "learning_rate": 2.108816388043477e-05, + "loss": 0.3998, + "step": 10039 + }, + { + "epoch": 2.0639325727207316, + "grad_norm": 0.23520736396312714, + "learning_rate": 2.1079703171801374e-05, + "loss": 0.4229, + "step": 10040 + }, + { + "epoch": 2.06413814369411, + "grad_norm": 0.22257588803768158, + "learning_rate": 2.1071243641609196e-05, + "loss": 0.3859, + "step": 10041 + }, + { + "epoch": 2.064343714667489, + "grad_norm": 0.22676822543144226, + "learning_rate": 2.106278529027498e-05, + "loss": 0.3839, + "step": 10042 + }, + { + "epoch": 2.0645492856408674, + "grad_norm": 0.22315295040607452, + "learning_rate": 2.1054328118215475e-05, + "loss": 0.3921, + "step": 10043 + }, + { + "epoch": 2.064754856614246, + "grad_norm": 0.22379836440086365, + "learning_rate": 2.1045872125847298e-05, + "loss": 0.3746, + "step": 10044 + }, + { + "epoch": 2.0649604275876245, + "grad_norm": 0.21513979136943817, + "learning_rate": 2.103741731358704e-05, + "loss": 0.393, + "step": 10045 + }, + { + "epoch": 2.065165998561003, + "grad_norm": 0.24278521537780762, + "learning_rate": 2.102896368185123e-05, + "loss": 0.4002, + "step": 10046 + }, + { + "epoch": 2.0653715695343817, + "grad_norm": 0.1328233927488327, + "learning_rate": 2.1020511231056337e-05, + "loss": 0.4638, + "step": 10047 + }, + { + "epoch": 2.0655771405077603, + "grad_norm": 0.23675784468650818, + "learning_rate": 2.101205996161876e-05, + "loss": 0.3953, + "step": 10048 + }, + { + "epoch": 2.065782711481139, + "grad_norm": 0.22523106634616852, + "learning_rate": 2.1003609873954888e-05, + "loss": 0.4019, + "step": 10049 + }, + { + "epoch": 2.0659882824545175, + "grad_norm": 0.12683424353599548, + "learning_rate": 2.0995160968480998e-05, + "loss": 0.4565, + "step": 10050 + }, + { + "epoch": 2.066193853427896, + "grad_norm": 0.22555489838123322, + "learning_rate": 2.098671324561333e-05, + "loss": 0.4062, + "step": 10051 + }, + { + "epoch": 2.0663994244012747, + "grad_norm": 0.23419348895549774, + "learning_rate": 2.0978266705768064e-05, + "loss": 0.4253, + "step": 10052 + }, + { + "epoch": 2.0666049953746533, + "grad_norm": 0.2320510447025299, + "learning_rate": 2.0969821349361312e-05, + "loss": 0.4052, + "step": 10053 + }, + { + "epoch": 2.066810566348032, + "grad_norm": 0.2119479775428772, + "learning_rate": 2.0961377176809152e-05, + "loss": 0.3983, + "step": 10054 + }, + { + "epoch": 2.0670161373214104, + "grad_norm": 0.21941865980625153, + "learning_rate": 2.0952934188527566e-05, + "loss": 0.3949, + "step": 10055 + }, + { + "epoch": 2.0672217082947886, + "grad_norm": 0.1271030455827713, + "learning_rate": 2.094449238493253e-05, + "loss": 0.4436, + "step": 10056 + }, + { + "epoch": 2.067427279268167, + "grad_norm": 0.22050043940544128, + "learning_rate": 2.093605176643992e-05, + "loss": 0.4041, + "step": 10057 + }, + { + "epoch": 2.0676328502415457, + "grad_norm": 0.22902661561965942, + "learning_rate": 2.0927612333465567e-05, + "loss": 0.4003, + "step": 10058 + }, + { + "epoch": 2.0678384212149243, + "grad_norm": 0.2170822024345398, + "learning_rate": 2.091917408642522e-05, + "loss": 0.391, + "step": 10059 + }, + { + "epoch": 2.068043992188303, + "grad_norm": 0.2229936420917511, + "learning_rate": 2.0910737025734634e-05, + "loss": 0.403, + "step": 10060 + }, + { + "epoch": 2.0682495631616815, + "grad_norm": 0.2259387969970703, + "learning_rate": 2.090230115180944e-05, + "loss": 0.3887, + "step": 10061 + }, + { + "epoch": 2.06845513413506, + "grad_norm": 0.22917728126049042, + "learning_rate": 2.0893866465065215e-05, + "loss": 0.4047, + "step": 10062 + }, + { + "epoch": 2.0686607051084387, + "grad_norm": 0.22916476428508759, + "learning_rate": 2.088543296591754e-05, + "loss": 0.3906, + "step": 10063 + }, + { + "epoch": 2.0688662760818173, + "grad_norm": 0.22529999911785126, + "learning_rate": 2.087700065478187e-05, + "loss": 0.4009, + "step": 10064 + }, + { + "epoch": 2.069071847055196, + "grad_norm": 0.22376291453838348, + "learning_rate": 2.0868569532073623e-05, + "loss": 0.4003, + "step": 10065 + }, + { + "epoch": 2.0692774180285745, + "grad_norm": 0.21545644104480743, + "learning_rate": 2.0860139598208166e-05, + "loss": 0.4031, + "step": 10066 + }, + { + "epoch": 2.069482989001953, + "grad_norm": 0.1322476714849472, + "learning_rate": 2.0851710853600806e-05, + "loss": 0.4664, + "step": 10067 + }, + { + "epoch": 2.0696885599753316, + "grad_norm": 0.22991631925106049, + "learning_rate": 2.0843283298666783e-05, + "loss": 0.4024, + "step": 10068 + }, + { + "epoch": 2.06989413094871, + "grad_norm": 0.22085146605968475, + "learning_rate": 2.0834856933821267e-05, + "loss": 0.3827, + "step": 10069 + }, + { + "epoch": 2.070099701922089, + "grad_norm": 0.1257437914609909, + "learning_rate": 2.0826431759479416e-05, + "loss": 0.4524, + "step": 10070 + }, + { + "epoch": 2.070305272895467, + "grad_norm": 0.1249329000711441, + "learning_rate": 2.081800777605628e-05, + "loss": 0.4446, + "step": 10071 + }, + { + "epoch": 2.0705108438688455, + "grad_norm": 0.12916463613510132, + "learning_rate": 2.0809584983966886e-05, + "loss": 0.4477, + "step": 10072 + }, + { + "epoch": 2.070716414842224, + "grad_norm": 0.22638201713562012, + "learning_rate": 2.080116338362617e-05, + "loss": 0.3862, + "step": 10073 + }, + { + "epoch": 2.0709219858156027, + "grad_norm": 0.21907664835453033, + "learning_rate": 2.0792742975449027e-05, + "loss": 0.3962, + "step": 10074 + }, + { + "epoch": 2.0711275567889813, + "grad_norm": 0.12063062191009521, + "learning_rate": 2.0784323759850295e-05, + "loss": 0.4442, + "step": 10075 + }, + { + "epoch": 2.07133312776236, + "grad_norm": 0.22785618901252747, + "learning_rate": 2.0775905737244727e-05, + "loss": 0.4005, + "step": 10076 + }, + { + "epoch": 2.0715386987357385, + "grad_norm": 0.2289772778749466, + "learning_rate": 2.076748890804708e-05, + "loss": 0.4268, + "step": 10077 + }, + { + "epoch": 2.071744269709117, + "grad_norm": 0.22283616662025452, + "learning_rate": 2.0759073272671997e-05, + "loss": 0.4003, + "step": 10078 + }, + { + "epoch": 2.0719498406824957, + "grad_norm": 0.23021160066127777, + "learning_rate": 2.0750658831534067e-05, + "loss": 0.3948, + "step": 10079 + }, + { + "epoch": 2.0721554116558742, + "grad_norm": 0.22141693532466888, + "learning_rate": 2.0742245585047817e-05, + "loss": 0.4089, + "step": 10080 + }, + { + "epoch": 2.072360982629253, + "grad_norm": 0.2241126000881195, + "learning_rate": 2.0733833533627767e-05, + "loss": 0.3935, + "step": 10081 + }, + { + "epoch": 2.0725665536026314, + "grad_norm": 0.1273168921470642, + "learning_rate": 2.0725422677688313e-05, + "loss": 0.4539, + "step": 10082 + }, + { + "epoch": 2.07277212457601, + "grad_norm": 0.2204464226961136, + "learning_rate": 2.0717013017643815e-05, + "loss": 0.4002, + "step": 10083 + }, + { + "epoch": 2.0729776955493886, + "grad_norm": 0.22708940505981445, + "learning_rate": 2.0708604553908598e-05, + "loss": 0.4088, + "step": 10084 + }, + { + "epoch": 2.073183266522767, + "grad_norm": 0.23681271076202393, + "learning_rate": 2.07001972868969e-05, + "loss": 0.4163, + "step": 10085 + }, + { + "epoch": 2.0733888374961453, + "grad_norm": 0.22358982264995575, + "learning_rate": 2.0691791217022905e-05, + "loss": 0.4071, + "step": 10086 + }, + { + "epoch": 2.073594408469524, + "grad_norm": 0.2268630564212799, + "learning_rate": 2.068338634470074e-05, + "loss": 0.4045, + "step": 10087 + }, + { + "epoch": 2.0737999794429025, + "grad_norm": 0.22552597522735596, + "learning_rate": 2.0674982670344475e-05, + "loss": 0.4144, + "step": 10088 + }, + { + "epoch": 2.074005550416281, + "grad_norm": 0.22645661234855652, + "learning_rate": 2.0666580194368117e-05, + "loss": 0.4017, + "step": 10089 + }, + { + "epoch": 2.0742111213896597, + "grad_norm": 0.2267918735742569, + "learning_rate": 2.0658178917185603e-05, + "loss": 0.3807, + "step": 10090 + }, + { + "epoch": 2.0744166923630383, + "grad_norm": 0.2314879298210144, + "learning_rate": 2.0649778839210855e-05, + "loss": 0.405, + "step": 10091 + }, + { + "epoch": 2.074622263336417, + "grad_norm": 0.22707362473011017, + "learning_rate": 2.0641379960857693e-05, + "loss": 0.4071, + "step": 10092 + }, + { + "epoch": 2.0748278343097954, + "grad_norm": 0.22233855724334717, + "learning_rate": 2.0632982282539892e-05, + "loss": 0.409, + "step": 10093 + }, + { + "epoch": 2.075033405283174, + "grad_norm": 0.2284967601299286, + "learning_rate": 2.0624585804671157e-05, + "loss": 0.3873, + "step": 10094 + }, + { + "epoch": 2.0752389762565526, + "grad_norm": 0.22250832617282867, + "learning_rate": 2.0616190527665155e-05, + "loss": 0.4054, + "step": 10095 + }, + { + "epoch": 2.075444547229931, + "grad_norm": 0.2331288605928421, + "learning_rate": 2.0607796451935468e-05, + "loss": 0.3975, + "step": 10096 + }, + { + "epoch": 2.07565011820331, + "grad_norm": 0.2304941862821579, + "learning_rate": 2.059940357789563e-05, + "loss": 0.3924, + "step": 10097 + }, + { + "epoch": 2.0758556891766884, + "grad_norm": 0.2210913896560669, + "learning_rate": 2.0591011905959142e-05, + "loss": 0.383, + "step": 10098 + }, + { + "epoch": 2.076061260150067, + "grad_norm": 0.22776024043560028, + "learning_rate": 2.0582621436539415e-05, + "loss": 0.4058, + "step": 10099 + }, + { + "epoch": 2.0762668311234456, + "grad_norm": 0.21400035917758942, + "learning_rate": 2.0574232170049804e-05, + "loss": 0.3827, + "step": 10100 + }, + { + "epoch": 2.076472402096824, + "grad_norm": 0.2280118465423584, + "learning_rate": 2.0565844106903584e-05, + "loss": 0.4127, + "step": 10101 + }, + { + "epoch": 2.0766779730702023, + "grad_norm": 0.2156902402639389, + "learning_rate": 2.0557457247514045e-05, + "loss": 0.4023, + "step": 10102 + }, + { + "epoch": 2.076883544043581, + "grad_norm": 0.22840487957000732, + "learning_rate": 2.0549071592294338e-05, + "loss": 0.411, + "step": 10103 + }, + { + "epoch": 2.0770891150169595, + "grad_norm": 0.22176077961921692, + "learning_rate": 2.0540687141657576e-05, + "loss": 0.3836, + "step": 10104 + }, + { + "epoch": 2.077294685990338, + "grad_norm": 0.2274215966463089, + "learning_rate": 2.053230389601685e-05, + "loss": 0.4141, + "step": 10105 + }, + { + "epoch": 2.0775002569637167, + "grad_norm": 0.2207675725221634, + "learning_rate": 2.052392185578515e-05, + "loss": 0.3992, + "step": 10106 + }, + { + "epoch": 2.0777058279370952, + "grad_norm": 0.23283138871192932, + "learning_rate": 2.051554102137542e-05, + "loss": 0.3971, + "step": 10107 + }, + { + "epoch": 2.077911398910474, + "grad_norm": 0.24435223639011383, + "learning_rate": 2.0507161393200547e-05, + "loss": 0.3989, + "step": 10108 + }, + { + "epoch": 2.0781169698838524, + "grad_norm": 0.22710062563419342, + "learning_rate": 2.0498782971673353e-05, + "loss": 0.3999, + "step": 10109 + }, + { + "epoch": 2.078322540857231, + "grad_norm": 0.22904515266418457, + "learning_rate": 2.0490405757206597e-05, + "loss": 0.3923, + "step": 10110 + }, + { + "epoch": 2.0785281118306096, + "grad_norm": 0.13042299449443817, + "learning_rate": 2.0482029750212982e-05, + "loss": 0.4425, + "step": 10111 + }, + { + "epoch": 2.078733682803988, + "grad_norm": 0.13699179887771606, + "learning_rate": 2.0473654951105176e-05, + "loss": 0.4472, + "step": 10112 + }, + { + "epoch": 2.0789392537773668, + "grad_norm": 0.228811115026474, + "learning_rate": 2.046528136029576e-05, + "loss": 0.4027, + "step": 10113 + }, + { + "epoch": 2.0791448247507454, + "grad_norm": 0.21991683542728424, + "learning_rate": 2.0456908978197252e-05, + "loss": 0.3894, + "step": 10114 + }, + { + "epoch": 2.079350395724124, + "grad_norm": 0.2570091485977173, + "learning_rate": 2.0448537805222124e-05, + "loss": 0.3982, + "step": 10115 + }, + { + "epoch": 2.0795559666975025, + "grad_norm": 0.22256134450435638, + "learning_rate": 2.0440167841782787e-05, + "loss": 0.387, + "step": 10116 + }, + { + "epoch": 2.0797615376708807, + "grad_norm": 0.22056585550308228, + "learning_rate": 2.0431799088291588e-05, + "loss": 0.3988, + "step": 10117 + }, + { + "epoch": 2.0799671086442593, + "grad_norm": 0.21491390466690063, + "learning_rate": 2.04234315451608e-05, + "loss": 0.4163, + "step": 10118 + }, + { + "epoch": 2.080172679617638, + "grad_norm": 0.21639686822891235, + "learning_rate": 2.0415065212802687e-05, + "loss": 0.3965, + "step": 10119 + }, + { + "epoch": 2.0803782505910164, + "grad_norm": 0.2295675426721573, + "learning_rate": 2.04067000916294e-05, + "loss": 0.3914, + "step": 10120 + }, + { + "epoch": 2.080583821564395, + "grad_norm": 0.12334294617176056, + "learning_rate": 2.039833618205305e-05, + "loss": 0.4585, + "step": 10121 + }, + { + "epoch": 2.0807893925377736, + "grad_norm": 0.221688911318779, + "learning_rate": 2.0389973484485674e-05, + "loss": 0.3932, + "step": 10122 + }, + { + "epoch": 2.080994963511152, + "grad_norm": 0.22646862268447876, + "learning_rate": 2.0381611999339288e-05, + "loss": 0.3961, + "step": 10123 + }, + { + "epoch": 2.081200534484531, + "grad_norm": 0.12576346099376678, + "learning_rate": 2.037325172702582e-05, + "loss": 0.4689, + "step": 10124 + }, + { + "epoch": 2.0814061054579094, + "grad_norm": 0.2239895462989807, + "learning_rate": 2.0364892667957114e-05, + "loss": 0.3882, + "step": 10125 + }, + { + "epoch": 2.081611676431288, + "grad_norm": 0.2263174057006836, + "learning_rate": 2.035653482254502e-05, + "loss": 0.4017, + "step": 10126 + }, + { + "epoch": 2.0818172474046666, + "grad_norm": 0.22486351430416107, + "learning_rate": 2.034817819120127e-05, + "loss": 0.3867, + "step": 10127 + }, + { + "epoch": 2.082022818378045, + "grad_norm": 0.12152829766273499, + "learning_rate": 2.0339822774337562e-05, + "loss": 0.467, + "step": 10128 + }, + { + "epoch": 2.0822283893514237, + "grad_norm": 0.2354230135679245, + "learning_rate": 2.0331468572365525e-05, + "loss": 0.4021, + "step": 10129 + }, + { + "epoch": 2.0824339603248023, + "grad_norm": 0.23212236166000366, + "learning_rate": 2.0323115585696726e-05, + "loss": 0.3827, + "step": 10130 + }, + { + "epoch": 2.082639531298181, + "grad_norm": 0.2421758621931076, + "learning_rate": 2.031476381474267e-05, + "loss": 0.3984, + "step": 10131 + }, + { + "epoch": 2.082845102271559, + "grad_norm": 0.21930502355098724, + "learning_rate": 2.0306413259914836e-05, + "loss": 0.3948, + "step": 10132 + }, + { + "epoch": 2.0830506732449376, + "grad_norm": 0.2258896678686142, + "learning_rate": 2.0298063921624603e-05, + "loss": 0.3935, + "step": 10133 + }, + { + "epoch": 2.0832562442183162, + "grad_norm": 0.2229209989309311, + "learning_rate": 2.0289715800283306e-05, + "loss": 0.3999, + "step": 10134 + }, + { + "epoch": 2.083461815191695, + "grad_norm": 0.22103843092918396, + "learning_rate": 2.0281368896302212e-05, + "loss": 0.3988, + "step": 10135 + }, + { + "epoch": 2.0836673861650734, + "grad_norm": 0.22075578570365906, + "learning_rate": 2.0273023210092543e-05, + "loss": 0.394, + "step": 10136 + }, + { + "epoch": 2.083872957138452, + "grad_norm": 0.22386351227760315, + "learning_rate": 2.026467874206545e-05, + "loss": 0.3929, + "step": 10137 + }, + { + "epoch": 2.0840785281118306, + "grad_norm": 0.22971957921981812, + "learning_rate": 2.0256335492631997e-05, + "loss": 0.402, + "step": 10138 + }, + { + "epoch": 2.084284099085209, + "grad_norm": 0.2303125262260437, + "learning_rate": 2.024799346220326e-05, + "loss": 0.3955, + "step": 10139 + }, + { + "epoch": 2.0844896700585878, + "grad_norm": 0.23009240627288818, + "learning_rate": 2.0239652651190203e-05, + "loss": 0.3969, + "step": 10140 + }, + { + "epoch": 2.0846952410319664, + "grad_norm": 0.22021248936653137, + "learning_rate": 2.0231313060003725e-05, + "loss": 0.4248, + "step": 10141 + }, + { + "epoch": 2.084900812005345, + "grad_norm": 0.2278214991092682, + "learning_rate": 2.0222974689054684e-05, + "loss": 0.4051, + "step": 10142 + }, + { + "epoch": 2.0851063829787235, + "grad_norm": 0.2289620041847229, + "learning_rate": 2.0214637538753872e-05, + "loss": 0.3883, + "step": 10143 + }, + { + "epoch": 2.085311953952102, + "grad_norm": 0.21833720803260803, + "learning_rate": 2.0206301609512006e-05, + "loss": 0.3837, + "step": 10144 + }, + { + "epoch": 2.0855175249254807, + "grad_norm": 0.225063756108284, + "learning_rate": 2.0197966901739792e-05, + "loss": 0.4063, + "step": 10145 + }, + { + "epoch": 2.0857230958988593, + "grad_norm": 0.22253869473934174, + "learning_rate": 2.0189633415847808e-05, + "loss": 0.3882, + "step": 10146 + }, + { + "epoch": 2.0859286668722374, + "grad_norm": 0.22410309314727783, + "learning_rate": 2.0181301152246636e-05, + "loss": 0.4163, + "step": 10147 + }, + { + "epoch": 2.086134237845616, + "grad_norm": 0.2274530529975891, + "learning_rate": 2.0172970111346756e-05, + "loss": 0.405, + "step": 10148 + }, + { + "epoch": 2.0863398088189946, + "grad_norm": 0.12682540714740753, + "learning_rate": 2.01646402935586e-05, + "loss": 0.4591, + "step": 10149 + }, + { + "epoch": 2.086545379792373, + "grad_norm": 0.22507302463054657, + "learning_rate": 2.015631169929253e-05, + "loss": 0.4096, + "step": 10150 + }, + { + "epoch": 2.086750950765752, + "grad_norm": 0.2234722524881363, + "learning_rate": 2.014798432895887e-05, + "loss": 0.3815, + "step": 10151 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 0.231471449136734, + "learning_rate": 2.0139658182967842e-05, + "loss": 0.4016, + "step": 10152 + }, + { + "epoch": 2.087162092712509, + "grad_norm": 0.2210719883441925, + "learning_rate": 2.0131333261729683e-05, + "loss": 0.3896, + "step": 10153 + }, + { + "epoch": 2.0873676636858876, + "grad_norm": 0.22725726664066315, + "learning_rate": 2.012300956565449e-05, + "loss": 0.3893, + "step": 10154 + }, + { + "epoch": 2.087573234659266, + "grad_norm": 0.21838733553886414, + "learning_rate": 2.011468709515234e-05, + "loss": 0.3981, + "step": 10155 + }, + { + "epoch": 2.0877788056326447, + "grad_norm": 0.22779439389705658, + "learning_rate": 2.010636585063325e-05, + "loss": 0.4055, + "step": 10156 + }, + { + "epoch": 2.0879843766060233, + "grad_norm": 0.2215360850095749, + "learning_rate": 2.009804583250716e-05, + "loss": 0.3861, + "step": 10157 + }, + { + "epoch": 2.088189947579402, + "grad_norm": 0.22047077119350433, + "learning_rate": 2.008972704118396e-05, + "loss": 0.3813, + "step": 10158 + }, + { + "epoch": 2.0883955185527805, + "grad_norm": 0.22012098133563995, + "learning_rate": 2.008140947707346e-05, + "loss": 0.4157, + "step": 10159 + }, + { + "epoch": 2.088601089526159, + "grad_norm": 0.22172100841999054, + "learning_rate": 2.0073093140585463e-05, + "loss": 0.4031, + "step": 10160 + }, + { + "epoch": 2.0888066604995377, + "grad_norm": 0.2272823601961136, + "learning_rate": 2.0064778032129662e-05, + "loss": 0.4071, + "step": 10161 + }, + { + "epoch": 2.089012231472916, + "grad_norm": 0.22149771451950073, + "learning_rate": 2.0056464152115694e-05, + "loss": 0.3809, + "step": 10162 + }, + { + "epoch": 2.0892178024462944, + "grad_norm": 0.1271590292453766, + "learning_rate": 2.004815150095316e-05, + "loss": 0.4552, + "step": 10163 + }, + { + "epoch": 2.089423373419673, + "grad_norm": 0.21896865963935852, + "learning_rate": 2.003984007905157e-05, + "loss": 0.3918, + "step": 10164 + }, + { + "epoch": 2.0896289443930516, + "grad_norm": 0.1302296221256256, + "learning_rate": 2.003152988682038e-05, + "loss": 0.4527, + "step": 10165 + }, + { + "epoch": 2.08983451536643, + "grad_norm": 0.2259882539510727, + "learning_rate": 2.002322092466903e-05, + "loss": 0.3874, + "step": 10166 + }, + { + "epoch": 2.0900400863398088, + "grad_norm": 0.22086426615715027, + "learning_rate": 2.001491319300684e-05, + "loss": 0.3821, + "step": 10167 + }, + { + "epoch": 2.0902456573131873, + "grad_norm": 0.1255428045988083, + "learning_rate": 2.0006606692243083e-05, + "loss": 0.4736, + "step": 10168 + }, + { + "epoch": 2.090451228286566, + "grad_norm": 0.21999509632587433, + "learning_rate": 1.9998301422787013e-05, + "loss": 0.3945, + "step": 10169 + }, + { + "epoch": 2.0906567992599445, + "grad_norm": 0.22573313117027283, + "learning_rate": 1.9989997385047776e-05, + "loss": 0.4072, + "step": 10170 + }, + { + "epoch": 2.090862370233323, + "grad_norm": 0.13236026465892792, + "learning_rate": 1.9981694579434462e-05, + "loss": 0.4539, + "step": 10171 + }, + { + "epoch": 2.0910679412067017, + "grad_norm": 0.22156447172164917, + "learning_rate": 1.997339300635613e-05, + "loss": 0.3903, + "step": 10172 + }, + { + "epoch": 2.0912735121800803, + "grad_norm": 0.22315345704555511, + "learning_rate": 1.996509266622173e-05, + "loss": 0.4011, + "step": 10173 + }, + { + "epoch": 2.091479083153459, + "grad_norm": 0.210943341255188, + "learning_rate": 1.9956793559440223e-05, + "loss": 0.4072, + "step": 10174 + }, + { + "epoch": 2.0916846541268375, + "grad_norm": 0.21411919593811035, + "learning_rate": 1.994849568642044e-05, + "loss": 0.3907, + "step": 10175 + }, + { + "epoch": 2.091890225100216, + "grad_norm": 0.21381069719791412, + "learning_rate": 1.9940199047571183e-05, + "loss": 0.3825, + "step": 10176 + }, + { + "epoch": 2.092095796073594, + "grad_norm": 0.22450023889541626, + "learning_rate": 1.9931903643301194e-05, + "loss": 0.4092, + "step": 10177 + }, + { + "epoch": 2.092301367046973, + "grad_norm": 0.22319452464580536, + "learning_rate": 1.9923609474019144e-05, + "loss": 0.3992, + "step": 10178 + }, + { + "epoch": 2.0925069380203514, + "grad_norm": 0.22775287926197052, + "learning_rate": 1.9915316540133648e-05, + "loss": 0.4082, + "step": 10179 + }, + { + "epoch": 2.09271250899373, + "grad_norm": 0.22660957276821136, + "learning_rate": 1.990702484205324e-05, + "loss": 0.4158, + "step": 10180 + }, + { + "epoch": 2.0929180799671085, + "grad_norm": 0.21837587654590607, + "learning_rate": 1.9898734380186455e-05, + "loss": 0.4005, + "step": 10181 + }, + { + "epoch": 2.093123650940487, + "grad_norm": 0.22248844802379608, + "learning_rate": 1.98904451549417e-05, + "loss": 0.4156, + "step": 10182 + }, + { + "epoch": 2.0933292219138657, + "grad_norm": 0.2321203351020813, + "learning_rate": 1.988215716672736e-05, + "loss": 0.3997, + "step": 10183 + }, + { + "epoch": 2.0935347928872443, + "grad_norm": 0.22257383167743683, + "learning_rate": 1.9873870415951728e-05, + "loss": 0.4017, + "step": 10184 + }, + { + "epoch": 2.093740363860623, + "grad_norm": 0.22243481874465942, + "learning_rate": 1.986558490302306e-05, + "loss": 0.3935, + "step": 10185 + }, + { + "epoch": 2.0939459348340015, + "grad_norm": 0.2248910516500473, + "learning_rate": 1.9857300628349532e-05, + "loss": 0.3968, + "step": 10186 + }, + { + "epoch": 2.09415150580738, + "grad_norm": 0.22491000592708588, + "learning_rate": 1.98490175923393e-05, + "loss": 0.3925, + "step": 10187 + }, + { + "epoch": 2.0943570767807587, + "grad_norm": 0.22509317100048065, + "learning_rate": 1.9840735795400418e-05, + "loss": 0.4006, + "step": 10188 + }, + { + "epoch": 2.0945626477541373, + "grad_norm": 0.2187567800283432, + "learning_rate": 1.9832455237940873e-05, + "loss": 0.4097, + "step": 10189 + }, + { + "epoch": 2.094768218727516, + "grad_norm": 0.12614451348781586, + "learning_rate": 1.9824175920368644e-05, + "loss": 0.4585, + "step": 10190 + }, + { + "epoch": 2.0949737897008944, + "grad_norm": 0.2270839512348175, + "learning_rate": 1.981589784309159e-05, + "loss": 0.4005, + "step": 10191 + }, + { + "epoch": 2.0951793606742726, + "grad_norm": 0.22762618958950043, + "learning_rate": 1.9807621006517543e-05, + "loss": 0.386, + "step": 10192 + }, + { + "epoch": 2.095384931647651, + "grad_norm": 0.23058810830116272, + "learning_rate": 1.9799345411054263e-05, + "loss": 0.3889, + "step": 10193 + }, + { + "epoch": 2.0955905026210297, + "grad_norm": 0.22418002784252167, + "learning_rate": 1.9791071057109426e-05, + "loss": 0.3864, + "step": 10194 + }, + { + "epoch": 2.0957960735944083, + "grad_norm": 0.23092950880527496, + "learning_rate": 1.9782797945090707e-05, + "loss": 0.4238, + "step": 10195 + }, + { + "epoch": 2.096001644567787, + "grad_norm": 0.2287166863679886, + "learning_rate": 1.977452607540567e-05, + "loss": 0.3985, + "step": 10196 + }, + { + "epoch": 2.0962072155411655, + "grad_norm": 0.22596527636051178, + "learning_rate": 1.9766255448461836e-05, + "loss": 0.4052, + "step": 10197 + }, + { + "epoch": 2.096412786514544, + "grad_norm": 0.1205587163567543, + "learning_rate": 1.9757986064666647e-05, + "loss": 0.4629, + "step": 10198 + }, + { + "epoch": 2.0966183574879227, + "grad_norm": 0.2247573435306549, + "learning_rate": 1.9749717924427508e-05, + "loss": 0.389, + "step": 10199 + }, + { + "epoch": 2.0968239284613013, + "grad_norm": 0.12539906799793243, + "learning_rate": 1.9741451028151723e-05, + "loss": 0.4471, + "step": 10200 + }, + { + "epoch": 2.09702949943468, + "grad_norm": 0.22345465421676636, + "learning_rate": 1.9733185376246612e-05, + "loss": 0.3977, + "step": 10201 + }, + { + "epoch": 2.0972350704080585, + "grad_norm": 0.21945199370384216, + "learning_rate": 1.9724920969119356e-05, + "loss": 0.3732, + "step": 10202 + }, + { + "epoch": 2.097440641381437, + "grad_norm": 0.2249259501695633, + "learning_rate": 1.9716657807177112e-05, + "loss": 0.3822, + "step": 10203 + }, + { + "epoch": 2.0976462123548156, + "grad_norm": 0.22166243195533752, + "learning_rate": 1.9708395890826962e-05, + "loss": 0.3932, + "step": 10204 + }, + { + "epoch": 2.097851783328194, + "grad_norm": 0.22853392362594604, + "learning_rate": 1.9700135220475934e-05, + "loss": 0.4078, + "step": 10205 + }, + { + "epoch": 2.098057354301573, + "grad_norm": 0.22204367816448212, + "learning_rate": 1.969187579653099e-05, + "loss": 0.3897, + "step": 10206 + }, + { + "epoch": 2.098262925274951, + "grad_norm": 0.21821551024913788, + "learning_rate": 1.968361761939902e-05, + "loss": 0.4099, + "step": 10207 + }, + { + "epoch": 2.0984684962483295, + "grad_norm": 0.21198779344558716, + "learning_rate": 1.96753606894869e-05, + "loss": 0.4046, + "step": 10208 + }, + { + "epoch": 2.098674067221708, + "grad_norm": 0.22482189536094666, + "learning_rate": 1.966710500720139e-05, + "loss": 0.4052, + "step": 10209 + }, + { + "epoch": 2.0988796381950867, + "grad_norm": 0.22468796372413635, + "learning_rate": 1.9658850572949195e-05, + "loss": 0.3828, + "step": 10210 + }, + { + "epoch": 2.0990852091684653, + "grad_norm": 0.1260218471288681, + "learning_rate": 1.9650597387137008e-05, + "loss": 0.4485, + "step": 10211 + }, + { + "epoch": 2.099290780141844, + "grad_norm": 0.23379628360271454, + "learning_rate": 1.96423454501714e-05, + "loss": 0.4066, + "step": 10212 + }, + { + "epoch": 2.0994963511152225, + "grad_norm": 0.22664855420589447, + "learning_rate": 1.9634094762458916e-05, + "loss": 0.4069, + "step": 10213 + }, + { + "epoch": 2.099701922088601, + "grad_norm": 0.23156146705150604, + "learning_rate": 1.9625845324406e-05, + "loss": 0.4082, + "step": 10214 + }, + { + "epoch": 2.0999074930619797, + "grad_norm": 0.12826383113861084, + "learning_rate": 1.9617597136419107e-05, + "loss": 0.4626, + "step": 10215 + }, + { + "epoch": 2.1001130640353582, + "grad_norm": 0.2237342894077301, + "learning_rate": 1.960935019890456e-05, + "loss": 0.4013, + "step": 10216 + }, + { + "epoch": 2.100318635008737, + "grad_norm": 0.1258496642112732, + "learning_rate": 1.960110451226866e-05, + "loss": 0.4512, + "step": 10217 + }, + { + "epoch": 2.1005242059821154, + "grad_norm": 0.23888415098190308, + "learning_rate": 1.9592860076917626e-05, + "loss": 0.4139, + "step": 10218 + }, + { + "epoch": 2.100729776955494, + "grad_norm": 0.12545999884605408, + "learning_rate": 1.9584616893257618e-05, + "loss": 0.4433, + "step": 10219 + }, + { + "epoch": 2.1009353479288726, + "grad_norm": 0.22233633697032928, + "learning_rate": 1.9576374961694747e-05, + "loss": 0.4026, + "step": 10220 + }, + { + "epoch": 2.101140918902251, + "grad_norm": 0.218837171792984, + "learning_rate": 1.956813428263504e-05, + "loss": 0.3964, + "step": 10221 + }, + { + "epoch": 2.1013464898756298, + "grad_norm": 0.22407136857509613, + "learning_rate": 1.9559894856484503e-05, + "loss": 0.3996, + "step": 10222 + }, + { + "epoch": 2.101552060849008, + "grad_norm": 0.22463653981685638, + "learning_rate": 1.9551656683649034e-05, + "loss": 0.3896, + "step": 10223 + }, + { + "epoch": 2.1017576318223865, + "grad_norm": 0.22171586751937866, + "learning_rate": 1.95434197645345e-05, + "loss": 0.3992, + "step": 10224 + }, + { + "epoch": 2.101963202795765, + "grad_norm": 0.2200179100036621, + "learning_rate": 1.9535184099546695e-05, + "loss": 0.4082, + "step": 10225 + }, + { + "epoch": 2.1021687737691437, + "grad_norm": 0.11994064599275589, + "learning_rate": 1.952694968909134e-05, + "loss": 0.4613, + "step": 10226 + }, + { + "epoch": 2.1023743447425223, + "grad_norm": 0.23581825196743011, + "learning_rate": 1.9518716533574114e-05, + "loss": 0.4014, + "step": 10227 + }, + { + "epoch": 2.102579915715901, + "grad_norm": 0.2292327582836151, + "learning_rate": 1.9510484633400608e-05, + "loss": 0.3876, + "step": 10228 + }, + { + "epoch": 2.1027854866892794, + "grad_norm": 0.22429661452770233, + "learning_rate": 1.9502253988976407e-05, + "loss": 0.3974, + "step": 10229 + }, + { + "epoch": 2.102991057662658, + "grad_norm": 0.1279507577419281, + "learning_rate": 1.9494024600706973e-05, + "loss": 0.458, + "step": 10230 + }, + { + "epoch": 2.1031966286360366, + "grad_norm": 0.14293161034584045, + "learning_rate": 1.9485796468997733e-05, + "loss": 0.4781, + "step": 10231 + }, + { + "epoch": 2.103402199609415, + "grad_norm": 2.012324571609497, + "learning_rate": 1.947756959425403e-05, + "loss": 0.417, + "step": 10232 + }, + { + "epoch": 2.103607770582794, + "grad_norm": 0.2304982990026474, + "learning_rate": 1.94693439768812e-05, + "loss": 0.4055, + "step": 10233 + }, + { + "epoch": 2.1038133415561724, + "grad_norm": 0.23225271701812744, + "learning_rate": 1.946111961728446e-05, + "loss": 0.4127, + "step": 10234 + }, + { + "epoch": 2.104018912529551, + "grad_norm": 0.2252538651227951, + "learning_rate": 1.9452896515868974e-05, + "loss": 0.3986, + "step": 10235 + }, + { + "epoch": 2.1042244835029296, + "grad_norm": 0.2263312041759491, + "learning_rate": 1.9444674673039884e-05, + "loss": 0.3912, + "step": 10236 + }, + { + "epoch": 2.104430054476308, + "grad_norm": 0.13533739745616913, + "learning_rate": 1.9436454089202226e-05, + "loss": 0.4608, + "step": 10237 + }, + { + "epoch": 2.1046356254496863, + "grad_norm": 0.22458425164222717, + "learning_rate": 1.9428234764760997e-05, + "loss": 0.4091, + "step": 10238 + }, + { + "epoch": 2.104841196423065, + "grad_norm": 0.23281507194042206, + "learning_rate": 1.9420016700121114e-05, + "loss": 0.4005, + "step": 10239 + }, + { + "epoch": 2.1050467673964435, + "grad_norm": 0.13586680591106415, + "learning_rate": 1.941179989568745e-05, + "loss": 0.4477, + "step": 10240 + }, + { + "epoch": 2.105252338369822, + "grad_norm": 0.23734326660633087, + "learning_rate": 1.9403584351864806e-05, + "loss": 0.4007, + "step": 10241 + }, + { + "epoch": 2.1054579093432007, + "grad_norm": 0.14735311269760132, + "learning_rate": 1.9395370069057907e-05, + "loss": 0.437, + "step": 10242 + }, + { + "epoch": 2.1056634803165792, + "grad_norm": 0.22844909131526947, + "learning_rate": 1.9387157047671467e-05, + "loss": 0.3974, + "step": 10243 + }, + { + "epoch": 2.105869051289958, + "grad_norm": 0.23056820034980774, + "learning_rate": 1.9378945288110086e-05, + "loss": 0.4101, + "step": 10244 + }, + { + "epoch": 2.1060746222633364, + "grad_norm": 0.3185328543186188, + "learning_rate": 1.937073479077831e-05, + "loss": 0.4083, + "step": 10245 + }, + { + "epoch": 2.106280193236715, + "grad_norm": 0.22201012074947357, + "learning_rate": 1.9362525556080648e-05, + "loss": 0.3922, + "step": 10246 + }, + { + "epoch": 2.1064857642100936, + "grad_norm": 0.2239248901605606, + "learning_rate": 1.935431758442152e-05, + "loss": 0.3834, + "step": 10247 + }, + { + "epoch": 2.106691335183472, + "grad_norm": 0.23866835236549377, + "learning_rate": 1.93461108762053e-05, + "loss": 0.4183, + "step": 10248 + }, + { + "epoch": 2.1068969061568508, + "grad_norm": 0.22566145658493042, + "learning_rate": 1.933790543183627e-05, + "loss": 0.3999, + "step": 10249 + }, + { + "epoch": 2.1071024771302294, + "grad_norm": 0.12605851888656616, + "learning_rate": 1.9329701251718715e-05, + "loss": 0.4435, + "step": 10250 + }, + { + "epoch": 2.107308048103608, + "grad_norm": 0.2505359947681427, + "learning_rate": 1.9321498336256792e-05, + "loss": 0.3997, + "step": 10251 + }, + { + "epoch": 2.1075136190769865, + "grad_norm": 0.22146162390708923, + "learning_rate": 1.9313296685854628e-05, + "loss": 0.3939, + "step": 10252 + }, + { + "epoch": 2.1077191900503647, + "grad_norm": 0.23050841689109802, + "learning_rate": 1.9305096300916266e-05, + "loss": 0.4322, + "step": 10253 + }, + { + "epoch": 2.1079247610237433, + "grad_norm": 0.12850402295589447, + "learning_rate": 1.929689718184572e-05, + "loss": 0.4712, + "step": 10254 + }, + { + "epoch": 2.108130331997122, + "grad_norm": 0.22386091947555542, + "learning_rate": 1.9288699329046917e-05, + "loss": 0.3985, + "step": 10255 + }, + { + "epoch": 2.1083359029705004, + "grad_norm": 0.21962010860443115, + "learning_rate": 1.9280502742923706e-05, + "loss": 0.3824, + "step": 10256 + }, + { + "epoch": 2.108541473943879, + "grad_norm": 0.2260153442621231, + "learning_rate": 1.927230742387993e-05, + "loss": 0.3941, + "step": 10257 + }, + { + "epoch": 2.1087470449172576, + "grad_norm": 0.22931505739688873, + "learning_rate": 1.926411337231932e-05, + "loss": 0.3826, + "step": 10258 + }, + { + "epoch": 2.108952615890636, + "grad_norm": 0.22665323317050934, + "learning_rate": 1.9255920588645544e-05, + "loss": 0.3905, + "step": 10259 + }, + { + "epoch": 2.109158186864015, + "grad_norm": 0.12365376204252243, + "learning_rate": 1.924772907326224e-05, + "loss": 0.4274, + "step": 10260 + }, + { + "epoch": 2.1093637578373934, + "grad_norm": 0.12186730653047562, + "learning_rate": 1.923953882657296e-05, + "loss": 0.4518, + "step": 10261 + }, + { + "epoch": 2.109569328810772, + "grad_norm": 0.2289813756942749, + "learning_rate": 1.9231349848981198e-05, + "loss": 0.4068, + "step": 10262 + }, + { + "epoch": 2.1097748997841506, + "grad_norm": 0.13003799319267273, + "learning_rate": 1.922316214089037e-05, + "loss": 0.4646, + "step": 10263 + }, + { + "epoch": 2.109980470757529, + "grad_norm": 0.22192876040935516, + "learning_rate": 1.921497570270388e-05, + "loss": 0.3899, + "step": 10264 + }, + { + "epoch": 2.1101860417309077, + "grad_norm": 0.24169708788394928, + "learning_rate": 1.9206790534825012e-05, + "loss": 0.3991, + "step": 10265 + }, + { + "epoch": 2.1103916127042863, + "grad_norm": 0.12277937680482864, + "learning_rate": 1.919860663765702e-05, + "loss": 0.4448, + "step": 10266 + }, + { + "epoch": 2.110597183677665, + "grad_norm": 0.22904759645462036, + "learning_rate": 1.919042401160309e-05, + "loss": 0.3916, + "step": 10267 + }, + { + "epoch": 2.1108027546510435, + "grad_norm": 0.22709167003631592, + "learning_rate": 1.9182242657066326e-05, + "loss": 0.3872, + "step": 10268 + }, + { + "epoch": 2.1110083256244216, + "grad_norm": 0.229196235537529, + "learning_rate": 1.9174062574449796e-05, + "loss": 0.4137, + "step": 10269 + }, + { + "epoch": 2.1112138965978002, + "grad_norm": 0.2226954847574234, + "learning_rate": 1.916588376415648e-05, + "loss": 0.3845, + "step": 10270 + }, + { + "epoch": 2.111419467571179, + "grad_norm": 0.21747919917106628, + "learning_rate": 1.915770622658934e-05, + "loss": 0.404, + "step": 10271 + }, + { + "epoch": 2.1116250385445574, + "grad_norm": 0.22931161522865295, + "learning_rate": 1.9149529962151223e-05, + "loss": 0.4024, + "step": 10272 + }, + { + "epoch": 2.111830609517936, + "grad_norm": 0.23304495215415955, + "learning_rate": 1.9141354971244945e-05, + "loss": 0.3922, + "step": 10273 + }, + { + "epoch": 2.1120361804913146, + "grad_norm": 0.2212096005678177, + "learning_rate": 1.9133181254273226e-05, + "loss": 0.4006, + "step": 10274 + }, + { + "epoch": 2.112241751464693, + "grad_norm": 0.13250161707401276, + "learning_rate": 1.912500881163878e-05, + "loss": 0.4599, + "step": 10275 + }, + { + "epoch": 2.1124473224380718, + "grad_norm": 0.22626225650310516, + "learning_rate": 1.911683764374421e-05, + "loss": 0.4085, + "step": 10276 + }, + { + "epoch": 2.1126528934114503, + "grad_norm": 0.12602867186069489, + "learning_rate": 1.9108667750992057e-05, + "loss": 0.4627, + "step": 10277 + }, + { + "epoch": 2.112858464384829, + "grad_norm": 0.23502740263938904, + "learning_rate": 1.9100499133784848e-05, + "loss": 0.4113, + "step": 10278 + }, + { + "epoch": 2.1130640353582075, + "grad_norm": 0.24214279651641846, + "learning_rate": 1.9092331792524986e-05, + "loss": 0.3842, + "step": 10279 + }, + { + "epoch": 2.113269606331586, + "grad_norm": 0.23367543518543243, + "learning_rate": 1.908416572761485e-05, + "loss": 0.3974, + "step": 10280 + }, + { + "epoch": 2.1134751773049647, + "grad_norm": 0.2227569818496704, + "learning_rate": 1.907600093945674e-05, + "loss": 0.4011, + "step": 10281 + }, + { + "epoch": 2.1136807482783433, + "grad_norm": 0.222117081284523, + "learning_rate": 1.906783742845289e-05, + "loss": 0.4013, + "step": 10282 + }, + { + "epoch": 2.113886319251722, + "grad_norm": 0.13070207834243774, + "learning_rate": 1.9059675195005468e-05, + "loss": 0.4754, + "step": 10283 + }, + { + "epoch": 2.1140918902251, + "grad_norm": 0.23519377410411835, + "learning_rate": 1.905151423951662e-05, + "loss": 0.4061, + "step": 10284 + }, + { + "epoch": 2.1142974611984786, + "grad_norm": 0.16713115572929382, + "learning_rate": 1.9043354562388385e-05, + "loss": 0.4556, + "step": 10285 + }, + { + "epoch": 2.114503032171857, + "grad_norm": 0.12903447449207306, + "learning_rate": 1.903519616402275e-05, + "loss": 0.4728, + "step": 10286 + }, + { + "epoch": 2.114708603145236, + "grad_norm": 0.22464367747306824, + "learning_rate": 1.9027039044821635e-05, + "loss": 0.4061, + "step": 10287 + }, + { + "epoch": 2.1149141741186144, + "grad_norm": 0.21755559742450714, + "learning_rate": 1.9018883205186913e-05, + "loss": 0.3932, + "step": 10288 + }, + { + "epoch": 2.115119745091993, + "grad_norm": 0.133756622672081, + "learning_rate": 1.901072864552038e-05, + "loss": 0.457, + "step": 10289 + }, + { + "epoch": 2.1153253160653716, + "grad_norm": 0.23208466172218323, + "learning_rate": 1.9002575366223756e-05, + "loss": 0.4064, + "step": 10290 + }, + { + "epoch": 2.11553088703875, + "grad_norm": 0.13155633211135864, + "learning_rate": 1.8994423367698753e-05, + "loss": 0.4419, + "step": 10291 + }, + { + "epoch": 2.1157364580121287, + "grad_norm": 0.2295832335948944, + "learning_rate": 1.8986272650346955e-05, + "loss": 0.3953, + "step": 10292 + }, + { + "epoch": 2.1159420289855073, + "grad_norm": 0.2247355431318283, + "learning_rate": 1.8978123214569915e-05, + "loss": 0.3978, + "step": 10293 + }, + { + "epoch": 2.116147599958886, + "grad_norm": 0.22480376064777374, + "learning_rate": 1.8969975060769123e-05, + "loss": 0.4201, + "step": 10294 + }, + { + "epoch": 2.1163531709322645, + "grad_norm": 0.12718500196933746, + "learning_rate": 1.896182818934598e-05, + "loss": 0.4484, + "step": 10295 + }, + { + "epoch": 2.116558741905643, + "grad_norm": 0.2338053286075592, + "learning_rate": 1.8953682600701873e-05, + "loss": 0.4009, + "step": 10296 + }, + { + "epoch": 2.1167643128790217, + "grad_norm": 0.23438557982444763, + "learning_rate": 1.894553829523808e-05, + "loss": 0.3935, + "step": 10297 + }, + { + "epoch": 2.1169698838524003, + "grad_norm": 0.2157134860754013, + "learning_rate": 1.8937395273355834e-05, + "loss": 0.3973, + "step": 10298 + }, + { + "epoch": 2.1171754548257784, + "grad_norm": 0.2266354262828827, + "learning_rate": 1.8929253535456313e-05, + "loss": 0.406, + "step": 10299 + }, + { + "epoch": 2.117381025799157, + "grad_norm": 0.2172161191701889, + "learning_rate": 1.8921113081940612e-05, + "loss": 0.3979, + "step": 10300 + }, + { + "epoch": 2.1175865967725356, + "grad_norm": 0.22894109785556793, + "learning_rate": 1.8912973913209784e-05, + "loss": 0.4039, + "step": 10301 + }, + { + "epoch": 2.117792167745914, + "grad_norm": 0.218611940741539, + "learning_rate": 1.8904836029664802e-05, + "loss": 0.3832, + "step": 10302 + }, + { + "epoch": 2.1179977387192928, + "grad_norm": 0.22846931219100952, + "learning_rate": 1.8896699431706573e-05, + "loss": 0.4059, + "step": 10303 + }, + { + "epoch": 2.1182033096926713, + "grad_norm": 0.22411483526229858, + "learning_rate": 1.888856411973595e-05, + "loss": 0.3933, + "step": 10304 + }, + { + "epoch": 2.11840888066605, + "grad_norm": 0.22474461793899536, + "learning_rate": 1.8880430094153738e-05, + "loss": 0.4027, + "step": 10305 + }, + { + "epoch": 2.1186144516394285, + "grad_norm": 0.22956325113773346, + "learning_rate": 1.8872297355360653e-05, + "loss": 0.397, + "step": 10306 + }, + { + "epoch": 2.118820022612807, + "grad_norm": 0.23198306560516357, + "learning_rate": 1.886416590375736e-05, + "loss": 0.41, + "step": 10307 + }, + { + "epoch": 2.1190255935861857, + "grad_norm": 0.22490225732326508, + "learning_rate": 1.8856035739744447e-05, + "loss": 0.396, + "step": 10308 + }, + { + "epoch": 2.1192311645595643, + "grad_norm": 0.23693934082984924, + "learning_rate": 1.8847906863722467e-05, + "loss": 0.4054, + "step": 10309 + }, + { + "epoch": 2.119436735532943, + "grad_norm": 0.22398188710212708, + "learning_rate": 1.8839779276091875e-05, + "loss": 0.399, + "step": 10310 + }, + { + "epoch": 2.1196423065063215, + "grad_norm": 0.23093253374099731, + "learning_rate": 1.883165297725307e-05, + "loss": 0.4094, + "step": 10311 + }, + { + "epoch": 2.1198478774797, + "grad_norm": 0.22496986389160156, + "learning_rate": 1.8823527967606428e-05, + "loss": 0.3819, + "step": 10312 + }, + { + "epoch": 2.1200534484530786, + "grad_norm": 0.22796480357646942, + "learning_rate": 1.8815404247552213e-05, + "loss": 0.3996, + "step": 10313 + }, + { + "epoch": 2.120259019426457, + "grad_norm": 0.22607813775539398, + "learning_rate": 1.8807281817490647e-05, + "loss": 0.3882, + "step": 10314 + }, + { + "epoch": 2.1204645903998354, + "grad_norm": 0.2205992192029953, + "learning_rate": 1.8799160677821882e-05, + "loss": 0.3846, + "step": 10315 + }, + { + "epoch": 2.120670161373214, + "grad_norm": 0.12466558814048767, + "learning_rate": 1.879104082894601e-05, + "loss": 0.4445, + "step": 10316 + }, + { + "epoch": 2.1208757323465925, + "grad_norm": 0.12291921675205231, + "learning_rate": 1.8782922271263033e-05, + "loss": 0.4429, + "step": 10317 + }, + { + "epoch": 2.121081303319971, + "grad_norm": 0.22178338468074799, + "learning_rate": 1.8774805005172958e-05, + "loss": 0.3842, + "step": 10318 + }, + { + "epoch": 2.1212868742933497, + "grad_norm": 0.22737297415733337, + "learning_rate": 1.8766689031075644e-05, + "loss": 0.3988, + "step": 10319 + }, + { + "epoch": 2.1214924452667283, + "grad_norm": 0.12307467311620712, + "learning_rate": 1.875857434937097e-05, + "loss": 0.4426, + "step": 10320 + }, + { + "epoch": 2.121698016240107, + "grad_norm": 0.21922807395458221, + "learning_rate": 1.8750460960458682e-05, + "loss": 0.4063, + "step": 10321 + }, + { + "epoch": 2.1219035872134855, + "grad_norm": 0.12798526883125305, + "learning_rate": 1.8742348864738494e-05, + "loss": 0.4517, + "step": 10322 + }, + { + "epoch": 2.122109158186864, + "grad_norm": 0.12603412568569183, + "learning_rate": 1.8734238062610044e-05, + "loss": 0.4614, + "step": 10323 + }, + { + "epoch": 2.1223147291602427, + "grad_norm": 0.22325001657009125, + "learning_rate": 1.8726128554472924e-05, + "loss": 0.3954, + "step": 10324 + }, + { + "epoch": 2.1225203001336213, + "grad_norm": 0.2292872816324234, + "learning_rate": 1.8718020340726634e-05, + "loss": 0.3985, + "step": 10325 + }, + { + "epoch": 2.122725871107, + "grad_norm": 0.23180240392684937, + "learning_rate": 1.8709913421770648e-05, + "loss": 0.4131, + "step": 10326 + }, + { + "epoch": 2.1229314420803784, + "grad_norm": 0.12431956827640533, + "learning_rate": 1.870180779800435e-05, + "loss": 0.4345, + "step": 10327 + }, + { + "epoch": 2.123137013053757, + "grad_norm": 0.13498254120349884, + "learning_rate": 1.8693703469827067e-05, + "loss": 0.4681, + "step": 10328 + }, + { + "epoch": 2.123342584027135, + "grad_norm": 0.12030383944511414, + "learning_rate": 1.8685600437638057e-05, + "loss": 0.4469, + "step": 10329 + }, + { + "epoch": 2.1235481550005137, + "grad_norm": 0.22829271852970123, + "learning_rate": 1.867749870183652e-05, + "loss": 0.3874, + "step": 10330 + }, + { + "epoch": 2.1237537259738923, + "grad_norm": 0.21957705914974213, + "learning_rate": 1.8669398262821593e-05, + "loss": 0.3904, + "step": 10331 + }, + { + "epoch": 2.123959296947271, + "grad_norm": 0.22618070244789124, + "learning_rate": 1.8661299120992332e-05, + "loss": 0.4029, + "step": 10332 + }, + { + "epoch": 2.1241648679206495, + "grad_norm": 0.2359391301870346, + "learning_rate": 1.8653201276747767e-05, + "loss": 0.4119, + "step": 10333 + }, + { + "epoch": 2.124370438894028, + "grad_norm": 0.21867458522319794, + "learning_rate": 1.8645104730486828e-05, + "loss": 0.3953, + "step": 10334 + }, + { + "epoch": 2.1245760098674067, + "grad_norm": 0.22511562705039978, + "learning_rate": 1.86370094826084e-05, + "loss": 0.3824, + "step": 10335 + }, + { + "epoch": 2.1247815808407853, + "grad_norm": 0.12738649547100067, + "learning_rate": 1.8628915533511296e-05, + "loss": 0.4281, + "step": 10336 + }, + { + "epoch": 2.124987151814164, + "grad_norm": 0.22026711702346802, + "learning_rate": 1.8620822883594267e-05, + "loss": 0.3925, + "step": 10337 + }, + { + "epoch": 2.1251927227875425, + "grad_norm": 0.22602379322052002, + "learning_rate": 1.8612731533255976e-05, + "loss": 0.3959, + "step": 10338 + }, + { + "epoch": 2.125398293760921, + "grad_norm": 0.22942064702510834, + "learning_rate": 1.860464148289509e-05, + "loss": 0.4084, + "step": 10339 + }, + { + "epoch": 2.1256038647342996, + "grad_norm": 0.22742587327957153, + "learning_rate": 1.8596552732910148e-05, + "loss": 0.4137, + "step": 10340 + }, + { + "epoch": 2.125809435707678, + "grad_norm": 0.12401507049798965, + "learning_rate": 1.8588465283699622e-05, + "loss": 0.434, + "step": 10341 + }, + { + "epoch": 2.126015006681057, + "grad_norm": 0.21955260634422302, + "learning_rate": 1.858037913566198e-05, + "loss": 0.4068, + "step": 10342 + }, + { + "epoch": 2.1262205776544354, + "grad_norm": 0.1239282488822937, + "learning_rate": 1.8572294289195576e-05, + "loss": 0.4364, + "step": 10343 + }, + { + "epoch": 2.1264261486278135, + "grad_norm": 0.2231699824333191, + "learning_rate": 1.8564210744698707e-05, + "loss": 0.3928, + "step": 10344 + }, + { + "epoch": 2.126631719601192, + "grad_norm": 0.12479789555072784, + "learning_rate": 1.8556128502569618e-05, + "loss": 0.4482, + "step": 10345 + }, + { + "epoch": 2.1268372905745707, + "grad_norm": 0.2382933497428894, + "learning_rate": 1.8548047563206465e-05, + "loss": 0.4012, + "step": 10346 + }, + { + "epoch": 2.1270428615479493, + "grad_norm": 0.23470161855220795, + "learning_rate": 1.853996792700738e-05, + "loss": 0.3967, + "step": 10347 + }, + { + "epoch": 2.127248432521328, + "grad_norm": 0.22285513579845428, + "learning_rate": 1.8531889594370406e-05, + "loss": 0.4076, + "step": 10348 + }, + { + "epoch": 2.1274540034947065, + "grad_norm": 0.23410557210445404, + "learning_rate": 1.8523812565693522e-05, + "loss": 0.4086, + "step": 10349 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.2240322232246399, + "learning_rate": 1.8515736841374643e-05, + "loss": 0.4091, + "step": 10350 + }, + { + "epoch": 2.1278651454414637, + "grad_norm": 0.22299052774906158, + "learning_rate": 1.8507662421811618e-05, + "loss": 0.3762, + "step": 10351 + }, + { + "epoch": 2.1280707164148422, + "grad_norm": 0.23107583820819855, + "learning_rate": 1.8499589307402244e-05, + "loss": 0.3983, + "step": 10352 + }, + { + "epoch": 2.128276287388221, + "grad_norm": 0.22548127174377441, + "learning_rate": 1.8491517498544227e-05, + "loss": 0.4028, + "step": 10353 + }, + { + "epoch": 2.1284818583615994, + "grad_norm": 0.2753831744194031, + "learning_rate": 1.848344699563526e-05, + "loss": 0.423, + "step": 10354 + }, + { + "epoch": 2.128687429334978, + "grad_norm": 0.22851170599460602, + "learning_rate": 1.847537779907292e-05, + "loss": 0.3987, + "step": 10355 + }, + { + "epoch": 2.1288930003083566, + "grad_norm": 0.2307175248861313, + "learning_rate": 1.8467309909254737e-05, + "loss": 0.4081, + "step": 10356 + }, + { + "epoch": 2.129098571281735, + "grad_norm": 0.2297874242067337, + "learning_rate": 1.8459243326578183e-05, + "loss": 0.406, + "step": 10357 + }, + { + "epoch": 2.1293041422551138, + "grad_norm": 0.12997567653656006, + "learning_rate": 1.845117805144066e-05, + "loss": 0.436, + "step": 10358 + }, + { + "epoch": 2.129509713228492, + "grad_norm": 0.133535698056221, + "learning_rate": 1.844311408423949e-05, + "loss": 0.4471, + "step": 10359 + }, + { + "epoch": 2.1297152842018705, + "grad_norm": 0.1276518702507019, + "learning_rate": 1.843505142537198e-05, + "loss": 0.4424, + "step": 10360 + }, + { + "epoch": 2.129920855175249, + "grad_norm": 0.12228768318891525, + "learning_rate": 1.842699007523532e-05, + "loss": 0.4467, + "step": 10361 + }, + { + "epoch": 2.1301264261486277, + "grad_norm": 0.2285146862268448, + "learning_rate": 1.841893003422664e-05, + "loss": 0.4019, + "step": 10362 + }, + { + "epoch": 2.1303319971220063, + "grad_norm": 0.22430342435836792, + "learning_rate": 1.8410871302743054e-05, + "loss": 0.4207, + "step": 10363 + }, + { + "epoch": 2.130537568095385, + "grad_norm": 0.214961439371109, + "learning_rate": 1.8402813881181563e-05, + "loss": 0.3986, + "step": 10364 + }, + { + "epoch": 2.1307431390687634, + "grad_norm": 0.23033976554870605, + "learning_rate": 1.8394757769939117e-05, + "loss": 0.3853, + "step": 10365 + }, + { + "epoch": 2.130948710042142, + "grad_norm": 0.12601739168167114, + "learning_rate": 1.8386702969412583e-05, + "loss": 0.438, + "step": 10366 + }, + { + "epoch": 2.1311542810155206, + "grad_norm": 0.23412902653217316, + "learning_rate": 1.8378649479998827e-05, + "loss": 0.3996, + "step": 10367 + }, + { + "epoch": 2.131359851988899, + "grad_norm": 0.2274925261735916, + "learning_rate": 1.8370597302094577e-05, + "loss": 0.388, + "step": 10368 + }, + { + "epoch": 2.131565422962278, + "grad_norm": 0.1298505961894989, + "learning_rate": 1.8362546436096537e-05, + "loss": 0.4471, + "step": 10369 + }, + { + "epoch": 2.1317709939356564, + "grad_norm": 0.23555243015289307, + "learning_rate": 1.8354496882401327e-05, + "loss": 0.3892, + "step": 10370 + }, + { + "epoch": 2.131976564909035, + "grad_norm": 0.2312725931406021, + "learning_rate": 1.8346448641405517e-05, + "loss": 0.386, + "step": 10371 + }, + { + "epoch": 2.1321821358824136, + "grad_norm": 0.22951969504356384, + "learning_rate": 1.8338401713505603e-05, + "loss": 0.407, + "step": 10372 + }, + { + "epoch": 2.132387706855792, + "grad_norm": 0.22569020092487335, + "learning_rate": 1.8330356099098006e-05, + "loss": 0.3961, + "step": 10373 + }, + { + "epoch": 2.1325932778291703, + "grad_norm": 0.2186949998140335, + "learning_rate": 1.8322311798579125e-05, + "loss": 0.3827, + "step": 10374 + }, + { + "epoch": 2.132798848802549, + "grad_norm": 0.22884011268615723, + "learning_rate": 1.8314268812345248e-05, + "loss": 0.3973, + "step": 10375 + }, + { + "epoch": 2.1330044197759275, + "grad_norm": 0.1293371617794037, + "learning_rate": 1.8306227140792622e-05, + "loss": 0.4564, + "step": 10376 + }, + { + "epoch": 2.133209990749306, + "grad_norm": 0.22477327287197113, + "learning_rate": 1.829818678431742e-05, + "loss": 0.3865, + "step": 10377 + }, + { + "epoch": 2.1334155617226847, + "grad_norm": 0.22367890179157257, + "learning_rate": 1.8290147743315746e-05, + "loss": 0.3733, + "step": 10378 + }, + { + "epoch": 2.1336211326960632, + "grad_norm": 0.23502875864505768, + "learning_rate": 1.8282110018183656e-05, + "loss": 0.4037, + "step": 10379 + }, + { + "epoch": 2.133826703669442, + "grad_norm": 0.12768757343292236, + "learning_rate": 1.8274073609317106e-05, + "loss": 0.4562, + "step": 10380 + }, + { + "epoch": 2.1340322746428204, + "grad_norm": 0.23585356771945953, + "learning_rate": 1.826603851711205e-05, + "loss": 0.3938, + "step": 10381 + }, + { + "epoch": 2.134237845616199, + "grad_norm": 0.23149564862251282, + "learning_rate": 1.825800474196432e-05, + "loss": 0.3848, + "step": 10382 + }, + { + "epoch": 2.1344434165895776, + "grad_norm": 0.23342165350914001, + "learning_rate": 1.824997228426969e-05, + "loss": 0.4179, + "step": 10383 + }, + { + "epoch": 2.134648987562956, + "grad_norm": 0.2237035036087036, + "learning_rate": 1.8241941144423916e-05, + "loss": 0.4023, + "step": 10384 + }, + { + "epoch": 2.1348545585363348, + "grad_norm": 0.2252335101366043, + "learning_rate": 1.8233911322822632e-05, + "loss": 0.3956, + "step": 10385 + }, + { + "epoch": 2.1350601295097134, + "grad_norm": 0.2148154377937317, + "learning_rate": 1.822588281986143e-05, + "loss": 0.3835, + "step": 10386 + }, + { + "epoch": 2.135265700483092, + "grad_norm": 0.11948797851800919, + "learning_rate": 1.8217855635935827e-05, + "loss": 0.4476, + "step": 10387 + }, + { + "epoch": 2.1354712714564705, + "grad_norm": 0.22916093468666077, + "learning_rate": 1.8209829771441314e-05, + "loss": 0.3903, + "step": 10388 + }, + { + "epoch": 2.1356768424298487, + "grad_norm": 0.21855413913726807, + "learning_rate": 1.820180522677327e-05, + "loss": 0.3972, + "step": 10389 + }, + { + "epoch": 2.1358824134032273, + "grad_norm": 0.13248126208782196, + "learning_rate": 1.819378200232703e-05, + "loss": 0.4453, + "step": 10390 + }, + { + "epoch": 2.136087984376606, + "grad_norm": 0.22880522906780243, + "learning_rate": 1.818576009849786e-05, + "loss": 0.3987, + "step": 10391 + }, + { + "epoch": 2.1362935553499844, + "grad_norm": 0.24837420880794525, + "learning_rate": 1.8177739515680953e-05, + "loss": 0.3857, + "step": 10392 + }, + { + "epoch": 2.136499126323363, + "grad_norm": 0.23082508146762848, + "learning_rate": 1.816972025427146e-05, + "loss": 0.421, + "step": 10393 + }, + { + "epoch": 2.1367046972967416, + "grad_norm": 0.1307905912399292, + "learning_rate": 1.8161702314664423e-05, + "loss": 0.4584, + "step": 10394 + }, + { + "epoch": 2.13691026827012, + "grad_norm": 0.23677071928977966, + "learning_rate": 1.815368569725489e-05, + "loss": 0.4082, + "step": 10395 + }, + { + "epoch": 2.137115839243499, + "grad_norm": 0.1245460957288742, + "learning_rate": 1.8145670402437787e-05, + "loss": 0.4332, + "step": 10396 + }, + { + "epoch": 2.1373214102168774, + "grad_norm": 0.2281726449728012, + "learning_rate": 1.8137656430607986e-05, + "loss": 0.3907, + "step": 10397 + }, + { + "epoch": 2.137526981190256, + "grad_norm": 0.11899819225072861, + "learning_rate": 1.8129643782160294e-05, + "loss": 0.449, + "step": 10398 + }, + { + "epoch": 2.1377325521636346, + "grad_norm": 0.23056533932685852, + "learning_rate": 1.8121632457489465e-05, + "loss": 0.4015, + "step": 10399 + }, + { + "epoch": 2.137938123137013, + "grad_norm": 0.2260628640651703, + "learning_rate": 1.8113622456990175e-05, + "loss": 0.3938, + "step": 10400 + }, + { + "epoch": 2.1381436941103917, + "grad_norm": 0.22494405508041382, + "learning_rate": 1.810561378105702e-05, + "loss": 0.3994, + "step": 10401 + }, + { + "epoch": 2.1383492650837703, + "grad_norm": 0.22264499962329865, + "learning_rate": 1.809760643008459e-05, + "loss": 0.3904, + "step": 10402 + }, + { + "epoch": 2.138554836057149, + "grad_norm": 0.2253665328025818, + "learning_rate": 1.808960040446735e-05, + "loss": 0.3998, + "step": 10403 + }, + { + "epoch": 2.138760407030527, + "grad_norm": 0.12751929461956024, + "learning_rate": 1.8081595704599718e-05, + "loss": 0.4584, + "step": 10404 + }, + { + "epoch": 2.1389659780039056, + "grad_norm": 0.1251654028892517, + "learning_rate": 1.8073592330876034e-05, + "loss": 0.4494, + "step": 10405 + }, + { + "epoch": 2.1391715489772842, + "grad_norm": 0.12770125269889832, + "learning_rate": 1.8065590283690614e-05, + "loss": 0.436, + "step": 10406 + }, + { + "epoch": 2.139377119950663, + "grad_norm": 0.22460491955280304, + "learning_rate": 1.8057589563437675e-05, + "loss": 0.3837, + "step": 10407 + }, + { + "epoch": 2.1395826909240414, + "grad_norm": 0.2189689576625824, + "learning_rate": 1.8049590170511354e-05, + "loss": 0.4027, + "step": 10408 + }, + { + "epoch": 2.13978826189742, + "grad_norm": 0.22947020828723907, + "learning_rate": 1.804159210530577e-05, + "loss": 0.3883, + "step": 10409 + }, + { + "epoch": 2.1399938328707986, + "grad_norm": 0.22392447292804718, + "learning_rate": 1.8033595368214945e-05, + "loss": 0.3933, + "step": 10410 + }, + { + "epoch": 2.140199403844177, + "grad_norm": 0.23469264805316925, + "learning_rate": 1.8025599959632835e-05, + "loss": 0.4153, + "step": 10411 + }, + { + "epoch": 2.1404049748175558, + "grad_norm": 0.2271226942539215, + "learning_rate": 1.8017605879953335e-05, + "loss": 0.396, + "step": 10412 + }, + { + "epoch": 2.1406105457909343, + "grad_norm": 0.2269534021615982, + "learning_rate": 1.8009613129570278e-05, + "loss": 0.401, + "step": 10413 + }, + { + "epoch": 2.140816116764313, + "grad_norm": 0.22716417908668518, + "learning_rate": 1.800162170887743e-05, + "loss": 0.3846, + "step": 10414 + }, + { + "epoch": 2.1410216877376915, + "grad_norm": 0.13274461030960083, + "learning_rate": 1.7993631618268472e-05, + "loss": 0.448, + "step": 10415 + }, + { + "epoch": 2.14122725871107, + "grad_norm": 0.22133229672908783, + "learning_rate": 1.7985642858137076e-05, + "loss": 0.3983, + "step": 10416 + }, + { + "epoch": 2.1414328296844487, + "grad_norm": 0.21587035059928894, + "learning_rate": 1.797765542887679e-05, + "loss": 0.3917, + "step": 10417 + }, + { + "epoch": 2.1416384006578273, + "grad_norm": 0.2158806473016739, + "learning_rate": 1.796966933088112e-05, + "loss": 0.3887, + "step": 10418 + }, + { + "epoch": 2.141843971631206, + "grad_norm": 0.23333343863487244, + "learning_rate": 1.7961684564543503e-05, + "loss": 0.393, + "step": 10419 + }, + { + "epoch": 2.1420495426045845, + "grad_norm": 0.21826335787773132, + "learning_rate": 1.7953701130257313e-05, + "loss": 0.3817, + "step": 10420 + }, + { + "epoch": 2.1422551135779626, + "grad_norm": 0.12297184020280838, + "learning_rate": 1.794571902841585e-05, + "loss": 0.4548, + "step": 10421 + }, + { + "epoch": 2.142460684551341, + "grad_norm": 0.12231001257896423, + "learning_rate": 1.793773825941234e-05, + "loss": 0.4505, + "step": 10422 + }, + { + "epoch": 2.14266625552472, + "grad_norm": 0.2218412458896637, + "learning_rate": 1.792975882364e-05, + "loss": 0.3939, + "step": 10423 + }, + { + "epoch": 2.1428718264980984, + "grad_norm": 0.1286546289920807, + "learning_rate": 1.7921780721491914e-05, + "loss": 0.4586, + "step": 10424 + }, + { + "epoch": 2.143077397471477, + "grad_norm": 0.22066746652126312, + "learning_rate": 1.7913803953361125e-05, + "loss": 0.3819, + "step": 10425 + }, + { + "epoch": 2.1432829684448556, + "grad_norm": 0.22369948029518127, + "learning_rate": 1.7905828519640602e-05, + "loss": 0.4186, + "step": 10426 + }, + { + "epoch": 2.143488539418234, + "grad_norm": 0.12636181712150574, + "learning_rate": 1.789785442072329e-05, + "loss": 0.4643, + "step": 10427 + }, + { + "epoch": 2.1436941103916127, + "grad_norm": 0.22555802762508392, + "learning_rate": 1.788988165700201e-05, + "loss": 0.3877, + "step": 10428 + }, + { + "epoch": 2.1438996813649913, + "grad_norm": 0.2376098334789276, + "learning_rate": 1.7881910228869535e-05, + "loss": 0.3993, + "step": 10429 + }, + { + "epoch": 2.14410525233837, + "grad_norm": 0.2282724678516388, + "learning_rate": 1.787394013671861e-05, + "loss": 0.3815, + "step": 10430 + }, + { + "epoch": 2.1443108233117485, + "grad_norm": 0.22976957261562347, + "learning_rate": 1.7865971380941866e-05, + "loss": 0.3869, + "step": 10431 + }, + { + "epoch": 2.144516394285127, + "grad_norm": 0.2277589738368988, + "learning_rate": 1.7858003961931885e-05, + "loss": 0.3927, + "step": 10432 + }, + { + "epoch": 2.1447219652585057, + "grad_norm": 0.21987488865852356, + "learning_rate": 1.785003788008119e-05, + "loss": 0.3971, + "step": 10433 + }, + { + "epoch": 2.1449275362318843, + "grad_norm": 0.22373713552951813, + "learning_rate": 1.784207313578223e-05, + "loss": 0.4124, + "step": 10434 + }, + { + "epoch": 2.145133107205263, + "grad_norm": 0.22595758736133575, + "learning_rate": 1.7834109729427376e-05, + "loss": 0.4053, + "step": 10435 + }, + { + "epoch": 2.145338678178641, + "grad_norm": 0.22213847935199738, + "learning_rate": 1.782614766140898e-05, + "loss": 0.3875, + "step": 10436 + }, + { + "epoch": 2.1455442491520196, + "grad_norm": 0.127987802028656, + "learning_rate": 1.7818186932119277e-05, + "loss": 0.4445, + "step": 10437 + }, + { + "epoch": 2.145749820125398, + "grad_norm": 0.22547675669193268, + "learning_rate": 1.781022754195045e-05, + "loss": 0.3897, + "step": 10438 + }, + { + "epoch": 2.1459553910987768, + "grad_norm": 0.23386697471141815, + "learning_rate": 1.780226949129464e-05, + "loss": 0.3906, + "step": 10439 + }, + { + "epoch": 2.1461609620721553, + "grad_norm": 0.22901882231235504, + "learning_rate": 1.7794312780543883e-05, + "loss": 0.3978, + "step": 10440 + }, + { + "epoch": 2.146366533045534, + "grad_norm": 0.22975675761699677, + "learning_rate": 1.7786357410090173e-05, + "loss": 0.3855, + "step": 10441 + }, + { + "epoch": 2.1465721040189125, + "grad_norm": 0.22928237915039062, + "learning_rate": 1.7778403380325427e-05, + "loss": 0.3919, + "step": 10442 + }, + { + "epoch": 2.146777674992291, + "grad_norm": 0.22319789230823517, + "learning_rate": 1.7770450691641526e-05, + "loss": 0.3921, + "step": 10443 + }, + { + "epoch": 2.1469832459656697, + "grad_norm": 0.23228733241558075, + "learning_rate": 1.7762499344430253e-05, + "loss": 0.395, + "step": 10444 + }, + { + "epoch": 2.1471888169390483, + "grad_norm": 0.22841905057430267, + "learning_rate": 1.7754549339083323e-05, + "loss": 0.4022, + "step": 10445 + }, + { + "epoch": 2.147394387912427, + "grad_norm": 0.1279844492673874, + "learning_rate": 1.7746600675992408e-05, + "loss": 0.4415, + "step": 10446 + }, + { + "epoch": 2.1475999588858055, + "grad_norm": 0.2246563881635666, + "learning_rate": 1.7738653355549078e-05, + "loss": 0.3858, + "step": 10447 + }, + { + "epoch": 2.147805529859184, + "grad_norm": 0.225599467754364, + "learning_rate": 1.773070737814489e-05, + "loss": 0.4025, + "step": 10448 + }, + { + "epoch": 2.1480111008325626, + "grad_norm": 0.2247907519340515, + "learning_rate": 1.7722762744171298e-05, + "loss": 0.4245, + "step": 10449 + }, + { + "epoch": 2.1482166718059412, + "grad_norm": 0.23618023097515106, + "learning_rate": 1.7714819454019672e-05, + "loss": 0.4155, + "step": 10450 + }, + { + "epoch": 2.1484222427793194, + "grad_norm": 0.12265011668205261, + "learning_rate": 1.770687750808138e-05, + "loss": 0.4512, + "step": 10451 + }, + { + "epoch": 2.148627813752698, + "grad_norm": 0.23683376610279083, + "learning_rate": 1.7698936906747665e-05, + "loss": 0.4045, + "step": 10452 + }, + { + "epoch": 2.1488333847260765, + "grad_norm": 0.2286202311515808, + "learning_rate": 1.7690997650409725e-05, + "loss": 0.401, + "step": 10453 + }, + { + "epoch": 2.149038955699455, + "grad_norm": 0.21446064114570618, + "learning_rate": 1.7683059739458683e-05, + "loss": 0.3898, + "step": 10454 + }, + { + "epoch": 2.1492445266728337, + "grad_norm": 0.12255129218101501, + "learning_rate": 1.7675123174285614e-05, + "loss": 0.46, + "step": 10455 + }, + { + "epoch": 2.1494500976462123, + "grad_norm": 0.22888119518756866, + "learning_rate": 1.766718795528149e-05, + "loss": 0.3708, + "step": 10456 + }, + { + "epoch": 2.149655668619591, + "grad_norm": 0.2274254858493805, + "learning_rate": 1.7659254082837288e-05, + "loss": 0.3951, + "step": 10457 + }, + { + "epoch": 2.1498612395929695, + "grad_norm": 0.12422723323106766, + "learning_rate": 1.7651321557343836e-05, + "loss": 0.4547, + "step": 10458 + }, + { + "epoch": 2.150066810566348, + "grad_norm": 0.23370634019374847, + "learning_rate": 1.7643390379191948e-05, + "loss": 0.3956, + "step": 10459 + }, + { + "epoch": 2.1502723815397267, + "grad_norm": 0.2372375875711441, + "learning_rate": 1.7635460548772353e-05, + "loss": 0.4031, + "step": 10460 + }, + { + "epoch": 2.1504779525131053, + "grad_norm": 0.23817671835422516, + "learning_rate": 1.762753206647571e-05, + "loss": 0.3945, + "step": 10461 + }, + { + "epoch": 2.150683523486484, + "grad_norm": 0.23152542114257812, + "learning_rate": 1.7619604932692628e-05, + "loss": 0.3837, + "step": 10462 + }, + { + "epoch": 2.1508890944598624, + "grad_norm": 0.21996726095676422, + "learning_rate": 1.7611679147813618e-05, + "loss": 0.3971, + "step": 10463 + }, + { + "epoch": 2.151094665433241, + "grad_norm": 0.22144795954227448, + "learning_rate": 1.760375471222918e-05, + "loss": 0.3999, + "step": 10464 + }, + { + "epoch": 2.1513002364066196, + "grad_norm": 0.23396509885787964, + "learning_rate": 1.7595831626329697e-05, + "loss": 0.3977, + "step": 10465 + }, + { + "epoch": 2.1515058073799977, + "grad_norm": 0.2290705144405365, + "learning_rate": 1.7587909890505503e-05, + "loss": 0.3953, + "step": 10466 + }, + { + "epoch": 2.1517113783533763, + "grad_norm": 0.22540703415870667, + "learning_rate": 1.7579989505146866e-05, + "loss": 0.3971, + "step": 10467 + }, + { + "epoch": 2.151916949326755, + "grad_norm": 0.12446384131908417, + "learning_rate": 1.7572070470643973e-05, + "loss": 0.4507, + "step": 10468 + }, + { + "epoch": 2.1521225203001335, + "grad_norm": 0.12616395950317383, + "learning_rate": 1.7564152787386977e-05, + "loss": 0.44, + "step": 10469 + }, + { + "epoch": 2.152328091273512, + "grad_norm": 0.23691080510616302, + "learning_rate": 1.7556236455765943e-05, + "loss": 0.3804, + "step": 10470 + }, + { + "epoch": 2.1525336622468907, + "grad_norm": 0.2261635661125183, + "learning_rate": 1.7548321476170854e-05, + "loss": 0.3727, + "step": 10471 + }, + { + "epoch": 2.1527392332202693, + "grad_norm": 0.22439588606357574, + "learning_rate": 1.7540407848991672e-05, + "loss": 0.3903, + "step": 10472 + }, + { + "epoch": 2.152944804193648, + "grad_norm": 0.13026651740074158, + "learning_rate": 1.7532495574618246e-05, + "loss": 0.4672, + "step": 10473 + }, + { + "epoch": 2.1531503751670265, + "grad_norm": 0.21984946727752686, + "learning_rate": 1.7524584653440377e-05, + "loss": 0.4064, + "step": 10474 + }, + { + "epoch": 2.153355946140405, + "grad_norm": 0.22405663132667542, + "learning_rate": 1.7516675085847812e-05, + "loss": 0.4067, + "step": 10475 + }, + { + "epoch": 2.1535615171137836, + "grad_norm": 0.22605964541435242, + "learning_rate": 1.75087668722302e-05, + "loss": 0.4045, + "step": 10476 + }, + { + "epoch": 2.153767088087162, + "grad_norm": 0.1273018717765808, + "learning_rate": 1.7500860012977142e-05, + "loss": 0.4456, + "step": 10477 + }, + { + "epoch": 2.153972659060541, + "grad_norm": 0.23210304975509644, + "learning_rate": 1.7492954508478192e-05, + "loss": 0.4067, + "step": 10478 + }, + { + "epoch": 2.1541782300339194, + "grad_norm": 0.2308957576751709, + "learning_rate": 1.7485050359122806e-05, + "loss": 0.4144, + "step": 10479 + }, + { + "epoch": 2.154383801007298, + "grad_norm": 0.2237699329853058, + "learning_rate": 1.7477147565300388e-05, + "loss": 0.3946, + "step": 10480 + }, + { + "epoch": 2.154589371980676, + "grad_norm": 0.12873926758766174, + "learning_rate": 1.7469246127400262e-05, + "loss": 0.4475, + "step": 10481 + }, + { + "epoch": 2.1547949429540547, + "grad_norm": 0.24316054582595825, + "learning_rate": 1.7461346045811703e-05, + "loss": 0.4043, + "step": 10482 + }, + { + "epoch": 2.1550005139274333, + "grad_norm": 0.21882621943950653, + "learning_rate": 1.7453447320923914e-05, + "loss": 0.4072, + "step": 10483 + }, + { + "epoch": 2.155206084900812, + "grad_norm": 0.2260080724954605, + "learning_rate": 1.7445549953126e-05, + "loss": 0.3984, + "step": 10484 + }, + { + "epoch": 2.1554116558741905, + "grad_norm": 0.22015734016895294, + "learning_rate": 1.743765394280707e-05, + "loss": 0.3975, + "step": 10485 + }, + { + "epoch": 2.155617226847569, + "grad_norm": 0.22426630556583405, + "learning_rate": 1.7429759290356103e-05, + "loss": 0.3925, + "step": 10486 + }, + { + "epoch": 2.1558227978209477, + "grad_norm": 0.23523494601249695, + "learning_rate": 1.7421865996162033e-05, + "loss": 0.4133, + "step": 10487 + }, + { + "epoch": 2.1560283687943262, + "grad_norm": 0.22726291418075562, + "learning_rate": 1.7413974060613727e-05, + "loss": 0.3988, + "step": 10488 + }, + { + "epoch": 2.156233939767705, + "grad_norm": 0.2152286171913147, + "learning_rate": 1.740608348409998e-05, + "loss": 0.3935, + "step": 10489 + }, + { + "epoch": 2.1564395107410834, + "grad_norm": 0.22603079676628113, + "learning_rate": 1.7398194267009514e-05, + "loss": 0.3965, + "step": 10490 + }, + { + "epoch": 2.156645081714462, + "grad_norm": 0.1339533030986786, + "learning_rate": 1.739030640973102e-05, + "loss": 0.435, + "step": 10491 + }, + { + "epoch": 2.1568506526878406, + "grad_norm": 0.23634931445121765, + "learning_rate": 1.7382419912653064e-05, + "loss": 0.4006, + "step": 10492 + }, + { + "epoch": 2.157056223661219, + "grad_norm": 0.23838773369789124, + "learning_rate": 1.7374534776164215e-05, + "loss": 0.4042, + "step": 10493 + }, + { + "epoch": 2.1572617946345978, + "grad_norm": 0.23160769045352936, + "learning_rate": 1.736665100065291e-05, + "loss": 0.3908, + "step": 10494 + }, + { + "epoch": 2.1574673656079764, + "grad_norm": 0.12931808829307556, + "learning_rate": 1.7358768586507557e-05, + "loss": 0.4381, + "step": 10495 + }, + { + "epoch": 2.1576729365813545, + "grad_norm": 0.2354772686958313, + "learning_rate": 1.735088753411648e-05, + "loss": 0.4097, + "step": 10496 + }, + { + "epoch": 2.157878507554733, + "grad_norm": 0.22520937025547028, + "learning_rate": 1.734300784386794e-05, + "loss": 0.4014, + "step": 10497 + }, + { + "epoch": 2.1580840785281117, + "grad_norm": 0.22981365025043488, + "learning_rate": 1.7335129516150123e-05, + "loss": 0.3952, + "step": 10498 + }, + { + "epoch": 2.1582896495014903, + "grad_norm": 0.2230282872915268, + "learning_rate": 1.7327252551351182e-05, + "loss": 0.405, + "step": 10499 + }, + { + "epoch": 2.158495220474869, + "grad_norm": 0.2350645661354065, + "learning_rate": 1.731937694985917e-05, + "loss": 0.3821, + "step": 10500 + }, + { + "epoch": 2.1587007914482474, + "grad_norm": 0.2205275148153305, + "learning_rate": 1.7311502712062073e-05, + "loss": 0.4014, + "step": 10501 + }, + { + "epoch": 2.158906362421626, + "grad_norm": 0.2229074090719223, + "learning_rate": 1.7303629838347825e-05, + "loss": 0.3965, + "step": 10502 + }, + { + "epoch": 2.1591119333950046, + "grad_norm": 0.2243238240480423, + "learning_rate": 1.7295758329104277e-05, + "loss": 0.3978, + "step": 10503 + }, + { + "epoch": 2.159317504368383, + "grad_norm": 0.22528594732284546, + "learning_rate": 1.728788818471923e-05, + "loss": 0.395, + "step": 10504 + }, + { + "epoch": 2.159523075341762, + "grad_norm": 0.22361469268798828, + "learning_rate": 1.7280019405580394e-05, + "loss": 0.3949, + "step": 10505 + }, + { + "epoch": 2.1597286463151404, + "grad_norm": 0.22868306934833527, + "learning_rate": 1.727215199207545e-05, + "loss": 0.396, + "step": 10506 + }, + { + "epoch": 2.159934217288519, + "grad_norm": 0.23044967651367188, + "learning_rate": 1.7264285944591975e-05, + "loss": 0.4099, + "step": 10507 + }, + { + "epoch": 2.1601397882618976, + "grad_norm": 0.2305765151977539, + "learning_rate": 1.7256421263517503e-05, + "loss": 0.3899, + "step": 10508 + }, + { + "epoch": 2.160345359235276, + "grad_norm": 0.21992215514183044, + "learning_rate": 1.724855794923948e-05, + "loss": 0.3854, + "step": 10509 + }, + { + "epoch": 2.1605509302086547, + "grad_norm": 0.21878063678741455, + "learning_rate": 1.7240696002145292e-05, + "loss": 0.3825, + "step": 10510 + }, + { + "epoch": 2.160756501182033, + "grad_norm": 0.12538020312786102, + "learning_rate": 1.7232835422622252e-05, + "loss": 0.4371, + "step": 10511 + }, + { + "epoch": 2.1609620721554115, + "grad_norm": 0.23171678185462952, + "learning_rate": 1.7224976211057645e-05, + "loss": 0.4239, + "step": 10512 + }, + { + "epoch": 2.16116764312879, + "grad_norm": 0.12217391282320023, + "learning_rate": 1.721711836783864e-05, + "loss": 0.4505, + "step": 10513 + }, + { + "epoch": 2.1613732141021686, + "grad_norm": 0.23179614543914795, + "learning_rate": 1.7209261893352335e-05, + "loss": 0.396, + "step": 10514 + }, + { + "epoch": 2.1615787850755472, + "grad_norm": 0.2259824126958847, + "learning_rate": 1.7201406787985824e-05, + "loss": 0.381, + "step": 10515 + }, + { + "epoch": 2.161784356048926, + "grad_norm": 0.2272365540266037, + "learning_rate": 1.719355305212607e-05, + "loss": 0.4012, + "step": 10516 + }, + { + "epoch": 2.1619899270223044, + "grad_norm": 0.2351997047662735, + "learning_rate": 1.718570068615999e-05, + "loss": 0.4049, + "step": 10517 + }, + { + "epoch": 2.162195497995683, + "grad_norm": 0.22571827471256256, + "learning_rate": 1.7177849690474415e-05, + "loss": 0.3954, + "step": 10518 + }, + { + "epoch": 2.1624010689690616, + "grad_norm": 0.22981050610542297, + "learning_rate": 1.7170000065456165e-05, + "loss": 0.3959, + "step": 10519 + }, + { + "epoch": 2.16260663994244, + "grad_norm": 0.2381727695465088, + "learning_rate": 1.7162151811491932e-05, + "loss": 0.3908, + "step": 10520 + }, + { + "epoch": 2.1628122109158188, + "grad_norm": 0.2317119836807251, + "learning_rate": 1.7154304928968366e-05, + "loss": 0.4135, + "step": 10521 + }, + { + "epoch": 2.1630177818891974, + "grad_norm": 0.2339845448732376, + "learning_rate": 1.714645941827205e-05, + "loss": 0.3687, + "step": 10522 + }, + { + "epoch": 2.163223352862576, + "grad_norm": 0.12437080591917038, + "learning_rate": 1.7138615279789484e-05, + "loss": 0.4476, + "step": 10523 + }, + { + "epoch": 2.1634289238359545, + "grad_norm": 0.12956155836582184, + "learning_rate": 1.7130772513907122e-05, + "loss": 0.4388, + "step": 10524 + }, + { + "epoch": 2.163634494809333, + "grad_norm": 0.22595298290252686, + "learning_rate": 1.7122931121011325e-05, + "loss": 0.3914, + "step": 10525 + }, + { + "epoch": 2.1638400657827113, + "grad_norm": 0.23524773120880127, + "learning_rate": 1.711509110148843e-05, + "loss": 0.394, + "step": 10526 + }, + { + "epoch": 2.16404563675609, + "grad_norm": 0.229460209608078, + "learning_rate": 1.7107252455724658e-05, + "loss": 0.3965, + "step": 10527 + }, + { + "epoch": 2.1642512077294684, + "grad_norm": 0.22869658470153809, + "learning_rate": 1.709941518410619e-05, + "loss": 0.3887, + "step": 10528 + }, + { + "epoch": 2.164456778702847, + "grad_norm": 0.2369028925895691, + "learning_rate": 1.7091579287019127e-05, + "loss": 0.4027, + "step": 10529 + }, + { + "epoch": 2.1646623496762256, + "grad_norm": 0.23322713375091553, + "learning_rate": 1.7083744764849512e-05, + "loss": 0.396, + "step": 10530 + }, + { + "epoch": 2.164867920649604, + "grad_norm": 0.23089557886123657, + "learning_rate": 1.707591161798331e-05, + "loss": 0.3945, + "step": 10531 + }, + { + "epoch": 2.165073491622983, + "grad_norm": 0.21757075190544128, + "learning_rate": 1.7068079846806413e-05, + "loss": 0.3796, + "step": 10532 + }, + { + "epoch": 2.1652790625963614, + "grad_norm": 0.2164604812860489, + "learning_rate": 1.706024945170468e-05, + "loss": 0.398, + "step": 10533 + }, + { + "epoch": 2.16548463356974, + "grad_norm": 0.2306961566209793, + "learning_rate": 1.705242043306387e-05, + "loss": 0.3956, + "step": 10534 + }, + { + "epoch": 2.1656902045431186, + "grad_norm": 0.2262311577796936, + "learning_rate": 1.704459279126966e-05, + "loss": 0.3937, + "step": 10535 + }, + { + "epoch": 2.165895775516497, + "grad_norm": 0.2339993417263031, + "learning_rate": 1.703676652670772e-05, + "loss": 0.4147, + "step": 10536 + }, + { + "epoch": 2.1661013464898757, + "grad_norm": 0.22700749337673187, + "learning_rate": 1.7028941639763586e-05, + "loss": 0.3932, + "step": 10537 + }, + { + "epoch": 2.1663069174632543, + "grad_norm": 0.22953462600708008, + "learning_rate": 1.7021118130822766e-05, + "loss": 0.3856, + "step": 10538 + }, + { + "epoch": 2.166512488436633, + "grad_norm": 0.12440577894449234, + "learning_rate": 1.7013296000270665e-05, + "loss": 0.4448, + "step": 10539 + }, + { + "epoch": 2.1667180594100115, + "grad_norm": 0.22885264456272125, + "learning_rate": 1.7005475248492677e-05, + "loss": 0.4023, + "step": 10540 + }, + { + "epoch": 2.1669236303833896, + "grad_norm": 0.22612909972667694, + "learning_rate": 1.6997655875874082e-05, + "loss": 0.3813, + "step": 10541 + }, + { + "epoch": 2.1671292013567682, + "grad_norm": 0.22638019919395447, + "learning_rate": 1.6989837882800095e-05, + "loss": 0.3978, + "step": 10542 + }, + { + "epoch": 2.167334772330147, + "grad_norm": 0.12233424931764603, + "learning_rate": 1.6982021269655878e-05, + "loss": 0.4485, + "step": 10543 + }, + { + "epoch": 2.1675403433035254, + "grad_norm": 0.12629348039627075, + "learning_rate": 1.6974206036826516e-05, + "loss": 0.4501, + "step": 10544 + }, + { + "epoch": 2.167745914276904, + "grad_norm": 0.12014532089233398, + "learning_rate": 1.696639218469703e-05, + "loss": 0.4594, + "step": 10545 + }, + { + "epoch": 2.1679514852502826, + "grad_norm": 0.2178095281124115, + "learning_rate": 1.6958579713652356e-05, + "loss": 0.4123, + "step": 10546 + }, + { + "epoch": 2.168157056223661, + "grad_norm": 0.22389446198940277, + "learning_rate": 1.6950768624077412e-05, + "loss": 0.3935, + "step": 10547 + }, + { + "epoch": 2.1683626271970398, + "grad_norm": 0.22835230827331543, + "learning_rate": 1.6942958916356995e-05, + "loss": 0.4019, + "step": 10548 + }, + { + "epoch": 2.1685681981704183, + "grad_norm": 0.2239934802055359, + "learning_rate": 1.6935150590875852e-05, + "loss": 0.4014, + "step": 10549 + }, + { + "epoch": 2.168773769143797, + "grad_norm": 0.22052869200706482, + "learning_rate": 1.6927343648018667e-05, + "loss": 0.3964, + "step": 10550 + }, + { + "epoch": 2.1689793401171755, + "grad_norm": 0.22106504440307617, + "learning_rate": 1.691953808817005e-05, + "loss": 0.3868, + "step": 10551 + }, + { + "epoch": 2.169184911090554, + "grad_norm": 0.12797969579696655, + "learning_rate": 1.6911733911714544e-05, + "loss": 0.4505, + "step": 10552 + }, + { + "epoch": 2.1693904820639327, + "grad_norm": 0.12730328738689423, + "learning_rate": 1.6903931119036607e-05, + "loss": 0.4535, + "step": 10553 + }, + { + "epoch": 2.1695960530373113, + "grad_norm": 0.22867700457572937, + "learning_rate": 1.6896129710520677e-05, + "loss": 0.4105, + "step": 10554 + }, + { + "epoch": 2.16980162401069, + "grad_norm": 0.22605451941490173, + "learning_rate": 1.688832968655108e-05, + "loss": 0.3941, + "step": 10555 + }, + { + "epoch": 2.170007194984068, + "grad_norm": 0.23293885588645935, + "learning_rate": 1.6880531047512074e-05, + "loss": 0.4083, + "step": 10556 + }, + { + "epoch": 2.1702127659574466, + "grad_norm": 0.11922682076692581, + "learning_rate": 1.6872733793787882e-05, + "loss": 0.449, + "step": 10557 + }, + { + "epoch": 2.170418336930825, + "grad_norm": 0.12665359675884247, + "learning_rate": 1.6864937925762637e-05, + "loss": 0.4587, + "step": 10558 + }, + { + "epoch": 2.170623907904204, + "grad_norm": 0.23081457614898682, + "learning_rate": 1.685714344382039e-05, + "loss": 0.3861, + "step": 10559 + }, + { + "epoch": 2.1708294788775824, + "grad_norm": 0.2365112155675888, + "learning_rate": 1.6849350348345137e-05, + "loss": 0.3958, + "step": 10560 + }, + { + "epoch": 2.171035049850961, + "grad_norm": 0.12257271260023117, + "learning_rate": 1.684155863972083e-05, + "loss": 0.46, + "step": 10561 + }, + { + "epoch": 2.1712406208243396, + "grad_norm": 0.2283942699432373, + "learning_rate": 1.6833768318331313e-05, + "loss": 0.388, + "step": 10562 + }, + { + "epoch": 2.171446191797718, + "grad_norm": 0.22442100942134857, + "learning_rate": 1.6825979384560385e-05, + "loss": 0.3916, + "step": 10563 + }, + { + "epoch": 2.1716517627710967, + "grad_norm": 0.12442784011363983, + "learning_rate": 1.681819183879177e-05, + "loss": 0.4635, + "step": 10564 + }, + { + "epoch": 2.1718573337444753, + "grad_norm": 0.22854554653167725, + "learning_rate": 1.681040568140912e-05, + "loss": 0.379, + "step": 10565 + }, + { + "epoch": 2.172062904717854, + "grad_norm": 0.12427257746458054, + "learning_rate": 1.680262091279602e-05, + "loss": 0.4719, + "step": 10566 + }, + { + "epoch": 2.1722684756912325, + "grad_norm": 0.22989091277122498, + "learning_rate": 1.6794837533335984e-05, + "loss": 0.4118, + "step": 10567 + }, + { + "epoch": 2.172474046664611, + "grad_norm": 0.23249632120132446, + "learning_rate": 1.6787055543412484e-05, + "loss": 0.3812, + "step": 10568 + }, + { + "epoch": 2.1726796176379897, + "grad_norm": 0.21678483486175537, + "learning_rate": 1.677927494340889e-05, + "loss": 0.4007, + "step": 10569 + }, + { + "epoch": 2.1728851886113683, + "grad_norm": 0.2254790961742401, + "learning_rate": 1.677149573370852e-05, + "loss": 0.395, + "step": 10570 + }, + { + "epoch": 2.1730907595847464, + "grad_norm": 0.2205883264541626, + "learning_rate": 1.6763717914694613e-05, + "loss": 0.3865, + "step": 10571 + }, + { + "epoch": 2.173296330558125, + "grad_norm": 0.12380865216255188, + "learning_rate": 1.675594148675035e-05, + "loss": 0.4542, + "step": 10572 + }, + { + "epoch": 2.1735019015315036, + "grad_norm": 0.22934816777706146, + "learning_rate": 1.6748166450258836e-05, + "loss": 0.3885, + "step": 10573 + }, + { + "epoch": 2.173707472504882, + "grad_norm": 0.2283497005701065, + "learning_rate": 1.6740392805603097e-05, + "loss": 0.385, + "step": 10574 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.22790871560573578, + "learning_rate": 1.6732620553166136e-05, + "loss": 0.3862, + "step": 10575 + }, + { + "epoch": 2.1741186144516393, + "grad_norm": 0.2244972586631775, + "learning_rate": 1.6724849693330837e-05, + "loss": 0.4012, + "step": 10576 + }, + { + "epoch": 2.174324185425018, + "grad_norm": 0.23788417875766754, + "learning_rate": 1.6717080226480034e-05, + "loss": 0.4071, + "step": 10577 + }, + { + "epoch": 2.1745297563983965, + "grad_norm": 0.22114843130111694, + "learning_rate": 1.6709312152996484e-05, + "loss": 0.3793, + "step": 10578 + }, + { + "epoch": 2.174735327371775, + "grad_norm": 0.23666070401668549, + "learning_rate": 1.6701545473262907e-05, + "loss": 0.4066, + "step": 10579 + }, + { + "epoch": 2.1749408983451537, + "grad_norm": 0.23616231977939606, + "learning_rate": 1.669378018766192e-05, + "loss": 0.4042, + "step": 10580 + }, + { + "epoch": 2.1751464693185323, + "grad_norm": 0.2265489399433136, + "learning_rate": 1.668601629657606e-05, + "loss": 0.3877, + "step": 10581 + }, + { + "epoch": 2.175352040291911, + "grad_norm": 0.223519966006279, + "learning_rate": 1.6678253800387857e-05, + "loss": 0.4095, + "step": 10582 + }, + { + "epoch": 2.1755576112652895, + "grad_norm": 0.12714464962482452, + "learning_rate": 1.6670492699479713e-05, + "loss": 0.4789, + "step": 10583 + }, + { + "epoch": 2.175763182238668, + "grad_norm": 0.22280433773994446, + "learning_rate": 1.6662732994233978e-05, + "loss": 0.3944, + "step": 10584 + }, + { + "epoch": 2.1759687532120466, + "grad_norm": 0.2261977344751358, + "learning_rate": 1.6654974685032947e-05, + "loss": 0.3955, + "step": 10585 + }, + { + "epoch": 2.1761743241854252, + "grad_norm": 0.23589631915092468, + "learning_rate": 1.6647217772258825e-05, + "loss": 0.3948, + "step": 10586 + }, + { + "epoch": 2.176379895158804, + "grad_norm": 0.1299065500497818, + "learning_rate": 1.6639462256293747e-05, + "loss": 0.4561, + "step": 10587 + }, + { + "epoch": 2.176585466132182, + "grad_norm": 0.24209356307983398, + "learning_rate": 1.6631708137519825e-05, + "loss": 0.4137, + "step": 10588 + }, + { + "epoch": 2.1767910371055605, + "grad_norm": 0.2254961133003235, + "learning_rate": 1.6623955416319047e-05, + "loss": 0.3962, + "step": 10589 + }, + { + "epoch": 2.176996608078939, + "grad_norm": 0.1276281327009201, + "learning_rate": 1.661620409307336e-05, + "loss": 0.4605, + "step": 10590 + }, + { + "epoch": 2.1772021790523177, + "grad_norm": 0.22398579120635986, + "learning_rate": 1.660845416816463e-05, + "loss": 0.396, + "step": 10591 + }, + { + "epoch": 2.1774077500256963, + "grad_norm": 0.22290287911891937, + "learning_rate": 1.660070564197466e-05, + "loss": 0.4096, + "step": 10592 + }, + { + "epoch": 2.177613320999075, + "grad_norm": 0.22636477649211884, + "learning_rate": 1.6592958514885183e-05, + "loss": 0.3942, + "step": 10593 + }, + { + "epoch": 2.1778188919724535, + "grad_norm": 0.21956631541252136, + "learning_rate": 1.6585212787277854e-05, + "loss": 0.4021, + "step": 10594 + }, + { + "epoch": 2.178024462945832, + "grad_norm": 0.2394167184829712, + "learning_rate": 1.6577468459534298e-05, + "loss": 0.397, + "step": 10595 + }, + { + "epoch": 2.1782300339192107, + "grad_norm": 0.22891393303871155, + "learning_rate": 1.656972553203602e-05, + "loss": 0.3938, + "step": 10596 + }, + { + "epoch": 2.1784356048925893, + "grad_norm": 0.2175266295671463, + "learning_rate": 1.6561984005164483e-05, + "loss": 0.3902, + "step": 10597 + }, + { + "epoch": 2.178641175865968, + "grad_norm": 0.22040759027004242, + "learning_rate": 1.6554243879301076e-05, + "loss": 0.3728, + "step": 10598 + }, + { + "epoch": 2.1788467468393464, + "grad_norm": 0.22119790315628052, + "learning_rate": 1.65465051548271e-05, + "loss": 0.4136, + "step": 10599 + }, + { + "epoch": 2.179052317812725, + "grad_norm": 0.22910022735595703, + "learning_rate": 1.6538767832123844e-05, + "loss": 0.4046, + "step": 10600 + }, + { + "epoch": 2.1792578887861036, + "grad_norm": 0.129209503531456, + "learning_rate": 1.653103191157247e-05, + "loss": 0.439, + "step": 10601 + }, + { + "epoch": 2.179463459759482, + "grad_norm": 0.23198646306991577, + "learning_rate": 1.6523297393554072e-05, + "loss": 0.4143, + "step": 10602 + }, + { + "epoch": 2.1796690307328603, + "grad_norm": 0.22791431844234467, + "learning_rate": 1.6515564278449728e-05, + "loss": 0.3833, + "step": 10603 + }, + { + "epoch": 2.179874601706239, + "grad_norm": 0.2255294919013977, + "learning_rate": 1.6507832566640392e-05, + "loss": 0.3928, + "step": 10604 + }, + { + "epoch": 2.1800801726796175, + "grad_norm": 0.23165516555309296, + "learning_rate": 1.6500102258506978e-05, + "loss": 0.3914, + "step": 10605 + }, + { + "epoch": 2.180285743652996, + "grad_norm": 0.2258346527814865, + "learning_rate": 1.6492373354430316e-05, + "loss": 0.3953, + "step": 10606 + }, + { + "epoch": 2.1804913146263747, + "grad_norm": 0.22352395951747894, + "learning_rate": 1.6484645854791174e-05, + "loss": 0.3852, + "step": 10607 + }, + { + "epoch": 2.1806968855997533, + "grad_norm": 0.22954273223876953, + "learning_rate": 1.6476919759970236e-05, + "loss": 0.4085, + "step": 10608 + }, + { + "epoch": 2.180902456573132, + "grad_norm": 0.22188891470432281, + "learning_rate": 1.6469195070348158e-05, + "loss": 0.3917, + "step": 10609 + }, + { + "epoch": 2.1811080275465105, + "grad_norm": 0.12909865379333496, + "learning_rate": 1.6461471786305488e-05, + "loss": 0.4633, + "step": 10610 + }, + { + "epoch": 2.181313598519889, + "grad_norm": 0.2231685221195221, + "learning_rate": 1.6453749908222718e-05, + "loss": 0.3876, + "step": 10611 + }, + { + "epoch": 2.1815191694932676, + "grad_norm": 0.22691339254379272, + "learning_rate": 1.6446029436480263e-05, + "loss": 0.3948, + "step": 10612 + }, + { + "epoch": 2.181724740466646, + "grad_norm": 0.23698212206363678, + "learning_rate": 1.643831037145847e-05, + "loss": 0.3962, + "step": 10613 + }, + { + "epoch": 2.181930311440025, + "grad_norm": 0.22960902750492096, + "learning_rate": 1.6430592713537634e-05, + "loss": 0.3989, + "step": 10614 + }, + { + "epoch": 2.1821358824134034, + "grad_norm": 0.2320588082075119, + "learning_rate": 1.642287646309795e-05, + "loss": 0.392, + "step": 10615 + }, + { + "epoch": 2.182341453386782, + "grad_norm": 0.560815155506134, + "learning_rate": 1.641516162051958e-05, + "loss": 0.3986, + "step": 10616 + }, + { + "epoch": 2.1825470243601606, + "grad_norm": 0.12423614412546158, + "learning_rate": 1.6407448186182598e-05, + "loss": 0.4408, + "step": 10617 + }, + { + "epoch": 2.1827525953335387, + "grad_norm": 0.2267366200685501, + "learning_rate": 1.6399736160467e-05, + "loss": 0.3849, + "step": 10618 + }, + { + "epoch": 2.1829581663069173, + "grad_norm": 0.2252301126718521, + "learning_rate": 1.6392025543752726e-05, + "loss": 0.3939, + "step": 10619 + }, + { + "epoch": 2.183163737280296, + "grad_norm": 0.1241535022854805, + "learning_rate": 1.6384316336419625e-05, + "loss": 0.4509, + "step": 10620 + }, + { + "epoch": 2.1833693082536745, + "grad_norm": 0.22740307450294495, + "learning_rate": 1.637660853884752e-05, + "loss": 0.4052, + "step": 10621 + }, + { + "epoch": 2.183574879227053, + "grad_norm": 0.2271934300661087, + "learning_rate": 1.6368902151416132e-05, + "loss": 0.3804, + "step": 10622 + }, + { + "epoch": 2.1837804502004317, + "grad_norm": 0.23072363436222076, + "learning_rate": 1.6361197174505098e-05, + "loss": 0.3939, + "step": 10623 + }, + { + "epoch": 2.1839860211738102, + "grad_norm": 0.2331043779850006, + "learning_rate": 1.6353493608494032e-05, + "loss": 0.3989, + "step": 10624 + }, + { + "epoch": 2.184191592147189, + "grad_norm": 0.12475959211587906, + "learning_rate": 1.634579145376245e-05, + "loss": 0.4525, + "step": 10625 + }, + { + "epoch": 2.1843971631205674, + "grad_norm": 0.22251753509044647, + "learning_rate": 1.633809071068979e-05, + "loss": 0.4049, + "step": 10626 + }, + { + "epoch": 2.184602734093946, + "grad_norm": 0.22629208862781525, + "learning_rate": 1.633039137965543e-05, + "loss": 0.4039, + "step": 10627 + }, + { + "epoch": 2.1848083050673246, + "grad_norm": 0.22912812232971191, + "learning_rate": 1.632269346103869e-05, + "loss": 0.4004, + "step": 10628 + }, + { + "epoch": 2.185013876040703, + "grad_norm": 0.2214146852493286, + "learning_rate": 1.6314996955218792e-05, + "loss": 0.3727, + "step": 10629 + }, + { + "epoch": 2.1852194470140818, + "grad_norm": 0.22701111435890198, + "learning_rate": 1.6307301862574933e-05, + "loss": 0.4044, + "step": 10630 + }, + { + "epoch": 2.1854250179874604, + "grad_norm": 0.22968102991580963, + "learning_rate": 1.6299608183486206e-05, + "loss": 0.399, + "step": 10631 + }, + { + "epoch": 2.185630588960839, + "grad_norm": 0.2261413037776947, + "learning_rate": 1.6291915918331637e-05, + "loss": 0.3978, + "step": 10632 + }, + { + "epoch": 2.185836159934217, + "grad_norm": 0.2443215698003769, + "learning_rate": 1.6284225067490187e-05, + "loss": 0.3938, + "step": 10633 + }, + { + "epoch": 2.1860417309075957, + "grad_norm": 0.1367214322090149, + "learning_rate": 1.6276535631340756e-05, + "loss": 0.459, + "step": 10634 + }, + { + "epoch": 2.1862473018809743, + "grad_norm": 0.1239805743098259, + "learning_rate": 1.6268847610262154e-05, + "loss": 0.445, + "step": 10635 + }, + { + "epoch": 2.186452872854353, + "grad_norm": 0.23008181154727936, + "learning_rate": 1.626116100463313e-05, + "loss": 0.3968, + "step": 10636 + }, + { + "epoch": 2.1866584438277314, + "grad_norm": 0.22786974906921387, + "learning_rate": 1.625347581483239e-05, + "loss": 0.3968, + "step": 10637 + }, + { + "epoch": 2.18686401480111, + "grad_norm": 0.2298787385225296, + "learning_rate": 1.6245792041238542e-05, + "loss": 0.3913, + "step": 10638 + }, + { + "epoch": 2.1870695857744886, + "grad_norm": 0.23194655776023865, + "learning_rate": 1.623810968423012e-05, + "loss": 0.3976, + "step": 10639 + }, + { + "epoch": 2.187275156747867, + "grad_norm": 0.23695392906665802, + "learning_rate": 1.62304287441856e-05, + "loss": 0.4161, + "step": 10640 + }, + { + "epoch": 2.187480727721246, + "grad_norm": 0.22045163810253143, + "learning_rate": 1.6222749221483375e-05, + "loss": 0.412, + "step": 10641 + }, + { + "epoch": 2.1876862986946244, + "grad_norm": 0.22696349024772644, + "learning_rate": 1.62150711165018e-05, + "loss": 0.3791, + "step": 10642 + }, + { + "epoch": 2.187891869668003, + "grad_norm": 0.23293721675872803, + "learning_rate": 1.6207394429619136e-05, + "loss": 0.4014, + "step": 10643 + }, + { + "epoch": 2.1880974406413816, + "grad_norm": 0.12806709110736847, + "learning_rate": 1.619971916121356e-05, + "loss": 0.449, + "step": 10644 + }, + { + "epoch": 2.18830301161476, + "grad_norm": 0.21958725154399872, + "learning_rate": 1.6192045311663218e-05, + "loss": 0.3836, + "step": 10645 + }, + { + "epoch": 2.1885085825881387, + "grad_norm": 0.22592249512672424, + "learning_rate": 1.6184372881346154e-05, + "loss": 0.3945, + "step": 10646 + }, + { + "epoch": 2.1887141535615173, + "grad_norm": 0.12806597352027893, + "learning_rate": 1.6176701870640362e-05, + "loss": 0.4394, + "step": 10647 + }, + { + "epoch": 2.1889197245348955, + "grad_norm": 0.2250743955373764, + "learning_rate": 1.616903227992374e-05, + "loss": 0.3952, + "step": 10648 + }, + { + "epoch": 2.189125295508274, + "grad_norm": 0.1263757050037384, + "learning_rate": 1.616136410957415e-05, + "loss": 0.4591, + "step": 10649 + }, + { + "epoch": 2.1893308664816526, + "grad_norm": 0.237161323428154, + "learning_rate": 1.6153697359969344e-05, + "loss": 0.4032, + "step": 10650 + }, + { + "epoch": 2.1895364374550312, + "grad_norm": 0.22208333015441895, + "learning_rate": 1.614603203148705e-05, + "loss": 0.3927, + "step": 10651 + }, + { + "epoch": 2.18974200842841, + "grad_norm": 0.22636909782886505, + "learning_rate": 1.61383681245049e-05, + "loss": 0.3784, + "step": 10652 + }, + { + "epoch": 2.1899475794017884, + "grad_norm": 0.23345516622066498, + "learning_rate": 1.6130705639400447e-05, + "loss": 0.4156, + "step": 10653 + }, + { + "epoch": 2.190153150375167, + "grad_norm": 0.2252190262079239, + "learning_rate": 1.6123044576551202e-05, + "loss": 0.3922, + "step": 10654 + }, + { + "epoch": 2.1903587213485456, + "grad_norm": 0.23159563541412354, + "learning_rate": 1.6115384936334575e-05, + "loss": 0.4089, + "step": 10655 + }, + { + "epoch": 2.190564292321924, + "grad_norm": 0.22487987577915192, + "learning_rate": 1.6107726719127926e-05, + "loss": 0.3992, + "step": 10656 + }, + { + "epoch": 2.1907698632953028, + "grad_norm": 0.23709611594676971, + "learning_rate": 1.6100069925308523e-05, + "loss": 0.4198, + "step": 10657 + }, + { + "epoch": 2.1909754342686814, + "grad_norm": 0.21871237456798553, + "learning_rate": 1.609241455525361e-05, + "loss": 0.4042, + "step": 10658 + }, + { + "epoch": 2.19118100524206, + "grad_norm": 0.2315407693386078, + "learning_rate": 1.6084760609340326e-05, + "loss": 0.4062, + "step": 10659 + }, + { + "epoch": 2.1913865762154385, + "grad_norm": 0.2263568639755249, + "learning_rate": 1.6077108087945734e-05, + "loss": 0.3908, + "step": 10660 + }, + { + "epoch": 2.191592147188817, + "grad_norm": 0.12639762461185455, + "learning_rate": 1.6069456991446842e-05, + "loss": 0.4546, + "step": 10661 + }, + { + "epoch": 2.1917977181621957, + "grad_norm": 0.2350437194108963, + "learning_rate": 1.606180732022058e-05, + "loss": 0.4115, + "step": 10662 + }, + { + "epoch": 2.192003289135574, + "grad_norm": 0.21677015721797943, + "learning_rate": 1.60541590746438e-05, + "loss": 0.3724, + "step": 10663 + }, + { + "epoch": 2.1922088601089524, + "grad_norm": 0.22756123542785645, + "learning_rate": 1.6046512255093326e-05, + "loss": 0.3916, + "step": 10664 + }, + { + "epoch": 2.192414431082331, + "grad_norm": 0.12300966680049896, + "learning_rate": 1.6038866861945847e-05, + "loss": 0.4532, + "step": 10665 + }, + { + "epoch": 2.1926200020557096, + "grad_norm": 0.23039010167121887, + "learning_rate": 1.6031222895578052e-05, + "loss": 0.3941, + "step": 10666 + }, + { + "epoch": 2.192825573029088, + "grad_norm": 0.2256508469581604, + "learning_rate": 1.6023580356366502e-05, + "loss": 0.4022, + "step": 10667 + }, + { + "epoch": 2.193031144002467, + "grad_norm": 0.21880964934825897, + "learning_rate": 1.6015939244687717e-05, + "loss": 0.3848, + "step": 10668 + }, + { + "epoch": 2.1932367149758454, + "grad_norm": 0.23204973340034485, + "learning_rate": 1.600829956091813e-05, + "loss": 0.3865, + "step": 10669 + }, + { + "epoch": 2.193442285949224, + "grad_norm": 0.24459494650363922, + "learning_rate": 1.6000661305434108e-05, + "loss": 0.3947, + "step": 10670 + }, + { + "epoch": 2.1936478569226026, + "grad_norm": 0.23136425018310547, + "learning_rate": 1.5993024478611972e-05, + "loss": 0.3957, + "step": 10671 + }, + { + "epoch": 2.193853427895981, + "grad_norm": 0.22914138436317444, + "learning_rate": 1.5985389080827937e-05, + "loss": 0.3889, + "step": 10672 + }, + { + "epoch": 2.1940589988693597, + "grad_norm": 0.22302468121051788, + "learning_rate": 1.5977755112458174e-05, + "loss": 0.385, + "step": 10673 + }, + { + "epoch": 2.1942645698427383, + "grad_norm": 0.2292277216911316, + "learning_rate": 1.5970122573878766e-05, + "loss": 0.4123, + "step": 10674 + }, + { + "epoch": 2.194470140816117, + "grad_norm": 0.2244681715965271, + "learning_rate": 1.5962491465465733e-05, + "loss": 0.3681, + "step": 10675 + }, + { + "epoch": 2.1946757117894955, + "grad_norm": 0.2233274132013321, + "learning_rate": 1.5954861787595024e-05, + "loss": 0.4046, + "step": 10676 + }, + { + "epoch": 2.194881282762874, + "grad_norm": 0.23008307814598083, + "learning_rate": 1.5947233540642505e-05, + "loss": 0.408, + "step": 10677 + }, + { + "epoch": 2.1950868537362522, + "grad_norm": 0.2235502302646637, + "learning_rate": 1.593960672498401e-05, + "loss": 0.3884, + "step": 10678 + }, + { + "epoch": 2.195292424709631, + "grad_norm": 0.12918898463249207, + "learning_rate": 1.5931981340995262e-05, + "loss": 0.4728, + "step": 10679 + }, + { + "epoch": 2.1954979956830094, + "grad_norm": 0.21759852766990662, + "learning_rate": 1.5924357389051935e-05, + "loss": 0.3975, + "step": 10680 + }, + { + "epoch": 2.195703566656388, + "grad_norm": 0.22451691329479218, + "learning_rate": 1.5916734869529616e-05, + "loss": 0.3896, + "step": 10681 + }, + { + "epoch": 2.1959091376297666, + "grad_norm": 0.13441641628742218, + "learning_rate": 1.5909113782803837e-05, + "loss": 0.4687, + "step": 10682 + }, + { + "epoch": 2.196114708603145, + "grad_norm": 0.23042891919612885, + "learning_rate": 1.5901494129250052e-05, + "loss": 0.3967, + "step": 10683 + }, + { + "epoch": 2.1963202795765238, + "grad_norm": 0.2289479672908783, + "learning_rate": 1.589387590924363e-05, + "loss": 0.3911, + "step": 10684 + }, + { + "epoch": 2.1965258505499023, + "grad_norm": 0.22492031753063202, + "learning_rate": 1.5886259123159917e-05, + "loss": 0.3867, + "step": 10685 + }, + { + "epoch": 2.196731421523281, + "grad_norm": 0.2289929836988449, + "learning_rate": 1.5878643771374133e-05, + "loss": 0.3915, + "step": 10686 + }, + { + "epoch": 2.1969369924966595, + "grad_norm": 0.12365361303091049, + "learning_rate": 1.5871029854261445e-05, + "loss": 0.4289, + "step": 10687 + }, + { + "epoch": 2.197142563470038, + "grad_norm": 0.21747228503227234, + "learning_rate": 1.5863417372196988e-05, + "loss": 0.401, + "step": 10688 + }, + { + "epoch": 2.1973481344434167, + "grad_norm": 0.21652854979038239, + "learning_rate": 1.585580632555577e-05, + "loss": 0.3908, + "step": 10689 + }, + { + "epoch": 2.1975537054167953, + "grad_norm": 0.22147879004478455, + "learning_rate": 1.584819671471275e-05, + "loss": 0.3968, + "step": 10690 + }, + { + "epoch": 2.197759276390174, + "grad_norm": 0.2206578552722931, + "learning_rate": 1.5840588540042816e-05, + "loss": 0.3972, + "step": 10691 + }, + { + "epoch": 2.1979648473635525, + "grad_norm": 0.23885060846805573, + "learning_rate": 1.5832981801920806e-05, + "loss": 0.385, + "step": 10692 + }, + { + "epoch": 2.1981704183369306, + "grad_norm": 0.23165802657604218, + "learning_rate": 1.582537650072145e-05, + "loss": 0.3954, + "step": 10693 + }, + { + "epoch": 2.198375989310309, + "grad_norm": 0.23803496360778809, + "learning_rate": 1.5817772636819437e-05, + "loss": 0.4089, + "step": 10694 + }, + { + "epoch": 2.198581560283688, + "grad_norm": 0.22591203451156616, + "learning_rate": 1.581017021058937e-05, + "loss": 0.3965, + "step": 10695 + }, + { + "epoch": 2.1987871312570664, + "grad_norm": 0.23487183451652527, + "learning_rate": 1.5802569222405785e-05, + "loss": 0.4041, + "step": 10696 + }, + { + "epoch": 2.198992702230445, + "grad_norm": 0.12291015684604645, + "learning_rate": 1.5794969672643143e-05, + "loss": 0.4483, + "step": 10697 + }, + { + "epoch": 2.1991982732038236, + "grad_norm": 0.2258739024400711, + "learning_rate": 1.5787371561675826e-05, + "loss": 0.3911, + "step": 10698 + }, + { + "epoch": 2.199403844177202, + "grad_norm": 0.2271280735731125, + "learning_rate": 1.5779774889878188e-05, + "loss": 0.39, + "step": 10699 + }, + { + "epoch": 2.1996094151505807, + "grad_norm": 0.12247934192419052, + "learning_rate": 1.5772179657624468e-05, + "loss": 0.4543, + "step": 10700 + }, + { + "epoch": 2.1998149861239593, + "grad_norm": 0.22866493463516235, + "learning_rate": 1.5764585865288846e-05, + "loss": 0.3903, + "step": 10701 + }, + { + "epoch": 2.200020557097338, + "grad_norm": 0.12255199253559113, + "learning_rate": 1.5756993513245428e-05, + "loss": 0.453, + "step": 10702 + }, + { + "epoch": 2.2002261280707165, + "grad_norm": 0.2146882563829422, + "learning_rate": 1.574940260186826e-05, + "loss": 0.3789, + "step": 10703 + }, + { + "epoch": 2.200431699044095, + "grad_norm": 0.23465701937675476, + "learning_rate": 1.5741813131531313e-05, + "loss": 0.3917, + "step": 10704 + }, + { + "epoch": 2.2006372700174737, + "grad_norm": 0.2412889301776886, + "learning_rate": 1.5734225102608464e-05, + "loss": 0.4213, + "step": 10705 + }, + { + "epoch": 2.2008428409908523, + "grad_norm": 0.22149762511253357, + "learning_rate": 1.5726638515473566e-05, + "loss": 0.3988, + "step": 10706 + }, + { + "epoch": 2.201048411964231, + "grad_norm": 0.23268526792526245, + "learning_rate": 1.571905337050037e-05, + "loss": 0.3857, + "step": 10707 + }, + { + "epoch": 2.201253982937609, + "grad_norm": 0.22317472100257874, + "learning_rate": 1.571146966806254e-05, + "loss": 0.3828, + "step": 10708 + }, + { + "epoch": 2.2014595539109876, + "grad_norm": 0.22195008397102356, + "learning_rate": 1.570388740853372e-05, + "loss": 0.4056, + "step": 10709 + }, + { + "epoch": 2.201665124884366, + "grad_norm": 0.21876020729541779, + "learning_rate": 1.569630659228744e-05, + "loss": 0.4002, + "step": 10710 + }, + { + "epoch": 2.2018706958577448, + "grad_norm": 0.2204761803150177, + "learning_rate": 1.5688727219697163e-05, + "loss": 0.3963, + "step": 10711 + }, + { + "epoch": 2.2020762668311233, + "grad_norm": 0.22541974484920502, + "learning_rate": 1.5681149291136285e-05, + "loss": 0.3829, + "step": 10712 + }, + { + "epoch": 2.202281837804502, + "grad_norm": 0.22481369972229004, + "learning_rate": 1.567357280697816e-05, + "loss": 0.3834, + "step": 10713 + }, + { + "epoch": 2.2024874087778805, + "grad_norm": 0.23171178996562958, + "learning_rate": 1.5665997767596033e-05, + "loss": 0.4008, + "step": 10714 + }, + { + "epoch": 2.202692979751259, + "grad_norm": 0.22620131075382233, + "learning_rate": 1.5658424173363085e-05, + "loss": 0.3997, + "step": 10715 + }, + { + "epoch": 2.2028985507246377, + "grad_norm": 0.22562332451343536, + "learning_rate": 1.5650852024652435e-05, + "loss": 0.4104, + "step": 10716 + }, + { + "epoch": 2.2031041216980163, + "grad_norm": 0.2276526838541031, + "learning_rate": 1.5643281321837135e-05, + "loss": 0.392, + "step": 10717 + }, + { + "epoch": 2.203309692671395, + "grad_norm": 0.12458810210227966, + "learning_rate": 1.5635712065290146e-05, + "loss": 0.4551, + "step": 10718 + }, + { + "epoch": 2.2035152636447735, + "grad_norm": 0.23165149986743927, + "learning_rate": 1.5628144255384365e-05, + "loss": 0.3855, + "step": 10719 + }, + { + "epoch": 2.203720834618152, + "grad_norm": 0.2240263819694519, + "learning_rate": 1.562057789249264e-05, + "loss": 0.3825, + "step": 10720 + }, + { + "epoch": 2.2039264055915306, + "grad_norm": 0.21997642517089844, + "learning_rate": 1.5613012976987728e-05, + "loss": 0.3813, + "step": 10721 + }, + { + "epoch": 2.2041319765649092, + "grad_norm": 1.4580494165420532, + "learning_rate": 1.5605449509242312e-05, + "loss": 0.408, + "step": 10722 + }, + { + "epoch": 2.2043375475382874, + "grad_norm": 0.23071999847888947, + "learning_rate": 1.5597887489629008e-05, + "loss": 0.3983, + "step": 10723 + }, + { + "epoch": 2.204543118511666, + "grad_norm": 0.22993268072605133, + "learning_rate": 1.559032691852036e-05, + "loss": 0.392, + "step": 10724 + }, + { + "epoch": 2.2047486894850445, + "grad_norm": 0.12808802723884583, + "learning_rate": 1.5582767796288852e-05, + "loss": 0.4491, + "step": 10725 + }, + { + "epoch": 2.204954260458423, + "grad_norm": 0.22585633397102356, + "learning_rate": 1.5575210123306855e-05, + "loss": 0.4, + "step": 10726 + }, + { + "epoch": 2.2051598314318017, + "grad_norm": 0.12611474096775055, + "learning_rate": 1.5567653899946745e-05, + "loss": 0.4577, + "step": 10727 + }, + { + "epoch": 2.2053654024051803, + "grad_norm": 0.17360465228557587, + "learning_rate": 1.5560099126580757e-05, + "loss": 0.4583, + "step": 10728 + }, + { + "epoch": 2.205570973378559, + "grad_norm": 0.23249217867851257, + "learning_rate": 1.5552545803581072e-05, + "loss": 0.3971, + "step": 10729 + }, + { + "epoch": 2.2057765443519375, + "grad_norm": 0.2386702597141266, + "learning_rate": 1.5544993931319832e-05, + "loss": 0.3891, + "step": 10730 + }, + { + "epoch": 2.205982115325316, + "grad_norm": 0.12809514999389648, + "learning_rate": 1.5537443510169068e-05, + "loss": 0.4534, + "step": 10731 + }, + { + "epoch": 2.2061876862986947, + "grad_norm": 0.2297258824110031, + "learning_rate": 1.5529894540500755e-05, + "loss": 0.3897, + "step": 10732 + }, + { + "epoch": 2.2063932572720732, + "grad_norm": 0.22300571203231812, + "learning_rate": 1.5522347022686782e-05, + "loss": 0.3961, + "step": 10733 + }, + { + "epoch": 2.206598828245452, + "grad_norm": 0.23077335953712463, + "learning_rate": 1.5514800957099003e-05, + "loss": 0.4094, + "step": 10734 + }, + { + "epoch": 2.2068043992188304, + "grad_norm": 0.22444140911102295, + "learning_rate": 1.550725634410917e-05, + "loss": 0.4009, + "step": 10735 + }, + { + "epoch": 2.207009970192209, + "grad_norm": 0.13065902888774872, + "learning_rate": 1.549971318408897e-05, + "loss": 0.4443, + "step": 10736 + }, + { + "epoch": 2.2072155411655876, + "grad_norm": 0.12475431710481644, + "learning_rate": 1.5492171477410013e-05, + "loss": 0.4383, + "step": 10737 + }, + { + "epoch": 2.2074211121389657, + "grad_norm": 0.23084284365177155, + "learning_rate": 1.5484631224443852e-05, + "loss": 0.4043, + "step": 10738 + }, + { + "epoch": 2.2076266831123443, + "grad_norm": 0.12472715973854065, + "learning_rate": 1.5477092425561953e-05, + "loss": 0.4307, + "step": 10739 + }, + { + "epoch": 2.207832254085723, + "grad_norm": 0.1253010481595993, + "learning_rate": 1.546955508113571e-05, + "loss": 0.4488, + "step": 10740 + }, + { + "epoch": 2.2080378250591015, + "grad_norm": 0.12054693698883057, + "learning_rate": 1.5462019191536478e-05, + "loss": 0.4402, + "step": 10741 + }, + { + "epoch": 2.20824339603248, + "grad_norm": 0.2258850783109665, + "learning_rate": 1.5454484757135496e-05, + "loss": 0.3804, + "step": 10742 + }, + { + "epoch": 2.2084489670058587, + "grad_norm": 0.23322363197803497, + "learning_rate": 1.5446951778303958e-05, + "loss": 0.4058, + "step": 10743 + }, + { + "epoch": 2.2086545379792373, + "grad_norm": 0.23911800980567932, + "learning_rate": 1.543942025541297e-05, + "loss": 0.3821, + "step": 10744 + }, + { + "epoch": 2.208860108952616, + "grad_norm": 0.22474057972431183, + "learning_rate": 1.5431890188833585e-05, + "loss": 0.3981, + "step": 10745 + }, + { + "epoch": 2.2090656799259945, + "grad_norm": 0.22120480239391327, + "learning_rate": 1.5424361578936754e-05, + "loss": 0.4036, + "step": 10746 + }, + { + "epoch": 2.209271250899373, + "grad_norm": 0.23113922774791718, + "learning_rate": 1.5416834426093406e-05, + "loss": 0.3996, + "step": 10747 + }, + { + "epoch": 2.2094768218727516, + "grad_norm": 0.23626331984996796, + "learning_rate": 1.5409308730674354e-05, + "loss": 0.409, + "step": 10748 + }, + { + "epoch": 2.20968239284613, + "grad_norm": 0.22344759106636047, + "learning_rate": 1.540178449305036e-05, + "loss": 0.3952, + "step": 10749 + }, + { + "epoch": 2.209887963819509, + "grad_norm": 0.23070107400417328, + "learning_rate": 1.5394261713592094e-05, + "loss": 0.3839, + "step": 10750 + }, + { + "epoch": 2.2100935347928874, + "grad_norm": 0.22357220947742462, + "learning_rate": 1.5386740392670165e-05, + "loss": 0.3963, + "step": 10751 + }, + { + "epoch": 2.210299105766266, + "grad_norm": 0.2235075831413269, + "learning_rate": 1.5379220530655138e-05, + "loss": 0.3847, + "step": 10752 + }, + { + "epoch": 2.2105046767396446, + "grad_norm": 0.2250668853521347, + "learning_rate": 1.5371702127917458e-05, + "loss": 0.3854, + "step": 10753 + }, + { + "epoch": 2.2107102477130227, + "grad_norm": 0.230119988322258, + "learning_rate": 1.5364185184827543e-05, + "loss": 0.3914, + "step": 10754 + }, + { + "epoch": 2.2109158186864013, + "grad_norm": 0.22010499238967896, + "learning_rate": 1.5356669701755708e-05, + "loss": 0.4028, + "step": 10755 + }, + { + "epoch": 2.21112138965978, + "grad_norm": 0.22333703935146332, + "learning_rate": 1.5349155679072205e-05, + "loss": 0.385, + "step": 10756 + }, + { + "epoch": 2.2113269606331585, + "grad_norm": 0.22866930067539215, + "learning_rate": 1.534164311714721e-05, + "loss": 0.4027, + "step": 10757 + }, + { + "epoch": 2.211532531606537, + "grad_norm": 0.22447089850902557, + "learning_rate": 1.533413201635084e-05, + "loss": 0.4108, + "step": 10758 + }, + { + "epoch": 2.2117381025799157, + "grad_norm": 0.23292423784732819, + "learning_rate": 1.5326622377053125e-05, + "loss": 0.4173, + "step": 10759 + }, + { + "epoch": 2.2119436735532942, + "grad_norm": 0.23067182302474976, + "learning_rate": 1.5319114199624018e-05, + "loss": 0.3871, + "step": 10760 + }, + { + "epoch": 2.212149244526673, + "grad_norm": 0.13341167569160461, + "learning_rate": 1.5311607484433443e-05, + "loss": 0.4604, + "step": 10761 + }, + { + "epoch": 2.2123548155000514, + "grad_norm": 0.2339571863412857, + "learning_rate": 1.53041022318512e-05, + "loss": 0.3879, + "step": 10762 + }, + { + "epoch": 2.21256038647343, + "grad_norm": 0.22482730448246002, + "learning_rate": 1.5296598442247045e-05, + "loss": 0.4002, + "step": 10763 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 0.2297281175851822, + "learning_rate": 1.5289096115990654e-05, + "loss": 0.4032, + "step": 10764 + }, + { + "epoch": 2.212971528420187, + "grad_norm": 0.12835589051246643, + "learning_rate": 1.5281595253451624e-05, + "loss": 0.4497, + "step": 10765 + }, + { + "epoch": 2.2131770993935658, + "grad_norm": 0.23261982202529907, + "learning_rate": 1.52740958549995e-05, + "loss": 0.4021, + "step": 10766 + }, + { + "epoch": 2.2133826703669444, + "grad_norm": 0.22967736423015594, + "learning_rate": 1.526659792100371e-05, + "loss": 0.3974, + "step": 10767 + }, + { + "epoch": 2.213588241340323, + "grad_norm": 0.1222897469997406, + "learning_rate": 1.5259101451833683e-05, + "loss": 0.454, + "step": 10768 + }, + { + "epoch": 2.2137938123137015, + "grad_norm": 0.22212044894695282, + "learning_rate": 1.5251606447858725e-05, + "loss": 0.3908, + "step": 10769 + }, + { + "epoch": 2.2139993832870797, + "grad_norm": 0.23276306688785553, + "learning_rate": 1.5244112909448069e-05, + "loss": 0.3877, + "step": 10770 + }, + { + "epoch": 2.2142049542604583, + "grad_norm": 0.12715481221675873, + "learning_rate": 1.5236620836970893e-05, + "loss": 0.4706, + "step": 10771 + }, + { + "epoch": 2.214410525233837, + "grad_norm": 0.22773075103759766, + "learning_rate": 1.5229130230796281e-05, + "loss": 0.4008, + "step": 10772 + }, + { + "epoch": 2.2146160962072154, + "grad_norm": 0.23511482775211334, + "learning_rate": 1.5221641091293283e-05, + "loss": 0.4078, + "step": 10773 + }, + { + "epoch": 2.214821667180594, + "grad_norm": 0.21598058938980103, + "learning_rate": 1.521415341883085e-05, + "loss": 0.3908, + "step": 10774 + }, + { + "epoch": 2.2150272381539726, + "grad_norm": 0.23073440790176392, + "learning_rate": 1.5206667213777846e-05, + "loss": 0.404, + "step": 10775 + }, + { + "epoch": 2.215232809127351, + "grad_norm": 0.22900259494781494, + "learning_rate": 1.5199182476503105e-05, + "loss": 0.3845, + "step": 10776 + }, + { + "epoch": 2.21543838010073, + "grad_norm": 0.26081186532974243, + "learning_rate": 1.519169920737536e-05, + "loss": 0.397, + "step": 10777 + }, + { + "epoch": 2.2156439510741084, + "grad_norm": 0.2252834439277649, + "learning_rate": 1.5184217406763266e-05, + "loss": 0.3678, + "step": 10778 + }, + { + "epoch": 2.215849522047487, + "grad_norm": 0.2190970927476883, + "learning_rate": 1.5176737075035423e-05, + "loss": 0.3733, + "step": 10779 + }, + { + "epoch": 2.2160550930208656, + "grad_norm": 0.23575487732887268, + "learning_rate": 1.5169258212560354e-05, + "loss": 0.4151, + "step": 10780 + }, + { + "epoch": 2.216260663994244, + "grad_norm": 0.22723565995693207, + "learning_rate": 1.5161780819706485e-05, + "loss": 0.382, + "step": 10781 + }, + { + "epoch": 2.2164662349676227, + "grad_norm": 0.23032769560813904, + "learning_rate": 1.5154304896842231e-05, + "loss": 0.3863, + "step": 10782 + }, + { + "epoch": 2.2166718059410013, + "grad_norm": 0.2345583289861679, + "learning_rate": 1.5146830444335872e-05, + "loss": 0.4049, + "step": 10783 + }, + { + "epoch": 2.21687737691438, + "grad_norm": 0.22362026572227478, + "learning_rate": 1.5139357462555645e-05, + "loss": 0.3943, + "step": 10784 + }, + { + "epoch": 2.217082947887758, + "grad_norm": 0.23059040307998657, + "learning_rate": 1.513188595186971e-05, + "loss": 0.4008, + "step": 10785 + }, + { + "epoch": 2.2172885188611366, + "grad_norm": 0.12331248074769974, + "learning_rate": 1.5124415912646149e-05, + "loss": 0.4494, + "step": 10786 + }, + { + "epoch": 2.2174940898345152, + "grad_norm": 0.23354892432689667, + "learning_rate": 1.5116947345252977e-05, + "loss": 0.4016, + "step": 10787 + }, + { + "epoch": 2.217699660807894, + "grad_norm": 0.232215017080307, + "learning_rate": 1.5109480250058124e-05, + "loss": 0.403, + "step": 10788 + }, + { + "epoch": 2.2179052317812724, + "grad_norm": 0.22965744137763977, + "learning_rate": 1.5102014627429483e-05, + "loss": 0.4111, + "step": 10789 + }, + { + "epoch": 2.218110802754651, + "grad_norm": 0.22863295674324036, + "learning_rate": 1.5094550477734838e-05, + "loss": 0.395, + "step": 10790 + }, + { + "epoch": 2.2183163737280296, + "grad_norm": 0.22686706483364105, + "learning_rate": 1.5087087801341914e-05, + "loss": 0.4058, + "step": 10791 + }, + { + "epoch": 2.218521944701408, + "grad_norm": 0.2347644418478012, + "learning_rate": 1.5079626598618362e-05, + "loss": 0.3953, + "step": 10792 + }, + { + "epoch": 2.2187275156747868, + "grad_norm": 0.23546837270259857, + "learning_rate": 1.5072166869931748e-05, + "loss": 0.4049, + "step": 10793 + }, + { + "epoch": 2.2189330866481654, + "grad_norm": 0.12171991914510727, + "learning_rate": 1.5064708615649601e-05, + "loss": 0.4516, + "step": 10794 + }, + { + "epoch": 2.219138657621544, + "grad_norm": 0.23397013545036316, + "learning_rate": 1.5057251836139343e-05, + "loss": 0.3816, + "step": 10795 + }, + { + "epoch": 2.2193442285949225, + "grad_norm": 0.22694621980190277, + "learning_rate": 1.5049796531768323e-05, + "loss": 0.3838, + "step": 10796 + }, + { + "epoch": 2.219549799568301, + "grad_norm": 0.234305739402771, + "learning_rate": 1.5042342702903859e-05, + "loss": 0.3874, + "step": 10797 + }, + { + "epoch": 2.2197553705416797, + "grad_norm": 0.2361372858285904, + "learning_rate": 1.5034890349913142e-05, + "loss": 0.3964, + "step": 10798 + }, + { + "epoch": 2.2199609415150583, + "grad_norm": 0.23526331782341003, + "learning_rate": 1.502743947316332e-05, + "loss": 0.3981, + "step": 10799 + }, + { + "epoch": 2.2201665124884364, + "grad_norm": 0.23586028814315796, + "learning_rate": 1.501999007302147e-05, + "loss": 0.4084, + "step": 10800 + }, + { + "epoch": 2.220372083461815, + "grad_norm": 0.2271769642829895, + "learning_rate": 1.5012542149854576e-05, + "loss": 0.3905, + "step": 10801 + }, + { + "epoch": 2.2205776544351936, + "grad_norm": 0.22880828380584717, + "learning_rate": 1.5005095704029562e-05, + "loss": 0.3896, + "step": 10802 + }, + { + "epoch": 2.220783225408572, + "grad_norm": 0.2337990701198578, + "learning_rate": 1.4997650735913297e-05, + "loss": 0.3984, + "step": 10803 + }, + { + "epoch": 2.220988796381951, + "grad_norm": 0.2161635160446167, + "learning_rate": 1.499020724587255e-05, + "loss": 0.4006, + "step": 10804 + }, + { + "epoch": 2.2211943673553294, + "grad_norm": 0.22818011045455933, + "learning_rate": 1.4982765234274027e-05, + "loss": 0.3912, + "step": 10805 + }, + { + "epoch": 2.221399938328708, + "grad_norm": 0.22331209480762482, + "learning_rate": 1.4975324701484358e-05, + "loss": 0.4113, + "step": 10806 + }, + { + "epoch": 2.2216055093020866, + "grad_norm": 0.21700911223888397, + "learning_rate": 1.4967885647870107e-05, + "loss": 0.3738, + "step": 10807 + }, + { + "epoch": 2.221811080275465, + "grad_norm": 0.12261340767145157, + "learning_rate": 1.4960448073797765e-05, + "loss": 0.4559, + "step": 10808 + }, + { + "epoch": 2.2220166512488437, + "grad_norm": 0.22570718824863434, + "learning_rate": 1.4953011979633725e-05, + "loss": 0.4089, + "step": 10809 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.22284522652626038, + "learning_rate": 1.4945577365744356e-05, + "loss": 0.406, + "step": 10810 + }, + { + "epoch": 2.222427793195601, + "grad_norm": 0.2190810590982437, + "learning_rate": 1.4938144232495923e-05, + "loss": 0.396, + "step": 10811 + }, + { + "epoch": 2.2226333641689795, + "grad_norm": 0.2320832461118698, + "learning_rate": 1.4930712580254612e-05, + "loss": 0.4115, + "step": 10812 + }, + { + "epoch": 2.222838935142358, + "grad_norm": 0.12574470043182373, + "learning_rate": 1.4923282409386543e-05, + "loss": 0.4488, + "step": 10813 + }, + { + "epoch": 2.2230445061157367, + "grad_norm": 0.21672125160694122, + "learning_rate": 1.4915853720257762e-05, + "loss": 0.4069, + "step": 10814 + }, + { + "epoch": 2.223250077089115, + "grad_norm": 0.2291223555803299, + "learning_rate": 1.490842651323427e-05, + "loss": 0.4088, + "step": 10815 + }, + { + "epoch": 2.2234556480624934, + "grad_norm": 0.23085300624370575, + "learning_rate": 1.4901000788681959e-05, + "loss": 0.3894, + "step": 10816 + }, + { + "epoch": 2.223661219035872, + "grad_norm": 0.11973418295383453, + "learning_rate": 1.489357654696664e-05, + "loss": 0.4637, + "step": 10817 + }, + { + "epoch": 2.2238667900092506, + "grad_norm": 0.2691250741481781, + "learning_rate": 1.4886153788454096e-05, + "loss": 0.4024, + "step": 10818 + }, + { + "epoch": 2.224072360982629, + "grad_norm": 0.12348726391792297, + "learning_rate": 1.4878732513510012e-05, + "loss": 0.4423, + "step": 10819 + }, + { + "epoch": 2.2242779319560078, + "grad_norm": 0.1290557086467743, + "learning_rate": 1.4871312722499987e-05, + "loss": 0.4628, + "step": 10820 + }, + { + "epoch": 2.2244835029293863, + "grad_norm": 0.2316775619983673, + "learning_rate": 1.4863894415789562e-05, + "loss": 0.3948, + "step": 10821 + }, + { + "epoch": 2.224689073902765, + "grad_norm": 0.2387668341398239, + "learning_rate": 1.4856477593744187e-05, + "loss": 0.379, + "step": 10822 + }, + { + "epoch": 2.2248946448761435, + "grad_norm": 0.22780825197696686, + "learning_rate": 1.4849062256729289e-05, + "loss": 0.3708, + "step": 10823 + }, + { + "epoch": 2.225100215849522, + "grad_norm": 0.22622719407081604, + "learning_rate": 1.484164840511017e-05, + "loss": 0.3871, + "step": 10824 + }, + { + "epoch": 2.2253057868229007, + "grad_norm": 0.22779934108257294, + "learning_rate": 1.4834236039252069e-05, + "loss": 0.3736, + "step": 10825 + }, + { + "epoch": 2.2255113577962793, + "grad_norm": 0.22025705873966217, + "learning_rate": 1.4826825159520165e-05, + "loss": 0.3883, + "step": 10826 + }, + { + "epoch": 2.225716928769658, + "grad_norm": 0.21935100853443146, + "learning_rate": 1.481941576627956e-05, + "loss": 0.3932, + "step": 10827 + }, + { + "epoch": 2.2259224997430365, + "grad_norm": 0.11909017711877823, + "learning_rate": 1.4812007859895275e-05, + "loss": 0.4316, + "step": 10828 + }, + { + "epoch": 2.226128070716415, + "grad_norm": 0.2229301780462265, + "learning_rate": 1.4804601440732245e-05, + "loss": 0.3889, + "step": 10829 + }, + { + "epoch": 2.226333641689793, + "grad_norm": 0.2314000278711319, + "learning_rate": 1.479719650915539e-05, + "loss": 0.4042, + "step": 10830 + }, + { + "epoch": 2.226539212663172, + "grad_norm": 0.23769402503967285, + "learning_rate": 1.4789793065529492e-05, + "loss": 0.4003, + "step": 10831 + }, + { + "epoch": 2.2267447836365504, + "grad_norm": 0.2327127605676651, + "learning_rate": 1.478239111021929e-05, + "loss": 0.3853, + "step": 10832 + }, + { + "epoch": 2.226950354609929, + "grad_norm": 0.23596766591072083, + "learning_rate": 1.4774990643589441e-05, + "loss": 0.4041, + "step": 10833 + }, + { + "epoch": 2.2271559255833075, + "grad_norm": 0.22967597842216492, + "learning_rate": 1.476759166600453e-05, + "loss": 0.413, + "step": 10834 + }, + { + "epoch": 2.227361496556686, + "grad_norm": 0.223694309592247, + "learning_rate": 1.476019417782907e-05, + "loss": 0.3922, + "step": 10835 + }, + { + "epoch": 2.2275670675300647, + "grad_norm": 0.22924546897411346, + "learning_rate": 1.4752798179427489e-05, + "loss": 0.3925, + "step": 10836 + }, + { + "epoch": 2.2277726385034433, + "grad_norm": 0.2322525531053543, + "learning_rate": 1.474540367116418e-05, + "loss": 0.4093, + "step": 10837 + }, + { + "epoch": 2.227978209476822, + "grad_norm": 0.22837835550308228, + "learning_rate": 1.4738010653403414e-05, + "loss": 0.3959, + "step": 10838 + }, + { + "epoch": 2.2281837804502005, + "grad_norm": 0.13115087151527405, + "learning_rate": 1.4730619126509427e-05, + "loss": 0.4592, + "step": 10839 + }, + { + "epoch": 2.228389351423579, + "grad_norm": 0.24123218655586243, + "learning_rate": 1.472322909084636e-05, + "loss": 0.389, + "step": 10840 + }, + { + "epoch": 2.2285949223969577, + "grad_norm": 0.24346770346164703, + "learning_rate": 1.4715840546778284e-05, + "loss": 0.419, + "step": 10841 + }, + { + "epoch": 2.2288004933703363, + "grad_norm": 0.2285340279340744, + "learning_rate": 1.4708453494669196e-05, + "loss": 0.4022, + "step": 10842 + }, + { + "epoch": 2.229006064343715, + "grad_norm": 0.22701993584632874, + "learning_rate": 1.4701067934883007e-05, + "loss": 0.3926, + "step": 10843 + }, + { + "epoch": 2.2292116353170934, + "grad_norm": 0.2268943190574646, + "learning_rate": 1.4693683867783597e-05, + "loss": 0.3891, + "step": 10844 + }, + { + "epoch": 2.2294172062904716, + "grad_norm": 0.23047508299350739, + "learning_rate": 1.468630129373473e-05, + "loss": 0.3973, + "step": 10845 + }, + { + "epoch": 2.22962277726385, + "grad_norm": 0.2280137687921524, + "learning_rate": 1.4678920213100116e-05, + "loss": 0.3851, + "step": 10846 + }, + { + "epoch": 2.2298283482372288, + "grad_norm": 0.2208314836025238, + "learning_rate": 1.4671540626243379e-05, + "loss": 0.3931, + "step": 10847 + }, + { + "epoch": 2.2300339192106073, + "grad_norm": 0.23788389563560486, + "learning_rate": 1.4664162533528081e-05, + "loss": 0.4042, + "step": 10848 + }, + { + "epoch": 2.230239490183986, + "grad_norm": 0.2255765050649643, + "learning_rate": 1.4656785935317708e-05, + "loss": 0.3875, + "step": 10849 + }, + { + "epoch": 2.2304450611573645, + "grad_norm": 0.22221685945987701, + "learning_rate": 1.4649410831975656e-05, + "loss": 0.3858, + "step": 10850 + }, + { + "epoch": 2.230650632130743, + "grad_norm": 0.22361934185028076, + "learning_rate": 1.4642037223865281e-05, + "loss": 0.3891, + "step": 10851 + }, + { + "epoch": 2.2308562031041217, + "grad_norm": 0.12343227863311768, + "learning_rate": 1.4634665111349843e-05, + "loss": 0.482, + "step": 10852 + }, + { + "epoch": 2.2310617740775003, + "grad_norm": 0.12411545217037201, + "learning_rate": 1.462729449479253e-05, + "loss": 0.4664, + "step": 10853 + }, + { + "epoch": 2.231267345050879, + "grad_norm": 0.2260737121105194, + "learning_rate": 1.4619925374556457e-05, + "loss": 0.392, + "step": 10854 + }, + { + "epoch": 2.2314729160242575, + "grad_norm": 0.2308768928050995, + "learning_rate": 1.461255775100466e-05, + "loss": 0.4033, + "step": 10855 + }, + { + "epoch": 2.231678486997636, + "grad_norm": 0.12042105197906494, + "learning_rate": 1.460519162450011e-05, + "loss": 0.4485, + "step": 10856 + }, + { + "epoch": 2.2318840579710146, + "grad_norm": 0.22707884013652802, + "learning_rate": 1.4597826995405697e-05, + "loss": 0.3747, + "step": 10857 + }, + { + "epoch": 2.2320896289443932, + "grad_norm": 0.23044802248477936, + "learning_rate": 1.4590463864084258e-05, + "loss": 0.3896, + "step": 10858 + }, + { + "epoch": 2.232295199917772, + "grad_norm": 0.2284078150987625, + "learning_rate": 1.458310223089853e-05, + "loss": 0.3806, + "step": 10859 + }, + { + "epoch": 2.23250077089115, + "grad_norm": 0.12638430297374725, + "learning_rate": 1.4575742096211172e-05, + "loss": 0.4579, + "step": 10860 + }, + { + "epoch": 2.2327063418645285, + "grad_norm": 0.12327645719051361, + "learning_rate": 1.4568383460384815e-05, + "loss": 0.4572, + "step": 10861 + }, + { + "epoch": 2.232911912837907, + "grad_norm": 0.22871337831020355, + "learning_rate": 1.4561026323781969e-05, + "loss": 0.3938, + "step": 10862 + }, + { + "epoch": 2.2331174838112857, + "grad_norm": 0.1175784319639206, + "learning_rate": 1.4553670686765082e-05, + "loss": 0.4228, + "step": 10863 + }, + { + "epoch": 2.2333230547846643, + "grad_norm": 0.23156176507472992, + "learning_rate": 1.4546316549696521e-05, + "loss": 0.3983, + "step": 10864 + }, + { + "epoch": 2.233528625758043, + "grad_norm": 0.22325018048286438, + "learning_rate": 1.453896391293862e-05, + "loss": 0.4036, + "step": 10865 + }, + { + "epoch": 2.2337341967314215, + "grad_norm": 0.2427932471036911, + "learning_rate": 1.4531612776853592e-05, + "loss": 0.3779, + "step": 10866 + }, + { + "epoch": 2.2339397677048, + "grad_norm": 0.12050554901361465, + "learning_rate": 1.452426314180359e-05, + "loss": 0.4408, + "step": 10867 + }, + { + "epoch": 2.2341453386781787, + "grad_norm": 0.2303098738193512, + "learning_rate": 1.4516915008150703e-05, + "loss": 0.3944, + "step": 10868 + }, + { + "epoch": 2.2343509096515572, + "grad_norm": 0.22475799918174744, + "learning_rate": 1.4509568376256933e-05, + "loss": 0.3911, + "step": 10869 + }, + { + "epoch": 2.234556480624936, + "grad_norm": 0.12232775241136551, + "learning_rate": 1.4502223246484222e-05, + "loss": 0.4503, + "step": 10870 + }, + { + "epoch": 2.2347620515983144, + "grad_norm": 0.23218752443790436, + "learning_rate": 1.4494879619194408e-05, + "loss": 0.3916, + "step": 10871 + }, + { + "epoch": 2.234967622571693, + "grad_norm": 0.22913837432861328, + "learning_rate": 1.4487537494749308e-05, + "loss": 0.3967, + "step": 10872 + }, + { + "epoch": 2.2351731935450716, + "grad_norm": 0.22640950977802277, + "learning_rate": 1.4480196873510623e-05, + "loss": 0.3938, + "step": 10873 + }, + { + "epoch": 2.23537876451845, + "grad_norm": 0.22983142733573914, + "learning_rate": 1.4472857755839987e-05, + "loss": 0.3957, + "step": 10874 + }, + { + "epoch": 2.2355843354918283, + "grad_norm": 0.13250325620174408, + "learning_rate": 1.4465520142098968e-05, + "loss": 0.4521, + "step": 10875 + }, + { + "epoch": 2.235789906465207, + "grad_norm": 0.12669454514980316, + "learning_rate": 1.4458184032649049e-05, + "loss": 0.4651, + "step": 10876 + }, + { + "epoch": 2.2359954774385855, + "grad_norm": 0.22359710931777954, + "learning_rate": 1.4450849427851654e-05, + "loss": 0.3771, + "step": 10877 + }, + { + "epoch": 2.236201048411964, + "grad_norm": 0.22868263721466064, + "learning_rate": 1.4443516328068107e-05, + "loss": 0.3723, + "step": 10878 + }, + { + "epoch": 2.2364066193853427, + "grad_norm": 0.2262980043888092, + "learning_rate": 1.4436184733659704e-05, + "loss": 0.3886, + "step": 10879 + }, + { + "epoch": 2.2366121903587213, + "grad_norm": 0.22829292714595795, + "learning_rate": 1.4428854644987623e-05, + "loss": 0.3879, + "step": 10880 + }, + { + "epoch": 2.2368177613321, + "grad_norm": 0.22236782312393188, + "learning_rate": 1.4421526062412972e-05, + "loss": 0.3716, + "step": 10881 + }, + { + "epoch": 2.2370233323054785, + "grad_norm": 0.2244395762681961, + "learning_rate": 1.4414198986296825e-05, + "loss": 0.3716, + "step": 10882 + }, + { + "epoch": 2.237228903278857, + "grad_norm": 0.23614956438541412, + "learning_rate": 1.4406873417000133e-05, + "loss": 0.4046, + "step": 10883 + }, + { + "epoch": 2.2374344742522356, + "grad_norm": 0.23262259364128113, + "learning_rate": 1.4399549354883795e-05, + "loss": 0.392, + "step": 10884 + }, + { + "epoch": 2.237640045225614, + "grad_norm": 0.23623405396938324, + "learning_rate": 1.439222680030862e-05, + "loss": 0.4101, + "step": 10885 + }, + { + "epoch": 2.237845616198993, + "grad_norm": 0.12626418471336365, + "learning_rate": 1.4384905753635388e-05, + "loss": 0.436, + "step": 10886 + }, + { + "epoch": 2.2380511871723714, + "grad_norm": 0.2217606157064438, + "learning_rate": 1.437758621522475e-05, + "loss": 0.3971, + "step": 10887 + }, + { + "epoch": 2.23825675814575, + "grad_norm": 0.22895729541778564, + "learning_rate": 1.4370268185437314e-05, + "loss": 0.4164, + "step": 10888 + }, + { + "epoch": 2.2384623291191286, + "grad_norm": 0.26154306530952454, + "learning_rate": 1.4362951664633601e-05, + "loss": 0.411, + "step": 10889 + }, + { + "epoch": 2.2386679000925067, + "grad_norm": 0.12071531265974045, + "learning_rate": 1.4355636653174064e-05, + "loss": 0.46, + "step": 10890 + }, + { + "epoch": 2.2388734710658853, + "grad_norm": 0.23138496279716492, + "learning_rate": 1.4348323151419076e-05, + "loss": 0.3929, + "step": 10891 + }, + { + "epoch": 2.239079042039264, + "grad_norm": 0.22143509984016418, + "learning_rate": 1.4341011159728923e-05, + "loss": 0.3937, + "step": 10892 + }, + { + "epoch": 2.2392846130126425, + "grad_norm": 0.23120230436325073, + "learning_rate": 1.433370067846387e-05, + "loss": 0.4061, + "step": 10893 + }, + { + "epoch": 2.239490183986021, + "grad_norm": 0.22361977398395538, + "learning_rate": 1.4326391707984047e-05, + "loss": 0.3993, + "step": 10894 + }, + { + "epoch": 2.2396957549593997, + "grad_norm": 0.1270783543586731, + "learning_rate": 1.431908424864954e-05, + "loss": 0.424, + "step": 10895 + }, + { + "epoch": 2.2399013259327782, + "grad_norm": 0.22819988429546356, + "learning_rate": 1.4311778300820347e-05, + "loss": 0.4009, + "step": 10896 + }, + { + "epoch": 2.240106896906157, + "grad_norm": 0.22298060357570648, + "learning_rate": 1.4304473864856404e-05, + "loss": 0.3959, + "step": 10897 + }, + { + "epoch": 2.2403124678795354, + "grad_norm": 0.22824987769126892, + "learning_rate": 1.4297170941117544e-05, + "loss": 0.4174, + "step": 10898 + }, + { + "epoch": 2.240518038852914, + "grad_norm": 0.1287529617547989, + "learning_rate": 1.4289869529963582e-05, + "loss": 0.4321, + "step": 10899 + }, + { + "epoch": 2.2407236098262926, + "grad_norm": 0.2339385449886322, + "learning_rate": 1.428256963175421e-05, + "loss": 0.4036, + "step": 10900 + }, + { + "epoch": 2.240929180799671, + "grad_norm": 0.22810976207256317, + "learning_rate": 1.4275271246849061e-05, + "loss": 0.4073, + "step": 10901 + }, + { + "epoch": 2.2411347517730498, + "grad_norm": 0.22102433443069458, + "learning_rate": 1.4267974375607675e-05, + "loss": 0.3761, + "step": 10902 + }, + { + "epoch": 2.2413403227464284, + "grad_norm": 0.2228943556547165, + "learning_rate": 1.4260679018389566e-05, + "loss": 0.3958, + "step": 10903 + }, + { + "epoch": 2.241545893719807, + "grad_norm": 0.22356650233268738, + "learning_rate": 1.4253385175554126e-05, + "loss": 0.3841, + "step": 10904 + }, + { + "epoch": 2.241751464693185, + "grad_norm": 0.1219724789261818, + "learning_rate": 1.4246092847460679e-05, + "loss": 0.4373, + "step": 10905 + }, + { + "epoch": 2.2419570356665637, + "grad_norm": 0.22389782965183258, + "learning_rate": 1.42388020344685e-05, + "loss": 0.3908, + "step": 10906 + }, + { + "epoch": 2.2421626066399423, + "grad_norm": 0.22778619825839996, + "learning_rate": 1.4231512736936774e-05, + "loss": 0.4086, + "step": 10907 + }, + { + "epoch": 2.242368177613321, + "grad_norm": 0.24095553159713745, + "learning_rate": 1.4224224955224604e-05, + "loss": 0.3859, + "step": 10908 + }, + { + "epoch": 2.2425737485866994, + "grad_norm": 0.2397175282239914, + "learning_rate": 1.4216938689691019e-05, + "loss": 0.4006, + "step": 10909 + }, + { + "epoch": 2.242779319560078, + "grad_norm": 0.22254031896591187, + "learning_rate": 1.4209653940694986e-05, + "loss": 0.4021, + "step": 10910 + }, + { + "epoch": 2.2429848905334566, + "grad_norm": 0.12882784008979797, + "learning_rate": 1.4202370708595396e-05, + "loss": 0.4369, + "step": 10911 + }, + { + "epoch": 2.243190461506835, + "grad_norm": 0.13095501065254211, + "learning_rate": 1.4195088993751034e-05, + "loss": 0.4539, + "step": 10912 + }, + { + "epoch": 2.243396032480214, + "grad_norm": 0.2357592135667801, + "learning_rate": 1.418780879652067e-05, + "loss": 0.3915, + "step": 10913 + }, + { + "epoch": 2.2436016034535924, + "grad_norm": 0.23308870196342468, + "learning_rate": 1.4180530117262953e-05, + "loss": 0.4003, + "step": 10914 + }, + { + "epoch": 2.243807174426971, + "grad_norm": 0.22599655389785767, + "learning_rate": 1.4173252956336463e-05, + "loss": 0.3978, + "step": 10915 + }, + { + "epoch": 2.2440127454003496, + "grad_norm": 0.23513002693653107, + "learning_rate": 1.416597731409972e-05, + "loss": 0.3943, + "step": 10916 + }, + { + "epoch": 2.244218316373728, + "grad_norm": 0.1267446130514145, + "learning_rate": 1.4158703190911157e-05, + "loss": 0.4464, + "step": 10917 + }, + { + "epoch": 2.2444238873471067, + "grad_norm": 0.22103582322597504, + "learning_rate": 1.4151430587129133e-05, + "loss": 0.3842, + "step": 10918 + }, + { + "epoch": 2.2446294583204853, + "grad_norm": 0.2322588562965393, + "learning_rate": 1.4144159503111928e-05, + "loss": 0.4096, + "step": 10919 + }, + { + "epoch": 2.2448350292938635, + "grad_norm": 0.1323188990354538, + "learning_rate": 1.4136889939217776e-05, + "loss": 0.4459, + "step": 10920 + }, + { + "epoch": 2.245040600267242, + "grad_norm": 0.2242937535047531, + "learning_rate": 1.41296218958048e-05, + "loss": 0.3859, + "step": 10921 + }, + { + "epoch": 2.2452461712406206, + "grad_norm": 0.22466784715652466, + "learning_rate": 1.4122355373231073e-05, + "loss": 0.3982, + "step": 10922 + }, + { + "epoch": 2.2454517422139992, + "grad_norm": 0.22480922937393188, + "learning_rate": 1.411509037185457e-05, + "loss": 0.4073, + "step": 10923 + }, + { + "epoch": 2.245657313187378, + "grad_norm": 0.12106183916330338, + "learning_rate": 1.4107826892033194e-05, + "loss": 0.4505, + "step": 10924 + }, + { + "epoch": 2.2458628841607564, + "grad_norm": 0.2291100174188614, + "learning_rate": 1.4100564934124812e-05, + "loss": 0.3902, + "step": 10925 + }, + { + "epoch": 2.246068455134135, + "grad_norm": 0.22419095039367676, + "learning_rate": 1.409330449848716e-05, + "loss": 0.3931, + "step": 10926 + }, + { + "epoch": 2.2462740261075136, + "grad_norm": 0.22613660991191864, + "learning_rate": 1.4086045585477947e-05, + "loss": 0.3922, + "step": 10927 + }, + { + "epoch": 2.246479597080892, + "grad_norm": 0.22982370853424072, + "learning_rate": 1.407878819545478e-05, + "loss": 0.399, + "step": 10928 + }, + { + "epoch": 2.2466851680542708, + "grad_norm": 0.23034709692001343, + "learning_rate": 1.4071532328775196e-05, + "loss": 0.3812, + "step": 10929 + }, + { + "epoch": 2.2468907390276494, + "grad_norm": 0.23110920190811157, + "learning_rate": 1.4064277985796652e-05, + "loss": 0.389, + "step": 10930 + }, + { + "epoch": 2.247096310001028, + "grad_norm": 0.2307683825492859, + "learning_rate": 1.4057025166876537e-05, + "loss": 0.4113, + "step": 10931 + }, + { + "epoch": 2.2473018809744065, + "grad_norm": 0.23556135594844818, + "learning_rate": 1.4049773872372172e-05, + "loss": 0.3884, + "step": 10932 + }, + { + "epoch": 2.247507451947785, + "grad_norm": 0.230165496468544, + "learning_rate": 1.4042524102640763e-05, + "loss": 0.3956, + "step": 10933 + }, + { + "epoch": 2.2477130229211637, + "grad_norm": 0.22927415370941162, + "learning_rate": 1.4035275858039516e-05, + "loss": 0.3868, + "step": 10934 + }, + { + "epoch": 2.2479185938945423, + "grad_norm": 0.22793439030647278, + "learning_rate": 1.4028029138925497e-05, + "loss": 0.3894, + "step": 10935 + }, + { + "epoch": 2.248124164867921, + "grad_norm": 0.2283446490764618, + "learning_rate": 1.4020783945655724e-05, + "loss": 0.3903, + "step": 10936 + }, + { + "epoch": 2.248329735841299, + "grad_norm": 0.22100144624710083, + "learning_rate": 1.4013540278587125e-05, + "loss": 0.3942, + "step": 10937 + }, + { + "epoch": 2.2485353068146776, + "grad_norm": 0.12830045819282532, + "learning_rate": 1.4006298138076567e-05, + "loss": 0.4512, + "step": 10938 + }, + { + "epoch": 2.248740877788056, + "grad_norm": 0.2236565202474594, + "learning_rate": 1.3999057524480838e-05, + "loss": 0.4032, + "step": 10939 + }, + { + "epoch": 2.248946448761435, + "grad_norm": 0.22065366804599762, + "learning_rate": 1.3991818438156628e-05, + "loss": 0.3844, + "step": 10940 + }, + { + "epoch": 2.2491520197348134, + "grad_norm": 0.12815195322036743, + "learning_rate": 1.3984580879460613e-05, + "loss": 0.4361, + "step": 10941 + }, + { + "epoch": 2.249357590708192, + "grad_norm": 0.23110713064670563, + "learning_rate": 1.3977344848749327e-05, + "loss": 0.3976, + "step": 10942 + }, + { + "epoch": 2.2495631616815706, + "grad_norm": 0.23048558831214905, + "learning_rate": 1.3970110346379258e-05, + "loss": 0.3893, + "step": 10943 + }, + { + "epoch": 2.249768732654949, + "grad_norm": 0.12720687687397003, + "learning_rate": 1.3962877372706823e-05, + "loss": 0.4534, + "step": 10944 + }, + { + "epoch": 2.2499743036283277, + "grad_norm": 0.2292504608631134, + "learning_rate": 1.3955645928088343e-05, + "loss": 0.4032, + "step": 10945 + }, + { + "epoch": 2.2501798746017063, + "grad_norm": 0.26804453134536743, + "learning_rate": 1.3948416012880095e-05, + "loss": 0.3896, + "step": 10946 + }, + { + "epoch": 2.250385445575085, + "grad_norm": 0.24208854138851166, + "learning_rate": 1.3941187627438255e-05, + "loss": 0.4036, + "step": 10947 + }, + { + "epoch": 2.2505910165484635, + "grad_norm": 0.21898695826530457, + "learning_rate": 1.393396077211892e-05, + "loss": 0.3847, + "step": 10948 + }, + { + "epoch": 2.250796587521842, + "grad_norm": 0.24147653579711914, + "learning_rate": 1.3926735447278149e-05, + "loss": 0.399, + "step": 10949 + }, + { + "epoch": 2.2510021584952202, + "grad_norm": 0.21761365234851837, + "learning_rate": 1.3919511653271885e-05, + "loss": 0.3977, + "step": 10950 + }, + { + "epoch": 2.2512077294685993, + "grad_norm": 0.23133422434329987, + "learning_rate": 1.3912289390456018e-05, + "loss": 0.3832, + "step": 10951 + }, + { + "epoch": 2.2514133004419774, + "grad_norm": 0.23142319917678833, + "learning_rate": 1.3905068659186345e-05, + "loss": 0.4152, + "step": 10952 + }, + { + "epoch": 2.251618871415356, + "grad_norm": 0.21739207208156586, + "learning_rate": 1.3897849459818602e-05, + "loss": 0.3866, + "step": 10953 + }, + { + "epoch": 2.2518244423887346, + "grad_norm": 0.2368880808353424, + "learning_rate": 1.389063179270843e-05, + "loss": 0.3975, + "step": 10954 + }, + { + "epoch": 2.252030013362113, + "grad_norm": 0.22230856120586395, + "learning_rate": 1.3883415658211439e-05, + "loss": 0.3897, + "step": 10955 + }, + { + "epoch": 2.2522355843354918, + "grad_norm": 0.2135685384273529, + "learning_rate": 1.387620105668312e-05, + "loss": 0.3953, + "step": 10956 + }, + { + "epoch": 2.2524411553088703, + "grad_norm": 0.22502809762954712, + "learning_rate": 1.3868987988478905e-05, + "loss": 0.3849, + "step": 10957 + }, + { + "epoch": 2.252646726282249, + "grad_norm": 0.12617872655391693, + "learning_rate": 1.3861776453954141e-05, + "loss": 0.4533, + "step": 10958 + }, + { + "epoch": 2.2528522972556275, + "grad_norm": 0.12221905589103699, + "learning_rate": 1.3854566453464114e-05, + "loss": 0.4514, + "step": 10959 + }, + { + "epoch": 2.253057868229006, + "grad_norm": 0.22371545433998108, + "learning_rate": 1.3847357987364026e-05, + "loss": 0.4013, + "step": 10960 + }, + { + "epoch": 2.2532634392023847, + "grad_norm": 0.22430896759033203, + "learning_rate": 1.3840151056008989e-05, + "loss": 0.3826, + "step": 10961 + }, + { + "epoch": 2.2534690101757633, + "grad_norm": 0.2251027673482895, + "learning_rate": 1.3832945659754084e-05, + "loss": 0.39, + "step": 10962 + }, + { + "epoch": 2.253674581149142, + "grad_norm": 0.21788759529590607, + "learning_rate": 1.3825741798954265e-05, + "loss": 0.3945, + "step": 10963 + }, + { + "epoch": 2.2538801521225205, + "grad_norm": 0.2384837120771408, + "learning_rate": 1.3818539473964443e-05, + "loss": 0.3972, + "step": 10964 + }, + { + "epoch": 2.254085723095899, + "grad_norm": 0.2365540862083435, + "learning_rate": 1.381133868513944e-05, + "loss": 0.4051, + "step": 10965 + }, + { + "epoch": 2.2542912940692776, + "grad_norm": 0.22459320724010468, + "learning_rate": 1.3804139432833994e-05, + "loss": 0.3933, + "step": 10966 + }, + { + "epoch": 2.254496865042656, + "grad_norm": 0.2330470085144043, + "learning_rate": 1.3796941717402797e-05, + "loss": 0.4029, + "step": 10967 + }, + { + "epoch": 2.2547024360160344, + "grad_norm": 0.2302565574645996, + "learning_rate": 1.3789745539200443e-05, + "loss": 0.3685, + "step": 10968 + }, + { + "epoch": 2.254908006989413, + "grad_norm": 0.12435781210660934, + "learning_rate": 1.3782550898581435e-05, + "loss": 0.465, + "step": 10969 + }, + { + "epoch": 2.2551135779627915, + "grad_norm": 0.22399941086769104, + "learning_rate": 1.377535779590025e-05, + "loss": 0.3946, + "step": 10970 + }, + { + "epoch": 2.25531914893617, + "grad_norm": 0.2299404740333557, + "learning_rate": 1.3768166231511242e-05, + "loss": 0.3981, + "step": 10971 + }, + { + "epoch": 2.2555247199095487, + "grad_norm": 0.22755853831768036, + "learning_rate": 1.3760976205768704e-05, + "loss": 0.4128, + "step": 10972 + }, + { + "epoch": 2.2557302908829273, + "grad_norm": 0.23051007091999054, + "learning_rate": 1.3753787719026858e-05, + "loss": 0.4034, + "step": 10973 + }, + { + "epoch": 2.255935861856306, + "grad_norm": 0.11795416474342346, + "learning_rate": 1.3746600771639847e-05, + "loss": 0.4349, + "step": 10974 + }, + { + "epoch": 2.2561414328296845, + "grad_norm": 0.22369509935379028, + "learning_rate": 1.3739415363961725e-05, + "loss": 0.3958, + "step": 10975 + }, + { + "epoch": 2.256347003803063, + "grad_norm": 0.224918395280838, + "learning_rate": 1.3732231496346506e-05, + "loss": 0.4054, + "step": 10976 + }, + { + "epoch": 2.2565525747764417, + "grad_norm": 0.22502835094928741, + "learning_rate": 1.3725049169148101e-05, + "loss": 0.3986, + "step": 10977 + }, + { + "epoch": 2.2567581457498203, + "grad_norm": 0.2298583686351776, + "learning_rate": 1.3717868382720342e-05, + "loss": 0.4023, + "step": 10978 + }, + { + "epoch": 2.256963716723199, + "grad_norm": 0.2239440232515335, + "learning_rate": 1.3710689137417002e-05, + "loss": 0.3776, + "step": 10979 + }, + { + "epoch": 2.2571692876965774, + "grad_norm": 0.12783947587013245, + "learning_rate": 1.3703511433591756e-05, + "loss": 0.4592, + "step": 10980 + }, + { + "epoch": 2.257374858669956, + "grad_norm": 0.23055274784564972, + "learning_rate": 1.3696335271598206e-05, + "loss": 0.3805, + "step": 10981 + }, + { + "epoch": 2.257580429643334, + "grad_norm": 0.22777009010314941, + "learning_rate": 1.3689160651789923e-05, + "loss": 0.3927, + "step": 10982 + }, + { + "epoch": 2.2577860006167128, + "grad_norm": 0.2232956886291504, + "learning_rate": 1.3681987574520346e-05, + "loss": 0.3783, + "step": 10983 + }, + { + "epoch": 2.2579915715900913, + "grad_norm": 0.2353593409061432, + "learning_rate": 1.3674816040142864e-05, + "loss": 0.4053, + "step": 10984 + }, + { + "epoch": 2.25819714256347, + "grad_norm": 0.12569645047187805, + "learning_rate": 1.3667646049010782e-05, + "loss": 0.4533, + "step": 10985 + }, + { + "epoch": 2.2584027135368485, + "grad_norm": 0.22515416145324707, + "learning_rate": 1.3660477601477328e-05, + "loss": 0.3757, + "step": 10986 + }, + { + "epoch": 2.258608284510227, + "grad_norm": 0.13127067685127258, + "learning_rate": 1.3653310697895652e-05, + "loss": 0.4595, + "step": 10987 + }, + { + "epoch": 2.2588138554836057, + "grad_norm": 0.22975093126296997, + "learning_rate": 1.3646145338618855e-05, + "loss": 0.3877, + "step": 10988 + }, + { + "epoch": 2.2590194264569843, + "grad_norm": 0.22624441981315613, + "learning_rate": 1.3638981523999929e-05, + "loss": 0.379, + "step": 10989 + }, + { + "epoch": 2.259224997430363, + "grad_norm": 0.12386941909790039, + "learning_rate": 1.3631819254391793e-05, + "loss": 0.4457, + "step": 10990 + }, + { + "epoch": 2.2594305684037415, + "grad_norm": 0.2416963428258896, + "learning_rate": 1.3624658530147319e-05, + "loss": 0.3763, + "step": 10991 + }, + { + "epoch": 2.25963613937712, + "grad_norm": 0.22425812482833862, + "learning_rate": 1.3617499351619269e-05, + "loss": 0.3828, + "step": 10992 + }, + { + "epoch": 2.2598417103504986, + "grad_norm": 0.13300848007202148, + "learning_rate": 1.3610341719160347e-05, + "loss": 0.4532, + "step": 10993 + }, + { + "epoch": 2.260047281323877, + "grad_norm": 0.22609826922416687, + "learning_rate": 1.3603185633123177e-05, + "loss": 0.3796, + "step": 10994 + }, + { + "epoch": 2.260252852297256, + "grad_norm": 0.22295403480529785, + "learning_rate": 1.3596031093860283e-05, + "loss": 0.4128, + "step": 10995 + }, + { + "epoch": 2.2604584232706344, + "grad_norm": 0.22617916762828827, + "learning_rate": 1.3588878101724169e-05, + "loss": 0.4004, + "step": 10996 + }, + { + "epoch": 2.2606639942440125, + "grad_norm": 0.23671671748161316, + "learning_rate": 1.3581726657067217e-05, + "loss": 0.3947, + "step": 10997 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 0.2252146303653717, + "learning_rate": 1.357457676024175e-05, + "loss": 0.3923, + "step": 10998 + }, + { + "epoch": 2.2610751361907697, + "grad_norm": 0.2305798977613449, + "learning_rate": 1.3567428411599997e-05, + "loss": 0.4119, + "step": 10999 + }, + { + "epoch": 2.2612807071641483, + "grad_norm": 0.23965519666671753, + "learning_rate": 1.3560281611494131e-05, + "loss": 0.3992, + "step": 11000 + }, + { + "epoch": 2.261486278137527, + "grad_norm": 0.22159597277641296, + "learning_rate": 1.355313636027624e-05, + "loss": 0.3947, + "step": 11001 + }, + { + "epoch": 2.2616918491109055, + "grad_norm": 0.23163023591041565, + "learning_rate": 1.3545992658298328e-05, + "loss": 0.3794, + "step": 11002 + }, + { + "epoch": 2.261897420084284, + "grad_norm": 0.2376321256160736, + "learning_rate": 1.3538850505912354e-05, + "loss": 0.3868, + "step": 11003 + }, + { + "epoch": 2.2621029910576627, + "grad_norm": 0.22760237753391266, + "learning_rate": 1.3531709903470169e-05, + "loss": 0.3917, + "step": 11004 + }, + { + "epoch": 2.2623085620310412, + "grad_norm": 0.22676926851272583, + "learning_rate": 1.3524570851323556e-05, + "loss": 0.3942, + "step": 11005 + }, + { + "epoch": 2.26251413300442, + "grad_norm": 0.22704067826271057, + "learning_rate": 1.351743334982422e-05, + "loss": 0.3709, + "step": 11006 + }, + { + "epoch": 2.2627197039777984, + "grad_norm": 0.24701926112174988, + "learning_rate": 1.3510297399323792e-05, + "loss": 0.3939, + "step": 11007 + }, + { + "epoch": 2.262925274951177, + "grad_norm": 0.2252301573753357, + "learning_rate": 1.3503163000173827e-05, + "loss": 0.373, + "step": 11008 + }, + { + "epoch": 2.2631308459245556, + "grad_norm": 0.2303270697593689, + "learning_rate": 1.3496030152725793e-05, + "loss": 0.4049, + "step": 11009 + }, + { + "epoch": 2.263336416897934, + "grad_norm": 0.22634254395961761, + "learning_rate": 1.3488898857331116e-05, + "loss": 0.3793, + "step": 11010 + }, + { + "epoch": 2.2635419878713128, + "grad_norm": 0.231819748878479, + "learning_rate": 1.3481769114341098e-05, + "loss": 0.3854, + "step": 11011 + }, + { + "epoch": 2.263747558844691, + "grad_norm": 0.12441035360097885, + "learning_rate": 1.3474640924107014e-05, + "loss": 0.4482, + "step": 11012 + }, + { + "epoch": 2.2639531298180695, + "grad_norm": 0.23297782242298126, + "learning_rate": 1.3467514286980024e-05, + "loss": 0.3978, + "step": 11013 + }, + { + "epoch": 2.264158700791448, + "grad_norm": 0.23407147824764252, + "learning_rate": 1.346038920331122e-05, + "loss": 0.3915, + "step": 11014 + }, + { + "epoch": 2.2643642717648267, + "grad_norm": 0.22615815699100494, + "learning_rate": 1.3453265673451623e-05, + "loss": 0.3919, + "step": 11015 + }, + { + "epoch": 2.2645698427382053, + "grad_norm": 0.23967291414737701, + "learning_rate": 1.3446143697752166e-05, + "loss": 0.3988, + "step": 11016 + }, + { + "epoch": 2.264775413711584, + "grad_norm": 0.2341252863407135, + "learning_rate": 1.3439023276563739e-05, + "loss": 0.363, + "step": 11017 + }, + { + "epoch": 2.2649809846849625, + "grad_norm": 0.22647178173065186, + "learning_rate": 1.3431904410237122e-05, + "loss": 0.3922, + "step": 11018 + }, + { + "epoch": 2.265186555658341, + "grad_norm": 0.2393738180398941, + "learning_rate": 1.3424787099123023e-05, + "loss": 0.3874, + "step": 11019 + }, + { + "epoch": 2.2653921266317196, + "grad_norm": 0.23167793452739716, + "learning_rate": 1.3417671343572087e-05, + "loss": 0.3921, + "step": 11020 + }, + { + "epoch": 2.265597697605098, + "grad_norm": 0.2206806093454361, + "learning_rate": 1.3410557143934864e-05, + "loss": 0.3988, + "step": 11021 + }, + { + "epoch": 2.265803268578477, + "grad_norm": 0.22465433180332184, + "learning_rate": 1.340344450056184e-05, + "loss": 0.3896, + "step": 11022 + }, + { + "epoch": 2.2660088395518554, + "grad_norm": 0.22498202323913574, + "learning_rate": 1.3396333413803412e-05, + "loss": 0.3902, + "step": 11023 + }, + { + "epoch": 2.266214410525234, + "grad_norm": 0.23176932334899902, + "learning_rate": 1.3389223884009937e-05, + "loss": 0.4043, + "step": 11024 + }, + { + "epoch": 2.2664199814986126, + "grad_norm": 0.22066771984100342, + "learning_rate": 1.3382115911531653e-05, + "loss": 0.3588, + "step": 11025 + }, + { + "epoch": 2.266625552471991, + "grad_norm": 0.23479969799518585, + "learning_rate": 1.3375009496718729e-05, + "loss": 0.4034, + "step": 11026 + }, + { + "epoch": 2.2668311234453693, + "grad_norm": 0.21714085340499878, + "learning_rate": 1.336790463992128e-05, + "loss": 0.4034, + "step": 11027 + }, + { + "epoch": 2.267036694418748, + "grad_norm": 0.22929847240447998, + "learning_rate": 1.336080134148932e-05, + "loss": 0.4047, + "step": 11028 + }, + { + "epoch": 2.2672422653921265, + "grad_norm": 0.23881329596042633, + "learning_rate": 1.3353699601772797e-05, + "loss": 0.3813, + "step": 11029 + }, + { + "epoch": 2.267447836365505, + "grad_norm": 0.22318050265312195, + "learning_rate": 1.3346599421121562e-05, + "loss": 0.4027, + "step": 11030 + }, + { + "epoch": 2.2676534073388837, + "grad_norm": 0.21505969762802124, + "learning_rate": 1.3339500799885443e-05, + "loss": 0.3957, + "step": 11031 + }, + { + "epoch": 2.2678589783122622, + "grad_norm": 0.22498784959316254, + "learning_rate": 1.3332403738414138e-05, + "loss": 0.3994, + "step": 11032 + }, + { + "epoch": 2.268064549285641, + "grad_norm": 0.23193588852882385, + "learning_rate": 1.3325308237057274e-05, + "loss": 0.3767, + "step": 11033 + }, + { + "epoch": 2.2682701202590194, + "grad_norm": 0.2315264791250229, + "learning_rate": 1.3318214296164444e-05, + "loss": 0.4012, + "step": 11034 + }, + { + "epoch": 2.268475691232398, + "grad_norm": 0.2320316731929779, + "learning_rate": 1.3311121916085105e-05, + "loss": 0.3979, + "step": 11035 + }, + { + "epoch": 2.2686812622057766, + "grad_norm": 0.22784501314163208, + "learning_rate": 1.3304031097168684e-05, + "loss": 0.3942, + "step": 11036 + }, + { + "epoch": 2.268886833179155, + "grad_norm": 0.22963948547840118, + "learning_rate": 1.329694183976449e-05, + "loss": 0.3872, + "step": 11037 + }, + { + "epoch": 2.2690924041525338, + "grad_norm": 0.2397637516260147, + "learning_rate": 1.32898541442218e-05, + "loss": 0.4042, + "step": 11038 + }, + { + "epoch": 2.2692979751259124, + "grad_norm": 0.22877174615859985, + "learning_rate": 1.3282768010889788e-05, + "loss": 0.39, + "step": 11039 + }, + { + "epoch": 2.269503546099291, + "grad_norm": 0.21806636452674866, + "learning_rate": 1.3275683440117551e-05, + "loss": 0.3721, + "step": 11040 + }, + { + "epoch": 2.2697091170726695, + "grad_norm": 0.22859534621238708, + "learning_rate": 1.3268600432254108e-05, + "loss": 0.4001, + "step": 11041 + }, + { + "epoch": 2.2699146880460477, + "grad_norm": 0.22555097937583923, + "learning_rate": 1.3261518987648413e-05, + "loss": 0.3969, + "step": 11042 + }, + { + "epoch": 2.2701202590194263, + "grad_norm": 0.22480298578739166, + "learning_rate": 1.3254439106649332e-05, + "loss": 0.3929, + "step": 11043 + }, + { + "epoch": 2.270325829992805, + "grad_norm": 0.13393786549568176, + "learning_rate": 1.324736078960564e-05, + "loss": 0.4585, + "step": 11044 + }, + { + "epoch": 2.2705314009661834, + "grad_norm": 0.22970856726169586, + "learning_rate": 1.324028403686609e-05, + "loss": 0.4069, + "step": 11045 + }, + { + "epoch": 2.270736971939562, + "grad_norm": 0.22466929256916046, + "learning_rate": 1.3233208848779298e-05, + "loss": 0.3929, + "step": 11046 + }, + { + "epoch": 2.2709425429129406, + "grad_norm": 0.12328503280878067, + "learning_rate": 1.3226135225693829e-05, + "loss": 0.4301, + "step": 11047 + }, + { + "epoch": 2.271148113886319, + "grad_norm": 0.2344934195280075, + "learning_rate": 1.3219063167958165e-05, + "loss": 0.3806, + "step": 11048 + }, + { + "epoch": 2.271353684859698, + "grad_norm": 0.23457783460617065, + "learning_rate": 1.3211992675920716e-05, + "loss": 0.3918, + "step": 11049 + }, + { + "epoch": 2.2715592558330764, + "grad_norm": 0.12788406014442444, + "learning_rate": 1.3204923749929811e-05, + "loss": 0.4623, + "step": 11050 + }, + { + "epoch": 2.271764826806455, + "grad_norm": 0.12366097420454025, + "learning_rate": 1.319785639033369e-05, + "loss": 0.431, + "step": 11051 + }, + { + "epoch": 2.2719703977798336, + "grad_norm": 0.22478674352169037, + "learning_rate": 1.3190790597480558e-05, + "loss": 0.4044, + "step": 11052 + }, + { + "epoch": 2.272175968753212, + "grad_norm": 0.2239609956741333, + "learning_rate": 1.3183726371718493e-05, + "loss": 0.3959, + "step": 11053 + }, + { + "epoch": 2.2723815397265907, + "grad_norm": 0.22685250639915466, + "learning_rate": 1.3176663713395506e-05, + "loss": 0.4002, + "step": 11054 + }, + { + "epoch": 2.2725871106999693, + "grad_norm": 0.2281496375799179, + "learning_rate": 1.3169602622859576e-05, + "loss": 0.3986, + "step": 11055 + }, + { + "epoch": 2.272792681673348, + "grad_norm": 0.23187507688999176, + "learning_rate": 1.3162543100458542e-05, + "loss": 0.4239, + "step": 11056 + }, + { + "epoch": 2.272998252646726, + "grad_norm": 0.2259424477815628, + "learning_rate": 1.3155485146540192e-05, + "loss": 0.381, + "step": 11057 + }, + { + "epoch": 2.273203823620105, + "grad_norm": 0.23765668272972107, + "learning_rate": 1.3148428761452263e-05, + "loss": 0.4185, + "step": 11058 + }, + { + "epoch": 2.2734093945934832, + "grad_norm": 0.23085662722587585, + "learning_rate": 1.3141373945542375e-05, + "loss": 0.4, + "step": 11059 + }, + { + "epoch": 2.273614965566862, + "grad_norm": 0.22228921949863434, + "learning_rate": 1.3134320699158083e-05, + "loss": 0.3736, + "step": 11060 + }, + { + "epoch": 2.2738205365402404, + "grad_norm": 0.21951285004615784, + "learning_rate": 1.3127269022646872e-05, + "loss": 0.3928, + "step": 11061 + }, + { + "epoch": 2.274026107513619, + "grad_norm": 0.1213352307677269, + "learning_rate": 1.3120218916356144e-05, + "loss": 0.4417, + "step": 11062 + }, + { + "epoch": 2.2742316784869976, + "grad_norm": 0.23710954189300537, + "learning_rate": 1.3113170380633223e-05, + "loss": 0.3963, + "step": 11063 + }, + { + "epoch": 2.274437249460376, + "grad_norm": 0.23138689994812012, + "learning_rate": 1.310612341582535e-05, + "loss": 0.3926, + "step": 11064 + }, + { + "epoch": 2.2746428204337548, + "grad_norm": 0.12516102194786072, + "learning_rate": 1.309907802227971e-05, + "loss": 0.4632, + "step": 11065 + }, + { + "epoch": 2.2748483914071334, + "grad_norm": 0.1229373887181282, + "learning_rate": 1.3092034200343395e-05, + "loss": 0.4587, + "step": 11066 + }, + { + "epoch": 2.275053962380512, + "grad_norm": 0.12089274078607559, + "learning_rate": 1.308499195036342e-05, + "loss": 0.4485, + "step": 11067 + }, + { + "epoch": 2.2752595333538905, + "grad_norm": 0.23402529954910278, + "learning_rate": 1.3077951272686716e-05, + "loss": 0.4031, + "step": 11068 + }, + { + "epoch": 2.275465104327269, + "grad_norm": 0.12246517091989517, + "learning_rate": 1.3070912167660153e-05, + "loss": 0.4518, + "step": 11069 + }, + { + "epoch": 2.2756706753006477, + "grad_norm": 0.22479888796806335, + "learning_rate": 1.3063874635630514e-05, + "loss": 0.4006, + "step": 11070 + }, + { + "epoch": 2.2758762462740263, + "grad_norm": 0.2248338758945465, + "learning_rate": 1.3056838676944483e-05, + "loss": 0.3937, + "step": 11071 + }, + { + "epoch": 2.2760818172474044, + "grad_norm": 0.23100706934928894, + "learning_rate": 1.3049804291948727e-05, + "loss": 0.3983, + "step": 11072 + }, + { + "epoch": 2.2762873882207835, + "grad_norm": 0.23669414222240448, + "learning_rate": 1.3042771480989777e-05, + "loss": 0.4027, + "step": 11073 + }, + { + "epoch": 2.2764929591941616, + "grad_norm": 0.1265943944454193, + "learning_rate": 1.303574024441411e-05, + "loss": 0.4579, + "step": 11074 + }, + { + "epoch": 2.27669853016754, + "grad_norm": 0.23661333322525024, + "learning_rate": 1.3028710582568104e-05, + "loss": 0.3944, + "step": 11075 + }, + { + "epoch": 2.276904101140919, + "grad_norm": 0.1238350123167038, + "learning_rate": 1.3021682495798108e-05, + "loss": 0.4527, + "step": 11076 + }, + { + "epoch": 2.2771096721142974, + "grad_norm": 0.23075202107429504, + "learning_rate": 1.3014655984450351e-05, + "loss": 0.4139, + "step": 11077 + }, + { + "epoch": 2.277315243087676, + "grad_norm": 0.23109117150306702, + "learning_rate": 1.300763104887098e-05, + "loss": 0.3795, + "step": 11078 + }, + { + "epoch": 2.2775208140610546, + "grad_norm": 0.13491906225681305, + "learning_rate": 1.300060768940611e-05, + "loss": 0.4503, + "step": 11079 + }, + { + "epoch": 2.277726385034433, + "grad_norm": 0.22590011358261108, + "learning_rate": 1.2993585906401735e-05, + "loss": 0.3878, + "step": 11080 + }, + { + "epoch": 2.2779319560078117, + "grad_norm": 0.23638883233070374, + "learning_rate": 1.2986565700203778e-05, + "loss": 0.3989, + "step": 11081 + }, + { + "epoch": 2.2781375269811903, + "grad_norm": 0.2324167639017105, + "learning_rate": 1.2979547071158106e-05, + "loss": 0.3983, + "step": 11082 + }, + { + "epoch": 2.278343097954569, + "grad_norm": 0.22499267756938934, + "learning_rate": 1.2972530019610482e-05, + "loss": 0.3917, + "step": 11083 + }, + { + "epoch": 2.2785486689279475, + "grad_norm": 0.23397715389728546, + "learning_rate": 1.2965514545906612e-05, + "loss": 0.4039, + "step": 11084 + }, + { + "epoch": 2.278754239901326, + "grad_norm": 0.12136294692754745, + "learning_rate": 1.2958500650392098e-05, + "loss": 0.4592, + "step": 11085 + }, + { + "epoch": 2.2789598108747047, + "grad_norm": 0.23275341093540192, + "learning_rate": 1.2951488333412505e-05, + "loss": 0.3907, + "step": 11086 + }, + { + "epoch": 2.279165381848083, + "grad_norm": 0.23098520934581757, + "learning_rate": 1.294447759531329e-05, + "loss": 0.3933, + "step": 11087 + }, + { + "epoch": 2.279370952821462, + "grad_norm": 0.2239454835653305, + "learning_rate": 1.2937468436439835e-05, + "loss": 0.3851, + "step": 11088 + }, + { + "epoch": 2.27957652379484, + "grad_norm": 0.23332616686820984, + "learning_rate": 1.2930460857137452e-05, + "loss": 0.4186, + "step": 11089 + }, + { + "epoch": 2.2797820947682186, + "grad_norm": 0.22289900481700897, + "learning_rate": 1.2923454857751368e-05, + "loss": 0.3918, + "step": 11090 + }, + { + "epoch": 2.279987665741597, + "grad_norm": 0.11850762367248535, + "learning_rate": 1.2916450438626742e-05, + "loss": 0.4475, + "step": 11091 + }, + { + "epoch": 2.2801932367149758, + "grad_norm": 0.22523003816604614, + "learning_rate": 1.2909447600108626e-05, + "loss": 0.3886, + "step": 11092 + }, + { + "epoch": 2.2803988076883543, + "grad_norm": 0.23885266482830048, + "learning_rate": 1.2902446342542053e-05, + "loss": 0.4051, + "step": 11093 + }, + { + "epoch": 2.280604378661733, + "grad_norm": 0.2248595505952835, + "learning_rate": 1.2895446666271926e-05, + "loss": 0.3843, + "step": 11094 + }, + { + "epoch": 2.2808099496351115, + "grad_norm": 0.23855264484882355, + "learning_rate": 1.2888448571643081e-05, + "loss": 0.3936, + "step": 11095 + }, + { + "epoch": 2.28101552060849, + "grad_norm": 0.2420293390750885, + "learning_rate": 1.2881452059000287e-05, + "loss": 0.3967, + "step": 11096 + }, + { + "epoch": 2.2812210915818687, + "grad_norm": 0.22361691296100616, + "learning_rate": 1.2874457128688216e-05, + "loss": 0.3815, + "step": 11097 + }, + { + "epoch": 2.2814266625552473, + "grad_norm": 0.13447174429893494, + "learning_rate": 1.28674637810515e-05, + "loss": 0.4621, + "step": 11098 + }, + { + "epoch": 2.281632233528626, + "grad_norm": 0.23001371324062347, + "learning_rate": 1.2860472016434645e-05, + "loss": 0.3698, + "step": 11099 + }, + { + "epoch": 2.2818378045020045, + "grad_norm": 0.2274404913187027, + "learning_rate": 1.2853481835182129e-05, + "loss": 0.3959, + "step": 11100 + }, + { + "epoch": 2.282043375475383, + "grad_norm": 0.23622088134288788, + "learning_rate": 1.2846493237638308e-05, + "loss": 0.4038, + "step": 11101 + }, + { + "epoch": 2.282248946448761, + "grad_norm": 0.11896710842847824, + "learning_rate": 1.283950622414748e-05, + "loss": 0.4503, + "step": 11102 + }, + { + "epoch": 2.2824545174221402, + "grad_norm": 0.23470290005207062, + "learning_rate": 1.2832520795053865e-05, + "loss": 0.3857, + "step": 11103 + }, + { + "epoch": 2.2826600883955184, + "grad_norm": 0.2171606570482254, + "learning_rate": 1.2825536950701594e-05, + "loss": 0.4002, + "step": 11104 + }, + { + "epoch": 2.282865659368897, + "grad_norm": 0.23823009431362152, + "learning_rate": 1.281855469143474e-05, + "loss": 0.3899, + "step": 11105 + }, + { + "epoch": 2.2830712303422755, + "grad_norm": 0.22637523710727692, + "learning_rate": 1.2811574017597265e-05, + "loss": 0.3961, + "step": 11106 + }, + { + "epoch": 2.283276801315654, + "grad_norm": 0.23832228779792786, + "learning_rate": 1.2804594929533107e-05, + "loss": 0.4002, + "step": 11107 + }, + { + "epoch": 2.2834823722890327, + "grad_norm": 0.22340717911720276, + "learning_rate": 1.2797617427586071e-05, + "loss": 0.3843, + "step": 11108 + }, + { + "epoch": 2.2836879432624113, + "grad_norm": 0.2311078906059265, + "learning_rate": 1.2790641512099914e-05, + "loss": 0.3848, + "step": 11109 + }, + { + "epoch": 2.28389351423579, + "grad_norm": 0.1308235377073288, + "learning_rate": 1.2783667183418299e-05, + "loss": 0.4372, + "step": 11110 + }, + { + "epoch": 2.2840990852091685, + "grad_norm": 0.22774946689605713, + "learning_rate": 1.2776694441884828e-05, + "loss": 0.4162, + "step": 11111 + }, + { + "epoch": 2.284304656182547, + "grad_norm": 0.23029407858848572, + "learning_rate": 1.2769723287843009e-05, + "loss": 0.4024, + "step": 11112 + }, + { + "epoch": 2.2845102271559257, + "grad_norm": 0.126814067363739, + "learning_rate": 1.2762753721636263e-05, + "loss": 0.4453, + "step": 11113 + }, + { + "epoch": 2.2847157981293043, + "grad_norm": 0.1285434365272522, + "learning_rate": 1.2755785743607981e-05, + "loss": 0.4571, + "step": 11114 + }, + { + "epoch": 2.284921369102683, + "grad_norm": 0.22413338720798492, + "learning_rate": 1.2748819354101428e-05, + "loss": 0.4142, + "step": 11115 + }, + { + "epoch": 2.2851269400760614, + "grad_norm": 0.2274656891822815, + "learning_rate": 1.2741854553459801e-05, + "loss": 0.3934, + "step": 11116 + }, + { + "epoch": 2.2853325110494396, + "grad_norm": 0.2260764241218567, + "learning_rate": 1.2734891342026228e-05, + "loss": 0.3912, + "step": 11117 + }, + { + "epoch": 2.2855380820228186, + "grad_norm": 0.24936430156230927, + "learning_rate": 1.2727929720143737e-05, + "loss": 0.3797, + "step": 11118 + }, + { + "epoch": 2.2857436529961968, + "grad_norm": 0.12210172414779663, + "learning_rate": 1.2720969688155326e-05, + "loss": 0.4556, + "step": 11119 + }, + { + "epoch": 2.2859492239695753, + "grad_norm": 0.23101243376731873, + "learning_rate": 1.2714011246403862e-05, + "loss": 0.3901, + "step": 11120 + }, + { + "epoch": 2.286154794942954, + "grad_norm": 0.22702264785766602, + "learning_rate": 1.2707054395232148e-05, + "loss": 0.4061, + "step": 11121 + }, + { + "epoch": 2.2863603659163325, + "grad_norm": 0.12117066979408264, + "learning_rate": 1.270009913498294e-05, + "loss": 0.4418, + "step": 11122 + }, + { + "epoch": 2.286565936889711, + "grad_norm": 0.12678340077400208, + "learning_rate": 1.2693145465998878e-05, + "loss": 0.462, + "step": 11123 + }, + { + "epoch": 2.2867715078630897, + "grad_norm": 0.1255645453929901, + "learning_rate": 1.2686193388622541e-05, + "loss": 0.4692, + "step": 11124 + }, + { + "epoch": 2.2869770788364683, + "grad_norm": 0.2327447086572647, + "learning_rate": 1.2679242903196418e-05, + "loss": 0.4108, + "step": 11125 + }, + { + "epoch": 2.287182649809847, + "grad_norm": 0.23680876195430756, + "learning_rate": 1.267229401006293e-05, + "loss": 0.3892, + "step": 11126 + }, + { + "epoch": 2.2873882207832255, + "grad_norm": 0.22818145155906677, + "learning_rate": 1.2665346709564407e-05, + "loss": 0.4014, + "step": 11127 + }, + { + "epoch": 2.287593791756604, + "grad_norm": 0.2357787936925888, + "learning_rate": 1.2658401002043128e-05, + "loss": 0.3958, + "step": 11128 + }, + { + "epoch": 2.2877993627299826, + "grad_norm": 0.12954148650169373, + "learning_rate": 1.2651456887841272e-05, + "loss": 0.4567, + "step": 11129 + }, + { + "epoch": 2.288004933703361, + "grad_norm": 0.23145915567874908, + "learning_rate": 1.2644514367300932e-05, + "loss": 0.4028, + "step": 11130 + }, + { + "epoch": 2.28821050467674, + "grad_norm": 0.22589780390262604, + "learning_rate": 1.2637573440764148e-05, + "loss": 0.3977, + "step": 11131 + }, + { + "epoch": 2.2884160756501184, + "grad_norm": 0.23484013974666595, + "learning_rate": 1.2630634108572853e-05, + "loss": 0.3964, + "step": 11132 + }, + { + "epoch": 2.288621646623497, + "grad_norm": 0.23270565271377563, + "learning_rate": 1.2623696371068912e-05, + "loss": 0.3953, + "step": 11133 + }, + { + "epoch": 2.288827217596875, + "grad_norm": 0.12677009403705597, + "learning_rate": 1.2616760228594133e-05, + "loss": 0.4461, + "step": 11134 + }, + { + "epoch": 2.2890327885702537, + "grad_norm": 0.22877991199493408, + "learning_rate": 1.2609825681490221e-05, + "loss": 0.3859, + "step": 11135 + }, + { + "epoch": 2.2892383595436323, + "grad_norm": 0.23278361558914185, + "learning_rate": 1.260289273009881e-05, + "loss": 0.3986, + "step": 11136 + }, + { + "epoch": 2.289443930517011, + "grad_norm": 0.2246071696281433, + "learning_rate": 1.2595961374761448e-05, + "loss": 0.3715, + "step": 11137 + }, + { + "epoch": 2.2896495014903895, + "grad_norm": 0.23304541409015656, + "learning_rate": 1.2589031615819613e-05, + "loss": 0.3874, + "step": 11138 + }, + { + "epoch": 2.289855072463768, + "grad_norm": 0.2341768443584442, + "learning_rate": 1.2582103453614684e-05, + "loss": 0.3995, + "step": 11139 + }, + { + "epoch": 2.2900606434371467, + "grad_norm": 0.22343499958515167, + "learning_rate": 1.2575176888488016e-05, + "loss": 0.3997, + "step": 11140 + }, + { + "epoch": 2.2902662144105252, + "grad_norm": 0.22474630177021027, + "learning_rate": 1.2568251920780829e-05, + "loss": 0.4096, + "step": 11141 + }, + { + "epoch": 2.290471785383904, + "grad_norm": 0.1266659051179886, + "learning_rate": 1.2561328550834265e-05, + "loss": 0.4552, + "step": 11142 + }, + { + "epoch": 2.2906773563572824, + "grad_norm": 0.2366304099559784, + "learning_rate": 1.2554406778989448e-05, + "loss": 0.3886, + "step": 11143 + }, + { + "epoch": 2.290882927330661, + "grad_norm": 0.23987746238708496, + "learning_rate": 1.2547486605587354e-05, + "loss": 0.4198, + "step": 11144 + }, + { + "epoch": 2.2910884983040396, + "grad_norm": 0.12243471294641495, + "learning_rate": 1.2540568030968911e-05, + "loss": 0.4459, + "step": 11145 + }, + { + "epoch": 2.291294069277418, + "grad_norm": 0.12086188048124313, + "learning_rate": 1.2533651055474965e-05, + "loss": 0.4536, + "step": 11146 + }, + { + "epoch": 2.2914996402507968, + "grad_norm": 0.23374128341674805, + "learning_rate": 1.2526735679446273e-05, + "loss": 0.3984, + "step": 11147 + }, + { + "epoch": 2.2917052112241754, + "grad_norm": 0.23066291213035583, + "learning_rate": 1.2519821903223552e-05, + "loss": 0.4043, + "step": 11148 + }, + { + "epoch": 2.2919107821975535, + "grad_norm": 0.227426216006279, + "learning_rate": 1.2512909727147388e-05, + "loss": 0.4083, + "step": 11149 + }, + { + "epoch": 2.292116353170932, + "grad_norm": 0.22349144518375397, + "learning_rate": 1.2505999151558319e-05, + "loss": 0.4062, + "step": 11150 + }, + { + "epoch": 2.2923219241443107, + "grad_norm": 0.22015713155269623, + "learning_rate": 1.2499090176796794e-05, + "loss": 0.3929, + "step": 11151 + }, + { + "epoch": 2.2925274951176893, + "grad_norm": 0.22965404391288757, + "learning_rate": 1.2492182803203188e-05, + "loss": 0.3723, + "step": 11152 + }, + { + "epoch": 2.292733066091068, + "grad_norm": 0.22359246015548706, + "learning_rate": 1.24852770311178e-05, + "loss": 0.399, + "step": 11153 + }, + { + "epoch": 2.2929386370644464, + "grad_norm": 0.2246733158826828, + "learning_rate": 1.2478372860880819e-05, + "loss": 0.4153, + "step": 11154 + }, + { + "epoch": 2.293144208037825, + "grad_norm": 0.23003293573856354, + "learning_rate": 1.2471470292832414e-05, + "loss": 0.4202, + "step": 11155 + }, + { + "epoch": 2.2933497790112036, + "grad_norm": 0.22609424591064453, + "learning_rate": 1.2464569327312634e-05, + "loss": 0.3861, + "step": 11156 + }, + { + "epoch": 2.293555349984582, + "grad_norm": 0.233436718583107, + "learning_rate": 1.2457669964661447e-05, + "loss": 0.4113, + "step": 11157 + }, + { + "epoch": 2.293760920957961, + "grad_norm": 0.2230585813522339, + "learning_rate": 1.2450772205218768e-05, + "loss": 0.3785, + "step": 11158 + }, + { + "epoch": 2.2939664919313394, + "grad_norm": 0.13363520801067352, + "learning_rate": 1.2443876049324401e-05, + "loss": 0.4589, + "step": 11159 + }, + { + "epoch": 2.294172062904718, + "grad_norm": 0.23311814665794373, + "learning_rate": 1.2436981497318081e-05, + "loss": 0.398, + "step": 11160 + }, + { + "epoch": 2.2943776338780966, + "grad_norm": 0.23788057267665863, + "learning_rate": 1.2430088549539498e-05, + "loss": 0.3656, + "step": 11161 + }, + { + "epoch": 2.294583204851475, + "grad_norm": 0.23247785866260529, + "learning_rate": 1.2423197206328219e-05, + "loss": 0.416, + "step": 11162 + }, + { + "epoch": 2.2947887758248537, + "grad_norm": 0.12585797905921936, + "learning_rate": 1.2416307468023738e-05, + "loss": 0.4245, + "step": 11163 + }, + { + "epoch": 2.294994346798232, + "grad_norm": 0.11753173917531967, + "learning_rate": 1.2409419334965507e-05, + "loss": 0.4366, + "step": 11164 + }, + { + "epoch": 2.2951999177716105, + "grad_norm": 0.11819145828485489, + "learning_rate": 1.2402532807492854e-05, + "loss": 0.4381, + "step": 11165 + }, + { + "epoch": 2.295405488744989, + "grad_norm": 0.2348855584859848, + "learning_rate": 1.2395647885945055e-05, + "loss": 0.3894, + "step": 11166 + }, + { + "epoch": 2.2956110597183677, + "grad_norm": 0.11923953890800476, + "learning_rate": 1.238876457066129e-05, + "loss": 0.4363, + "step": 11167 + }, + { + "epoch": 2.2958166306917462, + "grad_norm": 0.23349328339099884, + "learning_rate": 1.2381882861980653e-05, + "loss": 0.3905, + "step": 11168 + }, + { + "epoch": 2.296022201665125, + "grad_norm": 0.2256205677986145, + "learning_rate": 1.2375002760242207e-05, + "loss": 0.385, + "step": 11169 + }, + { + "epoch": 2.2962277726385034, + "grad_norm": 0.23128965497016907, + "learning_rate": 1.2368124265784888e-05, + "loss": 0.3942, + "step": 11170 + }, + { + "epoch": 2.296433343611882, + "grad_norm": 0.12350024282932281, + "learning_rate": 1.2361247378947561e-05, + "loss": 0.4333, + "step": 11171 + }, + { + "epoch": 2.2966389145852606, + "grad_norm": 0.23417676985263824, + "learning_rate": 1.2354372100069026e-05, + "loss": 0.3891, + "step": 11172 + }, + { + "epoch": 2.296844485558639, + "grad_norm": 0.22731667757034302, + "learning_rate": 1.2347498429487991e-05, + "loss": 0.3977, + "step": 11173 + }, + { + "epoch": 2.2970500565320178, + "grad_norm": 0.2296586036682129, + "learning_rate": 1.2340626367543091e-05, + "loss": 0.4054, + "step": 11174 + }, + { + "epoch": 2.2972556275053964, + "grad_norm": 0.13354873657226562, + "learning_rate": 1.2333755914572868e-05, + "loss": 0.4622, + "step": 11175 + }, + { + "epoch": 2.297461198478775, + "grad_norm": 0.22536778450012207, + "learning_rate": 1.2326887070915823e-05, + "loss": 0.3746, + "step": 11176 + }, + { + "epoch": 2.2976667694521535, + "grad_norm": 0.22419311106204987, + "learning_rate": 1.2320019836910335e-05, + "loss": 0.4029, + "step": 11177 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 0.2210252434015274, + "learning_rate": 1.231315421289473e-05, + "loss": 0.3709, + "step": 11178 + }, + { + "epoch": 2.2980779113989103, + "grad_norm": 0.22239845991134644, + "learning_rate": 1.2306290199207233e-05, + "loss": 0.3892, + "step": 11179 + }, + { + "epoch": 2.298283482372289, + "grad_norm": 0.22236813604831696, + "learning_rate": 1.2299427796186008e-05, + "loss": 0.4075, + "step": 11180 + }, + { + "epoch": 2.2984890533456674, + "grad_norm": 0.22609713673591614, + "learning_rate": 1.229256700416914e-05, + "loss": 0.3968, + "step": 11181 + }, + { + "epoch": 2.298694624319046, + "grad_norm": 0.23106250166893005, + "learning_rate": 1.2285707823494599e-05, + "loss": 0.3792, + "step": 11182 + }, + { + "epoch": 2.2989001952924246, + "grad_norm": 0.22286170721054077, + "learning_rate": 1.2278850254500348e-05, + "loss": 0.3835, + "step": 11183 + }, + { + "epoch": 2.299105766265803, + "grad_norm": 0.229881152510643, + "learning_rate": 1.227199429752419e-05, + "loss": 0.3851, + "step": 11184 + }, + { + "epoch": 2.299311337239182, + "grad_norm": 0.1258445382118225, + "learning_rate": 1.2265139952903916e-05, + "loss": 0.4364, + "step": 11185 + }, + { + "epoch": 2.2995169082125604, + "grad_norm": 0.22773106396198273, + "learning_rate": 1.2258287220977196e-05, + "loss": 0.4042, + "step": 11186 + }, + { + "epoch": 2.299722479185939, + "grad_norm": 0.22230634093284607, + "learning_rate": 1.225143610208163e-05, + "loss": 0.3832, + "step": 11187 + }, + { + "epoch": 2.2999280501593176, + "grad_norm": 0.23126055300235748, + "learning_rate": 1.2244586596554739e-05, + "loss": 0.3922, + "step": 11188 + }, + { + "epoch": 2.300133621132696, + "grad_norm": 0.2347308248281479, + "learning_rate": 1.2237738704733954e-05, + "loss": 0.3671, + "step": 11189 + }, + { + "epoch": 2.3003391921060747, + "grad_norm": 0.12365079671144485, + "learning_rate": 1.2230892426956669e-05, + "loss": 0.4378, + "step": 11190 + }, + { + "epoch": 2.3005447630794533, + "grad_norm": 0.22160682082176208, + "learning_rate": 1.222404776356015e-05, + "loss": 0.388, + "step": 11191 + }, + { + "epoch": 2.300750334052832, + "grad_norm": 0.22561746835708618, + "learning_rate": 1.2217204714881603e-05, + "loss": 0.3529, + "step": 11192 + }, + { + "epoch": 2.3009559050262105, + "grad_norm": 0.27136144042015076, + "learning_rate": 1.2210363281258155e-05, + "loss": 0.3885, + "step": 11193 + }, + { + "epoch": 2.3011614759995886, + "grad_norm": 0.22475385665893555, + "learning_rate": 1.220352346302685e-05, + "loss": 0.3874, + "step": 11194 + }, + { + "epoch": 2.3013670469729672, + "grad_norm": 0.23630446195602417, + "learning_rate": 1.2196685260524648e-05, + "loss": 0.3871, + "step": 11195 + }, + { + "epoch": 2.301572617946346, + "grad_norm": 0.12092158198356628, + "learning_rate": 1.2189848674088433e-05, + "loss": 0.4375, + "step": 11196 + }, + { + "epoch": 2.3017781889197244, + "grad_norm": 0.23177292943000793, + "learning_rate": 1.2183013704055033e-05, + "loss": 0.4025, + "step": 11197 + }, + { + "epoch": 2.301983759893103, + "grad_norm": 0.12416423112154007, + "learning_rate": 1.2176180350761157e-05, + "loss": 0.4473, + "step": 11198 + }, + { + "epoch": 2.3021893308664816, + "grad_norm": 0.12276289612054825, + "learning_rate": 1.2169348614543464e-05, + "loss": 0.4537, + "step": 11199 + }, + { + "epoch": 2.30239490183986, + "grad_norm": 0.22835765779018402, + "learning_rate": 1.216251849573851e-05, + "loss": 0.3937, + "step": 11200 + }, + { + "epoch": 2.3026004728132388, + "grad_norm": 0.22718718647956848, + "learning_rate": 1.2155689994682788e-05, + "loss": 0.3896, + "step": 11201 + }, + { + "epoch": 2.3028060437866174, + "grad_norm": 0.1231781542301178, + "learning_rate": 1.2148863111712704e-05, + "loss": 0.447, + "step": 11202 + }, + { + "epoch": 2.303011614759996, + "grad_norm": 0.23988062143325806, + "learning_rate": 1.214203784716458e-05, + "loss": 0.3919, + "step": 11203 + }, + { + "epoch": 2.3032171857333745, + "grad_norm": 0.21849578619003296, + "learning_rate": 1.2135214201374685e-05, + "loss": 0.3758, + "step": 11204 + }, + { + "epoch": 2.303422756706753, + "grad_norm": 0.2158803790807724, + "learning_rate": 1.2128392174679179e-05, + "loss": 0.3704, + "step": 11205 + }, + { + "epoch": 2.3036283276801317, + "grad_norm": 0.22733426094055176, + "learning_rate": 1.212157176741413e-05, + "loss": 0.3694, + "step": 11206 + }, + { + "epoch": 2.3038338986535103, + "grad_norm": 0.23298750817775726, + "learning_rate": 1.2114752979915584e-05, + "loss": 0.3798, + "step": 11207 + }, + { + "epoch": 2.304039469626889, + "grad_norm": 0.22814899682998657, + "learning_rate": 1.210793581251945e-05, + "loss": 0.3811, + "step": 11208 + }, + { + "epoch": 2.304245040600267, + "grad_norm": 0.23419663310050964, + "learning_rate": 1.2101120265561585e-05, + "loss": 0.3799, + "step": 11209 + }, + { + "epoch": 2.3044506115736456, + "grad_norm": 0.12921544909477234, + "learning_rate": 1.2094306339377743e-05, + "loss": 0.4378, + "step": 11210 + }, + { + "epoch": 2.304656182547024, + "grad_norm": 0.22787374258041382, + "learning_rate": 1.208749403430364e-05, + "loss": 0.4039, + "step": 11211 + }, + { + "epoch": 2.304861753520403, + "grad_norm": 0.2288065403699875, + "learning_rate": 1.2080683350674869e-05, + "loss": 0.3922, + "step": 11212 + }, + { + "epoch": 2.3050673244937814, + "grad_norm": 0.23211759328842163, + "learning_rate": 1.2073874288826966e-05, + "loss": 0.3804, + "step": 11213 + }, + { + "epoch": 2.30527289546716, + "grad_norm": 0.23307380080223083, + "learning_rate": 1.2067066849095386e-05, + "loss": 0.3883, + "step": 11214 + }, + { + "epoch": 2.3054784664405386, + "grad_norm": 0.22233398258686066, + "learning_rate": 1.206026103181549e-05, + "loss": 0.3948, + "step": 11215 + }, + { + "epoch": 2.305684037413917, + "grad_norm": 0.22807008028030396, + "learning_rate": 1.2053456837322557e-05, + "loss": 0.396, + "step": 11216 + }, + { + "epoch": 2.3058896083872957, + "grad_norm": 0.23228740692138672, + "learning_rate": 1.204665426595183e-05, + "loss": 0.4057, + "step": 11217 + }, + { + "epoch": 2.3060951793606743, + "grad_norm": 0.2424495369195938, + "learning_rate": 1.2039853318038428e-05, + "loss": 0.4068, + "step": 11218 + }, + { + "epoch": 2.306300750334053, + "grad_norm": 0.23171810805797577, + "learning_rate": 1.2033053993917391e-05, + "loss": 0.4152, + "step": 11219 + }, + { + "epoch": 2.3065063213074315, + "grad_norm": 0.2335965633392334, + "learning_rate": 1.2026256293923702e-05, + "loss": 0.3733, + "step": 11220 + }, + { + "epoch": 2.30671189228081, + "grad_norm": 0.12516964972019196, + "learning_rate": 1.2019460218392243e-05, + "loss": 0.4496, + "step": 11221 + }, + { + "epoch": 2.3069174632541887, + "grad_norm": 0.2288234382867813, + "learning_rate": 1.2012665767657825e-05, + "loss": 0.3842, + "step": 11222 + }, + { + "epoch": 2.3071230342275673, + "grad_norm": 0.23571978509426117, + "learning_rate": 1.2005872942055177e-05, + "loss": 0.4029, + "step": 11223 + }, + { + "epoch": 2.3073286052009454, + "grad_norm": 0.23239515721797943, + "learning_rate": 1.1999081741918965e-05, + "loss": 0.4028, + "step": 11224 + }, + { + "epoch": 2.307534176174324, + "grad_norm": 0.23048000037670135, + "learning_rate": 1.1992292167583748e-05, + "loss": 0.3883, + "step": 11225 + }, + { + "epoch": 2.3077397471477026, + "grad_norm": 0.1262623518705368, + "learning_rate": 1.198550421938402e-05, + "loss": 0.4509, + "step": 11226 + }, + { + "epoch": 2.307945318121081, + "grad_norm": 0.2399047166109085, + "learning_rate": 1.1978717897654171e-05, + "loss": 0.4162, + "step": 11227 + }, + { + "epoch": 2.3081508890944598, + "grad_norm": 0.22697141766548157, + "learning_rate": 1.197193320272857e-05, + "loss": 0.3845, + "step": 11228 + }, + { + "epoch": 2.3083564600678383, + "grad_norm": 0.2281046062707901, + "learning_rate": 1.1965150134941447e-05, + "loss": 0.3835, + "step": 11229 + }, + { + "epoch": 2.308562031041217, + "grad_norm": 0.12404376268386841, + "learning_rate": 1.1958368694626956e-05, + "loss": 0.4376, + "step": 11230 + }, + { + "epoch": 2.3087676020145955, + "grad_norm": 0.12131867557764053, + "learning_rate": 1.195158888211922e-05, + "loss": 0.4545, + "step": 11231 + }, + { + "epoch": 2.308973172987974, + "grad_norm": 0.22881445288658142, + "learning_rate": 1.194481069775223e-05, + "loss": 0.4063, + "step": 11232 + }, + { + "epoch": 2.3091787439613527, + "grad_norm": 0.22988468408584595, + "learning_rate": 1.1938034141859915e-05, + "loss": 0.4105, + "step": 11233 + }, + { + "epoch": 2.3093843149347313, + "grad_norm": 0.23098687827587128, + "learning_rate": 1.1931259214776129e-05, + "loss": 0.3975, + "step": 11234 + }, + { + "epoch": 2.30958988590811, + "grad_norm": 0.12407363951206207, + "learning_rate": 1.1924485916834638e-05, + "loss": 0.4472, + "step": 11235 + }, + { + "epoch": 2.3097954568814885, + "grad_norm": 0.12328176200389862, + "learning_rate": 1.1917714248369133e-05, + "loss": 0.4449, + "step": 11236 + }, + { + "epoch": 2.310001027854867, + "grad_norm": 0.22142189741134644, + "learning_rate": 1.1910944209713205e-05, + "loss": 0.3997, + "step": 11237 + }, + { + "epoch": 2.3102065988282456, + "grad_norm": 0.2281443476676941, + "learning_rate": 1.1904175801200417e-05, + "loss": 0.3818, + "step": 11238 + }, + { + "epoch": 2.310412169801624, + "grad_norm": 0.22729991376399994, + "learning_rate": 1.1897409023164191e-05, + "loss": 0.3928, + "step": 11239 + }, + { + "epoch": 2.310617740775003, + "grad_norm": 0.12084699422121048, + "learning_rate": 1.1890643875937904e-05, + "loss": 0.4569, + "step": 11240 + }, + { + "epoch": 2.310823311748381, + "grad_norm": 0.12548977136611938, + "learning_rate": 1.1883880359854836e-05, + "loss": 0.4437, + "step": 11241 + }, + { + "epoch": 2.3110288827217595, + "grad_norm": 0.22213564813137054, + "learning_rate": 1.1877118475248204e-05, + "loss": 0.4011, + "step": 11242 + }, + { + "epoch": 2.311234453695138, + "grad_norm": 0.2207585573196411, + "learning_rate": 1.1870358222451127e-05, + "loss": 0.4, + "step": 11243 + }, + { + "epoch": 2.3114400246685167, + "grad_norm": 0.2309262752532959, + "learning_rate": 1.1863599601796638e-05, + "loss": 0.384, + "step": 11244 + }, + { + "epoch": 2.3116455956418953, + "grad_norm": 0.22863119840621948, + "learning_rate": 1.1856842613617734e-05, + "loss": 0.3985, + "step": 11245 + }, + { + "epoch": 2.311851166615274, + "grad_norm": 0.22216136753559113, + "learning_rate": 1.1850087258247282e-05, + "loss": 0.3878, + "step": 11246 + }, + { + "epoch": 2.3120567375886525, + "grad_norm": 0.23234418034553528, + "learning_rate": 1.1843333536018088e-05, + "loss": 0.3844, + "step": 11247 + }, + { + "epoch": 2.312262308562031, + "grad_norm": 0.22549466788768768, + "learning_rate": 1.1836581447262865e-05, + "loss": 0.3844, + "step": 11248 + }, + { + "epoch": 2.3124678795354097, + "grad_norm": 0.2254628688097, + "learning_rate": 1.1829830992314282e-05, + "loss": 0.38, + "step": 11249 + }, + { + "epoch": 2.3126734505087883, + "grad_norm": 0.23794369399547577, + "learning_rate": 1.1823082171504888e-05, + "loss": 0.38, + "step": 11250 + }, + { + "epoch": 2.312879021482167, + "grad_norm": 0.1556072235107422, + "learning_rate": 1.1816334985167152e-05, + "loss": 0.4545, + "step": 11251 + }, + { + "epoch": 2.3130845924555454, + "grad_norm": 0.23473793268203735, + "learning_rate": 1.1809589433633507e-05, + "loss": 0.4154, + "step": 11252 + }, + { + "epoch": 2.313290163428924, + "grad_norm": 0.22591789066791534, + "learning_rate": 1.1802845517236261e-05, + "loss": 0.3782, + "step": 11253 + }, + { + "epoch": 2.313495734402302, + "grad_norm": 0.22409707307815552, + "learning_rate": 1.1796103236307647e-05, + "loss": 0.3871, + "step": 11254 + }, + { + "epoch": 2.313701305375681, + "grad_norm": 0.12136626243591309, + "learning_rate": 1.1789362591179836e-05, + "loss": 0.4417, + "step": 11255 + }, + { + "epoch": 2.3139068763490593, + "grad_norm": 0.23068110644817352, + "learning_rate": 1.1782623582184907e-05, + "loss": 0.3921, + "step": 11256 + }, + { + "epoch": 2.314112447322438, + "grad_norm": 0.22606144845485687, + "learning_rate": 1.1775886209654853e-05, + "loss": 0.4033, + "step": 11257 + }, + { + "epoch": 2.3143180182958165, + "grad_norm": 0.23773600161075592, + "learning_rate": 1.1769150473921582e-05, + "loss": 0.4094, + "step": 11258 + }, + { + "epoch": 2.314523589269195, + "grad_norm": 0.23489652574062347, + "learning_rate": 1.1762416375316958e-05, + "loss": 0.3755, + "step": 11259 + }, + { + "epoch": 2.3147291602425737, + "grad_norm": 0.12201520800590515, + "learning_rate": 1.1755683914172731e-05, + "loss": 0.4488, + "step": 11260 + }, + { + "epoch": 2.3149347312159523, + "grad_norm": 0.22625313699245453, + "learning_rate": 1.1748953090820572e-05, + "loss": 0.382, + "step": 11261 + }, + { + "epoch": 2.315140302189331, + "grad_norm": 0.21789546310901642, + "learning_rate": 1.1742223905592084e-05, + "loss": 0.3877, + "step": 11262 + }, + { + "epoch": 2.3153458731627095, + "grad_norm": 0.2211894392967224, + "learning_rate": 1.1735496358818773e-05, + "loss": 0.3978, + "step": 11263 + }, + { + "epoch": 2.315551444136088, + "grad_norm": 0.22544537484645844, + "learning_rate": 1.1728770450832078e-05, + "loss": 0.3777, + "step": 11264 + }, + { + "epoch": 2.3157570151094666, + "grad_norm": 0.23240074515342712, + "learning_rate": 1.1722046181963344e-05, + "loss": 0.3894, + "step": 11265 + }, + { + "epoch": 2.315962586082845, + "grad_norm": 0.22723515331745148, + "learning_rate": 1.1715323552543861e-05, + "loss": 0.3761, + "step": 11266 + }, + { + "epoch": 2.316168157056224, + "grad_norm": 0.2265399843454361, + "learning_rate": 1.170860256290482e-05, + "loss": 0.3725, + "step": 11267 + }, + { + "epoch": 2.3163737280296024, + "grad_norm": 0.22929410636425018, + "learning_rate": 1.1701883213377327e-05, + "loss": 0.4007, + "step": 11268 + }, + { + "epoch": 2.3165792990029805, + "grad_norm": 0.2396460622549057, + "learning_rate": 1.1695165504292409e-05, + "loss": 0.386, + "step": 11269 + }, + { + "epoch": 2.3167848699763596, + "grad_norm": 0.23619569838047028, + "learning_rate": 1.168844943598101e-05, + "loss": 0.3854, + "step": 11270 + }, + { + "epoch": 2.3169904409497377, + "grad_norm": 0.22975857555866241, + "learning_rate": 1.168173500877402e-05, + "loss": 0.3851, + "step": 11271 + }, + { + "epoch": 2.3171960119231163, + "grad_norm": 0.23731692135334015, + "learning_rate": 1.167502222300221e-05, + "loss": 0.3812, + "step": 11272 + }, + { + "epoch": 2.317401582896495, + "grad_norm": 0.22858087718486786, + "learning_rate": 1.1668311078996303e-05, + "loss": 0.387, + "step": 11273 + }, + { + "epoch": 2.3176071538698735, + "grad_norm": 0.22912317514419556, + "learning_rate": 1.1661601577086916e-05, + "loss": 0.4138, + "step": 11274 + }, + { + "epoch": 2.317812724843252, + "grad_norm": 0.2295382171869278, + "learning_rate": 1.1654893717604597e-05, + "loss": 0.4013, + "step": 11275 + }, + { + "epoch": 2.3180182958166307, + "grad_norm": 0.1292608678340912, + "learning_rate": 1.1648187500879812e-05, + "loss": 0.4512, + "step": 11276 + }, + { + "epoch": 2.3182238667900092, + "grad_norm": 0.23045098781585693, + "learning_rate": 1.1641482927242945e-05, + "loss": 0.4034, + "step": 11277 + }, + { + "epoch": 2.318429437763388, + "grad_norm": 0.22682234644889832, + "learning_rate": 1.1634779997024293e-05, + "loss": 0.3821, + "step": 11278 + }, + { + "epoch": 2.3186350087367664, + "grad_norm": 0.2304777354001999, + "learning_rate": 1.1628078710554069e-05, + "loss": 0.3779, + "step": 11279 + }, + { + "epoch": 2.318840579710145, + "grad_norm": 0.2295672744512558, + "learning_rate": 1.1621379068162438e-05, + "loss": 0.3924, + "step": 11280 + }, + { + "epoch": 2.3190461506835236, + "grad_norm": 0.23286469280719757, + "learning_rate": 1.161468107017945e-05, + "loss": 0.3817, + "step": 11281 + }, + { + "epoch": 2.319251721656902, + "grad_norm": 0.12597419321537018, + "learning_rate": 1.1607984716935084e-05, + "loss": 0.4553, + "step": 11282 + }, + { + "epoch": 2.3194572926302808, + "grad_norm": 0.2292589247226715, + "learning_rate": 1.160129000875924e-05, + "loss": 0.3939, + "step": 11283 + }, + { + "epoch": 2.319662863603659, + "grad_norm": 0.2388840913772583, + "learning_rate": 1.1594596945981732e-05, + "loss": 0.3885, + "step": 11284 + }, + { + "epoch": 2.319868434577038, + "grad_norm": 0.22787928581237793, + "learning_rate": 1.1587905528932294e-05, + "loss": 0.3977, + "step": 11285 + }, + { + "epoch": 2.320074005550416, + "grad_norm": 0.23008286952972412, + "learning_rate": 1.1581215757940565e-05, + "loss": 0.3862, + "step": 11286 + }, + { + "epoch": 2.3202795765237947, + "grad_norm": 0.22636668384075165, + "learning_rate": 1.1574527633336158e-05, + "loss": 0.4, + "step": 11287 + }, + { + "epoch": 2.3204851474971733, + "grad_norm": 0.12164843082427979, + "learning_rate": 1.1567841155448539e-05, + "loss": 0.4519, + "step": 11288 + }, + { + "epoch": 2.320690718470552, + "grad_norm": 0.22811272740364075, + "learning_rate": 1.1561156324607123e-05, + "loss": 0.3912, + "step": 11289 + }, + { + "epoch": 2.3208962894439304, + "grad_norm": 0.2221514880657196, + "learning_rate": 1.1554473141141244e-05, + "loss": 0.3612, + "step": 11290 + }, + { + "epoch": 2.321101860417309, + "grad_norm": 0.23008368909358978, + "learning_rate": 1.154779160538014e-05, + "loss": 0.3888, + "step": 11291 + }, + { + "epoch": 2.3213074313906876, + "grad_norm": 0.23193509876728058, + "learning_rate": 1.1541111717653002e-05, + "loss": 0.3793, + "step": 11292 + }, + { + "epoch": 2.321513002364066, + "grad_norm": 0.22582639753818512, + "learning_rate": 1.1534433478288896e-05, + "loss": 0.4062, + "step": 11293 + }, + { + "epoch": 2.321718573337445, + "grad_norm": 0.5882457494735718, + "learning_rate": 1.1527756887616828e-05, + "loss": 0.4089, + "step": 11294 + }, + { + "epoch": 2.3219241443108234, + "grad_norm": 0.23613165318965912, + "learning_rate": 1.152108194596574e-05, + "loss": 0.3803, + "step": 11295 + }, + { + "epoch": 2.322129715284202, + "grad_norm": 0.24490775167942047, + "learning_rate": 1.1514408653664464e-05, + "loss": 0.4217, + "step": 11296 + }, + { + "epoch": 2.3223352862575806, + "grad_norm": 0.2295404076576233, + "learning_rate": 1.1507737011041767e-05, + "loss": 0.3876, + "step": 11297 + }, + { + "epoch": 2.322540857230959, + "grad_norm": 0.22926199436187744, + "learning_rate": 1.150106701842632e-05, + "loss": 0.4045, + "step": 11298 + }, + { + "epoch": 2.3227464282043373, + "grad_norm": 0.23146659135818481, + "learning_rate": 1.1494398676146716e-05, + "loss": 0.3973, + "step": 11299 + }, + { + "epoch": 2.3229519991777163, + "grad_norm": 0.2279983013868332, + "learning_rate": 1.1487731984531497e-05, + "loss": 0.3856, + "step": 11300 + }, + { + "epoch": 2.3231575701510945, + "grad_norm": 0.22734786570072174, + "learning_rate": 1.1481066943909086e-05, + "loss": 0.395, + "step": 11301 + }, + { + "epoch": 2.323363141124473, + "grad_norm": 0.12357629090547562, + "learning_rate": 1.147440355460784e-05, + "loss": 0.4503, + "step": 11302 + }, + { + "epoch": 2.3235687120978517, + "grad_norm": 0.22878186404705048, + "learning_rate": 1.1467741816956036e-05, + "loss": 0.3805, + "step": 11303 + }, + { + "epoch": 2.3237742830712302, + "grad_norm": 0.22551243007183075, + "learning_rate": 1.1461081731281857e-05, + "loss": 0.3962, + "step": 11304 + }, + { + "epoch": 2.323979854044609, + "grad_norm": 0.22322127223014832, + "learning_rate": 1.1454423297913425e-05, + "loss": 0.3839, + "step": 11305 + }, + { + "epoch": 2.3241854250179874, + "grad_norm": 0.12337585538625717, + "learning_rate": 1.1447766517178752e-05, + "loss": 0.4513, + "step": 11306 + }, + { + "epoch": 2.324390995991366, + "grad_norm": 0.22409552335739136, + "learning_rate": 1.1441111389405813e-05, + "loss": 0.3851, + "step": 11307 + }, + { + "epoch": 2.3245965669647446, + "grad_norm": 0.2322671264410019, + "learning_rate": 1.1434457914922463e-05, + "loss": 0.4114, + "step": 11308 + }, + { + "epoch": 2.324802137938123, + "grad_norm": 0.23481951653957367, + "learning_rate": 1.1427806094056486e-05, + "loss": 0.4041, + "step": 11309 + }, + { + "epoch": 2.3250077089115018, + "grad_norm": 0.2358068972826004, + "learning_rate": 1.1421155927135584e-05, + "loss": 0.404, + "step": 11310 + }, + { + "epoch": 2.3252132798848804, + "grad_norm": 0.24007724225521088, + "learning_rate": 1.1414507414487383e-05, + "loss": 0.3907, + "step": 11311 + }, + { + "epoch": 2.325418850858259, + "grad_norm": 0.2249882072210312, + "learning_rate": 1.1407860556439413e-05, + "loss": 0.4018, + "step": 11312 + }, + { + "epoch": 2.3256244218316375, + "grad_norm": 0.21669505536556244, + "learning_rate": 1.1401215353319158e-05, + "loss": 0.3996, + "step": 11313 + }, + { + "epoch": 2.325829992805016, + "grad_norm": 0.2299477905035019, + "learning_rate": 1.139457180545398e-05, + "loss": 0.3819, + "step": 11314 + }, + { + "epoch": 2.3260355637783947, + "grad_norm": 0.22735092043876648, + "learning_rate": 1.1387929913171164e-05, + "loss": 0.3832, + "step": 11315 + }, + { + "epoch": 2.326241134751773, + "grad_norm": 0.22514750063419342, + "learning_rate": 1.1381289676797953e-05, + "loss": 0.3827, + "step": 11316 + }, + { + "epoch": 2.3264467057251514, + "grad_norm": 0.23412209749221802, + "learning_rate": 1.1374651096661464e-05, + "loss": 0.4225, + "step": 11317 + }, + { + "epoch": 2.32665227669853, + "grad_norm": 0.23634769022464752, + "learning_rate": 1.1368014173088757e-05, + "loss": 0.412, + "step": 11318 + }, + { + "epoch": 2.3268578476719086, + "grad_norm": 0.2300824671983719, + "learning_rate": 1.136137890640679e-05, + "loss": 0.3749, + "step": 11319 + }, + { + "epoch": 2.327063418645287, + "grad_norm": 0.2358069270849228, + "learning_rate": 1.135474529694245e-05, + "loss": 0.4009, + "step": 11320 + }, + { + "epoch": 2.327268989618666, + "grad_norm": 0.23068921267986298, + "learning_rate": 1.134811334502256e-05, + "loss": 0.3985, + "step": 11321 + }, + { + "epoch": 2.3274745605920444, + "grad_norm": 0.22651554644107819, + "learning_rate": 1.1341483050973838e-05, + "loss": 0.38, + "step": 11322 + }, + { + "epoch": 2.327680131565423, + "grad_norm": 0.22414909303188324, + "learning_rate": 1.1334854415122924e-05, + "loss": 0.3884, + "step": 11323 + }, + { + "epoch": 2.3278857025388016, + "grad_norm": 0.21925905346870422, + "learning_rate": 1.1328227437796389e-05, + "loss": 0.3742, + "step": 11324 + }, + { + "epoch": 2.32809127351218, + "grad_norm": 0.23087939620018005, + "learning_rate": 1.1321602119320704e-05, + "loss": 0.3872, + "step": 11325 + }, + { + "epoch": 2.3282968444855587, + "grad_norm": 0.2237529307603836, + "learning_rate": 1.131497846002227e-05, + "loss": 0.3848, + "step": 11326 + }, + { + "epoch": 2.3285024154589373, + "grad_norm": 0.22944872081279755, + "learning_rate": 1.1308356460227386e-05, + "loss": 0.4088, + "step": 11327 + }, + { + "epoch": 2.328707986432316, + "grad_norm": 0.1283191293478012, + "learning_rate": 1.1301736120262326e-05, + "loss": 0.47, + "step": 11328 + }, + { + "epoch": 2.3289135574056945, + "grad_norm": 0.22146999835968018, + "learning_rate": 1.1295117440453219e-05, + "loss": 0.3917, + "step": 11329 + }, + { + "epoch": 2.329119128379073, + "grad_norm": 0.22980590164661407, + "learning_rate": 1.1288500421126137e-05, + "loss": 0.3876, + "step": 11330 + }, + { + "epoch": 2.3293246993524512, + "grad_norm": 0.22274045646190643, + "learning_rate": 1.1281885062607072e-05, + "loss": 0.3849, + "step": 11331 + }, + { + "epoch": 2.32953027032583, + "grad_norm": 0.22919537127017975, + "learning_rate": 1.1275271365221938e-05, + "loss": 0.3906, + "step": 11332 + }, + { + "epoch": 2.3297358412992084, + "grad_norm": 0.1261204034090042, + "learning_rate": 1.1268659329296534e-05, + "loss": 0.444, + "step": 11333 + }, + { + "epoch": 2.329941412272587, + "grad_norm": 0.2240409255027771, + "learning_rate": 1.1262048955156643e-05, + "loss": 0.3987, + "step": 11334 + }, + { + "epoch": 2.3301469832459656, + "grad_norm": 0.125702366232872, + "learning_rate": 1.1255440243127906e-05, + "loss": 0.4473, + "step": 11335 + }, + { + "epoch": 2.330352554219344, + "grad_norm": 0.22843293845653534, + "learning_rate": 1.1248833193535898e-05, + "loss": 0.4213, + "step": 11336 + }, + { + "epoch": 2.3305581251927228, + "grad_norm": 0.23132173717021942, + "learning_rate": 1.1242227806706137e-05, + "loss": 0.3878, + "step": 11337 + }, + { + "epoch": 2.3307636961661014, + "grad_norm": 0.23673327267169952, + "learning_rate": 1.1235624082964025e-05, + "loss": 0.3987, + "step": 11338 + }, + { + "epoch": 2.33096926713948, + "grad_norm": 0.23916591703891754, + "learning_rate": 1.1229022022634903e-05, + "loss": 0.4045, + "step": 11339 + }, + { + "epoch": 2.3311748381128585, + "grad_norm": 0.12463133037090302, + "learning_rate": 1.122242162604402e-05, + "loss": 0.4453, + "step": 11340 + }, + { + "epoch": 2.331380409086237, + "grad_norm": 0.23358865082263947, + "learning_rate": 1.1215822893516539e-05, + "loss": 0.3772, + "step": 11341 + }, + { + "epoch": 2.3315859800596157, + "grad_norm": 0.2250611037015915, + "learning_rate": 1.1209225825377565e-05, + "loss": 0.4015, + "step": 11342 + }, + { + "epoch": 2.3317915510329943, + "grad_norm": 0.12012235075235367, + "learning_rate": 1.1202630421952097e-05, + "loss": 0.446, + "step": 11343 + }, + { + "epoch": 2.331997122006373, + "grad_norm": 0.12716658413410187, + "learning_rate": 1.1196036683565063e-05, + "loss": 0.4522, + "step": 11344 + }, + { + "epoch": 2.3322026929797515, + "grad_norm": 0.12125218659639359, + "learning_rate": 1.11894446105413e-05, + "loss": 0.4634, + "step": 11345 + }, + { + "epoch": 2.3324082639531296, + "grad_norm": 0.23313722014427185, + "learning_rate": 1.1182854203205569e-05, + "loss": 0.4123, + "step": 11346 + }, + { + "epoch": 2.332613834926508, + "grad_norm": 0.22456228733062744, + "learning_rate": 1.1176265461882556e-05, + "loss": 0.3851, + "step": 11347 + }, + { + "epoch": 2.332819405899887, + "grad_norm": 0.22414372861385345, + "learning_rate": 1.1169678386896833e-05, + "loss": 0.4027, + "step": 11348 + }, + { + "epoch": 2.3330249768732654, + "grad_norm": 0.2482268065214157, + "learning_rate": 1.116309297857295e-05, + "loss": 0.3893, + "step": 11349 + }, + { + "epoch": 2.333230547846644, + "grad_norm": 0.2372516393661499, + "learning_rate": 1.1156509237235325e-05, + "loss": 0.3884, + "step": 11350 + }, + { + "epoch": 2.3334361188200226, + "grad_norm": 0.23063679039478302, + "learning_rate": 1.1149927163208297e-05, + "loss": 0.3853, + "step": 11351 + }, + { + "epoch": 2.333641689793401, + "grad_norm": 0.12314844876527786, + "learning_rate": 1.114334675681615e-05, + "loss": 0.4468, + "step": 11352 + }, + { + "epoch": 2.3338472607667797, + "grad_norm": 0.22128140926361084, + "learning_rate": 1.1136768018383064e-05, + "loss": 0.3851, + "step": 11353 + }, + { + "epoch": 2.3340528317401583, + "grad_norm": 0.22692500054836273, + "learning_rate": 1.1130190948233133e-05, + "loss": 0.3878, + "step": 11354 + }, + { + "epoch": 2.334258402713537, + "grad_norm": 0.2241378277540207, + "learning_rate": 1.1123615546690383e-05, + "loss": 0.3838, + "step": 11355 + }, + { + "epoch": 2.3344639736869155, + "grad_norm": 0.22740109264850616, + "learning_rate": 1.1117041814078769e-05, + "loss": 0.3741, + "step": 11356 + }, + { + "epoch": 2.334669544660294, + "grad_norm": 0.25140267610549927, + "learning_rate": 1.1110469750722118e-05, + "loss": 0.3816, + "step": 11357 + }, + { + "epoch": 2.3348751156336727, + "grad_norm": 0.22210964560508728, + "learning_rate": 1.1103899356944239e-05, + "loss": 0.3815, + "step": 11358 + }, + { + "epoch": 2.3350806866070513, + "grad_norm": 0.2357717901468277, + "learning_rate": 1.1097330633068806e-05, + "loss": 0.3867, + "step": 11359 + }, + { + "epoch": 2.33528625758043, + "grad_norm": 0.23202987015247345, + "learning_rate": 1.1090763579419436e-05, + "loss": 0.4003, + "step": 11360 + }, + { + "epoch": 2.335491828553808, + "grad_norm": 0.2323846071958542, + "learning_rate": 1.1084198196319653e-05, + "loss": 0.3845, + "step": 11361 + }, + { + "epoch": 2.3356973995271866, + "grad_norm": 0.22971893846988678, + "learning_rate": 1.1077634484092887e-05, + "loss": 0.3897, + "step": 11362 + }, + { + "epoch": 2.335902970500565, + "grad_norm": 0.23653818666934967, + "learning_rate": 1.1071072443062531e-05, + "loss": 0.416, + "step": 11363 + }, + { + "epoch": 2.3361085414739438, + "grad_norm": 0.21813298761844635, + "learning_rate": 1.1064512073551854e-05, + "loss": 0.3926, + "step": 11364 + }, + { + "epoch": 2.3363141124473223, + "grad_norm": 0.24081604182720184, + "learning_rate": 1.1057953375884053e-05, + "loss": 0.3823, + "step": 11365 + }, + { + "epoch": 2.336519683420701, + "grad_norm": 0.12533682584762573, + "learning_rate": 1.1051396350382246e-05, + "loss": 0.4627, + "step": 11366 + }, + { + "epoch": 2.3367252543940795, + "grad_norm": 0.23893719911575317, + "learning_rate": 1.104484099736946e-05, + "loss": 0.3698, + "step": 11367 + }, + { + "epoch": 2.336930825367458, + "grad_norm": 0.1185644194483757, + "learning_rate": 1.1038287317168643e-05, + "loss": 0.4715, + "step": 11368 + }, + { + "epoch": 2.3371363963408367, + "grad_norm": 0.22912783920764923, + "learning_rate": 1.1031735310102686e-05, + "loss": 0.3963, + "step": 11369 + }, + { + "epoch": 2.3373419673142153, + "grad_norm": 0.23992134630680084, + "learning_rate": 1.1025184976494363e-05, + "loss": 0.3906, + "step": 11370 + }, + { + "epoch": 2.337547538287594, + "grad_norm": 0.2348276525735855, + "learning_rate": 1.1018636316666378e-05, + "loss": 0.4119, + "step": 11371 + }, + { + "epoch": 2.3377531092609725, + "grad_norm": 0.23046445846557617, + "learning_rate": 1.101208933094135e-05, + "loss": 0.3736, + "step": 11372 + }, + { + "epoch": 2.337958680234351, + "grad_norm": 0.22680574655532837, + "learning_rate": 1.1005544019641824e-05, + "loss": 0.3675, + "step": 11373 + }, + { + "epoch": 2.3381642512077296, + "grad_norm": 0.22704631090164185, + "learning_rate": 1.0999000383090255e-05, + "loss": 0.4037, + "step": 11374 + }, + { + "epoch": 2.3383698221811082, + "grad_norm": 0.23311007022857666, + "learning_rate": 1.0992458421609007e-05, + "loss": 0.3913, + "step": 11375 + }, + { + "epoch": 2.3385753931544864, + "grad_norm": 0.23383252322673798, + "learning_rate": 1.098591813552039e-05, + "loss": 0.3879, + "step": 11376 + }, + { + "epoch": 2.338780964127865, + "grad_norm": 0.2401203066110611, + "learning_rate": 1.0979379525146603e-05, + "loss": 0.4057, + "step": 11377 + }, + { + "epoch": 2.3389865351012435, + "grad_norm": 0.23543764650821686, + "learning_rate": 1.0972842590809783e-05, + "loss": 0.3725, + "step": 11378 + }, + { + "epoch": 2.339192106074622, + "grad_norm": 0.22404974699020386, + "learning_rate": 1.0966307332831947e-05, + "loss": 0.3833, + "step": 11379 + }, + { + "epoch": 2.3393976770480007, + "grad_norm": 0.23188042640686035, + "learning_rate": 1.0959773751535091e-05, + "loss": 0.3922, + "step": 11380 + }, + { + "epoch": 2.3396032480213793, + "grad_norm": 0.23337653279304504, + "learning_rate": 1.0953241847241078e-05, + "loss": 0.3864, + "step": 11381 + }, + { + "epoch": 2.339808818994758, + "grad_norm": 0.2359674870967865, + "learning_rate": 1.0946711620271692e-05, + "loss": 0.4073, + "step": 11382 + }, + { + "epoch": 2.3400143899681365, + "grad_norm": 0.22280322015285492, + "learning_rate": 1.0940183070948668e-05, + "loss": 0.3692, + "step": 11383 + }, + { + "epoch": 2.340219960941515, + "grad_norm": 0.2298697531223297, + "learning_rate": 1.0933656199593635e-05, + "loss": 0.3965, + "step": 11384 + }, + { + "epoch": 2.3404255319148937, + "grad_norm": 0.12579971551895142, + "learning_rate": 1.0927131006528134e-05, + "loss": 0.4416, + "step": 11385 + }, + { + "epoch": 2.3406311028882723, + "grad_norm": 0.22117015719413757, + "learning_rate": 1.0920607492073632e-05, + "loss": 0.3884, + "step": 11386 + }, + { + "epoch": 2.340836673861651, + "grad_norm": 0.22283059358596802, + "learning_rate": 1.0914085656551514e-05, + "loss": 0.3971, + "step": 11387 + }, + { + "epoch": 2.3410422448350294, + "grad_norm": 0.2289050966501236, + "learning_rate": 1.0907565500283078e-05, + "loss": 0.4027, + "step": 11388 + }, + { + "epoch": 2.341247815808408, + "grad_norm": 0.22611112892627716, + "learning_rate": 1.0901047023589525e-05, + "loss": 0.4097, + "step": 11389 + }, + { + "epoch": 2.3414533867817866, + "grad_norm": 0.23010249435901642, + "learning_rate": 1.0894530226792024e-05, + "loss": 0.3971, + "step": 11390 + }, + { + "epoch": 2.3416589577551647, + "grad_norm": 0.2295684963464737, + "learning_rate": 1.088801511021161e-05, + "loss": 0.371, + "step": 11391 + }, + { + "epoch": 2.3418645287285433, + "grad_norm": 0.221123605966568, + "learning_rate": 1.0881501674169247e-05, + "loss": 0.3816, + "step": 11392 + }, + { + "epoch": 2.342070099701922, + "grad_norm": 0.12064526975154877, + "learning_rate": 1.0874989918985833e-05, + "loss": 0.4318, + "step": 11393 + }, + { + "epoch": 2.3422756706753005, + "grad_norm": 0.23293597996234894, + "learning_rate": 1.0868479844982164e-05, + "loss": 0.3857, + "step": 11394 + }, + { + "epoch": 2.342481241648679, + "grad_norm": 0.22393792867660522, + "learning_rate": 1.0861971452478966e-05, + "loss": 0.3969, + "step": 11395 + }, + { + "epoch": 2.3426868126220577, + "grad_norm": 0.12383504956960678, + "learning_rate": 1.0855464741796857e-05, + "loss": 0.4518, + "step": 11396 + }, + { + "epoch": 2.3428923835954363, + "grad_norm": 0.2288213074207306, + "learning_rate": 1.0848959713256421e-05, + "loss": 0.3848, + "step": 11397 + }, + { + "epoch": 2.343097954568815, + "grad_norm": 0.23577377200126648, + "learning_rate": 1.0842456367178123e-05, + "loss": 0.4115, + "step": 11398 + }, + { + "epoch": 2.3433035255421935, + "grad_norm": 0.22047261893749237, + "learning_rate": 1.0835954703882345e-05, + "loss": 0.3738, + "step": 11399 + }, + { + "epoch": 2.343509096515572, + "grad_norm": 0.22211310267448425, + "learning_rate": 1.0829454723689383e-05, + "loss": 0.4006, + "step": 11400 + }, + { + "epoch": 2.3437146674889506, + "grad_norm": 0.23019267618656158, + "learning_rate": 1.0822956426919487e-05, + "loss": 0.3988, + "step": 11401 + }, + { + "epoch": 2.343920238462329, + "grad_norm": 0.23312908411026, + "learning_rate": 1.0816459813892787e-05, + "loss": 0.3799, + "step": 11402 + }, + { + "epoch": 2.344125809435708, + "grad_norm": 0.2296217679977417, + "learning_rate": 1.0809964884929325e-05, + "loss": 0.3731, + "step": 11403 + }, + { + "epoch": 2.3443313804090864, + "grad_norm": 0.12692473828792572, + "learning_rate": 1.08034716403491e-05, + "loss": 0.4605, + "step": 11404 + }, + { + "epoch": 2.344536951382465, + "grad_norm": 0.2210485190153122, + "learning_rate": 1.0796980080471993e-05, + "loss": 0.3822, + "step": 11405 + }, + { + "epoch": 2.344742522355843, + "grad_norm": 0.22621271014213562, + "learning_rate": 1.0790490205617812e-05, + "loss": 0.3743, + "step": 11406 + }, + { + "epoch": 2.344948093329222, + "grad_norm": 0.22742587327957153, + "learning_rate": 1.0784002016106287e-05, + "loss": 0.4062, + "step": 11407 + }, + { + "epoch": 2.3451536643026003, + "grad_norm": 0.2238548845052719, + "learning_rate": 1.0777515512257057e-05, + "loss": 0.3738, + "step": 11408 + }, + { + "epoch": 2.345359235275979, + "grad_norm": 0.2274450659751892, + "learning_rate": 1.077103069438968e-05, + "loss": 0.4024, + "step": 11409 + }, + { + "epoch": 2.3455648062493575, + "grad_norm": 0.2332809865474701, + "learning_rate": 1.0764547562823627e-05, + "loss": 0.4046, + "step": 11410 + }, + { + "epoch": 2.345770377222736, + "grad_norm": 0.1246822252869606, + "learning_rate": 1.0758066117878307e-05, + "loss": 0.4457, + "step": 11411 + }, + { + "epoch": 2.3459759481961147, + "grad_norm": 0.2296641618013382, + "learning_rate": 1.0751586359873026e-05, + "loss": 0.394, + "step": 11412 + }, + { + "epoch": 2.3461815191694932, + "grad_norm": 0.2302107959985733, + "learning_rate": 1.0745108289127006e-05, + "loss": 0.4005, + "step": 11413 + }, + { + "epoch": 2.346387090142872, + "grad_norm": 0.12304381281137466, + "learning_rate": 1.0738631905959397e-05, + "loss": 0.4551, + "step": 11414 + }, + { + "epoch": 2.3465926611162504, + "grad_norm": 0.23445133864879608, + "learning_rate": 1.0732157210689257e-05, + "loss": 0.3921, + "step": 11415 + }, + { + "epoch": 2.346798232089629, + "grad_norm": 0.22406600415706635, + "learning_rate": 1.0725684203635556e-05, + "loss": 0.3952, + "step": 11416 + }, + { + "epoch": 2.3470038030630076, + "grad_norm": 0.2265467792749405, + "learning_rate": 1.0719212885117194e-05, + "loss": 0.3897, + "step": 11417 + }, + { + "epoch": 2.347209374036386, + "grad_norm": 0.22809205949306488, + "learning_rate": 1.0712743255452993e-05, + "loss": 0.3919, + "step": 11418 + }, + { + "epoch": 2.3474149450097648, + "grad_norm": 0.12264318019151688, + "learning_rate": 1.0706275314961672e-05, + "loss": 0.4388, + "step": 11419 + }, + { + "epoch": 2.3476205159831434, + "grad_norm": 0.26397988200187683, + "learning_rate": 1.0699809063961879e-05, + "loss": 0.3855, + "step": 11420 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 0.2346579134464264, + "learning_rate": 1.0693344502772162e-05, + "loss": 0.397, + "step": 11421 + }, + { + "epoch": 2.3480316579299005, + "grad_norm": 0.23680004477500916, + "learning_rate": 1.0686881631711023e-05, + "loss": 0.4192, + "step": 11422 + }, + { + "epoch": 2.3482372289032787, + "grad_norm": 0.22599753737449646, + "learning_rate": 1.0680420451096852e-05, + "loss": 0.4073, + "step": 11423 + }, + { + "epoch": 2.3484427998766573, + "grad_norm": 0.23660646378993988, + "learning_rate": 1.0673960961247943e-05, + "loss": 0.3879, + "step": 11424 + }, + { + "epoch": 2.348648370850036, + "grad_norm": 0.26729151606559753, + "learning_rate": 1.0667503162482548e-05, + "loss": 0.3812, + "step": 11425 + }, + { + "epoch": 2.3488539418234144, + "grad_norm": 0.22443972527980804, + "learning_rate": 1.06610470551188e-05, + "loss": 0.4038, + "step": 11426 + }, + { + "epoch": 2.349059512796793, + "grad_norm": 0.22883421182632446, + "learning_rate": 1.0654592639474768e-05, + "loss": 0.3976, + "step": 11427 + }, + { + "epoch": 2.3492650837701716, + "grad_norm": 0.23285576701164246, + "learning_rate": 1.0648139915868425e-05, + "loss": 0.3958, + "step": 11428 + }, + { + "epoch": 2.34947065474355, + "grad_norm": 0.22203494608402252, + "learning_rate": 1.0641688884617673e-05, + "loss": 0.391, + "step": 11429 + }, + { + "epoch": 2.349676225716929, + "grad_norm": 0.21967896819114685, + "learning_rate": 1.0635239546040312e-05, + "loss": 0.3793, + "step": 11430 + }, + { + "epoch": 2.3498817966903074, + "grad_norm": 0.23787444829940796, + "learning_rate": 1.062879190045407e-05, + "loss": 0.3829, + "step": 11431 + }, + { + "epoch": 2.350087367663686, + "grad_norm": 0.2242104560136795, + "learning_rate": 1.0622345948176609e-05, + "loss": 0.3986, + "step": 11432 + }, + { + "epoch": 2.3502929386370646, + "grad_norm": 0.12349691241979599, + "learning_rate": 1.0615901689525487e-05, + "loss": 0.4521, + "step": 11433 + }, + { + "epoch": 2.350498509610443, + "grad_norm": 0.2340456247329712, + "learning_rate": 1.0609459124818177e-05, + "loss": 0.419, + "step": 11434 + }, + { + "epoch": 2.3507040805838217, + "grad_norm": 0.12247911095619202, + "learning_rate": 1.0603018254372072e-05, + "loss": 0.4609, + "step": 11435 + }, + { + "epoch": 2.3509096515572, + "grad_norm": 0.2385515421628952, + "learning_rate": 1.0596579078504486e-05, + "loss": 0.3997, + "step": 11436 + }, + { + "epoch": 2.351115222530579, + "grad_norm": 0.12545520067214966, + "learning_rate": 1.0590141597532653e-05, + "loss": 0.4411, + "step": 11437 + }, + { + "epoch": 2.351320793503957, + "grad_norm": 0.23046888411045074, + "learning_rate": 1.0583705811773695e-05, + "loss": 0.3795, + "step": 11438 + }, + { + "epoch": 2.3515263644773357, + "grad_norm": 0.12221966683864594, + "learning_rate": 1.0577271721544703e-05, + "loss": 0.4572, + "step": 11439 + }, + { + "epoch": 2.3517319354507142, + "grad_norm": 0.22688139975070953, + "learning_rate": 1.0570839327162644e-05, + "loss": 0.3925, + "step": 11440 + }, + { + "epoch": 2.351937506424093, + "grad_norm": 0.23011426627635956, + "learning_rate": 1.056440862894441e-05, + "loss": 0.3921, + "step": 11441 + }, + { + "epoch": 2.3521430773974714, + "grad_norm": 0.2639561891555786, + "learning_rate": 1.0557979627206812e-05, + "loss": 0.3734, + "step": 11442 + }, + { + "epoch": 2.35234864837085, + "grad_norm": 0.2354530692100525, + "learning_rate": 1.055155232226656e-05, + "loss": 0.3819, + "step": 11443 + }, + { + "epoch": 2.3525542193442286, + "grad_norm": 0.23552900552749634, + "learning_rate": 1.0545126714440329e-05, + "loss": 0.3951, + "step": 11444 + }, + { + "epoch": 2.352759790317607, + "grad_norm": 0.12986932694911957, + "learning_rate": 1.0538702804044648e-05, + "loss": 0.4338, + "step": 11445 + }, + { + "epoch": 2.3529653612909858, + "grad_norm": 0.22587868571281433, + "learning_rate": 1.0532280591396021e-05, + "loss": 0.388, + "step": 11446 + }, + { + "epoch": 2.3531709322643644, + "grad_norm": 0.22547636926174164, + "learning_rate": 1.0525860076810829e-05, + "loss": 0.3929, + "step": 11447 + }, + { + "epoch": 2.353376503237743, + "grad_norm": 0.24222803115844727, + "learning_rate": 1.0519441260605384e-05, + "loss": 0.3973, + "step": 11448 + }, + { + "epoch": 2.3535820742111215, + "grad_norm": 0.2281145453453064, + "learning_rate": 1.0513024143095896e-05, + "loss": 0.3693, + "step": 11449 + }, + { + "epoch": 2.3537876451845, + "grad_norm": 0.22498665750026703, + "learning_rate": 1.0506608724598525e-05, + "loss": 0.3781, + "step": 11450 + }, + { + "epoch": 2.3539932161578783, + "grad_norm": 0.12150565534830093, + "learning_rate": 1.0500195005429303e-05, + "loss": 0.4532, + "step": 11451 + }, + { + "epoch": 2.3541987871312573, + "grad_norm": 0.23014621436595917, + "learning_rate": 1.0493782985904235e-05, + "loss": 0.3878, + "step": 11452 + }, + { + "epoch": 2.3544043581046354, + "grad_norm": 0.2346828430891037, + "learning_rate": 1.04873726663392e-05, + "loss": 0.4009, + "step": 11453 + }, + { + "epoch": 2.354609929078014, + "grad_norm": 0.21988657116889954, + "learning_rate": 1.0480964047050002e-05, + "loss": 0.3942, + "step": 11454 + }, + { + "epoch": 2.3548155000513926, + "grad_norm": 0.12439004331827164, + "learning_rate": 1.0474557128352365e-05, + "loss": 0.4566, + "step": 11455 + }, + { + "epoch": 2.355021071024771, + "grad_norm": 0.12461668252944946, + "learning_rate": 1.0468151910561923e-05, + "loss": 0.4609, + "step": 11456 + }, + { + "epoch": 2.35522664199815, + "grad_norm": 0.11804953217506409, + "learning_rate": 1.0461748393994234e-05, + "loss": 0.4588, + "step": 11457 + }, + { + "epoch": 2.3554322129715284, + "grad_norm": 0.2295227199792862, + "learning_rate": 1.045534657896476e-05, + "loss": 0.3971, + "step": 11458 + }, + { + "epoch": 2.355637783944907, + "grad_norm": 0.22749020159244537, + "learning_rate": 1.0448946465788915e-05, + "loss": 0.4247, + "step": 11459 + }, + { + "epoch": 2.3558433549182856, + "grad_norm": 0.22229236364364624, + "learning_rate": 1.044254805478198e-05, + "loss": 0.3964, + "step": 11460 + }, + { + "epoch": 2.356048925891664, + "grad_norm": 0.2296680361032486, + "learning_rate": 1.0436151346259184e-05, + "loss": 0.402, + "step": 11461 + }, + { + "epoch": 2.3562544968650427, + "grad_norm": 0.12308470159769058, + "learning_rate": 1.0429756340535659e-05, + "loss": 0.4583, + "step": 11462 + }, + { + "epoch": 2.3564600678384213, + "grad_norm": 0.12049432843923569, + "learning_rate": 1.0423363037926464e-05, + "loss": 0.4624, + "step": 11463 + }, + { + "epoch": 2.3566656388118, + "grad_norm": 0.12415426224470139, + "learning_rate": 1.0416971438746542e-05, + "loss": 0.4517, + "step": 11464 + }, + { + "epoch": 2.3568712097851785, + "grad_norm": 0.2221984714269638, + "learning_rate": 1.041058154331081e-05, + "loss": 0.3924, + "step": 11465 + }, + { + "epoch": 2.3570767807585566, + "grad_norm": 0.22418946027755737, + "learning_rate": 1.0404193351934057e-05, + "loss": 0.3781, + "step": 11466 + }, + { + "epoch": 2.3572823517319357, + "grad_norm": 0.2208791971206665, + "learning_rate": 1.0397806864930983e-05, + "loss": 0.3731, + "step": 11467 + }, + { + "epoch": 2.357487922705314, + "grad_norm": 0.23673607409000397, + "learning_rate": 1.0391422082616247e-05, + "loss": 0.3809, + "step": 11468 + }, + { + "epoch": 2.3576934936786924, + "grad_norm": 0.22379258275032043, + "learning_rate": 1.0385039005304386e-05, + "loss": 0.401, + "step": 11469 + }, + { + "epoch": 2.357899064652071, + "grad_norm": 0.2308909147977829, + "learning_rate": 1.0378657633309862e-05, + "loss": 0.3777, + "step": 11470 + }, + { + "epoch": 2.3581046356254496, + "grad_norm": 0.12026369571685791, + "learning_rate": 1.0372277966947059e-05, + "loss": 0.4592, + "step": 11471 + }, + { + "epoch": 2.358310206598828, + "grad_norm": 0.12578755617141724, + "learning_rate": 1.036590000653026e-05, + "loss": 0.4422, + "step": 11472 + }, + { + "epoch": 2.3585157775722068, + "grad_norm": 0.23081423342227936, + "learning_rate": 1.0359523752373694e-05, + "loss": 0.3895, + "step": 11473 + }, + { + "epoch": 2.3587213485455854, + "grad_norm": 0.1233346238732338, + "learning_rate": 1.035314920479149e-05, + "loss": 0.4362, + "step": 11474 + }, + { + "epoch": 2.358926919518964, + "grad_norm": 0.23306210339069366, + "learning_rate": 1.0346776364097683e-05, + "loss": 0.3826, + "step": 11475 + }, + { + "epoch": 2.3591324904923425, + "grad_norm": 0.23711657524108887, + "learning_rate": 1.0340405230606235e-05, + "loss": 0.3861, + "step": 11476 + }, + { + "epoch": 2.359338061465721, + "grad_norm": 0.24400153756141663, + "learning_rate": 1.0334035804631026e-05, + "loss": 0.3896, + "step": 11477 + }, + { + "epoch": 2.3595436324390997, + "grad_norm": 0.1271253228187561, + "learning_rate": 1.0327668086485842e-05, + "loss": 0.4421, + "step": 11478 + }, + { + "epoch": 2.3597492034124783, + "grad_norm": 0.23349100351333618, + "learning_rate": 1.0321302076484381e-05, + "loss": 0.3748, + "step": 11479 + }, + { + "epoch": 2.359954774385857, + "grad_norm": 0.22339333593845367, + "learning_rate": 1.031493777494029e-05, + "loss": 0.392, + "step": 11480 + }, + { + "epoch": 2.3601603453592355, + "grad_norm": 0.23393899202346802, + "learning_rate": 1.03085751821671e-05, + "loss": 0.3857, + "step": 11481 + }, + { + "epoch": 2.360365916332614, + "grad_norm": 0.22653932869434357, + "learning_rate": 1.0302214298478262e-05, + "loss": 0.3752, + "step": 11482 + }, + { + "epoch": 2.360571487305992, + "grad_norm": 0.2276255041360855, + "learning_rate": 1.0295855124187149e-05, + "loss": 0.3894, + "step": 11483 + }, + { + "epoch": 2.360777058279371, + "grad_norm": 0.22504010796546936, + "learning_rate": 1.0289497659607049e-05, + "loss": 0.355, + "step": 11484 + }, + { + "epoch": 2.3609826292527494, + "grad_norm": 0.2319696694612503, + "learning_rate": 1.0283141905051145e-05, + "loss": 0.4006, + "step": 11485 + }, + { + "epoch": 2.361188200226128, + "grad_norm": 0.23021776974201202, + "learning_rate": 1.0276787860832589e-05, + "loss": 0.3885, + "step": 11486 + }, + { + "epoch": 2.3613937711995066, + "grad_norm": 0.22840525209903717, + "learning_rate": 1.0270435527264398e-05, + "loss": 0.3885, + "step": 11487 + }, + { + "epoch": 2.361599342172885, + "grad_norm": 0.23946824669837952, + "learning_rate": 1.0264084904659514e-05, + "loss": 0.3887, + "step": 11488 + }, + { + "epoch": 2.3618049131462637, + "grad_norm": 0.23394089937210083, + "learning_rate": 1.025773599333082e-05, + "loss": 0.387, + "step": 11489 + }, + { + "epoch": 2.3620104841196423, + "grad_norm": 0.2347833514213562, + "learning_rate": 1.0251388793591093e-05, + "loss": 0.3909, + "step": 11490 + }, + { + "epoch": 2.362216055093021, + "grad_norm": 0.24539333581924438, + "learning_rate": 1.024504330575302e-05, + "loss": 0.3911, + "step": 11491 + }, + { + "epoch": 2.3624216260663995, + "grad_norm": 0.2272724211215973, + "learning_rate": 1.0238699530129222e-05, + "loss": 0.3899, + "step": 11492 + }, + { + "epoch": 2.362627197039778, + "grad_norm": 0.1279131919145584, + "learning_rate": 1.0232357467032217e-05, + "loss": 0.4453, + "step": 11493 + }, + { + "epoch": 2.3628327680131567, + "grad_norm": 0.21736563742160797, + "learning_rate": 1.0226017116774459e-05, + "loss": 0.3957, + "step": 11494 + }, + { + "epoch": 2.3630383389865353, + "grad_norm": 0.22350220382213593, + "learning_rate": 1.0219678479668308e-05, + "loss": 0.38, + "step": 11495 + }, + { + "epoch": 2.363243909959914, + "grad_norm": 0.22701016068458557, + "learning_rate": 1.0213341556026038e-05, + "loss": 0.3937, + "step": 11496 + }, + { + "epoch": 2.3634494809332924, + "grad_norm": 0.23441599309444427, + "learning_rate": 1.0207006346159835e-05, + "loss": 0.3887, + "step": 11497 + }, + { + "epoch": 2.3636550519066706, + "grad_norm": 0.2203342318534851, + "learning_rate": 1.0200672850381808e-05, + "loss": 0.3824, + "step": 11498 + }, + { + "epoch": 2.363860622880049, + "grad_norm": 0.12455693632364273, + "learning_rate": 1.0194341069003977e-05, + "loss": 0.4432, + "step": 11499 + }, + { + "epoch": 2.3640661938534278, + "grad_norm": 0.22522957623004913, + "learning_rate": 1.0188011002338268e-05, + "loss": 0.376, + "step": 11500 + }, + { + "epoch": 2.3642717648268063, + "grad_norm": 0.23828500509262085, + "learning_rate": 1.0181682650696563e-05, + "loss": 0.394, + "step": 11501 + }, + { + "epoch": 2.364477335800185, + "grad_norm": 0.1210595965385437, + "learning_rate": 1.0175356014390606e-05, + "loss": 0.4444, + "step": 11502 + }, + { + "epoch": 2.3646829067735635, + "grad_norm": 0.22295325994491577, + "learning_rate": 1.0169031093732092e-05, + "loss": 0.3968, + "step": 11503 + }, + { + "epoch": 2.364888477746942, + "grad_norm": 0.21954037249088287, + "learning_rate": 1.016270788903262e-05, + "loss": 0.3821, + "step": 11504 + }, + { + "epoch": 2.3650940487203207, + "grad_norm": 0.23098480701446533, + "learning_rate": 1.0156386400603697e-05, + "loss": 0.3838, + "step": 11505 + }, + { + "epoch": 2.3652996196936993, + "grad_norm": 0.2265908420085907, + "learning_rate": 1.0150066628756741e-05, + "loss": 0.4052, + "step": 11506 + }, + { + "epoch": 2.365505190667078, + "grad_norm": 0.2330310344696045, + "learning_rate": 1.0143748573803133e-05, + "loss": 0.4044, + "step": 11507 + }, + { + "epoch": 2.3657107616404565, + "grad_norm": 0.22517040371894836, + "learning_rate": 1.0137432236054111e-05, + "loss": 0.4007, + "step": 11508 + }, + { + "epoch": 2.365916332613835, + "grad_norm": 0.22092685103416443, + "learning_rate": 1.0131117615820847e-05, + "loss": 0.395, + "step": 11509 + }, + { + "epoch": 2.3661219035872136, + "grad_norm": 0.22711274027824402, + "learning_rate": 1.0124804713414453e-05, + "loss": 0.3662, + "step": 11510 + }, + { + "epoch": 2.3663274745605922, + "grad_norm": 0.2373218983411789, + "learning_rate": 1.011849352914592e-05, + "loss": 0.3841, + "step": 11511 + }, + { + "epoch": 2.366533045533971, + "grad_norm": 0.23521727323532104, + "learning_rate": 1.011218406332618e-05, + "loss": 0.3878, + "step": 11512 + }, + { + "epoch": 2.366738616507349, + "grad_norm": 0.12808705866336823, + "learning_rate": 1.0105876316266065e-05, + "loss": 0.4659, + "step": 11513 + }, + { + "epoch": 2.3669441874807275, + "grad_norm": 0.11950518935918808, + "learning_rate": 1.0099570288276317e-05, + "loss": 0.4365, + "step": 11514 + }, + { + "epoch": 2.367149758454106, + "grad_norm": 0.24219125509262085, + "learning_rate": 1.0093265979667625e-05, + "loss": 0.3793, + "step": 11515 + }, + { + "epoch": 2.3673553294274847, + "grad_norm": 0.22846874594688416, + "learning_rate": 1.0086963390750568e-05, + "loss": 0.3735, + "step": 11516 + }, + { + "epoch": 2.3675609004008633, + "grad_norm": 0.23134097456932068, + "learning_rate": 1.0080662521835643e-05, + "loss": 0.3869, + "step": 11517 + }, + { + "epoch": 2.367766471374242, + "grad_norm": 0.21544456481933594, + "learning_rate": 1.0074363373233259e-05, + "loss": 0.393, + "step": 11518 + }, + { + "epoch": 2.3679720423476205, + "grad_norm": 0.22806456685066223, + "learning_rate": 1.0068065945253753e-05, + "loss": 0.3971, + "step": 11519 + }, + { + "epoch": 2.368177613320999, + "grad_norm": 0.22958514094352722, + "learning_rate": 1.0061770238207364e-05, + "loss": 0.4065, + "step": 11520 + }, + { + "epoch": 2.3683831842943777, + "grad_norm": 0.229364275932312, + "learning_rate": 1.0055476252404244e-05, + "loss": 0.394, + "step": 11521 + }, + { + "epoch": 2.3685887552677563, + "grad_norm": 0.23312440514564514, + "learning_rate": 1.0049183988154493e-05, + "loss": 0.4033, + "step": 11522 + }, + { + "epoch": 2.368794326241135, + "grad_norm": 0.22797515988349915, + "learning_rate": 1.0042893445768084e-05, + "loss": 0.3912, + "step": 11523 + }, + { + "epoch": 2.3689998972145134, + "grad_norm": 0.23262247443199158, + "learning_rate": 1.0036604625554923e-05, + "loss": 0.3907, + "step": 11524 + }, + { + "epoch": 2.369205468187892, + "grad_norm": 0.22848542034626007, + "learning_rate": 1.003031752782484e-05, + "loss": 0.3972, + "step": 11525 + }, + { + "epoch": 2.3694110391612706, + "grad_norm": 0.21729277074337006, + "learning_rate": 1.002403215288756e-05, + "loss": 0.4045, + "step": 11526 + }, + { + "epoch": 2.369616610134649, + "grad_norm": 0.22861436009407043, + "learning_rate": 1.001774850105273e-05, + "loss": 0.4129, + "step": 11527 + }, + { + "epoch": 2.3698221811080273, + "grad_norm": 0.22693173587322235, + "learning_rate": 1.0011466572629933e-05, + "loss": 0.3786, + "step": 11528 + }, + { + "epoch": 2.370027752081406, + "grad_norm": 0.23766165971755981, + "learning_rate": 1.0005186367928648e-05, + "loss": 0.406, + "step": 11529 + }, + { + "epoch": 2.3702333230547845, + "grad_norm": 0.12284702807664871, + "learning_rate": 9.998907887258245e-06, + "loss": 0.4393, + "step": 11530 + }, + { + "epoch": 2.370438894028163, + "grad_norm": 0.12830850481987, + "learning_rate": 9.992631130928073e-06, + "loss": 0.4596, + "step": 11531 + }, + { + "epoch": 2.3706444650015417, + "grad_norm": 0.21804697811603546, + "learning_rate": 9.986356099247343e-06, + "loss": 0.3676, + "step": 11532 + }, + { + "epoch": 2.3708500359749203, + "grad_norm": 0.12258250266313553, + "learning_rate": 9.98008279252519e-06, + "loss": 0.4617, + "step": 11533 + }, + { + "epoch": 2.371055606948299, + "grad_norm": 0.22385314106941223, + "learning_rate": 9.973811211070666e-06, + "loss": 0.3938, + "step": 11534 + }, + { + "epoch": 2.3712611779216775, + "grad_norm": 0.12500827014446259, + "learning_rate": 9.967541355192763e-06, + "loss": 0.4385, + "step": 11535 + }, + { + "epoch": 2.371466748895056, + "grad_norm": 0.23873549699783325, + "learning_rate": 9.961273225200353e-06, + "loss": 0.3857, + "step": 11536 + }, + { + "epoch": 2.3716723198684346, + "grad_norm": 0.23228701949119568, + "learning_rate": 9.955006821402244e-06, + "loss": 0.3898, + "step": 11537 + }, + { + "epoch": 2.371877890841813, + "grad_norm": 0.12291909754276276, + "learning_rate": 9.948742144107149e-06, + "loss": 0.4612, + "step": 11538 + }, + { + "epoch": 2.372083461815192, + "grad_norm": 0.23434710502624512, + "learning_rate": 9.942479193623696e-06, + "loss": 0.3871, + "step": 11539 + }, + { + "epoch": 2.3722890327885704, + "grad_norm": 0.23191601037979126, + "learning_rate": 9.936217970260437e-06, + "loss": 0.4079, + "step": 11540 + }, + { + "epoch": 2.372494603761949, + "grad_norm": 0.21654024720191956, + "learning_rate": 9.929958474325821e-06, + "loss": 0.387, + "step": 11541 + }, + { + "epoch": 2.3727001747353276, + "grad_norm": 0.2176521271467209, + "learning_rate": 9.923700706128245e-06, + "loss": 0.4028, + "step": 11542 + }, + { + "epoch": 2.3729057457087057, + "grad_norm": 0.22292236983776093, + "learning_rate": 9.917444665975987e-06, + "loss": 0.3789, + "step": 11543 + }, + { + "epoch": 2.3731113166820843, + "grad_norm": 0.23066085577011108, + "learning_rate": 9.911190354177257e-06, + "loss": 0.3781, + "step": 11544 + }, + { + "epoch": 2.373316887655463, + "grad_norm": 0.1312110424041748, + "learning_rate": 9.904937771040172e-06, + "loss": 0.4353, + "step": 11545 + }, + { + "epoch": 2.3735224586288415, + "grad_norm": 0.22334595024585724, + "learning_rate": 9.89868691687277e-06, + "loss": 0.3894, + "step": 11546 + }, + { + "epoch": 2.37372802960222, + "grad_norm": 0.2200348973274231, + "learning_rate": 9.892437791983002e-06, + "loss": 0.3792, + "step": 11547 + }, + { + "epoch": 2.3739336005755987, + "grad_norm": 0.2263760268688202, + "learning_rate": 9.886190396678715e-06, + "loss": 0.3948, + "step": 11548 + }, + { + "epoch": 2.3741391715489772, + "grad_norm": 0.22932201623916626, + "learning_rate": 9.879944731267723e-06, + "loss": 0.3927, + "step": 11549 + }, + { + "epoch": 2.374344742522356, + "grad_norm": 0.22899407148361206, + "learning_rate": 9.873700796057702e-06, + "loss": 0.4045, + "step": 11550 + }, + { + "epoch": 2.3745503134957344, + "grad_norm": 0.23349648714065552, + "learning_rate": 9.867458591356262e-06, + "loss": 0.3858, + "step": 11551 + }, + { + "epoch": 2.374755884469113, + "grad_norm": 0.2280297577381134, + "learning_rate": 9.861218117470914e-06, + "loss": 0.3987, + "step": 11552 + }, + { + "epoch": 2.3749614554424916, + "grad_norm": 0.11944809556007385, + "learning_rate": 9.854979374709125e-06, + "loss": 0.44, + "step": 11553 + }, + { + "epoch": 2.37516702641587, + "grad_norm": 0.2443980574607849, + "learning_rate": 9.848742363378233e-06, + "loss": 0.3749, + "step": 11554 + }, + { + "epoch": 2.3753725973892488, + "grad_norm": 0.224415123462677, + "learning_rate": 9.8425070837855e-06, + "loss": 0.4007, + "step": 11555 + }, + { + "epoch": 2.3755781683626274, + "grad_norm": 0.23538914322853088, + "learning_rate": 9.836273536238125e-06, + "loss": 0.4024, + "step": 11556 + }, + { + "epoch": 2.375783739336006, + "grad_norm": 0.2267664521932602, + "learning_rate": 9.830041721043201e-06, + "loss": 0.3676, + "step": 11557 + }, + { + "epoch": 2.375989310309384, + "grad_norm": 0.2350446581840515, + "learning_rate": 9.823811638507738e-06, + "loss": 0.3737, + "step": 11558 + }, + { + "epoch": 2.3761948812827627, + "grad_norm": 0.23056869208812714, + "learning_rate": 9.81758328893866e-06, + "loss": 0.3897, + "step": 11559 + }, + { + "epoch": 2.3764004522561413, + "grad_norm": 0.22713732719421387, + "learning_rate": 9.811356672642816e-06, + "loss": 0.3669, + "step": 11560 + }, + { + "epoch": 2.37660602322952, + "grad_norm": 0.22514687478542328, + "learning_rate": 9.805131789926953e-06, + "loss": 0.3922, + "step": 11561 + }, + { + "epoch": 2.3768115942028984, + "grad_norm": 0.2302553504705429, + "learning_rate": 9.798908641097734e-06, + "loss": 0.3878, + "step": 11562 + }, + { + "epoch": 2.377017165176277, + "grad_norm": 0.22958478331565857, + "learning_rate": 9.792687226461768e-06, + "loss": 0.3946, + "step": 11563 + }, + { + "epoch": 2.3772227361496556, + "grad_norm": 0.2399352788925171, + "learning_rate": 9.786467546325548e-06, + "loss": 0.3835, + "step": 11564 + }, + { + "epoch": 2.377428307123034, + "grad_norm": 0.22785188257694244, + "learning_rate": 9.780249600995484e-06, + "loss": 0.383, + "step": 11565 + }, + { + "epoch": 2.377633878096413, + "grad_norm": 0.23024681210517883, + "learning_rate": 9.774033390777902e-06, + "loss": 0.379, + "step": 11566 + }, + { + "epoch": 2.3778394490697914, + "grad_norm": 0.12375470250844955, + "learning_rate": 9.767818915979052e-06, + "loss": 0.4333, + "step": 11567 + }, + { + "epoch": 2.37804502004317, + "grad_norm": 0.23087210953235626, + "learning_rate": 9.761606176905089e-06, + "loss": 0.3899, + "step": 11568 + }, + { + "epoch": 2.3782505910165486, + "grad_norm": 0.12501628696918488, + "learning_rate": 9.755395173862072e-06, + "loss": 0.4761, + "step": 11569 + }, + { + "epoch": 2.378456161989927, + "grad_norm": 0.23227210342884064, + "learning_rate": 9.749185907156014e-06, + "loss": 0.3867, + "step": 11570 + }, + { + "epoch": 2.3786617329633057, + "grad_norm": 0.11980535089969635, + "learning_rate": 9.742978377092805e-06, + "loss": 0.4406, + "step": 11571 + }, + { + "epoch": 2.3788673039366843, + "grad_norm": 0.23125259578227997, + "learning_rate": 9.736772583978261e-06, + "loss": 0.3782, + "step": 11572 + }, + { + "epoch": 2.3790728749100625, + "grad_norm": 0.237775981426239, + "learning_rate": 9.730568528118097e-06, + "loss": 0.4088, + "step": 11573 + }, + { + "epoch": 2.3792784458834415, + "grad_norm": 0.22310465574264526, + "learning_rate": 9.724366209817991e-06, + "loss": 0.3875, + "step": 11574 + }, + { + "epoch": 2.3794840168568197, + "grad_norm": 0.23160605132579803, + "learning_rate": 9.71816562938348e-06, + "loss": 0.3908, + "step": 11575 + }, + { + "epoch": 2.3796895878301982, + "grad_norm": 0.21605071425437927, + "learning_rate": 9.711966787120025e-06, + "loss": 0.3931, + "step": 11576 + }, + { + "epoch": 2.379895158803577, + "grad_norm": 0.23142650723457336, + "learning_rate": 9.705769683333049e-06, + "loss": 0.3814, + "step": 11577 + }, + { + "epoch": 2.3801007297769554, + "grad_norm": 0.2322172224521637, + "learning_rate": 9.699574318327836e-06, + "loss": 0.4077, + "step": 11578 + }, + { + "epoch": 2.380306300750334, + "grad_norm": 0.23128941655158997, + "learning_rate": 9.693380692409598e-06, + "loss": 0.4085, + "step": 11579 + }, + { + "epoch": 2.3805118717237126, + "grad_norm": 0.22898712754249573, + "learning_rate": 9.687188805883475e-06, + "loss": 0.3729, + "step": 11580 + }, + { + "epoch": 2.380717442697091, + "grad_norm": 0.2282625287771225, + "learning_rate": 9.680998659054504e-06, + "loss": 0.3726, + "step": 11581 + }, + { + "epoch": 2.3809230136704698, + "grad_norm": 0.2424619495868683, + "learning_rate": 9.674810252227655e-06, + "loss": 0.4017, + "step": 11582 + }, + { + "epoch": 2.3811285846438484, + "grad_norm": 0.12431478500366211, + "learning_rate": 9.668623585707774e-06, + "loss": 0.4515, + "step": 11583 + }, + { + "epoch": 2.381334155617227, + "grad_norm": 0.2225092202425003, + "learning_rate": 9.662438659799689e-06, + "loss": 0.3965, + "step": 11584 + }, + { + "epoch": 2.3815397265906055, + "grad_norm": 0.2349206954240799, + "learning_rate": 9.656255474808082e-06, + "loss": 0.3851, + "step": 11585 + }, + { + "epoch": 2.381745297563984, + "grad_norm": 0.22471074759960175, + "learning_rate": 9.650074031037576e-06, + "loss": 0.396, + "step": 11586 + }, + { + "epoch": 2.3819508685373627, + "grad_norm": 0.13233044743537903, + "learning_rate": 9.643894328792692e-06, + "loss": 0.4617, + "step": 11587 + }, + { + "epoch": 2.382156439510741, + "grad_norm": 0.11851814389228821, + "learning_rate": 9.637716368377883e-06, + "loss": 0.4364, + "step": 11588 + }, + { + "epoch": 2.38236201048412, + "grad_norm": 0.22143539786338806, + "learning_rate": 9.631540150097501e-06, + "loss": 0.4004, + "step": 11589 + }, + { + "epoch": 2.382567581457498, + "grad_norm": 0.2365567684173584, + "learning_rate": 9.625365674255817e-06, + "loss": 0.4103, + "step": 11590 + }, + { + "epoch": 2.3827731524308766, + "grad_norm": 0.22856760025024414, + "learning_rate": 9.619192941157033e-06, + "loss": 0.3897, + "step": 11591 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 0.12115172296762466, + "learning_rate": 9.613021951105246e-06, + "loss": 0.456, + "step": 11592 + }, + { + "epoch": 2.383184294377634, + "grad_norm": 0.2271842509508133, + "learning_rate": 9.606852704404472e-06, + "loss": 0.3896, + "step": 11593 + }, + { + "epoch": 2.3833898653510124, + "grad_norm": 0.232135608792305, + "learning_rate": 9.600685201358626e-06, + "loss": 0.3863, + "step": 11594 + }, + { + "epoch": 2.383595436324391, + "grad_norm": 0.23168498277664185, + "learning_rate": 9.594519442271568e-06, + "loss": 0.4031, + "step": 11595 + }, + { + "epoch": 2.3838010072977696, + "grad_norm": 0.22451357543468475, + "learning_rate": 9.588355427447062e-06, + "loss": 0.3845, + "step": 11596 + }, + { + "epoch": 2.384006578271148, + "grad_norm": 0.23108120262622833, + "learning_rate": 9.582193157188753e-06, + "loss": 0.3817, + "step": 11597 + }, + { + "epoch": 2.3842121492445267, + "grad_norm": 0.2306068241596222, + "learning_rate": 9.576032631800258e-06, + "loss": 0.3839, + "step": 11598 + }, + { + "epoch": 2.3844177202179053, + "grad_norm": 0.23344826698303223, + "learning_rate": 9.569873851585067e-06, + "loss": 0.3873, + "step": 11599 + }, + { + "epoch": 2.384623291191284, + "grad_norm": 0.12073783576488495, + "learning_rate": 9.563716816846585e-06, + "loss": 0.4482, + "step": 11600 + }, + { + "epoch": 2.3848288621646625, + "grad_norm": 0.23118554055690765, + "learning_rate": 9.557561527888153e-06, + "loss": 0.3992, + "step": 11601 + }, + { + "epoch": 2.385034433138041, + "grad_norm": 0.24189937114715576, + "learning_rate": 9.551407985013004e-06, + "loss": 0.3896, + "step": 11602 + }, + { + "epoch": 2.3852400041114192, + "grad_norm": 0.22828659415245056, + "learning_rate": 9.545256188524287e-06, + "loss": 0.3812, + "step": 11603 + }, + { + "epoch": 2.3854455750847983, + "grad_norm": 0.22598852217197418, + "learning_rate": 9.53910613872509e-06, + "loss": 0.3918, + "step": 11604 + }, + { + "epoch": 2.3856511460581764, + "grad_norm": 0.2214164286851883, + "learning_rate": 9.532957835918392e-06, + "loss": 0.3615, + "step": 11605 + }, + { + "epoch": 2.385856717031555, + "grad_norm": 0.2324512004852295, + "learning_rate": 9.526811280407091e-06, + "loss": 0.3832, + "step": 11606 + }, + { + "epoch": 2.3860622880049336, + "grad_norm": 0.22195158898830414, + "learning_rate": 9.520666472493996e-06, + "loss": 0.3767, + "step": 11607 + }, + { + "epoch": 2.386267858978312, + "grad_norm": 0.23884356021881104, + "learning_rate": 9.514523412481835e-06, + "loss": 0.3979, + "step": 11608 + }, + { + "epoch": 2.3864734299516908, + "grad_norm": 0.2285103052854538, + "learning_rate": 9.508382100673247e-06, + "loss": 0.3877, + "step": 11609 + }, + { + "epoch": 2.3866790009250693, + "grad_norm": 0.24297171831130981, + "learning_rate": 9.502242537370767e-06, + "loss": 0.3847, + "step": 11610 + }, + { + "epoch": 2.386884571898448, + "grad_norm": 0.23993346095085144, + "learning_rate": 9.4961047228769e-06, + "loss": 0.3909, + "step": 11611 + }, + { + "epoch": 2.3870901428718265, + "grad_norm": 0.24006116390228271, + "learning_rate": 9.489968657494006e-06, + "loss": 0.3865, + "step": 11612 + }, + { + "epoch": 2.387295713845205, + "grad_norm": 0.23091156780719757, + "learning_rate": 9.483834341524384e-06, + "loss": 0.3936, + "step": 11613 + }, + { + "epoch": 2.3875012848185837, + "grad_norm": 0.12236663699150085, + "learning_rate": 9.477701775270241e-06, + "loss": 0.4518, + "step": 11614 + }, + { + "epoch": 2.3877068557919623, + "grad_norm": 0.2414701133966446, + "learning_rate": 9.471570959033699e-06, + "loss": 0.3928, + "step": 11615 + }, + { + "epoch": 2.387912426765341, + "grad_norm": 0.22857315838336945, + "learning_rate": 9.465441893116786e-06, + "loss": 0.3743, + "step": 11616 + }, + { + "epoch": 2.3881179977387195, + "grad_norm": 0.2300974428653717, + "learning_rate": 9.459314577821475e-06, + "loss": 0.3847, + "step": 11617 + }, + { + "epoch": 2.3883235687120976, + "grad_norm": 0.12099748104810715, + "learning_rate": 9.453189013449605e-06, + "loss": 0.4291, + "step": 11618 + }, + { + "epoch": 2.3885291396854766, + "grad_norm": 0.1314156949520111, + "learning_rate": 9.44706520030298e-06, + "loss": 0.4537, + "step": 11619 + }, + { + "epoch": 2.388734710658855, + "grad_norm": 0.22321221232414246, + "learning_rate": 9.44094313868328e-06, + "loss": 0.3879, + "step": 11620 + }, + { + "epoch": 2.3889402816322334, + "grad_norm": 0.22418095171451569, + "learning_rate": 9.434822828892105e-06, + "loss": 0.37, + "step": 11621 + }, + { + "epoch": 2.389145852605612, + "grad_norm": 0.23237618803977966, + "learning_rate": 9.428704271230982e-06, + "loss": 0.4108, + "step": 11622 + }, + { + "epoch": 2.3893514235789906, + "grad_norm": 0.235540971159935, + "learning_rate": 9.42258746600134e-06, + "loss": 0.3878, + "step": 11623 + }, + { + "epoch": 2.389556994552369, + "grad_norm": 0.2259136289358139, + "learning_rate": 9.41647241350451e-06, + "loss": 0.385, + "step": 11624 + }, + { + "epoch": 2.3897625655257477, + "grad_norm": 0.2294631153345108, + "learning_rate": 9.41035911404178e-06, + "loss": 0.3939, + "step": 11625 + }, + { + "epoch": 2.3899681364991263, + "grad_norm": 0.23754870891571045, + "learning_rate": 9.404247567914311e-06, + "loss": 0.3749, + "step": 11626 + }, + { + "epoch": 2.390173707472505, + "grad_norm": 0.2304736226797104, + "learning_rate": 9.398137775423193e-06, + "loss": 0.4073, + "step": 11627 + }, + { + "epoch": 2.3903792784458835, + "grad_norm": 0.22400328516960144, + "learning_rate": 9.392029736869421e-06, + "loss": 0.4066, + "step": 11628 + }, + { + "epoch": 2.390584849419262, + "grad_norm": 0.2297855019569397, + "learning_rate": 9.385923452553912e-06, + "loss": 0.3995, + "step": 11629 + }, + { + "epoch": 2.3907904203926407, + "grad_norm": 0.22708038985729218, + "learning_rate": 9.379818922777499e-06, + "loss": 0.3896, + "step": 11630 + }, + { + "epoch": 2.3909959913660193, + "grad_norm": 0.22796861827373505, + "learning_rate": 9.373716147840904e-06, + "loss": 0.3939, + "step": 11631 + }, + { + "epoch": 2.391201562339398, + "grad_norm": 0.2315075695514679, + "learning_rate": 9.367615128044811e-06, + "loss": 0.3848, + "step": 11632 + }, + { + "epoch": 2.391407133312776, + "grad_norm": 0.23707285523414612, + "learning_rate": 9.361515863689775e-06, + "loss": 0.3923, + "step": 11633 + }, + { + "epoch": 2.391612704286155, + "grad_norm": 0.2384757399559021, + "learning_rate": 9.355418355076277e-06, + "loss": 0.362, + "step": 11634 + }, + { + "epoch": 2.391818275259533, + "grad_norm": 0.23415617644786835, + "learning_rate": 9.349322602504717e-06, + "loss": 0.4033, + "step": 11635 + }, + { + "epoch": 2.3920238462329118, + "grad_norm": 0.23738761246204376, + "learning_rate": 9.343228606275398e-06, + "loss": 0.389, + "step": 11636 + }, + { + "epoch": 2.3922294172062903, + "grad_norm": 0.2381700575351715, + "learning_rate": 9.337136366688534e-06, + "loss": 0.396, + "step": 11637 + }, + { + "epoch": 2.392434988179669, + "grad_norm": 0.22899046540260315, + "learning_rate": 9.331045884044288e-06, + "loss": 0.3902, + "step": 11638 + }, + { + "epoch": 2.3926405591530475, + "grad_norm": 0.24088416993618011, + "learning_rate": 9.324957158642698e-06, + "loss": 0.4191, + "step": 11639 + }, + { + "epoch": 2.392846130126426, + "grad_norm": 0.12892726063728333, + "learning_rate": 9.318870190783708e-06, + "loss": 0.4628, + "step": 11640 + }, + { + "epoch": 2.3930517010998047, + "grad_norm": 0.23511195182800293, + "learning_rate": 9.312784980767221e-06, + "loss": 0.4036, + "step": 11641 + }, + { + "epoch": 2.3932572720731833, + "grad_norm": 0.1235305592417717, + "learning_rate": 9.306701528893022e-06, + "loss": 0.4505, + "step": 11642 + }, + { + "epoch": 2.393462843046562, + "grad_norm": 0.33878177404403687, + "learning_rate": 9.300619835460804e-06, + "loss": 0.3857, + "step": 11643 + }, + { + "epoch": 2.3936684140199405, + "grad_norm": 0.1223718672990799, + "learning_rate": 9.294539900770187e-06, + "loss": 0.4886, + "step": 11644 + }, + { + "epoch": 2.393873984993319, + "grad_norm": 0.2304789125919342, + "learning_rate": 9.288461725120694e-06, + "loss": 0.3925, + "step": 11645 + }, + { + "epoch": 2.3940795559666976, + "grad_norm": 0.2310042679309845, + "learning_rate": 9.282385308811784e-06, + "loss": 0.3862, + "step": 11646 + }, + { + "epoch": 2.3942851269400762, + "grad_norm": 0.22798366844654083, + "learning_rate": 9.276310652142813e-06, + "loss": 0.3814, + "step": 11647 + }, + { + "epoch": 2.394490697913455, + "grad_norm": 0.23074646294116974, + "learning_rate": 9.270237755413042e-06, + "loss": 0.3983, + "step": 11648 + }, + { + "epoch": 2.3946962688868334, + "grad_norm": 0.11851920187473297, + "learning_rate": 9.264166618921649e-06, + "loss": 0.4514, + "step": 11649 + }, + { + "epoch": 2.3949018398602115, + "grad_norm": 0.22583618760108948, + "learning_rate": 9.258097242967744e-06, + "loss": 0.3941, + "step": 11650 + }, + { + "epoch": 2.39510741083359, + "grad_norm": 0.23350371420383453, + "learning_rate": 9.252029627850334e-06, + "loss": 0.3911, + "step": 11651 + }, + { + "epoch": 2.3953129818069687, + "grad_norm": 0.22866030037403107, + "learning_rate": 9.245963773868321e-06, + "loss": 0.3851, + "step": 11652 + }, + { + "epoch": 2.3955185527803473, + "grad_norm": 0.23153471946716309, + "learning_rate": 9.239899681320573e-06, + "loss": 0.3953, + "step": 11653 + }, + { + "epoch": 2.395724123753726, + "grad_norm": 0.24449722468852997, + "learning_rate": 9.233837350505824e-06, + "loss": 0.3887, + "step": 11654 + }, + { + "epoch": 2.3959296947271045, + "grad_norm": 0.23732249438762665, + "learning_rate": 9.22777678172274e-06, + "loss": 0.3858, + "step": 11655 + }, + { + "epoch": 2.396135265700483, + "grad_norm": 0.231268510222435, + "learning_rate": 9.221717975269895e-06, + "loss": 0.3985, + "step": 11656 + }, + { + "epoch": 2.3963408366738617, + "grad_norm": 0.23087145388126373, + "learning_rate": 9.215660931445777e-06, + "loss": 0.4104, + "step": 11657 + }, + { + "epoch": 2.3965464076472403, + "grad_norm": 0.12480619549751282, + "learning_rate": 9.209605650548777e-06, + "loss": 0.4454, + "step": 11658 + }, + { + "epoch": 2.396751978620619, + "grad_norm": 0.2369321882724762, + "learning_rate": 9.203552132877233e-06, + "loss": 0.3862, + "step": 11659 + }, + { + "epoch": 2.3969575495939974, + "grad_norm": 0.22652901709079742, + "learning_rate": 9.197500378729366e-06, + "loss": 0.3744, + "step": 11660 + }, + { + "epoch": 2.397163120567376, + "grad_norm": 0.2313791662454605, + "learning_rate": 9.191450388403304e-06, + "loss": 0.3994, + "step": 11661 + }, + { + "epoch": 2.3973686915407546, + "grad_norm": 0.22537241876125336, + "learning_rate": 9.18540216219712e-06, + "loss": 0.3834, + "step": 11662 + }, + { + "epoch": 2.397574262514133, + "grad_norm": 0.12685616314411163, + "learning_rate": 9.17935570040878e-06, + "loss": 0.4568, + "step": 11663 + }, + { + "epoch": 2.397779833487512, + "grad_norm": 0.24072742462158203, + "learning_rate": 9.173311003336157e-06, + "loss": 0.3874, + "step": 11664 + }, + { + "epoch": 2.39798540446089, + "grad_norm": 0.2216685265302658, + "learning_rate": 9.167268071277045e-06, + "loss": 0.4017, + "step": 11665 + }, + { + "epoch": 2.3981909754342685, + "grad_norm": 0.1218583807349205, + "learning_rate": 9.161226904529145e-06, + "loss": 0.4435, + "step": 11666 + }, + { + "epoch": 2.398396546407647, + "grad_norm": 0.23101285099983215, + "learning_rate": 9.155187503390094e-06, + "loss": 0.3781, + "step": 11667 + }, + { + "epoch": 2.3986021173810257, + "grad_norm": 0.23491719365119934, + "learning_rate": 9.14914986815742e-06, + "loss": 0.394, + "step": 11668 + }, + { + "epoch": 2.3988076883544043, + "grad_norm": 0.23189514875411987, + "learning_rate": 9.143113999128563e-06, + "loss": 0.3847, + "step": 11669 + }, + { + "epoch": 2.399013259327783, + "grad_norm": 0.2177298218011856, + "learning_rate": 9.137079896600887e-06, + "loss": 0.3886, + "step": 11670 + }, + { + "epoch": 2.3992188303011615, + "grad_norm": 0.1185031533241272, + "learning_rate": 9.131047560871658e-06, + "loss": 0.4323, + "step": 11671 + }, + { + "epoch": 2.39942440127454, + "grad_norm": 0.22408519685268402, + "learning_rate": 9.12501699223807e-06, + "loss": 0.3679, + "step": 11672 + }, + { + "epoch": 2.3996299722479186, + "grad_norm": 0.23200219869613647, + "learning_rate": 9.118988190997197e-06, + "loss": 0.3909, + "step": 11673 + }, + { + "epoch": 2.399835543221297, + "grad_norm": 0.22250621020793915, + "learning_rate": 9.112961157446087e-06, + "loss": 0.3789, + "step": 11674 + }, + { + "epoch": 2.400041114194676, + "grad_norm": 0.2219180166721344, + "learning_rate": 9.106935891881641e-06, + "loss": 0.3725, + "step": 11675 + }, + { + "epoch": 2.4002466851680544, + "grad_norm": 0.2245936095714569, + "learning_rate": 9.1009123946007e-06, + "loss": 0.401, + "step": 11676 + }, + { + "epoch": 2.400452256141433, + "grad_norm": 0.2297823131084442, + "learning_rate": 9.094890665900018e-06, + "loss": 0.3871, + "step": 11677 + }, + { + "epoch": 2.4006578271148116, + "grad_norm": 0.2330087423324585, + "learning_rate": 9.088870706076245e-06, + "loss": 0.4198, + "step": 11678 + }, + { + "epoch": 2.40086339808819, + "grad_norm": 0.23439383506774902, + "learning_rate": 9.08285251542596e-06, + "loss": 0.3966, + "step": 11679 + }, + { + "epoch": 2.4010689690615683, + "grad_norm": 0.12889879941940308, + "learning_rate": 9.076836094245659e-06, + "loss": 0.4475, + "step": 11680 + }, + { + "epoch": 2.401274540034947, + "grad_norm": 0.22724612057209015, + "learning_rate": 9.070821442831747e-06, + "loss": 0.3952, + "step": 11681 + }, + { + "epoch": 2.4014801110083255, + "grad_norm": 0.22570443153381348, + "learning_rate": 9.064808561480513e-06, + "loss": 0.3949, + "step": 11682 + }, + { + "epoch": 2.401685681981704, + "grad_norm": 0.22554244101047516, + "learning_rate": 9.058797450488212e-06, + "loss": 0.4023, + "step": 11683 + }, + { + "epoch": 2.4018912529550827, + "grad_norm": 0.12734529376029968, + "learning_rate": 9.052788110150975e-06, + "loss": 0.4305, + "step": 11684 + }, + { + "epoch": 2.4020968239284612, + "grad_norm": 0.23667073249816895, + "learning_rate": 9.046780540764853e-06, + "loss": 0.3961, + "step": 11685 + }, + { + "epoch": 2.40230239490184, + "grad_norm": 0.12144028395414352, + "learning_rate": 9.040774742625795e-06, + "loss": 0.4524, + "step": 11686 + }, + { + "epoch": 2.4025079658752184, + "grad_norm": 0.2276497334241867, + "learning_rate": 9.034770716029703e-06, + "loss": 0.3837, + "step": 11687 + }, + { + "epoch": 2.402713536848597, + "grad_norm": 0.23129193484783173, + "learning_rate": 9.028768461272352e-06, + "loss": 0.384, + "step": 11688 + }, + { + "epoch": 2.4029191078219756, + "grad_norm": 0.21576765179634094, + "learning_rate": 9.022767978649457e-06, + "loss": 0.4049, + "step": 11689 + }, + { + "epoch": 2.403124678795354, + "grad_norm": 0.2269795835018158, + "learning_rate": 9.016769268456623e-06, + "loss": 0.3741, + "step": 11690 + }, + { + "epoch": 2.4033302497687328, + "grad_norm": 0.22810319066047668, + "learning_rate": 9.010772330989387e-06, + "loss": 0.4111, + "step": 11691 + }, + { + "epoch": 2.4035358207421114, + "grad_norm": 0.23659124970436096, + "learning_rate": 9.00477716654318e-06, + "loss": 0.4142, + "step": 11692 + }, + { + "epoch": 2.40374139171549, + "grad_norm": 0.21605411171913147, + "learning_rate": 8.998783775413351e-06, + "loss": 0.3838, + "step": 11693 + }, + { + "epoch": 2.4039469626888685, + "grad_norm": 0.23164892196655273, + "learning_rate": 8.992792157895186e-06, + "loss": 0.3911, + "step": 11694 + }, + { + "epoch": 2.4041525336622467, + "grad_norm": 0.23304125666618347, + "learning_rate": 8.986802314283856e-06, + "loss": 0.3949, + "step": 11695 + }, + { + "epoch": 2.4043581046356253, + "grad_norm": 0.2246290147304535, + "learning_rate": 8.980814244874447e-06, + "loss": 0.373, + "step": 11696 + }, + { + "epoch": 2.404563675609004, + "grad_norm": 0.23660001158714294, + "learning_rate": 8.974827949961973e-06, + "loss": 0.3805, + "step": 11697 + }, + { + "epoch": 2.4047692465823824, + "grad_norm": 0.2283889651298523, + "learning_rate": 8.968843429841342e-06, + "loss": 0.3934, + "step": 11698 + }, + { + "epoch": 2.404974817555761, + "grad_norm": 0.22905899584293365, + "learning_rate": 8.962860684807384e-06, + "loss": 0.3994, + "step": 11699 + }, + { + "epoch": 2.4051803885291396, + "grad_norm": 0.21978145837783813, + "learning_rate": 8.956879715154832e-06, + "loss": 0.3818, + "step": 11700 + }, + { + "epoch": 2.405385959502518, + "grad_norm": 0.2412233203649521, + "learning_rate": 8.950900521178367e-06, + "loss": 0.3827, + "step": 11701 + }, + { + "epoch": 2.405591530475897, + "grad_norm": 0.2382228821516037, + "learning_rate": 8.944923103172537e-06, + "loss": 0.3949, + "step": 11702 + }, + { + "epoch": 2.4057971014492754, + "grad_norm": 0.24121670424938202, + "learning_rate": 8.938947461431813e-06, + "loss": 0.3916, + "step": 11703 + }, + { + "epoch": 2.406002672422654, + "grad_norm": 0.12582828104496002, + "learning_rate": 8.932973596250607e-06, + "loss": 0.4566, + "step": 11704 + }, + { + "epoch": 2.4062082433960326, + "grad_norm": 0.11934048682451248, + "learning_rate": 8.927001507923221e-06, + "loss": 0.456, + "step": 11705 + }, + { + "epoch": 2.406413814369411, + "grad_norm": 0.22901131212711334, + "learning_rate": 8.921031196743864e-06, + "loss": 0.374, + "step": 11706 + }, + { + "epoch": 2.4066193853427897, + "grad_norm": 0.23406758904457092, + "learning_rate": 8.915062663006655e-06, + "loss": 0.3698, + "step": 11707 + }, + { + "epoch": 2.4068249563161683, + "grad_norm": 0.23848240077495575, + "learning_rate": 8.909095907005659e-06, + "loss": 0.3978, + "step": 11708 + }, + { + "epoch": 2.407030527289547, + "grad_norm": 0.22418878972530365, + "learning_rate": 8.903130929034822e-06, + "loss": 0.3848, + "step": 11709 + }, + { + "epoch": 2.407236098262925, + "grad_norm": 0.22299997508525848, + "learning_rate": 8.897167729388002e-06, + "loss": 0.3901, + "step": 11710 + }, + { + "epoch": 2.4074416692363036, + "grad_norm": 0.22833383083343506, + "learning_rate": 8.89120630835899e-06, + "loss": 0.3744, + "step": 11711 + }, + { + "epoch": 2.4076472402096822, + "grad_norm": 0.2442595660686493, + "learning_rate": 8.885246666241468e-06, + "loss": 0.3829, + "step": 11712 + }, + { + "epoch": 2.407852811183061, + "grad_norm": 0.23331138491630554, + "learning_rate": 8.879288803329043e-06, + "loss": 0.4022, + "step": 11713 + }, + { + "epoch": 2.4080583821564394, + "grad_norm": 0.22748929262161255, + "learning_rate": 8.87333271991522e-06, + "loss": 0.4032, + "step": 11714 + }, + { + "epoch": 2.408263953129818, + "grad_norm": 0.23111633956432343, + "learning_rate": 8.867378416293447e-06, + "loss": 0.3815, + "step": 11715 + }, + { + "epoch": 2.4084695241031966, + "grad_norm": 0.23724834620952606, + "learning_rate": 8.861425892757058e-06, + "loss": 0.384, + "step": 11716 + }, + { + "epoch": 2.408675095076575, + "grad_norm": 0.22605331242084503, + "learning_rate": 8.855475149599309e-06, + "loss": 0.3709, + "step": 11717 + }, + { + "epoch": 2.4088806660499538, + "grad_norm": 0.2561459541320801, + "learning_rate": 8.849526187113354e-06, + "loss": 0.3945, + "step": 11718 + }, + { + "epoch": 2.4090862370233324, + "grad_norm": 0.2261964976787567, + "learning_rate": 8.843579005592281e-06, + "loss": 0.399, + "step": 11719 + }, + { + "epoch": 2.409291807996711, + "grad_norm": 0.23060794174671173, + "learning_rate": 8.837633605329074e-06, + "loss": 0.4068, + "step": 11720 + }, + { + "epoch": 2.4094973789700895, + "grad_norm": 0.2191411554813385, + "learning_rate": 8.831689986616623e-06, + "loss": 0.3823, + "step": 11721 + }, + { + "epoch": 2.409702949943468, + "grad_norm": 0.2240157574415207, + "learning_rate": 8.82574814974777e-06, + "loss": 0.3942, + "step": 11722 + }, + { + "epoch": 2.4099085209168467, + "grad_norm": 0.22615095973014832, + "learning_rate": 8.819808095015225e-06, + "loss": 0.3915, + "step": 11723 + }, + { + "epoch": 2.4101140918902253, + "grad_norm": 0.12168576568365097, + "learning_rate": 8.81386982271163e-06, + "loss": 0.4526, + "step": 11724 + }, + { + "epoch": 2.4103196628636034, + "grad_norm": 0.12334515154361725, + "learning_rate": 8.807933333129526e-06, + "loss": 0.4541, + "step": 11725 + }, + { + "epoch": 2.410525233836982, + "grad_norm": 0.2267734259366989, + "learning_rate": 8.801998626561397e-06, + "loss": 0.3867, + "step": 11726 + }, + { + "epoch": 2.4107308048103606, + "grad_norm": 0.23022069036960602, + "learning_rate": 8.796065703299608e-06, + "loss": 0.4002, + "step": 11727 + }, + { + "epoch": 2.410936375783739, + "grad_norm": 0.2284584641456604, + "learning_rate": 8.79013456363643e-06, + "loss": 0.3759, + "step": 11728 + }, + { + "epoch": 2.411141946757118, + "grad_norm": 0.24320749938488007, + "learning_rate": 8.78420520786409e-06, + "loss": 0.3935, + "step": 11729 + }, + { + "epoch": 2.4113475177304964, + "grad_norm": 0.2366839051246643, + "learning_rate": 8.778277636274688e-06, + "loss": 0.399, + "step": 11730 + }, + { + "epoch": 2.411553088703875, + "grad_norm": 0.22736315429210663, + "learning_rate": 8.772351849160245e-06, + "loss": 0.3755, + "step": 11731 + }, + { + "epoch": 2.4117586596772536, + "grad_norm": 0.23666198551654816, + "learning_rate": 8.766427846812702e-06, + "loss": 0.3967, + "step": 11732 + }, + { + "epoch": 2.411964230650632, + "grad_norm": 0.2277197241783142, + "learning_rate": 8.760505629523901e-06, + "loss": 0.3715, + "step": 11733 + }, + { + "epoch": 2.4121698016240107, + "grad_norm": 0.21670396625995636, + "learning_rate": 8.754585197585605e-06, + "loss": 0.3729, + "step": 11734 + }, + { + "epoch": 2.4123753725973893, + "grad_norm": 0.22383198142051697, + "learning_rate": 8.748666551289474e-06, + "loss": 0.39, + "step": 11735 + }, + { + "epoch": 2.412580943570768, + "grad_norm": 0.232547789812088, + "learning_rate": 8.742749690927115e-06, + "loss": 0.3888, + "step": 11736 + }, + { + "epoch": 2.4127865145441465, + "grad_norm": 0.23317821323871613, + "learning_rate": 8.736834616790018e-06, + "loss": 0.4036, + "step": 11737 + }, + { + "epoch": 2.412992085517525, + "grad_norm": 0.12473565340042114, + "learning_rate": 8.73092132916958e-06, + "loss": 0.4348, + "step": 11738 + }, + { + "epoch": 2.4131976564909037, + "grad_norm": 0.22561731934547424, + "learning_rate": 8.72500982835713e-06, + "loss": 0.3938, + "step": 11739 + }, + { + "epoch": 2.413403227464282, + "grad_norm": 0.22302691638469696, + "learning_rate": 8.719100114643891e-06, + "loss": 0.3842, + "step": 11740 + }, + { + "epoch": 2.413608798437661, + "grad_norm": 0.2331441342830658, + "learning_rate": 8.71319218832102e-06, + "loss": 0.3891, + "step": 11741 + }, + { + "epoch": 2.413814369411039, + "grad_norm": 0.22362032532691956, + "learning_rate": 8.70728604967955e-06, + "loss": 0.3858, + "step": 11742 + }, + { + "epoch": 2.4140199403844176, + "grad_norm": 0.23315522074699402, + "learning_rate": 8.701381699010476e-06, + "loss": 0.3939, + "step": 11743 + }, + { + "epoch": 2.414225511357796, + "grad_norm": 0.11996540427207947, + "learning_rate": 8.69547913660467e-06, + "loss": 0.4391, + "step": 11744 + }, + { + "epoch": 2.4144310823311748, + "grad_norm": 0.12286810576915741, + "learning_rate": 8.689578362752919e-06, + "loss": 0.4379, + "step": 11745 + }, + { + "epoch": 2.4146366533045533, + "grad_norm": 0.229908749461174, + "learning_rate": 8.683679377745915e-06, + "loss": 0.3843, + "step": 11746 + }, + { + "epoch": 2.414842224277932, + "grad_norm": 0.2293166071176529, + "learning_rate": 8.677782181874295e-06, + "loss": 0.3845, + "step": 11747 + }, + { + "epoch": 2.4150477952513105, + "grad_norm": 0.22157427668571472, + "learning_rate": 8.671886775428584e-06, + "loss": 0.3857, + "step": 11748 + }, + { + "epoch": 2.415253366224689, + "grad_norm": 0.22539031505584717, + "learning_rate": 8.665993158699197e-06, + "loss": 0.3803, + "step": 11749 + }, + { + "epoch": 2.4154589371980677, + "grad_norm": 0.23554009199142456, + "learning_rate": 8.660101331976515e-06, + "loss": 0.3964, + "step": 11750 + }, + { + "epoch": 2.4156645081714463, + "grad_norm": 0.11748301237821579, + "learning_rate": 8.654211295550791e-06, + "loss": 0.4473, + "step": 11751 + }, + { + "epoch": 2.415870079144825, + "grad_norm": 0.12662889063358307, + "learning_rate": 8.648323049712192e-06, + "loss": 0.4615, + "step": 11752 + }, + { + "epoch": 2.4160756501182035, + "grad_norm": 0.23614639043807983, + "learning_rate": 8.642436594750813e-06, + "loss": 0.3832, + "step": 11753 + }, + { + "epoch": 2.416281221091582, + "grad_norm": 0.23256917297840118, + "learning_rate": 8.636551930956645e-06, + "loss": 0.4061, + "step": 11754 + }, + { + "epoch": 2.41648679206496, + "grad_norm": 0.23401497304439545, + "learning_rate": 8.630669058619595e-06, + "loss": 0.4095, + "step": 11755 + }, + { + "epoch": 2.4166923630383392, + "grad_norm": 0.12618698179721832, + "learning_rate": 8.624787978029495e-06, + "loss": 0.4405, + "step": 11756 + }, + { + "epoch": 2.4168979340117174, + "grad_norm": 0.22936862707138062, + "learning_rate": 8.61890868947608e-06, + "loss": 0.391, + "step": 11757 + }, + { + "epoch": 2.417103504985096, + "grad_norm": 0.2273116260766983, + "learning_rate": 8.613031193248985e-06, + "loss": 0.4034, + "step": 11758 + }, + { + "epoch": 2.4173090759584746, + "grad_norm": 0.23834945261478424, + "learning_rate": 8.607155489637773e-06, + "loss": 0.3938, + "step": 11759 + }, + { + "epoch": 2.417514646931853, + "grad_norm": 0.22730384767055511, + "learning_rate": 8.601281578931908e-06, + "loss": 0.4146, + "step": 11760 + }, + { + "epoch": 2.4177202179052317, + "grad_norm": 0.23497353494167328, + "learning_rate": 8.595409461420778e-06, + "loss": 0.3847, + "step": 11761 + }, + { + "epoch": 2.4179257888786103, + "grad_norm": 0.23128505051136017, + "learning_rate": 8.589539137393653e-06, + "loss": 0.3937, + "step": 11762 + }, + { + "epoch": 2.418131359851989, + "grad_norm": 0.22396472096443176, + "learning_rate": 8.583670607139764e-06, + "loss": 0.3887, + "step": 11763 + }, + { + "epoch": 2.4183369308253675, + "grad_norm": 0.2318245768547058, + "learning_rate": 8.577803870948217e-06, + "loss": 0.3752, + "step": 11764 + }, + { + "epoch": 2.418542501798746, + "grad_norm": 0.12530356645584106, + "learning_rate": 8.571938929108033e-06, + "loss": 0.4542, + "step": 11765 + }, + { + "epoch": 2.4187480727721247, + "grad_norm": 0.23986156284809113, + "learning_rate": 8.566075781908158e-06, + "loss": 0.3791, + "step": 11766 + }, + { + "epoch": 2.4189536437455033, + "grad_norm": 0.2401699423789978, + "learning_rate": 8.56021442963742e-06, + "loss": 0.3889, + "step": 11767 + }, + { + "epoch": 2.419159214718882, + "grad_norm": 0.12486173957586288, + "learning_rate": 8.554354872584612e-06, + "loss": 0.4482, + "step": 11768 + }, + { + "epoch": 2.4193647856922604, + "grad_norm": 0.12108970433473587, + "learning_rate": 8.5484971110384e-06, + "loss": 0.4339, + "step": 11769 + }, + { + "epoch": 2.4195703566656386, + "grad_norm": 0.2209288775920868, + "learning_rate": 8.542641145287342e-06, + "loss": 0.3695, + "step": 11770 + }, + { + "epoch": 2.4197759276390176, + "grad_norm": 0.22829124331474304, + "learning_rate": 8.536786975619966e-06, + "loss": 0.3876, + "step": 11771 + }, + { + "epoch": 2.4199814986123958, + "grad_norm": 0.24268139898777008, + "learning_rate": 8.53093460232467e-06, + "loss": 0.3802, + "step": 11772 + }, + { + "epoch": 2.4201870695857743, + "grad_norm": 0.23681510984897614, + "learning_rate": 8.525084025689766e-06, + "loss": 0.3856, + "step": 11773 + }, + { + "epoch": 2.420392640559153, + "grad_norm": 0.23241069912910461, + "learning_rate": 8.519235246003491e-06, + "loss": 0.3781, + "step": 11774 + }, + { + "epoch": 2.4205982115325315, + "grad_norm": 0.21853965520858765, + "learning_rate": 8.513388263553982e-06, + "loss": 0.3835, + "step": 11775 + }, + { + "epoch": 2.42080378250591, + "grad_norm": 0.23458018898963928, + "learning_rate": 8.507543078629288e-06, + "loss": 0.3982, + "step": 11776 + }, + { + "epoch": 2.4210093534792887, + "grad_norm": 0.23409396409988403, + "learning_rate": 8.501699691517392e-06, + "loss": 0.3817, + "step": 11777 + }, + { + "epoch": 2.4212149244526673, + "grad_norm": 0.23286281526088715, + "learning_rate": 8.49585810250616e-06, + "loss": 0.4137, + "step": 11778 + }, + { + "epoch": 2.421420495426046, + "grad_norm": 0.11904696375131607, + "learning_rate": 8.49001831188338e-06, + "loss": 0.453, + "step": 11779 + }, + { + "epoch": 2.4216260663994245, + "grad_norm": 0.24150028824806213, + "learning_rate": 8.484180319936748e-06, + "loss": 0.3943, + "step": 11780 + }, + { + "epoch": 2.421831637372803, + "grad_norm": 0.2359628528356552, + "learning_rate": 8.478344126953874e-06, + "loss": 0.3806, + "step": 11781 + }, + { + "epoch": 2.4220372083461816, + "grad_norm": 0.12654449045658112, + "learning_rate": 8.472509733222289e-06, + "loss": 0.4553, + "step": 11782 + }, + { + "epoch": 2.4222427793195602, + "grad_norm": 0.23700331151485443, + "learning_rate": 8.466677139029405e-06, + "loss": 0.4043, + "step": 11783 + }, + { + "epoch": 2.422448350292939, + "grad_norm": 0.24000297486782074, + "learning_rate": 8.460846344662597e-06, + "loss": 0.396, + "step": 11784 + }, + { + "epoch": 2.422653921266317, + "grad_norm": 0.12040732055902481, + "learning_rate": 8.455017350409105e-06, + "loss": 0.4522, + "step": 11785 + }, + { + "epoch": 2.422859492239696, + "grad_norm": 0.21814100444316864, + "learning_rate": 8.449190156556098e-06, + "loss": 0.3766, + "step": 11786 + }, + { + "epoch": 2.423065063213074, + "grad_norm": 0.12481694668531418, + "learning_rate": 8.443364763390649e-06, + "loss": 0.4527, + "step": 11787 + }, + { + "epoch": 2.4232706341864527, + "grad_norm": 0.22663375735282898, + "learning_rate": 8.43754117119976e-06, + "loss": 0.383, + "step": 11788 + }, + { + "epoch": 2.4234762051598313, + "grad_norm": 0.12208539247512817, + "learning_rate": 8.431719380270307e-06, + "loss": 0.4564, + "step": 11789 + }, + { + "epoch": 2.42368177613321, + "grad_norm": 0.22577068209648132, + "learning_rate": 8.425899390889138e-06, + "loss": 0.3758, + "step": 11790 + }, + { + "epoch": 2.4238873471065885, + "grad_norm": 0.12725965678691864, + "learning_rate": 8.420081203342941e-06, + "loss": 0.435, + "step": 11791 + }, + { + "epoch": 2.424092918079967, + "grad_norm": 0.23309412598609924, + "learning_rate": 8.414264817918385e-06, + "loss": 0.3846, + "step": 11792 + }, + { + "epoch": 2.4242984890533457, + "grad_norm": 0.2279675304889679, + "learning_rate": 8.408450234901998e-06, + "loss": 0.3934, + "step": 11793 + }, + { + "epoch": 2.4245040600267243, + "grad_norm": 0.22592322528362274, + "learning_rate": 8.402637454580244e-06, + "loss": 0.3864, + "step": 11794 + }, + { + "epoch": 2.424709631000103, + "grad_norm": 0.22809530794620514, + "learning_rate": 8.396826477239479e-06, + "loss": 0.3911, + "step": 11795 + }, + { + "epoch": 2.4249152019734814, + "grad_norm": 0.23382043838500977, + "learning_rate": 8.391017303165995e-06, + "loss": 0.392, + "step": 11796 + }, + { + "epoch": 2.42512077294686, + "grad_norm": 0.23308755457401276, + "learning_rate": 8.38520993264597e-06, + "loss": 0.4044, + "step": 11797 + }, + { + "epoch": 2.4253263439202386, + "grad_norm": 0.22026486694812775, + "learning_rate": 8.379404365965524e-06, + "loss": 0.3994, + "step": 11798 + }, + { + "epoch": 2.425531914893617, + "grad_norm": 0.12114302068948746, + "learning_rate": 8.373600603410658e-06, + "loss": 0.4553, + "step": 11799 + }, + { + "epoch": 2.4257374858669953, + "grad_norm": 0.23082832992076874, + "learning_rate": 8.367798645267303e-06, + "loss": 0.3775, + "step": 11800 + }, + { + "epoch": 2.4259430568403744, + "grad_norm": 0.2422942817211151, + "learning_rate": 8.361998491821289e-06, + "loss": 0.3988, + "step": 11801 + }, + { + "epoch": 2.4261486278137525, + "grad_norm": 0.23066774010658264, + "learning_rate": 8.356200143358363e-06, + "loss": 0.3964, + "step": 11802 + }, + { + "epoch": 2.426354198787131, + "grad_norm": 0.1255854219198227, + "learning_rate": 8.35040360016418e-06, + "loss": 0.4471, + "step": 11803 + }, + { + "epoch": 2.4265597697605097, + "grad_norm": 0.23167705535888672, + "learning_rate": 8.344608862524306e-06, + "loss": 0.3935, + "step": 11804 + }, + { + "epoch": 2.4267653407338883, + "grad_norm": 0.22956189513206482, + "learning_rate": 8.338815930724234e-06, + "loss": 0.3887, + "step": 11805 + }, + { + "epoch": 2.426970911707267, + "grad_norm": 0.23048321902751923, + "learning_rate": 8.33302480504935e-06, + "loss": 0.3993, + "step": 11806 + }, + { + "epoch": 2.4271764826806455, + "grad_norm": 0.22636932134628296, + "learning_rate": 8.327235485784948e-06, + "loss": 0.3955, + "step": 11807 + }, + { + "epoch": 2.427382053654024, + "grad_norm": 0.2231477051973343, + "learning_rate": 8.321447973216248e-06, + "loss": 0.3885, + "step": 11808 + }, + { + "epoch": 2.4275876246274026, + "grad_norm": 0.23407144844532013, + "learning_rate": 8.315662267628374e-06, + "loss": 0.3875, + "step": 11809 + }, + { + "epoch": 2.427793195600781, + "grad_norm": 0.22613434493541718, + "learning_rate": 8.309878369306348e-06, + "loss": 0.3699, + "step": 11810 + }, + { + "epoch": 2.42799876657416, + "grad_norm": 0.244913712143898, + "learning_rate": 8.30409627853513e-06, + "loss": 0.3858, + "step": 11811 + }, + { + "epoch": 2.4282043375475384, + "grad_norm": 0.24289999902248383, + "learning_rate": 8.298315995599578e-06, + "loss": 0.3877, + "step": 11812 + }, + { + "epoch": 2.428409908520917, + "grad_norm": 0.2354183942079544, + "learning_rate": 8.292537520784438e-06, + "loss": 0.3713, + "step": 11813 + }, + { + "epoch": 2.4286154794942956, + "grad_norm": 0.22426529228687286, + "learning_rate": 8.286760854374421e-06, + "loss": 0.374, + "step": 11814 + }, + { + "epoch": 2.428821050467674, + "grad_norm": 0.12690819799900055, + "learning_rate": 8.280985996654097e-06, + "loss": 0.4512, + "step": 11815 + }, + { + "epoch": 2.4290266214410527, + "grad_norm": 0.12370403110980988, + "learning_rate": 8.275212947907967e-06, + "loss": 0.4472, + "step": 11816 + }, + { + "epoch": 2.429232192414431, + "grad_norm": 0.237684965133667, + "learning_rate": 8.26944170842044e-06, + "loss": 0.3942, + "step": 11817 + }, + { + "epoch": 2.4294377633878095, + "grad_norm": 0.23901338875293732, + "learning_rate": 8.26367227847584e-06, + "loss": 0.3923, + "step": 11818 + }, + { + "epoch": 2.429643334361188, + "grad_norm": 0.22691085934638977, + "learning_rate": 8.257904658358407e-06, + "loss": 0.3927, + "step": 11819 + }, + { + "epoch": 2.4298489053345667, + "grad_norm": 0.2229507714509964, + "learning_rate": 8.25213884835228e-06, + "loss": 0.3897, + "step": 11820 + }, + { + "epoch": 2.4300544763079452, + "grad_norm": 0.2375117689371109, + "learning_rate": 8.246374848741511e-06, + "loss": 0.3892, + "step": 11821 + }, + { + "epoch": 2.430260047281324, + "grad_norm": 0.23138779401779175, + "learning_rate": 8.24061265981007e-06, + "loss": 0.3968, + "step": 11822 + }, + { + "epoch": 2.4304656182547024, + "grad_norm": 0.23814985156059265, + "learning_rate": 8.234852281841833e-06, + "loss": 0.3955, + "step": 11823 + }, + { + "epoch": 2.430671189228081, + "grad_norm": 0.2250833660364151, + "learning_rate": 8.229093715120578e-06, + "loss": 0.37, + "step": 11824 + }, + { + "epoch": 2.4308767602014596, + "grad_norm": 0.11755650490522385, + "learning_rate": 8.223336959930003e-06, + "loss": 0.4526, + "step": 11825 + }, + { + "epoch": 2.431082331174838, + "grad_norm": 0.11656010895967484, + "learning_rate": 8.217582016553732e-06, + "loss": 0.427, + "step": 11826 + }, + { + "epoch": 2.4312879021482168, + "grad_norm": 0.2219466120004654, + "learning_rate": 8.211828885275272e-06, + "loss": 0.3825, + "step": 11827 + }, + { + "epoch": 2.4314934731215954, + "grad_norm": 0.22404515743255615, + "learning_rate": 8.206077566378058e-06, + "loss": 0.3639, + "step": 11828 + }, + { + "epoch": 2.431699044094974, + "grad_norm": 0.22220522165298462, + "learning_rate": 8.200328060145428e-06, + "loss": 0.3856, + "step": 11829 + }, + { + "epoch": 2.4319046150683525, + "grad_norm": 0.22566857933998108, + "learning_rate": 8.194580366860628e-06, + "loss": 0.394, + "step": 11830 + }, + { + "epoch": 2.432110186041731, + "grad_norm": 0.2224518358707428, + "learning_rate": 8.18883448680682e-06, + "loss": 0.3692, + "step": 11831 + }, + { + "epoch": 2.4323157570151093, + "grad_norm": 0.23694801330566406, + "learning_rate": 8.18309042026709e-06, + "loss": 0.4063, + "step": 11832 + }, + { + "epoch": 2.432521327988488, + "grad_norm": 0.1398681104183197, + "learning_rate": 8.177348167524418e-06, + "loss": 0.4508, + "step": 11833 + }, + { + "epoch": 2.4327268989618664, + "grad_norm": 0.2483135610818863, + "learning_rate": 8.171607728861677e-06, + "loss": 0.3977, + "step": 11834 + }, + { + "epoch": 2.432932469935245, + "grad_norm": 0.23512189090251923, + "learning_rate": 8.165869104561702e-06, + "loss": 0.3918, + "step": 11835 + }, + { + "epoch": 2.4331380409086236, + "grad_norm": 0.11742374300956726, + "learning_rate": 8.16013229490719e-06, + "loss": 0.4327, + "step": 11836 + }, + { + "epoch": 2.433343611882002, + "grad_norm": 0.24561573565006256, + "learning_rate": 8.154397300180771e-06, + "loss": 0.409, + "step": 11837 + }, + { + "epoch": 2.433549182855381, + "grad_norm": 0.22359338402748108, + "learning_rate": 8.148664120664973e-06, + "loss": 0.3741, + "step": 11838 + }, + { + "epoch": 2.4337547538287594, + "grad_norm": 0.22402852773666382, + "learning_rate": 8.142932756642262e-06, + "loss": 0.3976, + "step": 11839 + }, + { + "epoch": 2.433960324802138, + "grad_norm": 0.22858619689941406, + "learning_rate": 8.137203208394986e-06, + "loss": 0.3971, + "step": 11840 + }, + { + "epoch": 2.4341658957755166, + "grad_norm": 0.1260390430688858, + "learning_rate": 8.13147547620541e-06, + "loss": 0.449, + "step": 11841 + }, + { + "epoch": 2.434371466748895, + "grad_norm": 0.2319212555885315, + "learning_rate": 8.12574956035571e-06, + "loss": 0.3938, + "step": 11842 + }, + { + "epoch": 2.4345770377222737, + "grad_norm": 0.2238619327545166, + "learning_rate": 8.120025461127984e-06, + "loss": 0.3789, + "step": 11843 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 0.2239915281534195, + "learning_rate": 8.114303178804226e-06, + "loss": 0.3881, + "step": 11844 + }, + { + "epoch": 2.434988179669031, + "grad_norm": 0.2295527458190918, + "learning_rate": 8.108582713666335e-06, + "loss": 0.3872, + "step": 11845 + }, + { + "epoch": 2.4351937506424095, + "grad_norm": 0.22697387635707855, + "learning_rate": 8.102864065996159e-06, + "loss": 0.3928, + "step": 11846 + }, + { + "epoch": 2.4353993216157876, + "grad_norm": 0.12275702506303787, + "learning_rate": 8.09714723607541e-06, + "loss": 0.4449, + "step": 11847 + }, + { + "epoch": 2.4356048925891662, + "grad_norm": 0.2424585521221161, + "learning_rate": 8.09143222418573e-06, + "loss": 0.4016, + "step": 11848 + }, + { + "epoch": 2.435810463562545, + "grad_norm": 0.23178981244564056, + "learning_rate": 8.085719030608682e-06, + "loss": 0.3917, + "step": 11849 + }, + { + "epoch": 2.4360160345359234, + "grad_norm": 0.23368723690509796, + "learning_rate": 8.080007655625715e-06, + "loss": 0.3883, + "step": 11850 + }, + { + "epoch": 2.436221605509302, + "grad_norm": 0.23010197281837463, + "learning_rate": 8.074298099518207e-06, + "loss": 0.4075, + "step": 11851 + }, + { + "epoch": 2.4364271764826806, + "grad_norm": 0.23444652557373047, + "learning_rate": 8.068590362567436e-06, + "loss": 0.3887, + "step": 11852 + }, + { + "epoch": 2.436632747456059, + "grad_norm": 0.22641274333000183, + "learning_rate": 8.062884445054602e-06, + "loss": 0.3826, + "step": 11853 + }, + { + "epoch": 2.4368383184294378, + "grad_norm": 0.12579554319381714, + "learning_rate": 8.057180347260816e-06, + "loss": 0.4397, + "step": 11854 + }, + { + "epoch": 2.4370438894028164, + "grad_norm": 0.2325548529624939, + "learning_rate": 8.05147806946707e-06, + "loss": 0.3948, + "step": 11855 + }, + { + "epoch": 2.437249460376195, + "grad_norm": 0.22521813213825226, + "learning_rate": 8.045777611954315e-06, + "loss": 0.3773, + "step": 11856 + }, + { + "epoch": 2.4374550313495735, + "grad_norm": 0.22666728496551514, + "learning_rate": 8.040078975003372e-06, + "loss": 0.3918, + "step": 11857 + }, + { + "epoch": 2.437660602322952, + "grad_norm": 0.2205967754125595, + "learning_rate": 8.03438215889499e-06, + "loss": 0.3929, + "step": 11858 + }, + { + "epoch": 2.4378661732963307, + "grad_norm": 0.23035195469856262, + "learning_rate": 8.028687163909804e-06, + "loss": 0.3795, + "step": 11859 + }, + { + "epoch": 2.4380717442697093, + "grad_norm": 0.22747023403644562, + "learning_rate": 8.022993990328418e-06, + "loss": 0.3908, + "step": 11860 + }, + { + "epoch": 2.438277315243088, + "grad_norm": 0.2318742722272873, + "learning_rate": 8.017302638431285e-06, + "loss": 0.3972, + "step": 11861 + }, + { + "epoch": 2.438482886216466, + "grad_norm": 0.22795268893241882, + "learning_rate": 8.011613108498795e-06, + "loss": 0.3828, + "step": 11862 + }, + { + "epoch": 2.4386884571898446, + "grad_norm": 0.23046202957630157, + "learning_rate": 8.00592540081124e-06, + "loss": 0.391, + "step": 11863 + }, + { + "epoch": 2.438894028163223, + "grad_norm": 0.23023991286754608, + "learning_rate": 8.000239515648832e-06, + "loss": 0.3984, + "step": 11864 + }, + { + "epoch": 2.439099599136602, + "grad_norm": 0.2348402440547943, + "learning_rate": 7.994555453291689e-06, + "loss": 0.4067, + "step": 11865 + }, + { + "epoch": 2.4393051701099804, + "grad_norm": 0.22935132682323456, + "learning_rate": 7.98887321401982e-06, + "loss": 0.3785, + "step": 11866 + }, + { + "epoch": 2.439510741083359, + "grad_norm": 0.23405125737190247, + "learning_rate": 7.983192798113195e-06, + "loss": 0.3775, + "step": 11867 + }, + { + "epoch": 2.4397163120567376, + "grad_norm": 0.22010092437267303, + "learning_rate": 7.977514205851645e-06, + "loss": 0.3812, + "step": 11868 + }, + { + "epoch": 2.439921883030116, + "grad_norm": 0.22667670249938965, + "learning_rate": 7.97183743751492e-06, + "loss": 0.3858, + "step": 11869 + }, + { + "epoch": 2.4401274540034947, + "grad_norm": 0.22953353822231293, + "learning_rate": 7.966162493382703e-06, + "loss": 0.3841, + "step": 11870 + }, + { + "epoch": 2.4403330249768733, + "grad_norm": 0.2351302206516266, + "learning_rate": 7.960489373734561e-06, + "loss": 0.3691, + "step": 11871 + }, + { + "epoch": 2.440538595950252, + "grad_norm": 0.22747184336185455, + "learning_rate": 7.954818078849988e-06, + "loss": 0.3671, + "step": 11872 + }, + { + "epoch": 2.4407441669236305, + "grad_norm": 0.2345331311225891, + "learning_rate": 7.949148609008362e-06, + "loss": 0.3826, + "step": 11873 + }, + { + "epoch": 2.440949737897009, + "grad_norm": 0.23388345539569855, + "learning_rate": 7.943480964489024e-06, + "loss": 0.3909, + "step": 11874 + }, + { + "epoch": 2.4411553088703877, + "grad_norm": 0.2370399385690689, + "learning_rate": 7.937815145571177e-06, + "loss": 0.384, + "step": 11875 + }, + { + "epoch": 2.4413608798437663, + "grad_norm": 0.23738206923007965, + "learning_rate": 7.93215115253394e-06, + "loss": 0.4001, + "step": 11876 + }, + { + "epoch": 2.4415664508171444, + "grad_norm": 0.2355763465166092, + "learning_rate": 7.926488985656372e-06, + "loss": 0.3872, + "step": 11877 + }, + { + "epoch": 2.441772021790523, + "grad_norm": 0.2362237423658371, + "learning_rate": 7.920828645217405e-06, + "loss": 0.3833, + "step": 11878 + }, + { + "epoch": 2.4419775927639016, + "grad_norm": 0.22331978380680084, + "learning_rate": 7.915170131495912e-06, + "loss": 0.3734, + "step": 11879 + }, + { + "epoch": 2.44218316373728, + "grad_norm": 0.23111921548843384, + "learning_rate": 7.909513444770636e-06, + "loss": 0.3911, + "step": 11880 + }, + { + "epoch": 2.4423887347106588, + "grad_norm": 0.12303854525089264, + "learning_rate": 7.90385858532028e-06, + "loss": 0.4474, + "step": 11881 + }, + { + "epoch": 2.4425943056840373, + "grad_norm": 0.23098498582839966, + "learning_rate": 7.89820555342343e-06, + "loss": 0.3952, + "step": 11882 + }, + { + "epoch": 2.442799876657416, + "grad_norm": 0.2414843589067459, + "learning_rate": 7.89255434935858e-06, + "loss": 0.3926, + "step": 11883 + }, + { + "epoch": 2.4430054476307945, + "grad_norm": 0.22785574197769165, + "learning_rate": 7.886904973404134e-06, + "loss": 0.3836, + "step": 11884 + }, + { + "epoch": 2.443211018604173, + "grad_norm": 0.13417033851146698, + "learning_rate": 7.881257425838412e-06, + "loss": 0.4613, + "step": 11885 + }, + { + "epoch": 2.4434165895775517, + "grad_norm": 0.11953188478946686, + "learning_rate": 7.875611706939649e-06, + "loss": 0.4594, + "step": 11886 + }, + { + "epoch": 2.4436221605509303, + "grad_norm": 0.22670340538024902, + "learning_rate": 7.869967816985965e-06, + "loss": 0.3894, + "step": 11887 + }, + { + "epoch": 2.443827731524309, + "grad_norm": 0.24815067648887634, + "learning_rate": 7.86432575625543e-06, + "loss": 0.3958, + "step": 11888 + }, + { + "epoch": 2.4440333024976875, + "grad_norm": 0.22556784749031067, + "learning_rate": 7.858685525025997e-06, + "loss": 0.3895, + "step": 11889 + }, + { + "epoch": 2.444238873471066, + "grad_norm": 0.2190685123205185, + "learning_rate": 7.85304712357553e-06, + "loss": 0.3572, + "step": 11890 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.22397378087043762, + "learning_rate": 7.847410552181804e-06, + "loss": 0.3832, + "step": 11891 + }, + { + "epoch": 2.444650015417823, + "grad_norm": 0.22663743793964386, + "learning_rate": 7.841775811122514e-06, + "loss": 0.3838, + "step": 11892 + }, + { + "epoch": 2.4448555863912014, + "grad_norm": 0.22418825328350067, + "learning_rate": 7.83614290067525e-06, + "loss": 0.4074, + "step": 11893 + }, + { + "epoch": 2.44506115736458, + "grad_norm": 0.2320103645324707, + "learning_rate": 7.83051182111751e-06, + "loss": 0.393, + "step": 11894 + }, + { + "epoch": 2.4452667283379586, + "grad_norm": 0.2413313090801239, + "learning_rate": 7.824882572726734e-06, + "loss": 0.3944, + "step": 11895 + }, + { + "epoch": 2.445472299311337, + "grad_norm": 0.24407535791397095, + "learning_rate": 7.81925515578024e-06, + "loss": 0.4004, + "step": 11896 + }, + { + "epoch": 2.4456778702847157, + "grad_norm": 0.24596747756004333, + "learning_rate": 7.81362957055526e-06, + "loss": 0.4299, + "step": 11897 + }, + { + "epoch": 2.4458834412580943, + "grad_norm": 0.23562337458133698, + "learning_rate": 7.808005817328927e-06, + "loss": 0.4011, + "step": 11898 + }, + { + "epoch": 2.446089012231473, + "grad_norm": 0.1234961450099945, + "learning_rate": 7.80238389637833e-06, + "loss": 0.4573, + "step": 11899 + }, + { + "epoch": 2.4462945832048515, + "grad_norm": 0.22207041084766388, + "learning_rate": 7.796763807980414e-06, + "loss": 0.3856, + "step": 11900 + }, + { + "epoch": 2.44650015417823, + "grad_norm": 0.23153123259544373, + "learning_rate": 7.79114555241205e-06, + "loss": 0.3985, + "step": 11901 + }, + { + "epoch": 2.4467057251516087, + "grad_norm": 0.2310320883989334, + "learning_rate": 7.785529129950038e-06, + "loss": 0.379, + "step": 11902 + }, + { + "epoch": 2.4469112961249873, + "grad_norm": 0.23394078016281128, + "learning_rate": 7.779914540871065e-06, + "loss": 0.3878, + "step": 11903 + }, + { + "epoch": 2.447116867098366, + "grad_norm": 0.24129053950309753, + "learning_rate": 7.774301785451743e-06, + "loss": 0.4019, + "step": 11904 + }, + { + "epoch": 2.4473224380717444, + "grad_norm": 0.23060935735702515, + "learning_rate": 7.768690863968575e-06, + "loss": 0.383, + "step": 11905 + }, + { + "epoch": 2.447528009045123, + "grad_norm": 0.2250318080186844, + "learning_rate": 7.763081776697986e-06, + "loss": 0.3917, + "step": 11906 + }, + { + "epoch": 2.447733580018501, + "grad_norm": 0.2283366620540619, + "learning_rate": 7.75747452391632e-06, + "loss": 0.3753, + "step": 11907 + }, + { + "epoch": 2.4479391509918798, + "grad_norm": 0.12568168342113495, + "learning_rate": 7.751869105899797e-06, + "loss": 0.4482, + "step": 11908 + }, + { + "epoch": 2.4481447219652583, + "grad_norm": 0.2254650741815567, + "learning_rate": 7.746265522924599e-06, + "loss": 0.3705, + "step": 11909 + }, + { + "epoch": 2.448350292938637, + "grad_norm": 0.22300590574741364, + "learning_rate": 7.740663775266774e-06, + "loss": 0.3809, + "step": 11910 + }, + { + "epoch": 2.4485558639120155, + "grad_norm": 0.12381377071142197, + "learning_rate": 7.735063863202297e-06, + "loss": 0.4679, + "step": 11911 + }, + { + "epoch": 2.448761434885394, + "grad_norm": 0.2367285043001175, + "learning_rate": 7.729465787007045e-06, + "loss": 0.4062, + "step": 11912 + }, + { + "epoch": 2.4489670058587727, + "grad_norm": 0.23108406364917755, + "learning_rate": 7.723869546956815e-06, + "loss": 0.3886, + "step": 11913 + }, + { + "epoch": 2.4491725768321513, + "grad_norm": 0.1295919418334961, + "learning_rate": 7.71827514332729e-06, + "loss": 0.4521, + "step": 11914 + }, + { + "epoch": 2.44937814780553, + "grad_norm": 0.21976915001869202, + "learning_rate": 7.71268257639411e-06, + "loss": 0.3668, + "step": 11915 + }, + { + "epoch": 2.4495837187789085, + "grad_norm": 0.22710269689559937, + "learning_rate": 7.707091846432775e-06, + "loss": 0.4069, + "step": 11916 + }, + { + "epoch": 2.449789289752287, + "grad_norm": 0.23662005364894867, + "learning_rate": 7.70150295371872e-06, + "loss": 0.3951, + "step": 11917 + }, + { + "epoch": 2.4499948607256656, + "grad_norm": 0.2241106480360031, + "learning_rate": 7.695915898527278e-06, + "loss": 0.3844, + "step": 11918 + }, + { + "epoch": 2.4502004316990442, + "grad_norm": 0.22867663204669952, + "learning_rate": 7.690330681133695e-06, + "loss": 0.398, + "step": 11919 + }, + { + "epoch": 2.450406002672423, + "grad_norm": 0.22373969852924347, + "learning_rate": 7.684747301813141e-06, + "loss": 0.3871, + "step": 11920 + }, + { + "epoch": 2.4506115736458014, + "grad_norm": 0.21344764530658722, + "learning_rate": 7.679165760840676e-06, + "loss": 0.3814, + "step": 11921 + }, + { + "epoch": 2.4508171446191795, + "grad_norm": 0.2301592379808426, + "learning_rate": 7.67358605849127e-06, + "loss": 0.3842, + "step": 11922 + }, + { + "epoch": 2.4510227155925586, + "grad_norm": 0.23212045431137085, + "learning_rate": 7.668008195039828e-06, + "loss": 0.4049, + "step": 11923 + }, + { + "epoch": 2.4512282865659367, + "grad_norm": 0.2373504489660263, + "learning_rate": 7.662432170761128e-06, + "loss": 0.371, + "step": 11924 + }, + { + "epoch": 2.4514338575393153, + "grad_norm": 0.22908884286880493, + "learning_rate": 7.65685798592988e-06, + "loss": 0.3902, + "step": 11925 + }, + { + "epoch": 2.451639428512694, + "grad_norm": 0.22210195660591125, + "learning_rate": 7.6512856408207e-06, + "loss": 0.4091, + "step": 11926 + }, + { + "epoch": 2.4518449994860725, + "grad_norm": 0.22548379004001617, + "learning_rate": 7.645715135708107e-06, + "loss": 0.3848, + "step": 11927 + }, + { + "epoch": 2.452050570459451, + "grad_norm": 0.23502108454704285, + "learning_rate": 7.640146470866528e-06, + "loss": 0.4022, + "step": 11928 + }, + { + "epoch": 2.4522561414328297, + "grad_norm": 0.23006172478199005, + "learning_rate": 7.634579646570319e-06, + "loss": 0.3856, + "step": 11929 + }, + { + "epoch": 2.4524617124062082, + "grad_norm": 0.23742365837097168, + "learning_rate": 7.629014663093729e-06, + "loss": 0.3902, + "step": 11930 + }, + { + "epoch": 2.452667283379587, + "grad_norm": 0.1288265436887741, + "learning_rate": 7.623451520710911e-06, + "loss": 0.4406, + "step": 11931 + }, + { + "epoch": 2.4528728543529654, + "grad_norm": 0.23163823783397675, + "learning_rate": 7.617890219695945e-06, + "loss": 0.4061, + "step": 11932 + }, + { + "epoch": 2.453078425326344, + "grad_norm": 0.23114512860774994, + "learning_rate": 7.612330760322799e-06, + "loss": 0.3849, + "step": 11933 + }, + { + "epoch": 2.4532839962997226, + "grad_norm": 0.2422575205564499, + "learning_rate": 7.606773142865368e-06, + "loss": 0.4076, + "step": 11934 + }, + { + "epoch": 2.453489567273101, + "grad_norm": 0.23896224796772003, + "learning_rate": 7.601217367597442e-06, + "loss": 0.3913, + "step": 11935 + }, + { + "epoch": 2.45369513824648, + "grad_norm": 0.22641202807426453, + "learning_rate": 7.595663434792739e-06, + "loss": 0.3782, + "step": 11936 + }, + { + "epoch": 2.453900709219858, + "grad_norm": 0.2328871637582779, + "learning_rate": 7.590111344724879e-06, + "loss": 0.3799, + "step": 11937 + }, + { + "epoch": 2.454106280193237, + "grad_norm": 0.22412602603435516, + "learning_rate": 7.584561097667373e-06, + "loss": 0.3826, + "step": 11938 + }, + { + "epoch": 2.454311851166615, + "grad_norm": 0.21876847743988037, + "learning_rate": 7.579012693893668e-06, + "loss": 0.3916, + "step": 11939 + }, + { + "epoch": 2.4545174221399937, + "grad_norm": 0.11912833899259567, + "learning_rate": 7.5734661336770845e-06, + "loss": 0.4674, + "step": 11940 + }, + { + "epoch": 2.4547229931133723, + "grad_norm": 0.2307884842157364, + "learning_rate": 7.56792141729091e-06, + "loss": 0.3924, + "step": 11941 + }, + { + "epoch": 2.454928564086751, + "grad_norm": 0.24826078116893768, + "learning_rate": 7.562378545008289e-06, + "loss": 0.3996, + "step": 11942 + }, + { + "epoch": 2.4551341350601295, + "grad_norm": 0.22318950295448303, + "learning_rate": 7.556837517102281e-06, + "loss": 0.3761, + "step": 11943 + }, + { + "epoch": 2.455339706033508, + "grad_norm": 0.22398589551448822, + "learning_rate": 7.55129833384589e-06, + "loss": 0.4049, + "step": 11944 + }, + { + "epoch": 2.4555452770068866, + "grad_norm": 0.2238241583108902, + "learning_rate": 7.545760995512e-06, + "loss": 0.3946, + "step": 11945 + }, + { + "epoch": 2.455750847980265, + "grad_norm": 0.23885828256607056, + "learning_rate": 7.540225502373406e-06, + "loss": 0.374, + "step": 11946 + }, + { + "epoch": 2.455956418953644, + "grad_norm": 0.22806625068187714, + "learning_rate": 7.53469185470281e-06, + "loss": 0.3872, + "step": 11947 + }, + { + "epoch": 2.4561619899270224, + "grad_norm": 0.2306099683046341, + "learning_rate": 7.529160052772834e-06, + "loss": 0.3904, + "step": 11948 + }, + { + "epoch": 2.456367560900401, + "grad_norm": 0.22906504571437836, + "learning_rate": 7.523630096855996e-06, + "loss": 0.3872, + "step": 11949 + }, + { + "epoch": 2.4565731318737796, + "grad_norm": 0.2300928682088852, + "learning_rate": 7.518101987224747e-06, + "loss": 0.3774, + "step": 11950 + }, + { + "epoch": 2.456778702847158, + "grad_norm": 0.22766615450382233, + "learning_rate": 7.512575724151425e-06, + "loss": 0.375, + "step": 11951 + }, + { + "epoch": 2.4569842738205363, + "grad_norm": 0.2307845950126648, + "learning_rate": 7.507051307908282e-06, + "loss": 0.4087, + "step": 11952 + }, + { + "epoch": 2.4571898447939153, + "grad_norm": 0.23094025254249573, + "learning_rate": 7.5015287387674745e-06, + "loss": 0.4023, + "step": 11953 + }, + { + "epoch": 2.4573954157672935, + "grad_norm": 0.25428444147109985, + "learning_rate": 7.4960080170010855e-06, + "loss": 0.3832, + "step": 11954 + }, + { + "epoch": 2.457600986740672, + "grad_norm": 0.2268911600112915, + "learning_rate": 7.490489142881082e-06, + "loss": 0.3697, + "step": 11955 + }, + { + "epoch": 2.4578065577140507, + "grad_norm": 0.23061802983283997, + "learning_rate": 7.484972116679353e-06, + "loss": 0.3872, + "step": 11956 + }, + { + "epoch": 2.4580121286874292, + "grad_norm": 0.22644907236099243, + "learning_rate": 7.479456938667715e-06, + "loss": 0.4041, + "step": 11957 + }, + { + "epoch": 2.458217699660808, + "grad_norm": 0.22928078472614288, + "learning_rate": 7.473943609117859e-06, + "loss": 0.3757, + "step": 11958 + }, + { + "epoch": 2.4584232706341864, + "grad_norm": 0.23315031826496124, + "learning_rate": 7.468432128301406e-06, + "loss": 0.3962, + "step": 11959 + }, + { + "epoch": 2.458628841607565, + "grad_norm": 0.23016057908535004, + "learning_rate": 7.462922496489881e-06, + "loss": 0.3948, + "step": 11960 + }, + { + "epoch": 2.4588344125809436, + "grad_norm": 0.23658603429794312, + "learning_rate": 7.457414713954714e-06, + "loss": 0.358, + "step": 11961 + }, + { + "epoch": 2.459039983554322, + "grad_norm": 0.23030193150043488, + "learning_rate": 7.451908780967242e-06, + "loss": 0.3848, + "step": 11962 + }, + { + "epoch": 2.4592455545277008, + "grad_norm": 0.23462505638599396, + "learning_rate": 7.446404697798738e-06, + "loss": 0.3856, + "step": 11963 + }, + { + "epoch": 2.4594511255010794, + "grad_norm": 0.23800687491893768, + "learning_rate": 7.4409024647203344e-06, + "loss": 0.3833, + "step": 11964 + }, + { + "epoch": 2.459656696474458, + "grad_norm": 0.22501428425312042, + "learning_rate": 7.43540208200313e-06, + "loss": 0.4078, + "step": 11965 + }, + { + "epoch": 2.4598622674478365, + "grad_norm": 0.23494026064872742, + "learning_rate": 7.429903549918089e-06, + "loss": 0.382, + "step": 11966 + }, + { + "epoch": 2.4600678384212147, + "grad_norm": 0.23382548987865448, + "learning_rate": 7.424406868736093e-06, + "loss": 0.3714, + "step": 11967 + }, + { + "epoch": 2.4602734093945937, + "grad_norm": 0.2336670309305191, + "learning_rate": 7.418912038727947e-06, + "loss": 0.386, + "step": 11968 + }, + { + "epoch": 2.460478980367972, + "grad_norm": 0.1252935528755188, + "learning_rate": 7.413419060164348e-06, + "loss": 0.4512, + "step": 11969 + }, + { + "epoch": 2.4606845513413504, + "grad_norm": 0.12696446478366852, + "learning_rate": 7.4079279333159054e-06, + "loss": 0.4467, + "step": 11970 + }, + { + "epoch": 2.460890122314729, + "grad_norm": 0.23147273063659668, + "learning_rate": 7.4024386584531574e-06, + "loss": 0.3854, + "step": 11971 + }, + { + "epoch": 2.4610956932881076, + "grad_norm": 0.23505190014839172, + "learning_rate": 7.396951235846528e-06, + "loss": 0.4034, + "step": 11972 + }, + { + "epoch": 2.461301264261486, + "grad_norm": 0.22706139087677002, + "learning_rate": 7.391465665766351e-06, + "loss": 0.3958, + "step": 11973 + }, + { + "epoch": 2.461506835234865, + "grad_norm": 0.2269429713487625, + "learning_rate": 7.385981948482885e-06, + "loss": 0.3912, + "step": 11974 + }, + { + "epoch": 2.4617124062082434, + "grad_norm": 0.24472972750663757, + "learning_rate": 7.380500084266274e-06, + "loss": 0.4132, + "step": 11975 + }, + { + "epoch": 2.461917977181622, + "grad_norm": 0.2306082546710968, + "learning_rate": 7.375020073386597e-06, + "loss": 0.3853, + "step": 11976 + }, + { + "epoch": 2.4621235481550006, + "grad_norm": 0.23048275709152222, + "learning_rate": 7.369541916113808e-06, + "loss": 0.3962, + "step": 11977 + }, + { + "epoch": 2.462329119128379, + "grad_norm": 0.22634848952293396, + "learning_rate": 7.364065612717816e-06, + "loss": 0.3774, + "step": 11978 + }, + { + "epoch": 2.4625346901017577, + "grad_norm": 0.23332750797271729, + "learning_rate": 7.3585911634684e-06, + "loss": 0.4043, + "step": 11979 + }, + { + "epoch": 2.4627402610751363, + "grad_norm": 0.22785905003547668, + "learning_rate": 7.353118568635265e-06, + "loss": 0.3758, + "step": 11980 + }, + { + "epoch": 2.462945832048515, + "grad_norm": 0.23111788928508759, + "learning_rate": 7.347647828488015e-06, + "loss": 0.3824, + "step": 11981 + }, + { + "epoch": 2.463151403021893, + "grad_norm": 0.12065623700618744, + "learning_rate": 7.342178943296169e-06, + "loss": 0.4432, + "step": 11982 + }, + { + "epoch": 2.463356973995272, + "grad_norm": 0.22295540571212769, + "learning_rate": 7.336711913329146e-06, + "loss": 0.3759, + "step": 11983 + }, + { + "epoch": 2.4635625449686502, + "grad_norm": 0.2377004474401474, + "learning_rate": 7.331246738856297e-06, + "loss": 0.3769, + "step": 11984 + }, + { + "epoch": 2.463768115942029, + "grad_norm": 0.23187273740768433, + "learning_rate": 7.325783420146861e-06, + "loss": 0.3655, + "step": 11985 + }, + { + "epoch": 2.4639736869154074, + "grad_norm": 0.12513786554336548, + "learning_rate": 7.320321957469973e-06, + "loss": 0.4312, + "step": 11986 + }, + { + "epoch": 2.464179257888786, + "grad_norm": 0.23012107610702515, + "learning_rate": 7.3148623510947215e-06, + "loss": 0.3841, + "step": 11987 + }, + { + "epoch": 2.4643848288621646, + "grad_norm": 0.23000621795654297, + "learning_rate": 7.309404601290058e-06, + "loss": 0.3997, + "step": 11988 + }, + { + "epoch": 2.464590399835543, + "grad_norm": 0.23168058693408966, + "learning_rate": 7.3039487083248665e-06, + "loss": 0.3756, + "step": 11989 + }, + { + "epoch": 2.4647959708089218, + "grad_norm": 0.2302679568529129, + "learning_rate": 7.298494672467922e-06, + "loss": 0.3849, + "step": 11990 + }, + { + "epoch": 2.4650015417823004, + "grad_norm": 0.23086369037628174, + "learning_rate": 7.2930424939879405e-06, + "loss": 0.3872, + "step": 11991 + }, + { + "epoch": 2.465207112755679, + "grad_norm": 0.2280319184064865, + "learning_rate": 7.28759217315351e-06, + "loss": 0.3777, + "step": 11992 + }, + { + "epoch": 2.4654126837290575, + "grad_norm": 0.2329222559928894, + "learning_rate": 7.282143710233148e-06, + "loss": 0.3905, + "step": 11993 + }, + { + "epoch": 2.465618254702436, + "grad_norm": 0.22960495948791504, + "learning_rate": 7.276697105495274e-06, + "loss": 0.3776, + "step": 11994 + }, + { + "epoch": 2.4658238256758147, + "grad_norm": 0.22651077806949615, + "learning_rate": 7.271252359208212e-06, + "loss": 0.3902, + "step": 11995 + }, + { + "epoch": 2.4660293966491933, + "grad_norm": 0.23243005573749542, + "learning_rate": 7.2658094716402e-06, + "loss": 0.3618, + "step": 11996 + }, + { + "epoch": 2.466234967622572, + "grad_norm": 0.2251901775598526, + "learning_rate": 7.260368443059382e-06, + "loss": 0.3856, + "step": 11997 + }, + { + "epoch": 2.4664405385959505, + "grad_norm": 0.22453376650810242, + "learning_rate": 7.254929273733824e-06, + "loss": 0.4079, + "step": 11998 + }, + { + "epoch": 2.4666461095693286, + "grad_norm": 0.2299884408712387, + "learning_rate": 7.249491963931481e-06, + "loss": 0.3801, + "step": 11999 + }, + { + "epoch": 2.466851680542707, + "grad_norm": 0.34996315836906433, + "learning_rate": 7.244056513920224e-06, + "loss": 0.4597, + "step": 12000 + }, + { + "epoch": 2.467057251516086, + "grad_norm": 0.22876065969467163, + "learning_rate": 7.238622923967829e-06, + "loss": 0.3884, + "step": 12001 + }, + { + "epoch": 2.4672628224894644, + "grad_norm": 0.22702383995056152, + "learning_rate": 7.233191194341992e-06, + "loss": 0.3792, + "step": 12002 + }, + { + "epoch": 2.467468393462843, + "grad_norm": 0.23241770267486572, + "learning_rate": 7.2277613253102985e-06, + "loss": 0.394, + "step": 12003 + }, + { + "epoch": 2.4676739644362216, + "grad_norm": 0.124427430331707, + "learning_rate": 7.222333317140245e-06, + "loss": 0.4528, + "step": 12004 + }, + { + "epoch": 2.4678795354096, + "grad_norm": 0.2252800464630127, + "learning_rate": 7.216907170099272e-06, + "loss": 0.373, + "step": 12005 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 0.22926414012908936, + "learning_rate": 7.211482884454681e-06, + "loss": 0.3816, + "step": 12006 + }, + { + "epoch": 2.4682906773563573, + "grad_norm": 0.1216835305094719, + "learning_rate": 7.206060460473699e-06, + "loss": 0.4481, + "step": 12007 + }, + { + "epoch": 2.468496248329736, + "grad_norm": 0.22425177693367004, + "learning_rate": 7.200639898423476e-06, + "loss": 0.3763, + "step": 12008 + }, + { + "epoch": 2.4687018193031145, + "grad_norm": 0.22090640664100647, + "learning_rate": 7.195221198571054e-06, + "loss": 0.3812, + "step": 12009 + }, + { + "epoch": 2.468907390276493, + "grad_norm": 0.23358182609081268, + "learning_rate": 7.1898043611833845e-06, + "loss": 0.3889, + "step": 12010 + }, + { + "epoch": 2.4691129612498717, + "grad_norm": 0.2351762056350708, + "learning_rate": 7.184389386527319e-06, + "loss": 0.4039, + "step": 12011 + }, + { + "epoch": 2.4693185322232503, + "grad_norm": 0.22724005579948425, + "learning_rate": 7.178976274869649e-06, + "loss": 0.4057, + "step": 12012 + }, + { + "epoch": 2.469524103196629, + "grad_norm": 0.22213922441005707, + "learning_rate": 7.173565026477041e-06, + "loss": 0.3853, + "step": 12013 + }, + { + "epoch": 2.469729674170007, + "grad_norm": 0.22167488932609558, + "learning_rate": 7.1681556416160875e-06, + "loss": 0.4068, + "step": 12014 + }, + { + "epoch": 2.4699352451433856, + "grad_norm": 0.23194730281829834, + "learning_rate": 7.1627481205532795e-06, + "loss": 0.3975, + "step": 12015 + }, + { + "epoch": 2.470140816116764, + "grad_norm": 0.22849147021770477, + "learning_rate": 7.157342463555019e-06, + "loss": 0.398, + "step": 12016 + }, + { + "epoch": 2.4703463870901428, + "grad_norm": 0.22663573920726776, + "learning_rate": 7.1519386708876185e-06, + "loss": 0.3678, + "step": 12017 + }, + { + "epoch": 2.4705519580635213, + "grad_norm": 0.5367224216461182, + "learning_rate": 7.14653674281729e-06, + "loss": 0.3891, + "step": 12018 + }, + { + "epoch": 2.4707575290369, + "grad_norm": 0.232307568192482, + "learning_rate": 7.1411366796101795e-06, + "loss": 0.3862, + "step": 12019 + }, + { + "epoch": 2.4709631000102785, + "grad_norm": 0.23023979365825653, + "learning_rate": 7.135738481532311e-06, + "loss": 0.3982, + "step": 12020 + }, + { + "epoch": 2.471168670983657, + "grad_norm": 0.2264028787612915, + "learning_rate": 7.13034214884963e-06, + "loss": 0.3724, + "step": 12021 + }, + { + "epoch": 2.4713742419570357, + "grad_norm": 0.2289603054523468, + "learning_rate": 7.124947681827991e-06, + "loss": 0.3857, + "step": 12022 + }, + { + "epoch": 2.4715798129304143, + "grad_norm": 0.22829623520374298, + "learning_rate": 7.119555080733154e-06, + "loss": 0.3647, + "step": 12023 + }, + { + "epoch": 2.471785383903793, + "grad_norm": 0.22573639452457428, + "learning_rate": 7.114164345830782e-06, + "loss": 0.405, + "step": 12024 + }, + { + "epoch": 2.4719909548771715, + "grad_norm": 0.2383507937192917, + "learning_rate": 7.108775477386444e-06, + "loss": 0.3742, + "step": 12025 + }, + { + "epoch": 2.47219652585055, + "grad_norm": 0.2287752777338028, + "learning_rate": 7.103388475665647e-06, + "loss": 0.3927, + "step": 12026 + }, + { + "epoch": 2.4724020968239286, + "grad_norm": 0.2402697503566742, + "learning_rate": 7.098003340933773e-06, + "loss": 0.3899, + "step": 12027 + }, + { + "epoch": 2.4726076677973072, + "grad_norm": 0.1256427764892578, + "learning_rate": 7.09262007345611e-06, + "loss": 0.4296, + "step": 12028 + }, + { + "epoch": 2.4728132387706854, + "grad_norm": 0.2302490770816803, + "learning_rate": 7.0872386734978865e-06, + "loss": 0.3799, + "step": 12029 + }, + { + "epoch": 2.473018809744064, + "grad_norm": 0.12290017306804657, + "learning_rate": 7.08185914132421e-06, + "loss": 0.4441, + "step": 12030 + }, + { + "epoch": 2.4732243807174425, + "grad_norm": 0.24090375006198883, + "learning_rate": 7.0764814772001035e-06, + "loss": 0.3726, + "step": 12031 + }, + { + "epoch": 2.473429951690821, + "grad_norm": 0.23742975294589996, + "learning_rate": 7.071105681390495e-06, + "loss": 0.3915, + "step": 12032 + }, + { + "epoch": 2.4736355226641997, + "grad_norm": 0.12602561712265015, + "learning_rate": 7.065731754160233e-06, + "loss": 0.429, + "step": 12033 + }, + { + "epoch": 2.4738410936375783, + "grad_norm": 0.2322402000427246, + "learning_rate": 7.06035969577407e-06, + "loss": 0.3945, + "step": 12034 + }, + { + "epoch": 2.474046664610957, + "grad_norm": 0.22450955212116241, + "learning_rate": 7.05498950649665e-06, + "loss": 0.3684, + "step": 12035 + }, + { + "epoch": 2.4742522355843355, + "grad_norm": 0.23430559039115906, + "learning_rate": 7.049621186592546e-06, + "loss": 0.378, + "step": 12036 + }, + { + "epoch": 2.474457806557714, + "grad_norm": 0.2268606424331665, + "learning_rate": 7.044254736326227e-06, + "loss": 0.3944, + "step": 12037 + }, + { + "epoch": 2.4746633775310927, + "grad_norm": 0.22248311340808868, + "learning_rate": 7.038890155962071e-06, + "loss": 0.3941, + "step": 12038 + }, + { + "epoch": 2.4748689485044713, + "grad_norm": 0.12058508396148682, + "learning_rate": 7.033527445764357e-06, + "loss": 0.4526, + "step": 12039 + }, + { + "epoch": 2.47507451947785, + "grad_norm": 0.12435080856084824, + "learning_rate": 7.028166605997302e-06, + "loss": 0.4443, + "step": 12040 + }, + { + "epoch": 2.4752800904512284, + "grad_norm": 0.22780485451221466, + "learning_rate": 7.022807636924997e-06, + "loss": 0.3832, + "step": 12041 + }, + { + "epoch": 2.475485661424607, + "grad_norm": 0.22483399510383606, + "learning_rate": 7.017450538811455e-06, + "loss": 0.4114, + "step": 12042 + }, + { + "epoch": 2.4756912323979856, + "grad_norm": 0.22376932203769684, + "learning_rate": 7.012095311920595e-06, + "loss": 0.365, + "step": 12043 + }, + { + "epoch": 2.4758968033713638, + "grad_norm": 0.22798992693424225, + "learning_rate": 7.006741956516246e-06, + "loss": 0.3874, + "step": 12044 + }, + { + "epoch": 2.4761023743447423, + "grad_norm": 0.23297694325447083, + "learning_rate": 7.001390472862141e-06, + "loss": 0.3908, + "step": 12045 + }, + { + "epoch": 2.476307945318121, + "grad_norm": 0.22531673312187195, + "learning_rate": 6.99604086122191e-06, + "loss": 0.3905, + "step": 12046 + }, + { + "epoch": 2.4765135162914995, + "grad_norm": 0.22847385704517365, + "learning_rate": 6.990693121859122e-06, + "loss": 0.3764, + "step": 12047 + }, + { + "epoch": 2.476719087264878, + "grad_norm": 0.12156729400157928, + "learning_rate": 6.985347255037237e-06, + "loss": 0.4623, + "step": 12048 + }, + { + "epoch": 2.4769246582382567, + "grad_norm": 0.12409412860870361, + "learning_rate": 6.980003261019599e-06, + "loss": 0.4559, + "step": 12049 + }, + { + "epoch": 2.4771302292116353, + "grad_norm": 0.2400665581226349, + "learning_rate": 6.974661140069501e-06, + "loss": 0.3763, + "step": 12050 + }, + { + "epoch": 2.477335800185014, + "grad_norm": 0.22779475152492523, + "learning_rate": 6.969320892450124e-06, + "loss": 0.3765, + "step": 12051 + }, + { + "epoch": 2.4775413711583925, + "grad_norm": 0.22973057627677917, + "learning_rate": 6.9639825184245524e-06, + "loss": 0.3799, + "step": 12052 + }, + { + "epoch": 2.477746942131771, + "grad_norm": 0.23675696551799774, + "learning_rate": 6.9586460182557705e-06, + "loss": 0.399, + "step": 12053 + }, + { + "epoch": 2.4779525131051496, + "grad_norm": 0.23592104017734528, + "learning_rate": 6.953311392206702e-06, + "loss": 0.3764, + "step": 12054 + }, + { + "epoch": 2.4781580840785282, + "grad_norm": 0.12474309653043747, + "learning_rate": 6.947978640540154e-06, + "loss": 0.436, + "step": 12055 + }, + { + "epoch": 2.478363655051907, + "grad_norm": 0.23557905852794647, + "learning_rate": 6.942647763518844e-06, + "loss": 0.3961, + "step": 12056 + }, + { + "epoch": 2.4785692260252854, + "grad_norm": 0.23335106670856476, + "learning_rate": 6.937318761405399e-06, + "loss": 0.39, + "step": 12057 + }, + { + "epoch": 2.478774796998664, + "grad_norm": 1.1866546869277954, + "learning_rate": 6.931991634462352e-06, + "loss": 0.4177, + "step": 12058 + }, + { + "epoch": 2.478980367972042, + "grad_norm": 0.12616188824176788, + "learning_rate": 6.926666382952149e-06, + "loss": 0.4414, + "step": 12059 + }, + { + "epoch": 2.4791859389454207, + "grad_norm": 0.22257229685783386, + "learning_rate": 6.921343007137131e-06, + "loss": 0.3853, + "step": 12060 + }, + { + "epoch": 2.4793915099187993, + "grad_norm": 0.2297201305627823, + "learning_rate": 6.916021507279572e-06, + "loss": 0.3891, + "step": 12061 + }, + { + "epoch": 2.479597080892178, + "grad_norm": 0.12035045772790909, + "learning_rate": 6.910701883641627e-06, + "loss": 0.4512, + "step": 12062 + }, + { + "epoch": 2.4798026518655565, + "grad_norm": 0.11942754685878754, + "learning_rate": 6.905384136485374e-06, + "loss": 0.4546, + "step": 12063 + }, + { + "epoch": 2.480008222838935, + "grad_norm": 0.12709856033325195, + "learning_rate": 6.900068266072795e-06, + "loss": 0.4667, + "step": 12064 + }, + { + "epoch": 2.4802137938123137, + "grad_norm": 0.22888512909412384, + "learning_rate": 6.894754272665767e-06, + "loss": 0.3852, + "step": 12065 + }, + { + "epoch": 2.4804193647856922, + "grad_norm": 0.22018122673034668, + "learning_rate": 6.889442156526085e-06, + "loss": 0.3962, + "step": 12066 + }, + { + "epoch": 2.480624935759071, + "grad_norm": 0.23357877135276794, + "learning_rate": 6.884131917915471e-06, + "loss": 0.3871, + "step": 12067 + }, + { + "epoch": 2.4808305067324494, + "grad_norm": 0.22664080560207367, + "learning_rate": 6.87882355709552e-06, + "loss": 0.3931, + "step": 12068 + }, + { + "epoch": 2.481036077705828, + "grad_norm": 0.22483284771442413, + "learning_rate": 6.873517074327758e-06, + "loss": 0.3701, + "step": 12069 + }, + { + "epoch": 2.4812416486792066, + "grad_norm": 0.12439465522766113, + "learning_rate": 6.868212469873605e-06, + "loss": 0.4436, + "step": 12070 + }, + { + "epoch": 2.481447219652585, + "grad_norm": 0.12237696349620819, + "learning_rate": 6.862909743994388e-06, + "loss": 0.4515, + "step": 12071 + }, + { + "epoch": 2.481652790625964, + "grad_norm": 0.22126199305057526, + "learning_rate": 6.857608896951367e-06, + "loss": 0.3588, + "step": 12072 + }, + { + "epoch": 2.4818583615993424, + "grad_norm": 0.23091398179531097, + "learning_rate": 6.8523099290056645e-06, + "loss": 0.3856, + "step": 12073 + }, + { + "epoch": 2.4820639325727205, + "grad_norm": 0.22415180504322052, + "learning_rate": 6.847012840418361e-06, + "loss": 0.397, + "step": 12074 + }, + { + "epoch": 2.482269503546099, + "grad_norm": 0.12421949952840805, + "learning_rate": 6.8417176314504125e-06, + "loss": 0.4434, + "step": 12075 + }, + { + "epoch": 2.4824750745194777, + "grad_norm": 0.12139065563678741, + "learning_rate": 6.83642430236268e-06, + "loss": 0.4701, + "step": 12076 + }, + { + "epoch": 2.4826806454928563, + "grad_norm": 0.24218404293060303, + "learning_rate": 6.831132853415946e-06, + "loss": 0.4046, + "step": 12077 + }, + { + "epoch": 2.482886216466235, + "grad_norm": 0.23166660964488983, + "learning_rate": 6.825843284870901e-06, + "loss": 0.3861, + "step": 12078 + }, + { + "epoch": 2.4830917874396135, + "grad_norm": 0.2387050986289978, + "learning_rate": 6.820555596988127e-06, + "loss": 0.3854, + "step": 12079 + }, + { + "epoch": 2.483297358412992, + "grad_norm": 0.23468570411205292, + "learning_rate": 6.81526979002812e-06, + "loss": 0.3764, + "step": 12080 + }, + { + "epoch": 2.4835029293863706, + "grad_norm": 0.23246009647846222, + "learning_rate": 6.809985864251303e-06, + "loss": 0.3896, + "step": 12081 + }, + { + "epoch": 2.483708500359749, + "grad_norm": 0.24410288035869598, + "learning_rate": 6.804703819917987e-06, + "loss": 0.3876, + "step": 12082 + }, + { + "epoch": 2.483914071333128, + "grad_norm": 0.2310299128293991, + "learning_rate": 6.799423657288384e-06, + "loss": 0.3816, + "step": 12083 + }, + { + "epoch": 2.4841196423065064, + "grad_norm": 0.22626370191574097, + "learning_rate": 6.794145376622635e-06, + "loss": 0.3851, + "step": 12084 + }, + { + "epoch": 2.484325213279885, + "grad_norm": 0.2305128276348114, + "learning_rate": 6.788868978180763e-06, + "loss": 0.4095, + "step": 12085 + }, + { + "epoch": 2.4845307842532636, + "grad_norm": 0.22715520858764648, + "learning_rate": 6.78359446222272e-06, + "loss": 0.397, + "step": 12086 + }, + { + "epoch": 2.484736355226642, + "grad_norm": 0.12447824329137802, + "learning_rate": 6.778321829008348e-06, + "loss": 0.4611, + "step": 12087 + }, + { + "epoch": 2.4849419262000207, + "grad_norm": 0.12171711772680283, + "learning_rate": 6.773051078797419e-06, + "loss": 0.4459, + "step": 12088 + }, + { + "epoch": 2.485147497173399, + "grad_norm": 0.12131594866514206, + "learning_rate": 6.767782211849591e-06, + "loss": 0.4644, + "step": 12089 + }, + { + "epoch": 2.485353068146778, + "grad_norm": 0.12281377613544464, + "learning_rate": 6.7625152284244395e-06, + "loss": 0.4399, + "step": 12090 + }, + { + "epoch": 2.485558639120156, + "grad_norm": 0.2290441393852234, + "learning_rate": 6.75725012878144e-06, + "loss": 0.3939, + "step": 12091 + }, + { + "epoch": 2.4857642100935347, + "grad_norm": 0.22904446721076965, + "learning_rate": 6.751986913179967e-06, + "loss": 0.3833, + "step": 12092 + }, + { + "epoch": 2.4859697810669132, + "grad_norm": 0.23602800071239471, + "learning_rate": 6.746725581879339e-06, + "loss": 0.3835, + "step": 12093 + }, + { + "epoch": 2.486175352040292, + "grad_norm": 0.2316070944070816, + "learning_rate": 6.74146613513875e-06, + "loss": 0.3902, + "step": 12094 + }, + { + "epoch": 2.4863809230136704, + "grad_norm": 0.22582808136940002, + "learning_rate": 6.736208573217292e-06, + "loss": 0.4079, + "step": 12095 + }, + { + "epoch": 2.486586493987049, + "grad_norm": 0.23117490112781525, + "learning_rate": 6.730952896374002e-06, + "loss": 0.3945, + "step": 12096 + }, + { + "epoch": 2.4867920649604276, + "grad_norm": 0.22690841555595398, + "learning_rate": 6.725699104867799e-06, + "loss": 0.3927, + "step": 12097 + }, + { + "epoch": 2.486997635933806, + "grad_norm": 0.23165901005268097, + "learning_rate": 6.7204471989575e-06, + "loss": 0.4029, + "step": 12098 + }, + { + "epoch": 2.4872032069071848, + "grad_norm": 0.22219586372375488, + "learning_rate": 6.715197178901853e-06, + "loss": 0.3776, + "step": 12099 + }, + { + "epoch": 2.4874087778805634, + "grad_norm": 0.2293098270893097, + "learning_rate": 6.709949044959502e-06, + "loss": 0.3988, + "step": 12100 + }, + { + "epoch": 2.487614348853942, + "grad_norm": 0.23303751647472382, + "learning_rate": 6.70470279738898e-06, + "loss": 0.3915, + "step": 12101 + }, + { + "epoch": 2.4878199198273205, + "grad_norm": 0.1233496144413948, + "learning_rate": 6.6994584364487695e-06, + "loss": 0.4614, + "step": 12102 + }, + { + "epoch": 2.488025490800699, + "grad_norm": 0.23316849768161774, + "learning_rate": 6.694215962397225e-06, + "loss": 0.3868, + "step": 12103 + }, + { + "epoch": 2.4882310617740773, + "grad_norm": 0.22257505357265472, + "learning_rate": 6.688975375492618e-06, + "loss": 0.374, + "step": 12104 + }, + { + "epoch": 2.4884366327474563, + "grad_norm": 0.12211709469556808, + "learning_rate": 6.6837366759931345e-06, + "loss": 0.4395, + "step": 12105 + }, + { + "epoch": 2.4886422037208344, + "grad_norm": 0.22621026635169983, + "learning_rate": 6.678499864156851e-06, + "loss": 0.3922, + "step": 12106 + }, + { + "epoch": 2.488847774694213, + "grad_norm": 0.2442169040441513, + "learning_rate": 6.673264940241767e-06, + "loss": 0.3831, + "step": 12107 + }, + { + "epoch": 2.4890533456675916, + "grad_norm": 0.22115904092788696, + "learning_rate": 6.668031904505771e-06, + "loss": 0.39, + "step": 12108 + }, + { + "epoch": 2.48925891664097, + "grad_norm": 0.12432961910963058, + "learning_rate": 6.662800757206687e-06, + "loss": 0.4369, + "step": 12109 + }, + { + "epoch": 2.489464487614349, + "grad_norm": 0.23481737077236176, + "learning_rate": 6.657571498602224e-06, + "loss": 0.3807, + "step": 12110 + }, + { + "epoch": 2.4896700585877274, + "grad_norm": 0.24072937667369843, + "learning_rate": 6.65234412895e-06, + "loss": 0.3857, + "step": 12111 + }, + { + "epoch": 2.489875629561106, + "grad_norm": 0.2299319952726364, + "learning_rate": 6.647118648507545e-06, + "loss": 0.3725, + "step": 12112 + }, + { + "epoch": 2.4900812005344846, + "grad_norm": 0.2447563111782074, + "learning_rate": 6.641895057532282e-06, + "loss": 0.3858, + "step": 12113 + }, + { + "epoch": 2.490286771507863, + "grad_norm": 0.22545365989208221, + "learning_rate": 6.636673356281577e-06, + "loss": 0.3783, + "step": 12114 + }, + { + "epoch": 2.4904923424812417, + "grad_norm": 0.2212546318769455, + "learning_rate": 6.631453545012663e-06, + "loss": 0.3906, + "step": 12115 + }, + { + "epoch": 2.4906979134546203, + "grad_norm": 0.2362491488456726, + "learning_rate": 6.626235623982693e-06, + "loss": 0.4016, + "step": 12116 + }, + { + "epoch": 2.490903484427999, + "grad_norm": 0.11933384835720062, + "learning_rate": 6.6210195934487395e-06, + "loss": 0.4647, + "step": 12117 + }, + { + "epoch": 2.4911090554013775, + "grad_norm": 0.12910796701908112, + "learning_rate": 6.615805453667774e-06, + "loss": 0.4296, + "step": 12118 + }, + { + "epoch": 2.4913146263747556, + "grad_norm": 0.22228464484214783, + "learning_rate": 6.6105932048966625e-06, + "loss": 0.3975, + "step": 12119 + }, + { + "epoch": 2.4915201973481347, + "grad_norm": 0.11906154453754425, + "learning_rate": 6.6053828473921945e-06, + "loss": 0.4488, + "step": 12120 + }, + { + "epoch": 2.491725768321513, + "grad_norm": 0.11698108166456223, + "learning_rate": 6.600174381411054e-06, + "loss": 0.467, + "step": 12121 + }, + { + "epoch": 2.4919313392948914, + "grad_norm": 0.23555971682071686, + "learning_rate": 6.594967807209831e-06, + "loss": 0.3887, + "step": 12122 + }, + { + "epoch": 2.49213691026827, + "grad_norm": 0.23438353836536407, + "learning_rate": 6.589763125045056e-06, + "loss": 0.3863, + "step": 12123 + }, + { + "epoch": 2.4923424812416486, + "grad_norm": 0.22644414007663727, + "learning_rate": 6.584560335173119e-06, + "loss": 0.3941, + "step": 12124 + }, + { + "epoch": 2.492548052215027, + "grad_norm": 0.11747743934392929, + "learning_rate": 6.579359437850339e-06, + "loss": 0.4527, + "step": 12125 + }, + { + "epoch": 2.4927536231884058, + "grad_norm": 0.23047557473182678, + "learning_rate": 6.574160433332946e-06, + "loss": 0.4062, + "step": 12126 + }, + { + "epoch": 2.4929591941617844, + "grad_norm": 0.22950156033039093, + "learning_rate": 6.568963321877061e-06, + "loss": 0.3833, + "step": 12127 + }, + { + "epoch": 2.493164765135163, + "grad_norm": 0.21891199052333832, + "learning_rate": 6.563768103738734e-06, + "loss": 0.3736, + "step": 12128 + }, + { + "epoch": 2.4933703361085415, + "grad_norm": 0.22695685923099518, + "learning_rate": 6.558574779173884e-06, + "loss": 0.3752, + "step": 12129 + }, + { + "epoch": 2.49357590708192, + "grad_norm": 0.12211208045482635, + "learning_rate": 6.553383348438398e-06, + "loss": 0.4442, + "step": 12130 + }, + { + "epoch": 2.4937814780552987, + "grad_norm": 0.22641681134700775, + "learning_rate": 6.548193811788011e-06, + "loss": 0.3864, + "step": 12131 + }, + { + "epoch": 2.4939870490286773, + "grad_norm": 0.11796488612890244, + "learning_rate": 6.543006169478392e-06, + "loss": 0.4571, + "step": 12132 + }, + { + "epoch": 2.494192620002056, + "grad_norm": 0.226291224360466, + "learning_rate": 6.537820421765109e-06, + "loss": 0.38, + "step": 12133 + }, + { + "epoch": 2.494398190975434, + "grad_norm": 0.22466683387756348, + "learning_rate": 6.5326365689036465e-06, + "loss": 0.4094, + "step": 12134 + }, + { + "epoch": 2.494603761948813, + "grad_norm": 0.23120231926441193, + "learning_rate": 6.5274546111493696e-06, + "loss": 0.3899, + "step": 12135 + }, + { + "epoch": 2.494809332922191, + "grad_norm": 0.23374420404434204, + "learning_rate": 6.5222745487576e-06, + "loss": 0.3821, + "step": 12136 + }, + { + "epoch": 2.49501490389557, + "grad_norm": 0.22625453770160675, + "learning_rate": 6.517096381983503e-06, + "loss": 0.3882, + "step": 12137 + }, + { + "epoch": 2.4952204748689484, + "grad_norm": 0.12417057901620865, + "learning_rate": 6.51192011108221e-06, + "loss": 0.4423, + "step": 12138 + }, + { + "epoch": 2.495426045842327, + "grad_norm": 0.2231971025466919, + "learning_rate": 6.506745736308721e-06, + "loss": 0.3984, + "step": 12139 + }, + { + "epoch": 2.4956316168157056, + "grad_norm": 0.2350044548511505, + "learning_rate": 6.501573257917954e-06, + "loss": 0.3884, + "step": 12140 + }, + { + "epoch": 2.495837187789084, + "grad_norm": 0.23853430151939392, + "learning_rate": 6.496402676164734e-06, + "loss": 0.3903, + "step": 12141 + }, + { + "epoch": 2.4960427587624627, + "grad_norm": 0.23373542726039886, + "learning_rate": 6.4912339913037815e-06, + "loss": 0.3925, + "step": 12142 + }, + { + "epoch": 2.4962483297358413, + "grad_norm": 0.2317272126674652, + "learning_rate": 6.486067203589738e-06, + "loss": 0.4034, + "step": 12143 + }, + { + "epoch": 2.49645390070922, + "grad_norm": 0.22617876529693604, + "learning_rate": 6.480902313277152e-06, + "loss": 0.3891, + "step": 12144 + }, + { + "epoch": 2.4966594716825985, + "grad_norm": 0.22388514876365662, + "learning_rate": 6.475739320620478e-06, + "loss": 0.3823, + "step": 12145 + }, + { + "epoch": 2.496865042655977, + "grad_norm": 0.12233025580644608, + "learning_rate": 6.470578225874062e-06, + "loss": 0.459, + "step": 12146 + }, + { + "epoch": 2.4970706136293557, + "grad_norm": 0.2257211059331894, + "learning_rate": 6.4654190292921724e-06, + "loss": 0.3908, + "step": 12147 + }, + { + "epoch": 2.4972761846027343, + "grad_norm": 0.2302434891462326, + "learning_rate": 6.460261731128975e-06, + "loss": 0.3994, + "step": 12148 + }, + { + "epoch": 2.4974817555761124, + "grad_norm": 0.2282235473394394, + "learning_rate": 6.455106331638541e-06, + "loss": 0.3751, + "step": 12149 + }, + { + "epoch": 2.4976873265494914, + "grad_norm": 0.23330600559711456, + "learning_rate": 6.449952831074869e-06, + "loss": 0.3851, + "step": 12150 + }, + { + "epoch": 2.4978928975228696, + "grad_norm": 0.22312867641448975, + "learning_rate": 6.4448012296918385e-06, + "loss": 0.3799, + "step": 12151 + }, + { + "epoch": 2.498098468496248, + "grad_norm": 0.22371982038021088, + "learning_rate": 6.439651527743244e-06, + "loss": 0.386, + "step": 12152 + }, + { + "epoch": 2.4983040394696268, + "grad_norm": 0.2417476773262024, + "learning_rate": 6.434503725482785e-06, + "loss": 0.3929, + "step": 12153 + }, + { + "epoch": 2.4985096104430053, + "grad_norm": 0.23515672981739044, + "learning_rate": 6.429357823164076e-06, + "loss": 0.3886, + "step": 12154 + }, + { + "epoch": 2.498715181416384, + "grad_norm": 0.22999493777751923, + "learning_rate": 6.424213821040627e-06, + "loss": 0.3596, + "step": 12155 + }, + { + "epoch": 2.4989207523897625, + "grad_norm": 0.2299181967973709, + "learning_rate": 6.419071719365853e-06, + "loss": 0.3789, + "step": 12156 + }, + { + "epoch": 2.499126323363141, + "grad_norm": 0.23717856407165527, + "learning_rate": 6.4139315183930986e-06, + "loss": 0.3868, + "step": 12157 + }, + { + "epoch": 2.4993318943365197, + "grad_norm": 0.22513870894908905, + "learning_rate": 6.408793218375587e-06, + "loss": 0.3657, + "step": 12158 + }, + { + "epoch": 2.4995374653098983, + "grad_norm": 0.22355802357196808, + "learning_rate": 6.403656819566447e-06, + "loss": 0.3665, + "step": 12159 + }, + { + "epoch": 2.499743036283277, + "grad_norm": 0.23084700107574463, + "learning_rate": 6.3985223222187455e-06, + "loss": 0.3808, + "step": 12160 + }, + { + "epoch": 2.4999486072566555, + "grad_norm": 0.22430096566677094, + "learning_rate": 6.393389726585429e-06, + "loss": 0.3874, + "step": 12161 + }, + { + "epoch": 2.500154178230034, + "grad_norm": 0.23496957123279572, + "learning_rate": 6.388259032919352e-06, + "loss": 0.4068, + "step": 12162 + }, + { + "epoch": 2.5003597492034126, + "grad_norm": 0.22846169769763947, + "learning_rate": 6.383130241473271e-06, + "loss": 0.3625, + "step": 12163 + }, + { + "epoch": 2.500565320176791, + "grad_norm": 0.23542927205562592, + "learning_rate": 6.37800335249988e-06, + "loss": 0.4062, + "step": 12164 + }, + { + "epoch": 2.50077089115017, + "grad_norm": 0.22982755303382874, + "learning_rate": 6.372878366251746e-06, + "loss": 0.3788, + "step": 12165 + }, + { + "epoch": 2.500976462123548, + "grad_norm": 0.2346840351819992, + "learning_rate": 6.3677552829813525e-06, + "loss": 0.3856, + "step": 12166 + }, + { + "epoch": 2.5011820330969265, + "grad_norm": 0.23400172591209412, + "learning_rate": 6.362634102941088e-06, + "loss": 0.3948, + "step": 12167 + }, + { + "epoch": 2.501387604070305, + "grad_norm": 0.2556484639644623, + "learning_rate": 6.357514826383249e-06, + "loss": 0.4074, + "step": 12168 + }, + { + "epoch": 2.5015931750436837, + "grad_norm": 0.23373647034168243, + "learning_rate": 6.352397453560041e-06, + "loss": 0.3774, + "step": 12169 + }, + { + "epoch": 2.5017987460170623, + "grad_norm": 0.23084743320941925, + "learning_rate": 6.347281984723565e-06, + "loss": 0.378, + "step": 12170 + }, + { + "epoch": 2.502004316990441, + "grad_norm": 0.22970278561115265, + "learning_rate": 6.342168420125852e-06, + "loss": 0.3945, + "step": 12171 + }, + { + "epoch": 2.5022098879638195, + "grad_norm": 0.22761283814907074, + "learning_rate": 6.337056760018814e-06, + "loss": 0.393, + "step": 12172 + }, + { + "epoch": 2.502415458937198, + "grad_norm": 0.2262086719274521, + "learning_rate": 6.331947004654279e-06, + "loss": 0.4013, + "step": 12173 + }, + { + "epoch": 2.5026210299105767, + "grad_norm": 0.22546137869358063, + "learning_rate": 6.326839154283977e-06, + "loss": 0.3821, + "step": 12174 + }, + { + "epoch": 2.5028266008839553, + "grad_norm": 0.12685376405715942, + "learning_rate": 6.321733209159555e-06, + "loss": 0.4521, + "step": 12175 + }, + { + "epoch": 2.503032171857334, + "grad_norm": 0.1270647794008255, + "learning_rate": 6.316629169532559e-06, + "loss": 0.443, + "step": 12176 + }, + { + "epoch": 2.5032377428307124, + "grad_norm": 0.23198673129081726, + "learning_rate": 6.3115270356544265e-06, + "loss": 0.3716, + "step": 12177 + }, + { + "epoch": 2.503443313804091, + "grad_norm": 0.22710855305194855, + "learning_rate": 6.306426807776537e-06, + "loss": 0.3858, + "step": 12178 + }, + { + "epoch": 2.503648884777469, + "grad_norm": 0.222482368350029, + "learning_rate": 6.301328486150148e-06, + "loss": 0.3927, + "step": 12179 + }, + { + "epoch": 2.503854455750848, + "grad_norm": 0.23889537155628204, + "learning_rate": 6.2962320710264155e-06, + "loss": 0.4017, + "step": 12180 + }, + { + "epoch": 2.5040600267242263, + "grad_norm": 0.22771425545215607, + "learning_rate": 6.291137562656433e-06, + "loss": 0.3956, + "step": 12181 + }, + { + "epoch": 2.5042655976976054, + "grad_norm": 0.23738330602645874, + "learning_rate": 6.286044961291184e-06, + "loss": 0.3685, + "step": 12182 + }, + { + "epoch": 2.5044711686709835, + "grad_norm": 0.23009170591831207, + "learning_rate": 6.2809542671815495e-06, + "loss": 0.3939, + "step": 12183 + }, + { + "epoch": 2.504676739644362, + "grad_norm": 0.2258174568414688, + "learning_rate": 6.275865480578317e-06, + "loss": 0.3827, + "step": 12184 + }, + { + "epoch": 2.5048823106177407, + "grad_norm": 0.21696443855762482, + "learning_rate": 6.2707786017322066e-06, + "loss": 0.3773, + "step": 12185 + }, + { + "epoch": 2.5050878815911193, + "grad_norm": 0.23282679915428162, + "learning_rate": 6.265693630893814e-06, + "loss": 0.4009, + "step": 12186 + }, + { + "epoch": 2.505293452564498, + "grad_norm": 0.2228788286447525, + "learning_rate": 6.260610568313647e-06, + "loss": 0.3716, + "step": 12187 + }, + { + "epoch": 2.5054990235378765, + "grad_norm": 0.2226657122373581, + "learning_rate": 6.255529414242136e-06, + "loss": 0.3872, + "step": 12188 + }, + { + "epoch": 2.505704594511255, + "grad_norm": 0.22949428856372833, + "learning_rate": 6.250450168929597e-06, + "loss": 0.3995, + "step": 12189 + }, + { + "epoch": 2.5059101654846336, + "grad_norm": 2.063056707382202, + "learning_rate": 6.2453728326262674e-06, + "loss": 0.4019, + "step": 12190 + }, + { + "epoch": 2.506115736458012, + "grad_norm": 0.23003017902374268, + "learning_rate": 6.240297405582264e-06, + "loss": 0.3975, + "step": 12191 + }, + { + "epoch": 2.506321307431391, + "grad_norm": 0.23214492201805115, + "learning_rate": 6.235223888047661e-06, + "loss": 0.3863, + "step": 12192 + }, + { + "epoch": 2.5065268784047694, + "grad_norm": 0.2411757856607437, + "learning_rate": 6.2301522802723835e-06, + "loss": 0.3888, + "step": 12193 + }, + { + "epoch": 2.5067324493781475, + "grad_norm": 0.22843949496746063, + "learning_rate": 6.2250825825062975e-06, + "loss": 0.4066, + "step": 12194 + }, + { + "epoch": 2.5069380203515266, + "grad_norm": 0.2403997927904129, + "learning_rate": 6.2200147949991624e-06, + "loss": 0.3949, + "step": 12195 + }, + { + "epoch": 2.5071435913249047, + "grad_norm": 0.23729455471038818, + "learning_rate": 6.214948918000638e-06, + "loss": 0.3915, + "step": 12196 + }, + { + "epoch": 2.5073491622982838, + "grad_norm": 0.21809126436710358, + "learning_rate": 6.209884951760296e-06, + "loss": 0.3535, + "step": 12197 + }, + { + "epoch": 2.507554733271662, + "grad_norm": 0.22672174870967865, + "learning_rate": 6.20482289652761e-06, + "loss": 0.3854, + "step": 12198 + }, + { + "epoch": 2.5077603042450405, + "grad_norm": 0.22650691866874695, + "learning_rate": 6.199762752551988e-06, + "loss": 0.3908, + "step": 12199 + }, + { + "epoch": 2.507965875218419, + "grad_norm": 0.22250515222549438, + "learning_rate": 6.194704520082694e-06, + "loss": 0.3765, + "step": 12200 + }, + { + "epoch": 2.5081714461917977, + "grad_norm": 0.2284214347600937, + "learning_rate": 6.189648199368929e-06, + "loss": 0.3919, + "step": 12201 + }, + { + "epoch": 2.5083770171651762, + "grad_norm": 0.23341004550457, + "learning_rate": 6.184593790659807e-06, + "loss": 0.3923, + "step": 12202 + }, + { + "epoch": 2.508582588138555, + "grad_norm": 0.24929523468017578, + "learning_rate": 6.179541294204327e-06, + "loss": 0.3788, + "step": 12203 + }, + { + "epoch": 2.5087881591119334, + "grad_norm": 0.23570400476455688, + "learning_rate": 6.174490710251398e-06, + "loss": 0.3904, + "step": 12204 + }, + { + "epoch": 2.508993730085312, + "grad_norm": 0.22578248381614685, + "learning_rate": 6.169442039049831e-06, + "loss": 0.4045, + "step": 12205 + }, + { + "epoch": 2.5091993010586906, + "grad_norm": 0.2417832911014557, + "learning_rate": 6.1643952808483726e-06, + "loss": 0.3621, + "step": 12206 + }, + { + "epoch": 2.509404872032069, + "grad_norm": 0.2352113127708435, + "learning_rate": 6.159350435895643e-06, + "loss": 0.3799, + "step": 12207 + }, + { + "epoch": 2.5096104430054478, + "grad_norm": 0.1250247210264206, + "learning_rate": 6.154307504440175e-06, + "loss": 0.4474, + "step": 12208 + }, + { + "epoch": 2.509816013978826, + "grad_norm": 0.22554874420166016, + "learning_rate": 6.149266486730414e-06, + "loss": 0.3857, + "step": 12209 + }, + { + "epoch": 2.510021584952205, + "grad_norm": 0.23448492586612701, + "learning_rate": 6.144227383014705e-06, + "loss": 0.3939, + "step": 12210 + }, + { + "epoch": 2.510227155925583, + "grad_norm": 0.23547668755054474, + "learning_rate": 6.139190193541301e-06, + "loss": 0.4062, + "step": 12211 + }, + { + "epoch": 2.510432726898962, + "grad_norm": 0.2341381311416626, + "learning_rate": 6.1341549185583495e-06, + "loss": 0.3777, + "step": 12212 + }, + { + "epoch": 2.5106382978723403, + "grad_norm": 0.23182830214500427, + "learning_rate": 6.129121558313939e-06, + "loss": 0.3878, + "step": 12213 + }, + { + "epoch": 2.510843868845719, + "grad_norm": 0.23118196427822113, + "learning_rate": 6.124090113056029e-06, + "loss": 0.3822, + "step": 12214 + }, + { + "epoch": 2.5110494398190975, + "grad_norm": 0.2306642383337021, + "learning_rate": 6.11906058303249e-06, + "loss": 0.3742, + "step": 12215 + }, + { + "epoch": 2.511255010792476, + "grad_norm": 0.2363126426935196, + "learning_rate": 6.114032968491108e-06, + "loss": 0.3679, + "step": 12216 + }, + { + "epoch": 2.5114605817658546, + "grad_norm": 0.22103947401046753, + "learning_rate": 6.109007269679567e-06, + "loss": 0.378, + "step": 12217 + }, + { + "epoch": 2.511666152739233, + "grad_norm": 0.23410068452358246, + "learning_rate": 6.1039834868454676e-06, + "loss": 0.3637, + "step": 12218 + }, + { + "epoch": 2.511871723712612, + "grad_norm": 0.2275317907333374, + "learning_rate": 6.098961620236286e-06, + "loss": 0.3731, + "step": 12219 + }, + { + "epoch": 2.5120772946859904, + "grad_norm": 0.2327854335308075, + "learning_rate": 6.093941670099456e-06, + "loss": 0.3812, + "step": 12220 + }, + { + "epoch": 2.512282865659369, + "grad_norm": 0.11918573826551437, + "learning_rate": 6.088923636682273e-06, + "loss": 0.4619, + "step": 12221 + }, + { + "epoch": 2.5124884366327476, + "grad_norm": 0.11901576071977615, + "learning_rate": 6.083907520231941e-06, + "loss": 0.4478, + "step": 12222 + }, + { + "epoch": 2.512694007606126, + "grad_norm": 0.23713438212871552, + "learning_rate": 6.0788933209956015e-06, + "loss": 0.3682, + "step": 12223 + }, + { + "epoch": 2.5128995785795043, + "grad_norm": 0.22688627243041992, + "learning_rate": 6.0738810392202725e-06, + "loss": 0.3878, + "step": 12224 + }, + { + "epoch": 2.5131051495528833, + "grad_norm": 0.22531628608703613, + "learning_rate": 6.068870675152875e-06, + "loss": 0.3921, + "step": 12225 + }, + { + "epoch": 2.5133107205262615, + "grad_norm": 0.23456744849681854, + "learning_rate": 6.063862229040268e-06, + "loss": 0.3799, + "step": 12226 + }, + { + "epoch": 2.5135162914996405, + "grad_norm": 0.22432486712932587, + "learning_rate": 6.058855701129178e-06, + "loss": 0.386, + "step": 12227 + }, + { + "epoch": 2.5137218624730187, + "grad_norm": 0.23045605421066284, + "learning_rate": 6.0538510916662595e-06, + "loss": 0.3704, + "step": 12228 + }, + { + "epoch": 2.5139274334463972, + "grad_norm": 0.23640716075897217, + "learning_rate": 6.048848400898063e-06, + "loss": 0.3814, + "step": 12229 + }, + { + "epoch": 2.514133004419776, + "grad_norm": 0.23872627317905426, + "learning_rate": 6.043847629071049e-06, + "loss": 0.3968, + "step": 12230 + }, + { + "epoch": 2.5143385753931544, + "grad_norm": 0.2490765005350113, + "learning_rate": 6.038848776431582e-06, + "loss": 0.396, + "step": 12231 + }, + { + "epoch": 2.514544146366533, + "grad_norm": 0.24178048968315125, + "learning_rate": 6.033851843225918e-06, + "loss": 0.393, + "step": 12232 + }, + { + "epoch": 2.5147497173399116, + "grad_norm": 0.12222810834646225, + "learning_rate": 6.028856829700258e-06, + "loss": 0.4281, + "step": 12233 + }, + { + "epoch": 2.51495528831329, + "grad_norm": 0.23273582756519318, + "learning_rate": 6.023863736100677e-06, + "loss": 0.3911, + "step": 12234 + }, + { + "epoch": 2.5151608592866688, + "grad_norm": 0.23268084228038788, + "learning_rate": 6.0188725626731475e-06, + "loss": 0.3771, + "step": 12235 + }, + { + "epoch": 2.5153664302600474, + "grad_norm": 0.22554205358028412, + "learning_rate": 6.013883309663577e-06, + "loss": 0.3792, + "step": 12236 + }, + { + "epoch": 2.515572001233426, + "grad_norm": 0.12636855244636536, + "learning_rate": 6.00889597731775e-06, + "loss": 0.4423, + "step": 12237 + }, + { + "epoch": 2.5157775722068045, + "grad_norm": 0.22077181935310364, + "learning_rate": 6.0039105658813745e-06, + "loss": 0.4052, + "step": 12238 + }, + { + "epoch": 2.515983143180183, + "grad_norm": 0.2312643826007843, + "learning_rate": 5.998927075600054e-06, + "loss": 0.3937, + "step": 12239 + }, + { + "epoch": 2.5161887141535617, + "grad_norm": 0.22247177362442017, + "learning_rate": 5.993945506719307e-06, + "loss": 0.3851, + "step": 12240 + }, + { + "epoch": 2.51639428512694, + "grad_norm": 0.12727056443691254, + "learning_rate": 5.988965859484558e-06, + "loss": 0.4577, + "step": 12241 + }, + { + "epoch": 2.516599856100319, + "grad_norm": 0.2271554172039032, + "learning_rate": 5.9839881341411235e-06, + "loss": 0.3691, + "step": 12242 + }, + { + "epoch": 2.516805427073697, + "grad_norm": 0.22784371674060822, + "learning_rate": 5.97901233093423e-06, + "loss": 0.3829, + "step": 12243 + }, + { + "epoch": 2.5170109980470756, + "grad_norm": 0.235183447599411, + "learning_rate": 5.974038450109005e-06, + "loss": 0.399, + "step": 12244 + }, + { + "epoch": 2.517216569020454, + "grad_norm": 0.22664892673492432, + "learning_rate": 5.969066491910514e-06, + "loss": 0.3783, + "step": 12245 + }, + { + "epoch": 2.517422139993833, + "grad_norm": 0.2340896725654602, + "learning_rate": 5.9640964565836684e-06, + "loss": 0.3887, + "step": 12246 + }, + { + "epoch": 2.5176277109672114, + "grad_norm": 0.23754270374774933, + "learning_rate": 5.959128344373354e-06, + "loss": 0.3782, + "step": 12247 + }, + { + "epoch": 2.51783328194059, + "grad_norm": 0.2322191596031189, + "learning_rate": 5.9541621555243055e-06, + "loss": 0.3946, + "step": 12248 + }, + { + "epoch": 2.5180388529139686, + "grad_norm": 0.24042516946792603, + "learning_rate": 5.9491978902811915e-06, + "loss": 0.405, + "step": 12249 + }, + { + "epoch": 2.518244423887347, + "grad_norm": 0.23382358253002167, + "learning_rate": 5.944235548888571e-06, + "loss": 0.3944, + "step": 12250 + }, + { + "epoch": 2.5184499948607257, + "grad_norm": 0.22752118110656738, + "learning_rate": 5.939275131590924e-06, + "loss": 0.3723, + "step": 12251 + }, + { + "epoch": 2.5186555658341043, + "grad_norm": 0.23085589706897736, + "learning_rate": 5.934316638632615e-06, + "loss": 0.3916, + "step": 12252 + }, + { + "epoch": 2.518861136807483, + "grad_norm": 0.23417700827121735, + "learning_rate": 5.929360070257928e-06, + "loss": 0.3699, + "step": 12253 + }, + { + "epoch": 2.5190667077808615, + "grad_norm": 0.23297809064388275, + "learning_rate": 5.924405426711064e-06, + "loss": 0.3863, + "step": 12254 + }, + { + "epoch": 2.51927227875424, + "grad_norm": 0.3042377531528473, + "learning_rate": 5.919452708236101e-06, + "loss": 0.3995, + "step": 12255 + }, + { + "epoch": 2.5194778497276182, + "grad_norm": 0.3225111663341522, + "learning_rate": 5.914501915077045e-06, + "loss": 0.387, + "step": 12256 + }, + { + "epoch": 2.5196834207009973, + "grad_norm": 0.12028972804546356, + "learning_rate": 5.909553047477796e-06, + "loss": 0.447, + "step": 12257 + }, + { + "epoch": 2.5198889916743754, + "grad_norm": 0.22156624495983124, + "learning_rate": 5.904606105682159e-06, + "loss": 0.3813, + "step": 12258 + }, + { + "epoch": 2.520094562647754, + "grad_norm": 0.23767083883285522, + "learning_rate": 5.899661089933842e-06, + "loss": 0.3754, + "step": 12259 + }, + { + "epoch": 2.5203001336211326, + "grad_norm": 0.24094241857528687, + "learning_rate": 5.894718000476468e-06, + "loss": 0.3879, + "step": 12260 + }, + { + "epoch": 2.520505704594511, + "grad_norm": 0.23110216856002808, + "learning_rate": 5.889776837553565e-06, + "loss": 0.384, + "step": 12261 + }, + { + "epoch": 2.5207112755678898, + "grad_norm": 0.23392094671726227, + "learning_rate": 5.884837601408556e-06, + "loss": 0.3925, + "step": 12262 + }, + { + "epoch": 2.5209168465412684, + "grad_norm": 0.23152020573616028, + "learning_rate": 5.879900292284778e-06, + "loss": 0.391, + "step": 12263 + }, + { + "epoch": 2.521122417514647, + "grad_norm": 0.2314983457326889, + "learning_rate": 5.8749649104254634e-06, + "loss": 0.3918, + "step": 12264 + }, + { + "epoch": 2.5213279884880255, + "grad_norm": 0.2333018183708191, + "learning_rate": 5.870031456073747e-06, + "loss": 0.3686, + "step": 12265 + }, + { + "epoch": 2.521533559461404, + "grad_norm": 0.22460217773914337, + "learning_rate": 5.8650999294727e-06, + "loss": 0.38, + "step": 12266 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.2324889600276947, + "learning_rate": 5.8601703308652585e-06, + "loss": 0.3957, + "step": 12267 + }, + { + "epoch": 2.5219447014081613, + "grad_norm": 0.22676560282707214, + "learning_rate": 5.8552426604942814e-06, + "loss": 0.3589, + "step": 12268 + }, + { + "epoch": 2.52215027238154, + "grad_norm": 0.22790227830410004, + "learning_rate": 5.8503169186025465e-06, + "loss": 0.3892, + "step": 12269 + }, + { + "epoch": 2.5223558433549185, + "grad_norm": 0.23212410509586334, + "learning_rate": 5.845393105432708e-06, + "loss": 0.3854, + "step": 12270 + }, + { + "epoch": 2.5225614143282966, + "grad_norm": 0.23569580912590027, + "learning_rate": 5.8404712212273436e-06, + "loss": 0.3756, + "step": 12271 + }, + { + "epoch": 2.5227669853016756, + "grad_norm": 0.22882294654846191, + "learning_rate": 5.835551266228932e-06, + "loss": 0.3866, + "step": 12272 + }, + { + "epoch": 2.522972556275054, + "grad_norm": 0.22856424748897552, + "learning_rate": 5.8306332406798574e-06, + "loss": 0.3792, + "step": 12273 + }, + { + "epoch": 2.5231781272484324, + "grad_norm": 0.240891695022583, + "learning_rate": 5.825717144822393e-06, + "loss": 0.3868, + "step": 12274 + }, + { + "epoch": 2.523383698221811, + "grad_norm": 0.23485086858272552, + "learning_rate": 5.820802978898757e-06, + "loss": 0.3834, + "step": 12275 + }, + { + "epoch": 2.5235892691951896, + "grad_norm": 0.23131176829338074, + "learning_rate": 5.81589074315103e-06, + "loss": 0.3774, + "step": 12276 + }, + { + "epoch": 2.523794840168568, + "grad_norm": 0.24229222536087036, + "learning_rate": 5.810980437821223e-06, + "loss": 0.4105, + "step": 12277 + }, + { + "epoch": 2.5240004111419467, + "grad_norm": 0.23501570522785187, + "learning_rate": 5.806072063151243e-06, + "loss": 0.3863, + "step": 12278 + }, + { + "epoch": 2.5242059821153253, + "grad_norm": 0.22483564913272858, + "learning_rate": 5.801165619382897e-06, + "loss": 0.3781, + "step": 12279 + }, + { + "epoch": 2.524411553088704, + "grad_norm": 0.2277892529964447, + "learning_rate": 5.7962611067579116e-06, + "loss": 0.3625, + "step": 12280 + }, + { + "epoch": 2.5246171240620825, + "grad_norm": 0.2191118448972702, + "learning_rate": 5.791358525517887e-06, + "loss": 0.3664, + "step": 12281 + }, + { + "epoch": 2.524822695035461, + "grad_norm": 0.12358484417200089, + "learning_rate": 5.786457875904382e-06, + "loss": 0.4396, + "step": 12282 + }, + { + "epoch": 2.5250282660088397, + "grad_norm": 0.23930969834327698, + "learning_rate": 5.781559158158813e-06, + "loss": 0.3858, + "step": 12283 + }, + { + "epoch": 2.5252338369822183, + "grad_norm": 0.231824591755867, + "learning_rate": 5.776662372522516e-06, + "loss": 0.3838, + "step": 12284 + }, + { + "epoch": 2.525439407955597, + "grad_norm": 0.1248823031783104, + "learning_rate": 5.771767519236734e-06, + "loss": 0.4319, + "step": 12285 + }, + { + "epoch": 2.525644978928975, + "grad_norm": 0.22508475184440613, + "learning_rate": 5.766874598542609e-06, + "loss": 0.3756, + "step": 12286 + }, + { + "epoch": 2.525850549902354, + "grad_norm": 0.22717183828353882, + "learning_rate": 5.761983610681201e-06, + "loss": 0.3744, + "step": 12287 + }, + { + "epoch": 2.526056120875732, + "grad_norm": 0.22601410746574402, + "learning_rate": 5.757094555893466e-06, + "loss": 0.3717, + "step": 12288 + }, + { + "epoch": 2.5262616918491108, + "grad_norm": 0.2239820659160614, + "learning_rate": 5.752207434420249e-06, + "loss": 0.3665, + "step": 12289 + }, + { + "epoch": 2.5264672628224893, + "grad_norm": 0.23698551952838898, + "learning_rate": 5.747322246502343e-06, + "loss": 0.4048, + "step": 12290 + }, + { + "epoch": 2.526672833795868, + "grad_norm": 0.22845512628555298, + "learning_rate": 5.742438992380399e-06, + "loss": 0.3882, + "step": 12291 + }, + { + "epoch": 2.5268784047692465, + "grad_norm": 0.232425257563591, + "learning_rate": 5.7375576722949975e-06, + "loss": 0.3715, + "step": 12292 + }, + { + "epoch": 2.527083975742625, + "grad_norm": 0.21982984244823456, + "learning_rate": 5.732678286486614e-06, + "loss": 0.3603, + "step": 12293 + }, + { + "epoch": 2.5272895467160037, + "grad_norm": 0.22463706135749817, + "learning_rate": 5.727800835195642e-06, + "loss": 0.3708, + "step": 12294 + }, + { + "epoch": 2.5274951176893823, + "grad_norm": 0.12560081481933594, + "learning_rate": 5.722925318662354e-06, + "loss": 0.4498, + "step": 12295 + }, + { + "epoch": 2.527700688662761, + "grad_norm": 0.2263532429933548, + "learning_rate": 5.718051737126963e-06, + "loss": 0.3865, + "step": 12296 + }, + { + "epoch": 2.5279062596361395, + "grad_norm": 0.23695407807826996, + "learning_rate": 5.713180090829561e-06, + "loss": 0.3791, + "step": 12297 + }, + { + "epoch": 2.528111830609518, + "grad_norm": 0.12093336135149002, + "learning_rate": 5.708310380010148e-06, + "loss": 0.4565, + "step": 12298 + }, + { + "epoch": 2.5283174015828966, + "grad_norm": 0.237099289894104, + "learning_rate": 5.703442604908635e-06, + "loss": 0.4034, + "step": 12299 + }, + { + "epoch": 2.5285229725562752, + "grad_norm": 0.26493915915489197, + "learning_rate": 5.698576765764832e-06, + "loss": 0.3807, + "step": 12300 + }, + { + "epoch": 2.5287285435296534, + "grad_norm": 0.2276540845632553, + "learning_rate": 5.693712862818446e-06, + "loss": 0.381, + "step": 12301 + }, + { + "epoch": 2.5289341145030324, + "grad_norm": 0.22775860130786896, + "learning_rate": 5.688850896309126e-06, + "loss": 0.3737, + "step": 12302 + }, + { + "epoch": 2.5291396854764105, + "grad_norm": 0.12160097062587738, + "learning_rate": 5.6839908664763745e-06, + "loss": 0.4252, + "step": 12303 + }, + { + "epoch": 2.529345256449789, + "grad_norm": 0.24990959465503693, + "learning_rate": 5.679132773559636e-06, + "loss": 0.3963, + "step": 12304 + }, + { + "epoch": 2.5295508274231677, + "grad_norm": 0.2322610765695572, + "learning_rate": 5.674276617798239e-06, + "loss": 0.3973, + "step": 12305 + }, + { + "epoch": 2.5297563983965463, + "grad_norm": 0.23721322417259216, + "learning_rate": 5.669422399431426e-06, + "loss": 0.4063, + "step": 12306 + }, + { + "epoch": 2.529961969369925, + "grad_norm": 0.23168140649795532, + "learning_rate": 5.6645701186983416e-06, + "loss": 0.3688, + "step": 12307 + }, + { + "epoch": 2.5301675403433035, + "grad_norm": 0.22344398498535156, + "learning_rate": 5.65971977583802e-06, + "loss": 0.3936, + "step": 12308 + }, + { + "epoch": 2.530373111316682, + "grad_norm": 0.2303028702735901, + "learning_rate": 5.6548713710894444e-06, + "loss": 0.3847, + "step": 12309 + }, + { + "epoch": 2.5305786822900607, + "grad_norm": 0.23595616221427917, + "learning_rate": 5.650024904691443e-06, + "loss": 0.3789, + "step": 12310 + }, + { + "epoch": 2.5307842532634393, + "grad_norm": 0.23113130033016205, + "learning_rate": 5.645180376882806e-06, + "loss": 0.3743, + "step": 12311 + }, + { + "epoch": 2.530989824236818, + "grad_norm": 0.23540560901165009, + "learning_rate": 5.640337787902188e-06, + "loss": 0.3958, + "step": 12312 + }, + { + "epoch": 2.5311953952101964, + "grad_norm": 0.23868121206760406, + "learning_rate": 5.635497137988157e-06, + "loss": 0.3984, + "step": 12313 + }, + { + "epoch": 2.531400966183575, + "grad_norm": 0.23868517577648163, + "learning_rate": 5.6306584273791965e-06, + "loss": 0.4009, + "step": 12314 + }, + { + "epoch": 2.5316065371569536, + "grad_norm": 0.1235857680439949, + "learning_rate": 5.625821656313673e-06, + "loss": 0.456, + "step": 12315 + }, + { + "epoch": 2.5318121081303318, + "grad_norm": 0.22929808497428894, + "learning_rate": 5.620986825029889e-06, + "loss": 0.3708, + "step": 12316 + }, + { + "epoch": 2.532017679103711, + "grad_norm": 0.22845540940761566, + "learning_rate": 5.6161539337660305e-06, + "loss": 0.3914, + "step": 12317 + }, + { + "epoch": 2.532223250077089, + "grad_norm": 0.22550463676452637, + "learning_rate": 5.611322982760191e-06, + "loss": 0.3671, + "step": 12318 + }, + { + "epoch": 2.5324288210504675, + "grad_norm": 0.223682701587677, + "learning_rate": 5.606493972250359e-06, + "loss": 0.3678, + "step": 12319 + }, + { + "epoch": 2.532634392023846, + "grad_norm": 0.22872628271579742, + "learning_rate": 5.601666902474447e-06, + "loss": 0.3995, + "step": 12320 + }, + { + "epoch": 2.5328399629972247, + "grad_norm": 0.12020973116159439, + "learning_rate": 5.596841773670258e-06, + "loss": 0.4526, + "step": 12321 + }, + { + "epoch": 2.5330455339706033, + "grad_norm": 0.22796304523944855, + "learning_rate": 5.592018586075498e-06, + "loss": 0.3907, + "step": 12322 + }, + { + "epoch": 2.533251104943982, + "grad_norm": 0.22996073961257935, + "learning_rate": 5.5871973399278e-06, + "loss": 0.3912, + "step": 12323 + }, + { + "epoch": 2.5334566759173605, + "grad_norm": 0.23374302685260773, + "learning_rate": 5.582378035464671e-06, + "loss": 0.3796, + "step": 12324 + }, + { + "epoch": 2.533662246890739, + "grad_norm": 0.12319260090589523, + "learning_rate": 5.577560672923539e-06, + "loss": 0.4496, + "step": 12325 + }, + { + "epoch": 2.5338678178641176, + "grad_norm": 0.23780031502246857, + "learning_rate": 5.572745252541736e-06, + "loss": 0.3854, + "step": 12326 + }, + { + "epoch": 2.534073388837496, + "grad_norm": 0.22886228561401367, + "learning_rate": 5.567931774556487e-06, + "loss": 0.3914, + "step": 12327 + }, + { + "epoch": 2.534278959810875, + "grad_norm": 0.2357688695192337, + "learning_rate": 5.563120239204937e-06, + "loss": 0.3849, + "step": 12328 + }, + { + "epoch": 2.5344845307842534, + "grad_norm": 0.23062871396541595, + "learning_rate": 5.558310646724115e-06, + "loss": 0.369, + "step": 12329 + }, + { + "epoch": 2.534690101757632, + "grad_norm": 0.24323634803295135, + "learning_rate": 5.553502997350989e-06, + "loss": 0.3802, + "step": 12330 + }, + { + "epoch": 2.53489567273101, + "grad_norm": 0.24064137041568756, + "learning_rate": 5.548697291322398e-06, + "loss": 0.3606, + "step": 12331 + }, + { + "epoch": 2.535101243704389, + "grad_norm": 0.22627827525138855, + "learning_rate": 5.543893528875087e-06, + "loss": 0.3883, + "step": 12332 + }, + { + "epoch": 2.5353068146777673, + "grad_norm": 0.2306792140007019, + "learning_rate": 5.539091710245729e-06, + "loss": 0.3751, + "step": 12333 + }, + { + "epoch": 2.535512385651146, + "grad_norm": 0.12287892401218414, + "learning_rate": 5.534291835670888e-06, + "loss": 0.4274, + "step": 12334 + }, + { + "epoch": 2.5357179566245245, + "grad_norm": 0.22518762946128845, + "learning_rate": 5.529493905387025e-06, + "loss": 0.3907, + "step": 12335 + }, + { + "epoch": 2.535923527597903, + "grad_norm": 0.2295764982700348, + "learning_rate": 5.524697919630501e-06, + "loss": 0.3925, + "step": 12336 + }, + { + "epoch": 2.5361290985712817, + "grad_norm": 0.23383575677871704, + "learning_rate": 5.519903878637617e-06, + "loss": 0.4127, + "step": 12337 + }, + { + "epoch": 2.5363346695446602, + "grad_norm": 0.24502725899219513, + "learning_rate": 5.515111782644535e-06, + "loss": 0.3995, + "step": 12338 + }, + { + "epoch": 2.536540240518039, + "grad_norm": 0.24664360284805298, + "learning_rate": 5.510321631887345e-06, + "loss": 0.3686, + "step": 12339 + }, + { + "epoch": 2.5367458114914174, + "grad_norm": 0.12008702009916306, + "learning_rate": 5.505533426602033e-06, + "loss": 0.4564, + "step": 12340 + }, + { + "epoch": 2.536951382464796, + "grad_norm": 0.23110847175121307, + "learning_rate": 5.500747167024496e-06, + "loss": 0.3741, + "step": 12341 + }, + { + "epoch": 2.5371569534381746, + "grad_norm": 0.22447291016578674, + "learning_rate": 5.495962853390521e-06, + "loss": 0.3598, + "step": 12342 + }, + { + "epoch": 2.537362524411553, + "grad_norm": 0.11924073100090027, + "learning_rate": 5.491180485935813e-06, + "loss": 0.4384, + "step": 12343 + }, + { + "epoch": 2.5375680953849318, + "grad_norm": 0.22318169474601746, + "learning_rate": 5.48640006489598e-06, + "loss": 0.3797, + "step": 12344 + }, + { + "epoch": 2.5377736663583104, + "grad_norm": 0.22601492702960968, + "learning_rate": 5.4816215905065375e-06, + "loss": 0.3914, + "step": 12345 + }, + { + "epoch": 2.5379792373316885, + "grad_norm": 0.23133814334869385, + "learning_rate": 5.476845063002888e-06, + "loss": 0.3854, + "step": 12346 + }, + { + "epoch": 2.5381848083050675, + "grad_norm": 0.12131541967391968, + "learning_rate": 5.472070482620347e-06, + "loss": 0.4463, + "step": 12347 + }, + { + "epoch": 2.5383903792784457, + "grad_norm": 0.23147541284561157, + "learning_rate": 5.467297849594143e-06, + "loss": 0.3833, + "step": 12348 + }, + { + "epoch": 2.5385959502518247, + "grad_norm": 0.23673370480537415, + "learning_rate": 5.462527164159402e-06, + "loss": 0.381, + "step": 12349 + }, + { + "epoch": 2.538801521225203, + "grad_norm": 0.22934825718402863, + "learning_rate": 5.457758426551136e-06, + "loss": 0.3894, + "step": 12350 + }, + { + "epoch": 2.5390070921985815, + "grad_norm": 0.22352683544158936, + "learning_rate": 5.4529916370043065e-06, + "loss": 0.3989, + "step": 12351 + }, + { + "epoch": 2.53921266317196, + "grad_norm": 0.23551110923290253, + "learning_rate": 5.448226795753732e-06, + "loss": 0.4017, + "step": 12352 + }, + { + "epoch": 2.5394182341453386, + "grad_norm": 0.23551301658153534, + "learning_rate": 5.443463903034154e-06, + "loss": 0.3999, + "step": 12353 + }, + { + "epoch": 2.539623805118717, + "grad_norm": 0.22477497160434723, + "learning_rate": 5.43870295908023e-06, + "loss": 0.3857, + "step": 12354 + }, + { + "epoch": 2.539829376092096, + "grad_norm": 0.222430020570755, + "learning_rate": 5.433943964126501e-06, + "loss": 0.3661, + "step": 12355 + }, + { + "epoch": 2.5400349470654744, + "grad_norm": 0.23153965175151825, + "learning_rate": 5.429186918407423e-06, + "loss": 0.3748, + "step": 12356 + }, + { + "epoch": 2.540240518038853, + "grad_norm": 0.12405303865671158, + "learning_rate": 5.4244318221573395e-06, + "loss": 0.4329, + "step": 12357 + }, + { + "epoch": 2.5404460890122316, + "grad_norm": 0.22384849190711975, + "learning_rate": 5.419678675610535e-06, + "loss": 0.3792, + "step": 12358 + }, + { + "epoch": 2.54065165998561, + "grad_norm": 0.2254662811756134, + "learning_rate": 5.414927479001167e-06, + "loss": 0.3913, + "step": 12359 + }, + { + "epoch": 2.5408572309589887, + "grad_norm": 0.23094946146011353, + "learning_rate": 5.410178232563299e-06, + "loss": 0.3677, + "step": 12360 + }, + { + "epoch": 2.541062801932367, + "grad_norm": 0.23447729647159576, + "learning_rate": 5.405430936530908e-06, + "loss": 0.3659, + "step": 12361 + }, + { + "epoch": 2.541268372905746, + "grad_norm": 0.23138076066970825, + "learning_rate": 5.400685591137871e-06, + "loss": 0.375, + "step": 12362 + }, + { + "epoch": 2.541473943879124, + "grad_norm": 0.12305799126625061, + "learning_rate": 5.395942196617968e-06, + "loss": 0.4492, + "step": 12363 + }, + { + "epoch": 2.541679514852503, + "grad_norm": 0.1181621253490448, + "learning_rate": 5.391200753204876e-06, + "loss": 0.4415, + "step": 12364 + }, + { + "epoch": 2.5418850858258812, + "grad_norm": 0.12282504886388779, + "learning_rate": 5.386461261132198e-06, + "loss": 0.4412, + "step": 12365 + }, + { + "epoch": 2.54209065679926, + "grad_norm": 0.23556901514530182, + "learning_rate": 5.381723720633422e-06, + "loss": 0.3947, + "step": 12366 + }, + { + "epoch": 2.5422962277726384, + "grad_norm": 0.1227208599448204, + "learning_rate": 5.376988131941943e-06, + "loss": 0.4529, + "step": 12367 + }, + { + "epoch": 2.542501798746017, + "grad_norm": 0.2223055213689804, + "learning_rate": 5.3722544952910625e-06, + "loss": 0.3783, + "step": 12368 + }, + { + "epoch": 2.5427073697193956, + "grad_norm": 0.12605048716068268, + "learning_rate": 5.367522810913984e-06, + "loss": 0.4487, + "step": 12369 + }, + { + "epoch": 2.542912940692774, + "grad_norm": 0.12217556685209274, + "learning_rate": 5.362793079043813e-06, + "loss": 0.4541, + "step": 12370 + }, + { + "epoch": 2.5431185116661528, + "grad_norm": 0.2289581149816513, + "learning_rate": 5.358065299913551e-06, + "loss": 0.3795, + "step": 12371 + }, + { + "epoch": 2.5433240826395314, + "grad_norm": 0.24024073779582977, + "learning_rate": 5.3533394737561425e-06, + "loss": 0.3983, + "step": 12372 + }, + { + "epoch": 2.54352965361291, + "grad_norm": 0.2249261736869812, + "learning_rate": 5.348615600804381e-06, + "loss": 0.397, + "step": 12373 + }, + { + "epoch": 2.5437352245862885, + "grad_norm": 0.231714129447937, + "learning_rate": 5.3438936812909965e-06, + "loss": 0.3847, + "step": 12374 + }, + { + "epoch": 2.543940795559667, + "grad_norm": 0.2364065796136856, + "learning_rate": 5.339173715448626e-06, + "loss": 0.3956, + "step": 12375 + }, + { + "epoch": 2.5441463665330453, + "grad_norm": 0.23184433579444885, + "learning_rate": 5.33445570350979e-06, + "loss": 0.3686, + "step": 12376 + }, + { + "epoch": 2.5443519375064243, + "grad_norm": 0.23327279090881348, + "learning_rate": 5.3297396457069164e-06, + "loss": 0.3834, + "step": 12377 + }, + { + "epoch": 2.5445575084798024, + "grad_norm": 0.2422483265399933, + "learning_rate": 5.3250255422723655e-06, + "loss": 0.3607, + "step": 12378 + }, + { + "epoch": 2.5447630794531815, + "grad_norm": 0.22602832317352295, + "learning_rate": 5.320313393438361e-06, + "loss": 0.3734, + "step": 12379 + }, + { + "epoch": 2.5449686504265596, + "grad_norm": 0.22192765772342682, + "learning_rate": 5.315603199437057e-06, + "loss": 0.3825, + "step": 12380 + }, + { + "epoch": 2.545174221399938, + "grad_norm": 0.2314867079257965, + "learning_rate": 5.310894960500493e-06, + "loss": 0.3918, + "step": 12381 + }, + { + "epoch": 2.545379792373317, + "grad_norm": 0.22359047830104828, + "learning_rate": 5.306188676860634e-06, + "loss": 0.3916, + "step": 12382 + }, + { + "epoch": 2.5455853633466954, + "grad_norm": 0.23620890080928802, + "learning_rate": 5.301484348749329e-06, + "loss": 0.4001, + "step": 12383 + }, + { + "epoch": 2.545790934320074, + "grad_norm": 0.23437555134296417, + "learning_rate": 5.296781976398327e-06, + "loss": 0.3721, + "step": 12384 + }, + { + "epoch": 2.5459965052934526, + "grad_norm": 0.22928333282470703, + "learning_rate": 5.292081560039319e-06, + "loss": 0.3894, + "step": 12385 + }, + { + "epoch": 2.546202076266831, + "grad_norm": 0.23154671490192413, + "learning_rate": 5.287383099903855e-06, + "loss": 0.3979, + "step": 12386 + }, + { + "epoch": 2.5464076472402097, + "grad_norm": 0.23585215210914612, + "learning_rate": 5.282686596223412e-06, + "loss": 0.3604, + "step": 12387 + }, + { + "epoch": 2.5466132182135883, + "grad_norm": 0.23773467540740967, + "learning_rate": 5.277992049229358e-06, + "loss": 0.3868, + "step": 12388 + }, + { + "epoch": 2.546818789186967, + "grad_norm": 0.22915463149547577, + "learning_rate": 5.273299459152977e-06, + "loss": 0.371, + "step": 12389 + }, + { + "epoch": 2.5470243601603455, + "grad_norm": 0.2335277944803238, + "learning_rate": 5.268608826225454e-06, + "loss": 0.3819, + "step": 12390 + }, + { + "epoch": 2.5472299311337236, + "grad_norm": 0.24060097336769104, + "learning_rate": 5.263920150677854e-06, + "loss": 0.4, + "step": 12391 + }, + { + "epoch": 2.5474355021071027, + "grad_norm": 0.2347687929868698, + "learning_rate": 5.259233432741198e-06, + "loss": 0.4035, + "step": 12392 + }, + { + "epoch": 2.547641073080481, + "grad_norm": 0.23900634050369263, + "learning_rate": 5.25454867264636e-06, + "loss": 0.3856, + "step": 12393 + }, + { + "epoch": 2.54784664405386, + "grad_norm": 0.22846248745918274, + "learning_rate": 5.249865870624136e-06, + "loss": 0.391, + "step": 12394 + }, + { + "epoch": 2.548052215027238, + "grad_norm": 0.2248847782611847, + "learning_rate": 5.2451850269052214e-06, + "loss": 0.3927, + "step": 12395 + }, + { + "epoch": 2.5482577860006166, + "grad_norm": 0.1248060017824173, + "learning_rate": 5.2405061417202366e-06, + "loss": 0.4394, + "step": 12396 + }, + { + "epoch": 2.548463356973995, + "grad_norm": 0.22413742542266846, + "learning_rate": 5.235829215299683e-06, + "loss": 0.3923, + "step": 12397 + }, + { + "epoch": 2.5486689279473738, + "grad_norm": 0.12117671221494675, + "learning_rate": 5.2311542478739505e-06, + "loss": 0.4538, + "step": 12398 + }, + { + "epoch": 2.5488744989207524, + "grad_norm": 0.22767889499664307, + "learning_rate": 5.226481239673385e-06, + "loss": 0.3679, + "step": 12399 + }, + { + "epoch": 2.549080069894131, + "grad_norm": 0.23341724276542664, + "learning_rate": 5.221810190928183e-06, + "loss": 0.3912, + "step": 12400 + }, + { + "epoch": 2.5492856408675095, + "grad_norm": 0.2280956357717514, + "learning_rate": 5.21714110186847e-06, + "loss": 0.3871, + "step": 12401 + }, + { + "epoch": 2.549491211840888, + "grad_norm": 0.22712482511997223, + "learning_rate": 5.212473972724271e-06, + "loss": 0.3725, + "step": 12402 + }, + { + "epoch": 2.5496967828142667, + "grad_norm": 0.22607560455799103, + "learning_rate": 5.207808803725519e-06, + "loss": 0.3927, + "step": 12403 + }, + { + "epoch": 2.5499023537876453, + "grad_norm": 0.26311928033828735, + "learning_rate": 5.203145595102033e-06, + "loss": 0.4036, + "step": 12404 + }, + { + "epoch": 2.550107924761024, + "grad_norm": 0.12306389212608337, + "learning_rate": 5.198484347083541e-06, + "loss": 0.4641, + "step": 12405 + }, + { + "epoch": 2.550313495734402, + "grad_norm": 0.12494704872369766, + "learning_rate": 5.193825059899709e-06, + "loss": 0.4593, + "step": 12406 + }, + { + "epoch": 2.550519066707781, + "grad_norm": 0.2371709644794464, + "learning_rate": 5.189167733780062e-06, + "loss": 0.4007, + "step": 12407 + }, + { + "epoch": 2.550724637681159, + "grad_norm": 0.22765342891216278, + "learning_rate": 5.184512368954043e-06, + "loss": 0.3812, + "step": 12408 + }, + { + "epoch": 2.5509302086545382, + "grad_norm": 0.12098430842161179, + "learning_rate": 5.1798589656510035e-06, + "loss": 0.4594, + "step": 12409 + }, + { + "epoch": 2.5511357796279164, + "grad_norm": 0.2373332679271698, + "learning_rate": 5.1752075241001945e-06, + "loss": 0.3945, + "step": 12410 + }, + { + "epoch": 2.551341350601295, + "grad_norm": 0.12555475533008575, + "learning_rate": 5.170558044530767e-06, + "loss": 0.4456, + "step": 12411 + }, + { + "epoch": 2.5515469215746736, + "grad_norm": 0.22965727746486664, + "learning_rate": 5.16591052717178e-06, + "loss": 0.3749, + "step": 12412 + }, + { + "epoch": 2.551752492548052, + "grad_norm": 0.23548352718353271, + "learning_rate": 5.161264972252198e-06, + "loss": 0.3978, + "step": 12413 + }, + { + "epoch": 2.5519580635214307, + "grad_norm": 0.22701576352119446, + "learning_rate": 5.156621380000889e-06, + "loss": 0.3722, + "step": 12414 + }, + { + "epoch": 2.5521636344948093, + "grad_norm": 0.24250133335590363, + "learning_rate": 5.15197975064662e-06, + "loss": 0.3762, + "step": 12415 + }, + { + "epoch": 2.552369205468188, + "grad_norm": 0.235441654920578, + "learning_rate": 5.147340084418053e-06, + "loss": 0.3904, + "step": 12416 + }, + { + "epoch": 2.5525747764415665, + "grad_norm": 0.2249901443719864, + "learning_rate": 5.1427023815437655e-06, + "loss": 0.3812, + "step": 12417 + }, + { + "epoch": 2.552780347414945, + "grad_norm": 0.11771126836538315, + "learning_rate": 5.138066642252249e-06, + "loss": 0.4564, + "step": 12418 + }, + { + "epoch": 2.5529859183883237, + "grad_norm": 0.22527842223644257, + "learning_rate": 5.133432866771862e-06, + "loss": 0.3784, + "step": 12419 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 0.22456350922584534, + "learning_rate": 5.1288010553309096e-06, + "loss": 0.367, + "step": 12420 + }, + { + "epoch": 2.553397060335081, + "grad_norm": 0.24883276224136353, + "learning_rate": 5.124171208157577e-06, + "loss": 0.4066, + "step": 12421 + }, + { + "epoch": 2.5536026313084594, + "grad_norm": 0.22530537843704224, + "learning_rate": 5.119543325479944e-06, + "loss": 0.3663, + "step": 12422 + }, + { + "epoch": 2.5538082022818376, + "grad_norm": 0.23030851781368256, + "learning_rate": 5.114917407526017e-06, + "loss": 0.3692, + "step": 12423 + }, + { + "epoch": 2.5540137732552166, + "grad_norm": 0.22793278098106384, + "learning_rate": 5.110293454523685e-06, + "loss": 0.3891, + "step": 12424 + }, + { + "epoch": 2.5542193442285948, + "grad_norm": 0.22494344413280487, + "learning_rate": 5.1056714667007475e-06, + "loss": 0.3759, + "step": 12425 + }, + { + "epoch": 2.5544249152019733, + "grad_norm": 0.2244558185338974, + "learning_rate": 5.101051444284902e-06, + "loss": 0.3901, + "step": 12426 + }, + { + "epoch": 2.554630486175352, + "grad_norm": 0.22287413477897644, + "learning_rate": 5.096433387503776e-06, + "loss": 0.3852, + "step": 12427 + }, + { + "epoch": 2.5548360571487305, + "grad_norm": 0.22315384447574615, + "learning_rate": 5.091817296584869e-06, + "loss": 0.3859, + "step": 12428 + }, + { + "epoch": 2.555041628122109, + "grad_norm": 0.2284417599439621, + "learning_rate": 5.087203171755592e-06, + "loss": 0.3805, + "step": 12429 + }, + { + "epoch": 2.5552471990954877, + "grad_norm": 0.22787770628929138, + "learning_rate": 5.08259101324326e-06, + "loss": 0.3772, + "step": 12430 + }, + { + "epoch": 2.5554527700688663, + "grad_norm": 0.11939222365617752, + "learning_rate": 5.0779808212751e-06, + "loss": 0.4522, + "step": 12431 + }, + { + "epoch": 2.555658341042245, + "grad_norm": 0.22622445225715637, + "learning_rate": 5.0733725960782266e-06, + "loss": 0.3636, + "step": 12432 + }, + { + "epoch": 2.5558639120156235, + "grad_norm": 0.23444552719593048, + "learning_rate": 5.068766337879662e-06, + "loss": 0.3865, + "step": 12433 + }, + { + "epoch": 2.556069482989002, + "grad_norm": 0.1237218827009201, + "learning_rate": 5.064162046906351e-06, + "loss": 0.4495, + "step": 12434 + }, + { + "epoch": 2.5562750539623806, + "grad_norm": 0.2306404858827591, + "learning_rate": 5.059559723385115e-06, + "loss": 0.3957, + "step": 12435 + }, + { + "epoch": 2.5564806249357592, + "grad_norm": 0.2336534857749939, + "learning_rate": 5.054959367542689e-06, + "loss": 0.3902, + "step": 12436 + }, + { + "epoch": 2.556686195909138, + "grad_norm": 0.23504294455051422, + "learning_rate": 5.0503609796057175e-06, + "loss": 0.3697, + "step": 12437 + }, + { + "epoch": 2.556891766882516, + "grad_norm": 0.23617815971374512, + "learning_rate": 5.045764559800722e-06, + "loss": 0.3986, + "step": 12438 + }, + { + "epoch": 2.557097337855895, + "grad_norm": 0.2346443086862564, + "learning_rate": 5.041170108354174e-06, + "loss": 0.3879, + "step": 12439 + }, + { + "epoch": 2.557302908829273, + "grad_norm": 0.2332235723733902, + "learning_rate": 5.0365776254924055e-06, + "loss": 0.3873, + "step": 12440 + }, + { + "epoch": 2.5575084798026517, + "grad_norm": 0.2314586490392685, + "learning_rate": 5.031987111441657e-06, + "loss": 0.3749, + "step": 12441 + }, + { + "epoch": 2.5577140507760303, + "grad_norm": 0.22944357991218567, + "learning_rate": 5.027398566428106e-06, + "loss": 0.3968, + "step": 12442 + }, + { + "epoch": 2.557919621749409, + "grad_norm": 0.23463036119937897, + "learning_rate": 5.0228119906777975e-06, + "loss": 0.3848, + "step": 12443 + }, + { + "epoch": 2.5581251927227875, + "grad_norm": 0.12207529693841934, + "learning_rate": 5.018227384416686e-06, + "loss": 0.4292, + "step": 12444 + }, + { + "epoch": 2.558330763696166, + "grad_norm": 0.12470246851444244, + "learning_rate": 5.013644747870641e-06, + "loss": 0.4441, + "step": 12445 + }, + { + "epoch": 2.5585363346695447, + "grad_norm": 0.2320421189069748, + "learning_rate": 5.009064081265421e-06, + "loss": 0.3746, + "step": 12446 + }, + { + "epoch": 2.5587419056429233, + "grad_norm": 0.22109293937683105, + "learning_rate": 5.004485384826685e-06, + "loss": 0.3845, + "step": 12447 + }, + { + "epoch": 2.558947476616302, + "grad_norm": 0.2225075662136078, + "learning_rate": 4.999908658780025e-06, + "loss": 0.3755, + "step": 12448 + }, + { + "epoch": 2.5591530475896804, + "grad_norm": 0.12030386924743652, + "learning_rate": 4.995333903350908e-06, + "loss": 0.4508, + "step": 12449 + }, + { + "epoch": 2.559358618563059, + "grad_norm": 0.2298811674118042, + "learning_rate": 4.990761118764711e-06, + "loss": 0.384, + "step": 12450 + }, + { + "epoch": 2.5595641895364376, + "grad_norm": 0.23811288177967072, + "learning_rate": 4.9861903052467065e-06, + "loss": 0.3781, + "step": 12451 + }, + { + "epoch": 2.559769760509816, + "grad_norm": 0.22522372007369995, + "learning_rate": 4.981621463022082e-06, + "loss": 0.3895, + "step": 12452 + }, + { + "epoch": 2.5599753314831943, + "grad_norm": 0.2294057160615921, + "learning_rate": 4.9770545923159244e-06, + "loss": 0.3782, + "step": 12453 + }, + { + "epoch": 2.5601809024565734, + "grad_norm": 0.225949227809906, + "learning_rate": 4.972489693353206e-06, + "loss": 0.3833, + "step": 12454 + }, + { + "epoch": 2.5603864734299515, + "grad_norm": 0.22200778126716614, + "learning_rate": 4.967926766358847e-06, + "loss": 0.3662, + "step": 12455 + }, + { + "epoch": 2.56059204440333, + "grad_norm": 0.25845810770988464, + "learning_rate": 4.963365811557625e-06, + "loss": 0.3953, + "step": 12456 + }, + { + "epoch": 2.5607976153767087, + "grad_norm": 0.24813143908977509, + "learning_rate": 4.958806829174239e-06, + "loss": 0.3734, + "step": 12457 + }, + { + "epoch": 2.5610031863500873, + "grad_norm": 0.2332116812467575, + "learning_rate": 4.954249819433291e-06, + "loss": 0.4004, + "step": 12458 + }, + { + "epoch": 2.561208757323466, + "grad_norm": 0.3196330666542053, + "learning_rate": 4.949694782559268e-06, + "loss": 0.3785, + "step": 12459 + }, + { + "epoch": 2.5614143282968445, + "grad_norm": 0.1242533028125763, + "learning_rate": 4.945141718776601e-06, + "loss": 0.4463, + "step": 12460 + }, + { + "epoch": 2.561619899270223, + "grad_norm": 0.12104514986276627, + "learning_rate": 4.94059062830958e-06, + "loss": 0.4533, + "step": 12461 + }, + { + "epoch": 2.5618254702436016, + "grad_norm": 0.23640932142734528, + "learning_rate": 4.9360415113824195e-06, + "loss": 0.3747, + "step": 12462 + }, + { + "epoch": 2.56203104121698, + "grad_norm": 0.13072489202022552, + "learning_rate": 4.931494368219237e-06, + "loss": 0.44, + "step": 12463 + }, + { + "epoch": 2.562236612190359, + "grad_norm": 0.22700245678424835, + "learning_rate": 4.926949199044052e-06, + "loss": 0.3968, + "step": 12464 + }, + { + "epoch": 2.5624421831637374, + "grad_norm": 0.23678947985172272, + "learning_rate": 4.922406004080776e-06, + "loss": 0.3869, + "step": 12465 + }, + { + "epoch": 2.562647754137116, + "grad_norm": 0.22681771218776703, + "learning_rate": 4.91786478355324e-06, + "loss": 0.3858, + "step": 12466 + }, + { + "epoch": 2.5628533251104946, + "grad_norm": 0.12208957225084305, + "learning_rate": 4.91332553768515e-06, + "loss": 0.4612, + "step": 12467 + }, + { + "epoch": 2.5630588960838727, + "grad_norm": 0.22747553884983063, + "learning_rate": 4.908788266700153e-06, + "loss": 0.3569, + "step": 12468 + }, + { + "epoch": 2.5632644670572517, + "grad_norm": 0.23218391835689545, + "learning_rate": 4.904252970821774e-06, + "loss": 0.3702, + "step": 12469 + }, + { + "epoch": 2.56347003803063, + "grad_norm": 0.23117561638355255, + "learning_rate": 4.899719650273443e-06, + "loss": 0.3971, + "step": 12470 + }, + { + "epoch": 2.5636756090040085, + "grad_norm": 0.22626996040344238, + "learning_rate": 4.895188305278499e-06, + "loss": 0.3674, + "step": 12471 + }, + { + "epoch": 2.563881179977387, + "grad_norm": 0.23139001429080963, + "learning_rate": 4.890658936060177e-06, + "loss": 0.3867, + "step": 12472 + }, + { + "epoch": 2.5640867509507657, + "grad_norm": 0.12001971155405045, + "learning_rate": 4.8861315428416195e-06, + "loss": 0.4376, + "step": 12473 + }, + { + "epoch": 2.5642923219241442, + "grad_norm": 0.22510318458080292, + "learning_rate": 4.8816061258458565e-06, + "loss": 0.3841, + "step": 12474 + }, + { + "epoch": 2.564497892897523, + "grad_norm": 0.22726254165172577, + "learning_rate": 4.877082685295861e-06, + "loss": 0.364, + "step": 12475 + }, + { + "epoch": 2.5647034638709014, + "grad_norm": 0.2362823784351349, + "learning_rate": 4.872561221414465e-06, + "loss": 0.376, + "step": 12476 + }, + { + "epoch": 2.56490903484428, + "grad_norm": 0.2281261384487152, + "learning_rate": 4.868041734424418e-06, + "loss": 0.3786, + "step": 12477 + }, + { + "epoch": 2.5651146058176586, + "grad_norm": 0.22835490107536316, + "learning_rate": 4.863524224548385e-06, + "loss": 0.3793, + "step": 12478 + }, + { + "epoch": 2.565320176791037, + "grad_norm": 0.22907859086990356, + "learning_rate": 4.859008692008911e-06, + "loss": 0.3848, + "step": 12479 + }, + { + "epoch": 2.5655257477644158, + "grad_norm": 0.12358912080526352, + "learning_rate": 4.854495137028458e-06, + "loss": 0.4492, + "step": 12480 + }, + { + "epoch": 2.5657313187377944, + "grad_norm": 0.23854859173297882, + "learning_rate": 4.849983559829394e-06, + "loss": 0.3861, + "step": 12481 + }, + { + "epoch": 2.565936889711173, + "grad_norm": 0.23517528176307678, + "learning_rate": 4.845473960633981e-06, + "loss": 0.3655, + "step": 12482 + }, + { + "epoch": 2.566142460684551, + "grad_norm": 0.11823319643735886, + "learning_rate": 4.840966339664371e-06, + "loss": 0.4302, + "step": 12483 + }, + { + "epoch": 2.56634803165793, + "grad_norm": 0.22691769897937775, + "learning_rate": 4.836460697142662e-06, + "loss": 0.3748, + "step": 12484 + }, + { + "epoch": 2.5665536026313083, + "grad_norm": 0.23544248938560486, + "learning_rate": 4.831957033290806e-06, + "loss": 0.3853, + "step": 12485 + }, + { + "epoch": 2.566759173604687, + "grad_norm": 0.22319963574409485, + "learning_rate": 4.827455348330684e-06, + "loss": 0.389, + "step": 12486 + }, + { + "epoch": 2.5669647445780654, + "grad_norm": 0.22622336447238922, + "learning_rate": 4.822955642484072e-06, + "loss": 0.376, + "step": 12487 + }, + { + "epoch": 2.567170315551444, + "grad_norm": 0.22860629856586456, + "learning_rate": 4.818457915972635e-06, + "loss": 0.3648, + "step": 12488 + }, + { + "epoch": 2.5673758865248226, + "grad_norm": 0.12275035679340363, + "learning_rate": 4.813962169017981e-06, + "loss": 0.441, + "step": 12489 + }, + { + "epoch": 2.567581457498201, + "grad_norm": 0.2211294323205948, + "learning_rate": 4.809468401841578e-06, + "loss": 0.3881, + "step": 12490 + }, + { + "epoch": 2.56778702847158, + "grad_norm": 0.2277289628982544, + "learning_rate": 4.804976614664821e-06, + "loss": 0.3607, + "step": 12491 + }, + { + "epoch": 2.5679925994449584, + "grad_norm": 0.23206478357315063, + "learning_rate": 4.800486807708995e-06, + "loss": 0.3881, + "step": 12492 + }, + { + "epoch": 2.568198170418337, + "grad_norm": 0.24027037620544434, + "learning_rate": 4.795998981195294e-06, + "loss": 0.3896, + "step": 12493 + }, + { + "epoch": 2.5684037413917156, + "grad_norm": 0.24075712263584137, + "learning_rate": 4.791513135344807e-06, + "loss": 0.3876, + "step": 12494 + }, + { + "epoch": 2.568609312365094, + "grad_norm": 0.23393917083740234, + "learning_rate": 4.787029270378522e-06, + "loss": 0.3844, + "step": 12495 + }, + { + "epoch": 2.5688148833384727, + "grad_norm": 0.22370143234729767, + "learning_rate": 4.782547386517362e-06, + "loss": 0.3913, + "step": 12496 + }, + { + "epoch": 2.5690204543118513, + "grad_norm": 0.23047272861003876, + "learning_rate": 4.778067483982119e-06, + "loss": 0.3883, + "step": 12497 + }, + { + "epoch": 2.5692260252852295, + "grad_norm": 0.12189171463251114, + "learning_rate": 4.773589562993489e-06, + "loss": 0.4429, + "step": 12498 + }, + { + "epoch": 2.5694315962586085, + "grad_norm": 0.23024466633796692, + "learning_rate": 4.769113623772089e-06, + "loss": 0.3858, + "step": 12499 + }, + { + "epoch": 2.5696371672319867, + "grad_norm": 0.12103652209043503, + "learning_rate": 4.764639666538418e-06, + "loss": 0.4603, + "step": 12500 + }, + { + "epoch": 2.5698427382053652, + "grad_norm": 0.23692312836647034, + "learning_rate": 4.76016769151289e-06, + "loss": 0.3932, + "step": 12501 + }, + { + "epoch": 2.570048309178744, + "grad_norm": 0.22613804042339325, + "learning_rate": 4.755697698915813e-06, + "loss": 0.3724, + "step": 12502 + }, + { + "epoch": 2.5702538801521224, + "grad_norm": 0.2332460582256317, + "learning_rate": 4.7512296889674205e-06, + "loss": 0.3811, + "step": 12503 + }, + { + "epoch": 2.570459451125501, + "grad_norm": 0.2254786342382431, + "learning_rate": 4.746763661887813e-06, + "loss": 0.3876, + "step": 12504 + }, + { + "epoch": 2.5706650220988796, + "grad_norm": 0.2281585931777954, + "learning_rate": 4.742299617897014e-06, + "loss": 0.3865, + "step": 12505 + }, + { + "epoch": 2.570870593072258, + "grad_norm": 0.23506386578083038, + "learning_rate": 4.737837557214951e-06, + "loss": 0.3798, + "step": 12506 + }, + { + "epoch": 2.5710761640456368, + "grad_norm": 0.23268993198871613, + "learning_rate": 4.7333774800614505e-06, + "loss": 0.3984, + "step": 12507 + }, + { + "epoch": 2.5712817350190154, + "grad_norm": 0.23153680562973022, + "learning_rate": 4.728919386656236e-06, + "loss": 0.386, + "step": 12508 + }, + { + "epoch": 2.571487305992394, + "grad_norm": 0.1192520409822464, + "learning_rate": 4.72446327721893e-06, + "loss": 0.4267, + "step": 12509 + }, + { + "epoch": 2.5716928769657725, + "grad_norm": 0.23475117981433868, + "learning_rate": 4.720009151969075e-06, + "loss": 0.3883, + "step": 12510 + }, + { + "epoch": 2.571898447939151, + "grad_norm": 0.24187681078910828, + "learning_rate": 4.715557011126102e-06, + "loss": 0.3814, + "step": 12511 + }, + { + "epoch": 2.5721040189125297, + "grad_norm": 0.23029695451259613, + "learning_rate": 4.7111068549093485e-06, + "loss": 0.3786, + "step": 12512 + }, + { + "epoch": 2.572309589885908, + "grad_norm": 0.1344357579946518, + "learning_rate": 4.7066586835380475e-06, + "loss": 0.4468, + "step": 12513 + }, + { + "epoch": 2.572515160859287, + "grad_norm": 0.2278510481119156, + "learning_rate": 4.7022124972313446e-06, + "loss": 0.3777, + "step": 12514 + }, + { + "epoch": 2.572720731832665, + "grad_norm": 0.23537226021289825, + "learning_rate": 4.697768296208279e-06, + "loss": 0.3934, + "step": 12515 + }, + { + "epoch": 2.572926302806044, + "grad_norm": 0.2430686354637146, + "learning_rate": 4.693326080687791e-06, + "loss": 0.4047, + "step": 12516 + }, + { + "epoch": 2.573131873779422, + "grad_norm": 0.2354026734828949, + "learning_rate": 4.688885850888745e-06, + "loss": 0.3855, + "step": 12517 + }, + { + "epoch": 2.573337444752801, + "grad_norm": 0.22142033278942108, + "learning_rate": 4.6844476070298715e-06, + "loss": 0.4079, + "step": 12518 + }, + { + "epoch": 2.5735430157261794, + "grad_norm": 0.23135825991630554, + "learning_rate": 4.680011349329835e-06, + "loss": 0.3854, + "step": 12519 + }, + { + "epoch": 2.573748586699558, + "grad_norm": 0.23446208238601685, + "learning_rate": 4.675577078007187e-06, + "loss": 0.3963, + "step": 12520 + }, + { + "epoch": 2.5739541576729366, + "grad_norm": 0.11839111894369125, + "learning_rate": 4.671144793280376e-06, + "loss": 0.4355, + "step": 12521 + }, + { + "epoch": 2.574159728646315, + "grad_norm": 0.23686189949512482, + "learning_rate": 4.666714495367763e-06, + "loss": 0.3901, + "step": 12522 + }, + { + "epoch": 2.5743652996196937, + "grad_norm": 0.11666683852672577, + "learning_rate": 4.662286184487604e-06, + "loss": 0.4504, + "step": 12523 + }, + { + "epoch": 2.5745708705930723, + "grad_norm": 0.23141448199748993, + "learning_rate": 4.6578598608580744e-06, + "loss": 0.3776, + "step": 12524 + }, + { + "epoch": 2.574776441566451, + "grad_norm": 0.23291108012199402, + "learning_rate": 4.653435524697234e-06, + "loss": 0.3911, + "step": 12525 + }, + { + "epoch": 2.5749820125398295, + "grad_norm": 0.2317928522825241, + "learning_rate": 4.649013176223034e-06, + "loss": 0.3803, + "step": 12526 + }, + { + "epoch": 2.575187583513208, + "grad_norm": 0.23690040409564972, + "learning_rate": 4.644592815653365e-06, + "loss": 0.3758, + "step": 12527 + }, + { + "epoch": 2.5753931544865862, + "grad_norm": 0.22948139905929565, + "learning_rate": 4.640174443205982e-06, + "loss": 0.3874, + "step": 12528 + }, + { + "epoch": 2.5755987254599653, + "grad_norm": 0.2342422902584076, + "learning_rate": 4.635758059098568e-06, + "loss": 0.3791, + "step": 12529 + }, + { + "epoch": 2.5758042964333434, + "grad_norm": 0.2423672080039978, + "learning_rate": 4.6313436635486865e-06, + "loss": 0.3912, + "step": 12530 + }, + { + "epoch": 2.5760098674067224, + "grad_norm": 0.23133836686611176, + "learning_rate": 4.626931256773821e-06, + "loss": 0.3838, + "step": 12531 + }, + { + "epoch": 2.5762154383801006, + "grad_norm": 0.22452816367149353, + "learning_rate": 4.622520838991355e-06, + "loss": 0.393, + "step": 12532 + }, + { + "epoch": 2.576421009353479, + "grad_norm": 0.23489362001419067, + "learning_rate": 4.618112410418561e-06, + "loss": 0.3839, + "step": 12533 + }, + { + "epoch": 2.5766265803268578, + "grad_norm": 0.22381377220153809, + "learning_rate": 4.613705971272626e-06, + "loss": 0.3874, + "step": 12534 + }, + { + "epoch": 2.5768321513002364, + "grad_norm": 0.22615040838718414, + "learning_rate": 4.6093015217706305e-06, + "loss": 0.3871, + "step": 12535 + }, + { + "epoch": 2.577037722273615, + "grad_norm": 0.12121882289648056, + "learning_rate": 4.604899062129556e-06, + "loss": 0.4319, + "step": 12536 + }, + { + "epoch": 2.5772432932469935, + "grad_norm": 0.21779850125312805, + "learning_rate": 4.600498592566309e-06, + "loss": 0.3738, + "step": 12537 + }, + { + "epoch": 2.577448864220372, + "grad_norm": 0.12148154526948929, + "learning_rate": 4.596100113297666e-06, + "loss": 0.4412, + "step": 12538 + }, + { + "epoch": 2.5776544351937507, + "grad_norm": 0.24054840207099915, + "learning_rate": 4.591703624540323e-06, + "loss": 0.3983, + "step": 12539 + }, + { + "epoch": 2.5778600061671293, + "grad_norm": 0.23665878176689148, + "learning_rate": 4.587309126510879e-06, + "loss": 0.391, + "step": 12540 + }, + { + "epoch": 2.578065577140508, + "grad_norm": 0.23454469442367554, + "learning_rate": 4.582916619425823e-06, + "loss": 0.38, + "step": 12541 + }, + { + "epoch": 2.5782711481138865, + "grad_norm": 0.22932063043117523, + "learning_rate": 4.578526103501554e-06, + "loss": 0.386, + "step": 12542 + }, + { + "epoch": 2.5784767190872646, + "grad_norm": 0.2300751805305481, + "learning_rate": 4.574137578954369e-06, + "loss": 0.3736, + "step": 12543 + }, + { + "epoch": 2.5786822900606436, + "grad_norm": 0.23985406756401062, + "learning_rate": 4.569751046000483e-06, + "loss": 0.4049, + "step": 12544 + }, + { + "epoch": 2.578887861034022, + "grad_norm": 0.2458028644323349, + "learning_rate": 4.5653665048559895e-06, + "loss": 0.3769, + "step": 12545 + }, + { + "epoch": 2.579093432007401, + "grad_norm": 0.23812150955200195, + "learning_rate": 4.560983955736901e-06, + "loss": 0.3921, + "step": 12546 + }, + { + "epoch": 2.579299002980779, + "grad_norm": 0.23339690268039703, + "learning_rate": 4.5566033988591146e-06, + "loss": 0.3839, + "step": 12547 + }, + { + "epoch": 2.5795045739541576, + "grad_norm": 0.22212813794612885, + "learning_rate": 4.5522248344384525e-06, + "loss": 0.3801, + "step": 12548 + }, + { + "epoch": 2.579710144927536, + "grad_norm": 0.23296941816806793, + "learning_rate": 4.547848262690621e-06, + "loss": 0.3995, + "step": 12549 + }, + { + "epoch": 2.5799157159009147, + "grad_norm": 0.23013028502464294, + "learning_rate": 4.543473683831221e-06, + "loss": 0.3542, + "step": 12550 + }, + { + "epoch": 2.5801212868742933, + "grad_norm": 0.23488110303878784, + "learning_rate": 4.539101098075791e-06, + "loss": 0.3884, + "step": 12551 + }, + { + "epoch": 2.580326857847672, + "grad_norm": 0.2313051074743271, + "learning_rate": 4.534730505639736e-06, + "loss": 0.3894, + "step": 12552 + }, + { + "epoch": 2.5805324288210505, + "grad_norm": 0.2325943112373352, + "learning_rate": 4.5303619067383785e-06, + "loss": 0.38, + "step": 12553 + }, + { + "epoch": 2.580737999794429, + "grad_norm": 0.22922460734844208, + "learning_rate": 4.525995301586931e-06, + "loss": 0.4037, + "step": 12554 + }, + { + "epoch": 2.5809435707678077, + "grad_norm": 0.12386429309844971, + "learning_rate": 4.521630690400517e-06, + "loss": 0.4532, + "step": 12555 + }, + { + "epoch": 2.5811491417411863, + "grad_norm": 0.223622664809227, + "learning_rate": 4.517268073394169e-06, + "loss": 0.3716, + "step": 12556 + }, + { + "epoch": 2.581354712714565, + "grad_norm": 0.23587054014205933, + "learning_rate": 4.512907450782795e-06, + "loss": 0.3963, + "step": 12557 + }, + { + "epoch": 2.581560283687943, + "grad_norm": 0.23143510520458221, + "learning_rate": 4.508548822781248e-06, + "loss": 0.3691, + "step": 12558 + }, + { + "epoch": 2.581765854661322, + "grad_norm": 0.227389857172966, + "learning_rate": 4.504192189604236e-06, + "loss": 0.3778, + "step": 12559 + }, + { + "epoch": 2.5819714256347, + "grad_norm": 0.2255561500787735, + "learning_rate": 4.499837551466404e-06, + "loss": 0.3913, + "step": 12560 + }, + { + "epoch": 2.582176996608079, + "grad_norm": 0.2301749736070633, + "learning_rate": 4.4954849085822795e-06, + "loss": 0.3736, + "step": 12561 + }, + { + "epoch": 2.5823825675814573, + "grad_norm": 0.23604105412960052, + "learning_rate": 4.491134261166295e-06, + "loss": 0.3876, + "step": 12562 + }, + { + "epoch": 2.582588138554836, + "grad_norm": 0.24186021089553833, + "learning_rate": 4.4867856094327845e-06, + "loss": 0.3961, + "step": 12563 + }, + { + "epoch": 2.5827937095282145, + "grad_norm": 0.23529918491840363, + "learning_rate": 4.482438953595982e-06, + "loss": 0.3746, + "step": 12564 + }, + { + "epoch": 2.582999280501593, + "grad_norm": 0.2367369532585144, + "learning_rate": 4.4780942938700425e-06, + "loss": 0.397, + "step": 12565 + }, + { + "epoch": 2.5832048514749717, + "grad_norm": 0.23594695329666138, + "learning_rate": 4.473751630468997e-06, + "loss": 0.395, + "step": 12566 + }, + { + "epoch": 2.5834104224483503, + "grad_norm": 0.221700519323349, + "learning_rate": 4.469410963606791e-06, + "loss": 0.3851, + "step": 12567 + }, + { + "epoch": 2.583615993421729, + "grad_norm": 0.12304549664258957, + "learning_rate": 4.465072293497258e-06, + "loss": 0.4312, + "step": 12568 + }, + { + "epoch": 2.5838215643951075, + "grad_norm": 0.22101683914661407, + "learning_rate": 4.460735620354163e-06, + "loss": 0.3857, + "step": 12569 + }, + { + "epoch": 2.584027135368486, + "grad_norm": 0.11826925724744797, + "learning_rate": 4.456400944391144e-06, + "loss": 0.4562, + "step": 12570 + }, + { + "epoch": 2.5842327063418646, + "grad_norm": 0.11855246126651764, + "learning_rate": 4.45206826582174e-06, + "loss": 0.4436, + "step": 12571 + }, + { + "epoch": 2.5844382773152432, + "grad_norm": 0.22833800315856934, + "learning_rate": 4.447737584859421e-06, + "loss": 0.38, + "step": 12572 + }, + { + "epoch": 2.5846438482886214, + "grad_norm": 0.12766826152801514, + "learning_rate": 4.443408901717526e-06, + "loss": 0.4538, + "step": 12573 + }, + { + "epoch": 2.5848494192620004, + "grad_norm": 0.2362251728773117, + "learning_rate": 4.43908221660932e-06, + "loss": 0.3791, + "step": 12574 + }, + { + "epoch": 2.5850549902353785, + "grad_norm": 0.2291366159915924, + "learning_rate": 4.434757529747952e-06, + "loss": 0.3797, + "step": 12575 + }, + { + "epoch": 2.5852605612087576, + "grad_norm": 0.11885092407464981, + "learning_rate": 4.430434841346476e-06, + "loss": 0.4403, + "step": 12576 + }, + { + "epoch": 2.5854661321821357, + "grad_norm": 0.2298079878091812, + "learning_rate": 4.426114151617852e-06, + "loss": 0.3733, + "step": 12577 + }, + { + "epoch": 2.5856717031555143, + "grad_norm": 0.2344081848859787, + "learning_rate": 4.421795460774936e-06, + "loss": 0.3807, + "step": 12578 + }, + { + "epoch": 2.585877274128893, + "grad_norm": 0.23046311736106873, + "learning_rate": 4.417478769030506e-06, + "loss": 0.3999, + "step": 12579 + }, + { + "epoch": 2.5860828451022715, + "grad_norm": 0.22831617295742035, + "learning_rate": 4.4131640765972125e-06, + "loss": 0.367, + "step": 12580 + }, + { + "epoch": 2.58628841607565, + "grad_norm": 0.2355516254901886, + "learning_rate": 4.408851383687621e-06, + "loss": 0.3845, + "step": 12581 + }, + { + "epoch": 2.5864939870490287, + "grad_norm": 0.23236438632011414, + "learning_rate": 4.4045406905142014e-06, + "loss": 0.399, + "step": 12582 + }, + { + "epoch": 2.5866995580224073, + "grad_norm": 0.2408120036125183, + "learning_rate": 4.400231997289323e-06, + "loss": 0.3817, + "step": 12583 + }, + { + "epoch": 2.586905128995786, + "grad_norm": 0.11931653320789337, + "learning_rate": 4.395925304225247e-06, + "loss": 0.4378, + "step": 12584 + }, + { + "epoch": 2.5871106999691644, + "grad_norm": 0.2242845743894577, + "learning_rate": 4.391620611534138e-06, + "loss": 0.3632, + "step": 12585 + }, + { + "epoch": 2.587316270942543, + "grad_norm": 0.2249579280614853, + "learning_rate": 4.387317919428092e-06, + "loss": 0.3774, + "step": 12586 + }, + { + "epoch": 2.5875218419159216, + "grad_norm": 0.245198056101799, + "learning_rate": 4.383017228119064e-06, + "loss": 0.3825, + "step": 12587 + }, + { + "epoch": 2.5877274128893, + "grad_norm": 0.23710772395133972, + "learning_rate": 4.378718537818934e-06, + "loss": 0.3911, + "step": 12588 + }, + { + "epoch": 2.587932983862679, + "grad_norm": 0.22874656319618225, + "learning_rate": 4.374421848739483e-06, + "loss": 0.3732, + "step": 12589 + }, + { + "epoch": 2.588138554836057, + "grad_norm": 0.2265346497297287, + "learning_rate": 4.370127161092373e-06, + "loss": 0.367, + "step": 12590 + }, + { + "epoch": 2.588344125809436, + "grad_norm": 0.22639183700084686, + "learning_rate": 4.365834475089203e-06, + "loss": 0.3966, + "step": 12591 + }, + { + "epoch": 2.588549696782814, + "grad_norm": 0.2292342483997345, + "learning_rate": 4.361543790941434e-06, + "loss": 0.3785, + "step": 12592 + }, + { + "epoch": 2.5887552677561927, + "grad_norm": 0.23128138482570648, + "learning_rate": 4.357255108860468e-06, + "loss": 0.3829, + "step": 12593 + }, + { + "epoch": 2.5889608387295713, + "grad_norm": 0.2288789004087448, + "learning_rate": 4.35296842905758e-06, + "loss": 0.3774, + "step": 12594 + }, + { + "epoch": 2.58916640970295, + "grad_norm": 0.22914868593215942, + "learning_rate": 4.348683751743952e-06, + "loss": 0.3669, + "step": 12595 + }, + { + "epoch": 2.5893719806763285, + "grad_norm": 0.12475783377885818, + "learning_rate": 4.344401077130674e-06, + "loss": 0.466, + "step": 12596 + }, + { + "epoch": 2.589577551649707, + "grad_norm": 0.22877533733844757, + "learning_rate": 4.340120405428733e-06, + "loss": 0.386, + "step": 12597 + }, + { + "epoch": 2.5897831226230856, + "grad_norm": 0.2277032434940338, + "learning_rate": 4.335841736849015e-06, + "loss": 0.3892, + "step": 12598 + }, + { + "epoch": 2.589988693596464, + "grad_norm": 0.22553406655788422, + "learning_rate": 4.331565071602301e-06, + "loss": 0.3725, + "step": 12599 + }, + { + "epoch": 2.590194264569843, + "grad_norm": 0.23949706554412842, + "learning_rate": 4.327290409899299e-06, + "loss": 0.3877, + "step": 12600 + }, + { + "epoch": 2.5903998355432214, + "grad_norm": 0.23723599314689636, + "learning_rate": 4.323017751950593e-06, + "loss": 0.3816, + "step": 12601 + }, + { + "epoch": 2.5906054065166, + "grad_norm": 0.2249545156955719, + "learning_rate": 4.318747097966682e-06, + "loss": 0.3656, + "step": 12602 + }, + { + "epoch": 2.5908109774899786, + "grad_norm": 0.22489015758037567, + "learning_rate": 4.314478448157962e-06, + "loss": 0.3837, + "step": 12603 + }, + { + "epoch": 2.591016548463357, + "grad_norm": 0.22682681679725647, + "learning_rate": 4.31021180273472e-06, + "loss": 0.388, + "step": 12604 + }, + { + "epoch": 2.5912221194367353, + "grad_norm": 0.22748106718063354, + "learning_rate": 4.305947161907161e-06, + "loss": 0.3789, + "step": 12605 + }, + { + "epoch": 2.5914276904101143, + "grad_norm": 0.22520016133785248, + "learning_rate": 4.301684525885369e-06, + "loss": 0.3921, + "step": 12606 + }, + { + "epoch": 2.5916332613834925, + "grad_norm": 0.22781188786029816, + "learning_rate": 4.297423894879371e-06, + "loss": 0.3787, + "step": 12607 + }, + { + "epoch": 2.591838832356871, + "grad_norm": 0.1223108097910881, + "learning_rate": 4.293165269099049e-06, + "loss": 0.4503, + "step": 12608 + }, + { + "epoch": 2.5920444033302497, + "grad_norm": 0.2299673855304718, + "learning_rate": 4.288908648754213e-06, + "loss": 0.3899, + "step": 12609 + }, + { + "epoch": 2.5922499743036282, + "grad_norm": 0.28654077649116516, + "learning_rate": 4.284654034054568e-06, + "loss": 0.3756, + "step": 12610 + }, + { + "epoch": 2.592455545277007, + "grad_norm": 0.24485927820205688, + "learning_rate": 4.280401425209705e-06, + "loss": 0.3807, + "step": 12611 + }, + { + "epoch": 2.5926611162503854, + "grad_norm": 0.12086854130029678, + "learning_rate": 4.276150822429146e-06, + "loss": 0.4414, + "step": 12612 + }, + { + "epoch": 2.592866687223764, + "grad_norm": 0.12190677970647812, + "learning_rate": 4.2719022259223e-06, + "loss": 0.4467, + "step": 12613 + }, + { + "epoch": 2.5930722581971426, + "grad_norm": 0.23326410353183746, + "learning_rate": 4.267655635898454e-06, + "loss": 0.3714, + "step": 12614 + }, + { + "epoch": 2.593277829170521, + "grad_norm": 0.12049704790115356, + "learning_rate": 4.263411052566845e-06, + "loss": 0.4688, + "step": 12615 + }, + { + "epoch": 2.5934834001438998, + "grad_norm": 0.22923533618450165, + "learning_rate": 4.259168476136571e-06, + "loss": 0.3817, + "step": 12616 + }, + { + "epoch": 2.5936889711172784, + "grad_norm": 0.23745588958263397, + "learning_rate": 4.25492790681664e-06, + "loss": 0.3685, + "step": 12617 + }, + { + "epoch": 2.593894542090657, + "grad_norm": 0.1278616040945053, + "learning_rate": 4.250689344815975e-06, + "loss": 0.4721, + "step": 12618 + }, + { + "epoch": 2.5941001130640355, + "grad_norm": 0.2362724095582962, + "learning_rate": 4.2464527903433685e-06, + "loss": 0.399, + "step": 12619 + }, + { + "epoch": 2.5943056840374137, + "grad_norm": 0.22642555832862854, + "learning_rate": 4.242218243607564e-06, + "loss": 0.3937, + "step": 12620 + }, + { + "epoch": 2.5945112550107927, + "grad_norm": 0.11976632475852966, + "learning_rate": 4.237985704817164e-06, + "loss": 0.4401, + "step": 12621 + }, + { + "epoch": 2.594716825984171, + "grad_norm": 0.22681719064712524, + "learning_rate": 4.233755174180688e-06, + "loss": 0.3904, + "step": 12622 + }, + { + "epoch": 2.5949223969575494, + "grad_norm": 0.23691929876804352, + "learning_rate": 4.2295266519065575e-06, + "loss": 0.3986, + "step": 12623 + }, + { + "epoch": 2.595127967930928, + "grad_norm": 0.2224111258983612, + "learning_rate": 4.225300138203082e-06, + "loss": 0.3766, + "step": 12624 + }, + { + "epoch": 2.5953335389043066, + "grad_norm": 0.22920066118240356, + "learning_rate": 4.22107563327849e-06, + "loss": 0.3883, + "step": 12625 + }, + { + "epoch": 2.595539109877685, + "grad_norm": 0.22633981704711914, + "learning_rate": 4.216853137340895e-06, + "loss": 0.381, + "step": 12626 + }, + { + "epoch": 2.595744680851064, + "grad_norm": 0.2328253835439682, + "learning_rate": 4.21263265059833e-06, + "loss": 0.3936, + "step": 12627 + }, + { + "epoch": 2.5959502518244424, + "grad_norm": 0.2358826845884323, + "learning_rate": 4.208414173258719e-06, + "loss": 0.3572, + "step": 12628 + }, + { + "epoch": 2.596155822797821, + "grad_norm": 0.23712804913520813, + "learning_rate": 4.204197705529881e-06, + "loss": 0.4026, + "step": 12629 + }, + { + "epoch": 2.5963613937711996, + "grad_norm": 0.2283300757408142, + "learning_rate": 4.199983247619545e-06, + "loss": 0.3851, + "step": 12630 + }, + { + "epoch": 2.596566964744578, + "grad_norm": 0.22347889840602875, + "learning_rate": 4.195770799735333e-06, + "loss": 0.3673, + "step": 12631 + }, + { + "epoch": 2.5967725357179567, + "grad_norm": 0.23281633853912354, + "learning_rate": 4.1915603620847675e-06, + "loss": 0.4097, + "step": 12632 + }, + { + "epoch": 2.5969781066913353, + "grad_norm": 0.2382623255252838, + "learning_rate": 4.187351934875289e-06, + "loss": 0.409, + "step": 12633 + }, + { + "epoch": 2.597183677664714, + "grad_norm": 0.2310085892677307, + "learning_rate": 4.18314551831423e-06, + "loss": 0.3912, + "step": 12634 + }, + { + "epoch": 2.597389248638092, + "grad_norm": 0.2320510298013687, + "learning_rate": 4.1789411126088015e-06, + "loss": 0.3582, + "step": 12635 + }, + { + "epoch": 2.597594819611471, + "grad_norm": 0.23400908708572388, + "learning_rate": 4.174738717966154e-06, + "loss": 0.3725, + "step": 12636 + }, + { + "epoch": 2.5978003905848492, + "grad_norm": 0.23173139989376068, + "learning_rate": 4.170538334593318e-06, + "loss": 0.3992, + "step": 12637 + }, + { + "epoch": 2.598005961558228, + "grad_norm": 0.23350614309310913, + "learning_rate": 4.1663399626972175e-06, + "loss": 0.3795, + "step": 12638 + }, + { + "epoch": 2.5982115325316064, + "grad_norm": 0.23148328065872192, + "learning_rate": 4.162143602484692e-06, + "loss": 0.3611, + "step": 12639 + }, + { + "epoch": 2.598417103504985, + "grad_norm": 0.22413556277751923, + "learning_rate": 4.15794925416247e-06, + "loss": 0.3857, + "step": 12640 + }, + { + "epoch": 2.5986226744783636, + "grad_norm": 0.22561226785182953, + "learning_rate": 4.153756917937197e-06, + "loss": 0.3802, + "step": 12641 + }, + { + "epoch": 2.598828245451742, + "grad_norm": 0.22936685383319855, + "learning_rate": 4.149566594015408e-06, + "loss": 0.374, + "step": 12642 + }, + { + "epoch": 2.5990338164251208, + "grad_norm": 0.12491568177938461, + "learning_rate": 4.145378282603538e-06, + "loss": 0.4393, + "step": 12643 + }, + { + "epoch": 2.5992393873984994, + "grad_norm": 0.22605833411216736, + "learning_rate": 4.141191983907927e-06, + "loss": 0.3838, + "step": 12644 + }, + { + "epoch": 2.599444958371878, + "grad_norm": 0.22735659778118134, + "learning_rate": 4.137007698134814e-06, + "loss": 0.3879, + "step": 12645 + }, + { + "epoch": 2.5996505293452565, + "grad_norm": 0.23198509216308594, + "learning_rate": 4.1328254254903345e-06, + "loss": 0.3801, + "step": 12646 + }, + { + "epoch": 2.599856100318635, + "grad_norm": 0.11979317665100098, + "learning_rate": 4.12864516618053e-06, + "loss": 0.4608, + "step": 12647 + }, + { + "epoch": 2.6000616712920137, + "grad_norm": 0.23859287798404694, + "learning_rate": 4.124466920411354e-06, + "loss": 0.3896, + "step": 12648 + }, + { + "epoch": 2.6002672422653923, + "grad_norm": 0.22083625197410583, + "learning_rate": 4.120290688388638e-06, + "loss": 0.3972, + "step": 12649 + }, + { + "epoch": 2.6004728132387704, + "grad_norm": 0.23605285584926605, + "learning_rate": 4.116116470318131e-06, + "loss": 0.4005, + "step": 12650 + }, + { + "epoch": 2.6006783842121495, + "grad_norm": 0.2248559296131134, + "learning_rate": 4.111944266405476e-06, + "loss": 0.3765, + "step": 12651 + }, + { + "epoch": 2.6008839551855276, + "grad_norm": 0.23217467963695526, + "learning_rate": 4.107774076856211e-06, + "loss": 0.3721, + "step": 12652 + }, + { + "epoch": 2.601089526158906, + "grad_norm": 0.23271602392196655, + "learning_rate": 4.103605901875783e-06, + "loss": 0.383, + "step": 12653 + }, + { + "epoch": 2.601295097132285, + "grad_norm": 0.22513030469417572, + "learning_rate": 4.099439741669553e-06, + "loss": 0.375, + "step": 12654 + }, + { + "epoch": 2.6015006681056634, + "grad_norm": 0.2263166457414627, + "learning_rate": 4.0952755964427555e-06, + "loss": 0.3759, + "step": 12655 + }, + { + "epoch": 2.601706239079042, + "grad_norm": 0.22585409879684448, + "learning_rate": 4.091113466400533e-06, + "loss": 0.3809, + "step": 12656 + }, + { + "epoch": 2.6019118100524206, + "grad_norm": 0.22669021785259247, + "learning_rate": 4.08695335174795e-06, + "loss": 0.3782, + "step": 12657 + }, + { + "epoch": 2.602117381025799, + "grad_norm": 0.2331555336713791, + "learning_rate": 4.082795252689949e-06, + "loss": 0.3776, + "step": 12658 + }, + { + "epoch": 2.6023229519991777, + "grad_norm": 0.2259588986635208, + "learning_rate": 4.07863916943138e-06, + "loss": 0.4052, + "step": 12659 + }, + { + "epoch": 2.6025285229725563, + "grad_norm": 0.2281801849603653, + "learning_rate": 4.074485102176994e-06, + "loss": 0.391, + "step": 12660 + }, + { + "epoch": 2.602734093945935, + "grad_norm": 0.22164572775363922, + "learning_rate": 4.070333051131434e-06, + "loss": 0.3553, + "step": 12661 + }, + { + "epoch": 2.6029396649193135, + "grad_norm": 0.2315491884946823, + "learning_rate": 4.0661830164992644e-06, + "loss": 0.3879, + "step": 12662 + }, + { + "epoch": 2.603145235892692, + "grad_norm": 0.22418269515037537, + "learning_rate": 4.062034998484938e-06, + "loss": 0.3866, + "step": 12663 + }, + { + "epoch": 2.6033508068660707, + "grad_norm": 0.22450844943523407, + "learning_rate": 4.0578889972928e-06, + "loss": 0.3846, + "step": 12664 + }, + { + "epoch": 2.603556377839449, + "grad_norm": 0.22189322113990784, + "learning_rate": 4.053745013127109e-06, + "loss": 0.3828, + "step": 12665 + }, + { + "epoch": 2.603761948812828, + "grad_norm": 0.22486789524555206, + "learning_rate": 4.04960304619202e-06, + "loss": 0.3742, + "step": 12666 + }, + { + "epoch": 2.603967519786206, + "grad_norm": 0.23218779265880585, + "learning_rate": 4.045463096691585e-06, + "loss": 0.3841, + "step": 12667 + }, + { + "epoch": 2.6041730907595846, + "grad_norm": 0.24457047879695892, + "learning_rate": 4.041325164829752e-06, + "loss": 0.3924, + "step": 12668 + }, + { + "epoch": 2.604378661732963, + "grad_norm": 0.1159941703081131, + "learning_rate": 4.037189250810401e-06, + "loss": 0.4393, + "step": 12669 + }, + { + "epoch": 2.6045842327063418, + "grad_norm": 0.2236376702785492, + "learning_rate": 4.033055354837276e-06, + "loss": 0.376, + "step": 12670 + }, + { + "epoch": 2.6047898036797204, + "grad_norm": 0.23160508275032043, + "learning_rate": 4.0289234771140335e-06, + "loss": 0.3798, + "step": 12671 + }, + { + "epoch": 2.604995374653099, + "grad_norm": 0.11661987006664276, + "learning_rate": 4.02479361784423e-06, + "loss": 0.4366, + "step": 12672 + }, + { + "epoch": 2.6052009456264775, + "grad_norm": 0.22946786880493164, + "learning_rate": 4.020665777231327e-06, + "loss": 0.3895, + "step": 12673 + }, + { + "epoch": 2.605406516599856, + "grad_norm": 0.2358265221118927, + "learning_rate": 4.0165399554786894e-06, + "loss": 0.37, + "step": 12674 + }, + { + "epoch": 2.6056120875732347, + "grad_norm": 0.23617896437644958, + "learning_rate": 4.0124161527895635e-06, + "loss": 0.3892, + "step": 12675 + }, + { + "epoch": 2.6058176585466133, + "grad_norm": 0.23179112374782562, + "learning_rate": 4.008294369367121e-06, + "loss": 0.3648, + "step": 12676 + }, + { + "epoch": 2.606023229519992, + "grad_norm": 0.12185750156641006, + "learning_rate": 4.004174605414424e-06, + "loss": 0.4366, + "step": 12677 + }, + { + "epoch": 2.6062288004933705, + "grad_norm": 0.12000511586666107, + "learning_rate": 4.000056861134422e-06, + "loss": 0.4486, + "step": 12678 + }, + { + "epoch": 2.606434371466749, + "grad_norm": 0.25182247161865234, + "learning_rate": 3.995941136729992e-06, + "loss": 0.4121, + "step": 12679 + }, + { + "epoch": 2.606639942440127, + "grad_norm": 0.2277366816997528, + "learning_rate": 3.991827432403891e-06, + "loss": 0.3816, + "step": 12680 + }, + { + "epoch": 2.6068455134135062, + "grad_norm": 0.24362795054912567, + "learning_rate": 3.987715748358783e-06, + "loss": 0.3746, + "step": 12681 + }, + { + "epoch": 2.6070510843868844, + "grad_norm": 0.23480716347694397, + "learning_rate": 3.983606084797215e-06, + "loss": 0.3913, + "step": 12682 + }, + { + "epoch": 2.607256655360263, + "grad_norm": 0.232587993144989, + "learning_rate": 3.9794984419216755e-06, + "loss": 0.3749, + "step": 12683 + }, + { + "epoch": 2.6074622263336416, + "grad_norm": 0.22933539748191833, + "learning_rate": 3.9753928199345225e-06, + "loss": 0.3873, + "step": 12684 + }, + { + "epoch": 2.60766779730702, + "grad_norm": 0.22452722489833832, + "learning_rate": 3.971289219038014e-06, + "loss": 0.3707, + "step": 12685 + }, + { + "epoch": 2.6078733682803987, + "grad_norm": 0.23321999609470367, + "learning_rate": 3.967187639434315e-06, + "loss": 0.3864, + "step": 12686 + }, + { + "epoch": 2.6080789392537773, + "grad_norm": 0.1229674443602562, + "learning_rate": 3.963088081325497e-06, + "loss": 0.4495, + "step": 12687 + }, + { + "epoch": 2.608284510227156, + "grad_norm": 0.24325041472911835, + "learning_rate": 3.958990544913513e-06, + "loss": 0.3694, + "step": 12688 + }, + { + "epoch": 2.6084900812005345, + "grad_norm": 0.22455255687236786, + "learning_rate": 3.9548950304002536e-06, + "loss": 0.38, + "step": 12689 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.23633895814418793, + "learning_rate": 3.950801537987466e-06, + "loss": 0.3842, + "step": 12690 + }, + { + "epoch": 2.6089012231472917, + "grad_norm": 0.2320830076932907, + "learning_rate": 3.946710067876824e-06, + "loss": 0.3621, + "step": 12691 + }, + { + "epoch": 2.6091067941206703, + "grad_norm": 0.22698310017585754, + "learning_rate": 3.942620620269896e-06, + "loss": 0.3754, + "step": 12692 + }, + { + "epoch": 2.609312365094049, + "grad_norm": 0.23345276713371277, + "learning_rate": 3.938533195368147e-06, + "loss": 0.3781, + "step": 12693 + }, + { + "epoch": 2.6095179360674274, + "grad_norm": 0.23347729444503784, + "learning_rate": 3.93444779337295e-06, + "loss": 0.3761, + "step": 12694 + }, + { + "epoch": 2.6097235070408056, + "grad_norm": 0.2320747673511505, + "learning_rate": 3.9303644144855595e-06, + "loss": 0.3811, + "step": 12695 + }, + { + "epoch": 2.6099290780141846, + "grad_norm": 0.23767243325710297, + "learning_rate": 3.926283058907159e-06, + "loss": 0.3885, + "step": 12696 + }, + { + "epoch": 2.6101346489875628, + "grad_norm": 0.2290267050266266, + "learning_rate": 3.922203726838818e-06, + "loss": 0.3869, + "step": 12697 + }, + { + "epoch": 2.610340219960942, + "grad_norm": 0.2302408069372177, + "learning_rate": 3.918126418481507e-06, + "loss": 0.3751, + "step": 12698 + }, + { + "epoch": 2.61054579093432, + "grad_norm": 0.22879059612751007, + "learning_rate": 3.914051134036077e-06, + "loss": 0.3831, + "step": 12699 + }, + { + "epoch": 2.6107513619076985, + "grad_norm": 0.23263303935527802, + "learning_rate": 3.9099778737033215e-06, + "loss": 0.3813, + "step": 12700 + }, + { + "epoch": 2.610956932881077, + "grad_norm": 0.23693881928920746, + "learning_rate": 3.905906637683902e-06, + "loss": 0.3898, + "step": 12701 + }, + { + "epoch": 2.6111625038544557, + "grad_norm": 0.22104987502098083, + "learning_rate": 3.901837426178384e-06, + "loss": 0.3801, + "step": 12702 + }, + { + "epoch": 2.6113680748278343, + "grad_norm": 0.23495499789714813, + "learning_rate": 3.897770239387247e-06, + "loss": 0.3659, + "step": 12703 + }, + { + "epoch": 2.611573645801213, + "grad_norm": 0.23057711124420166, + "learning_rate": 3.893705077510861e-06, + "loss": 0.3858, + "step": 12704 + }, + { + "epoch": 2.6117792167745915, + "grad_norm": 0.2297714799642563, + "learning_rate": 3.8896419407494955e-06, + "loss": 0.3919, + "step": 12705 + }, + { + "epoch": 2.61198478774797, + "grad_norm": 0.12049584090709686, + "learning_rate": 3.885580829303326e-06, + "loss": 0.4581, + "step": 12706 + }, + { + "epoch": 2.6121903587213486, + "grad_norm": 0.23567521572113037, + "learning_rate": 3.8815217433724165e-06, + "loss": 0.3795, + "step": 12707 + }, + { + "epoch": 2.6123959296947272, + "grad_norm": 0.22988666594028473, + "learning_rate": 3.877464683156743e-06, + "loss": 0.3793, + "step": 12708 + }, + { + "epoch": 2.612601500668106, + "grad_norm": 0.2348259687423706, + "learning_rate": 3.873409648856175e-06, + "loss": 0.3949, + "step": 12709 + }, + { + "epoch": 2.612807071641484, + "grad_norm": 0.2306186556816101, + "learning_rate": 3.869356640670493e-06, + "loss": 0.3803, + "step": 12710 + }, + { + "epoch": 2.613012642614863, + "grad_norm": 0.23866905272006989, + "learning_rate": 3.865305658799362e-06, + "loss": 0.3926, + "step": 12711 + }, + { + "epoch": 2.613218213588241, + "grad_norm": 0.23288170993328094, + "learning_rate": 3.861256703442363e-06, + "loss": 0.3704, + "step": 12712 + }, + { + "epoch": 2.61342378456162, + "grad_norm": 0.12461275607347488, + "learning_rate": 3.857209774798965e-06, + "loss": 0.4396, + "step": 12713 + }, + { + "epoch": 2.6136293555349983, + "grad_norm": 0.2187352180480957, + "learning_rate": 3.853164873068535e-06, + "loss": 0.3681, + "step": 12714 + }, + { + "epoch": 2.613834926508377, + "grad_norm": 0.1244712844491005, + "learning_rate": 3.849121998450358e-06, + "loss": 0.4467, + "step": 12715 + }, + { + "epoch": 2.6140404974817555, + "grad_norm": 0.12192442268133163, + "learning_rate": 3.84508115114359e-06, + "loss": 0.4495, + "step": 12716 + }, + { + "epoch": 2.614246068455134, + "grad_norm": 0.22983375191688538, + "learning_rate": 3.841042331347321e-06, + "loss": 0.3779, + "step": 12717 + }, + { + "epoch": 2.6144516394285127, + "grad_norm": 0.23377950489521027, + "learning_rate": 3.8370055392605225e-06, + "loss": 0.3805, + "step": 12718 + }, + { + "epoch": 2.6146572104018913, + "grad_norm": 0.24588587880134583, + "learning_rate": 3.832970775082071e-06, + "loss": 0.393, + "step": 12719 + }, + { + "epoch": 2.61486278137527, + "grad_norm": 0.23526208102703094, + "learning_rate": 3.82893803901072e-06, + "loss": 0.3813, + "step": 12720 + }, + { + "epoch": 2.6150683523486484, + "grad_norm": 0.24966707825660706, + "learning_rate": 3.824907331245169e-06, + "loss": 0.3916, + "step": 12721 + }, + { + "epoch": 2.615273923322027, + "grad_norm": 0.2245279848575592, + "learning_rate": 3.820878651983982e-06, + "loss": 0.3745, + "step": 12722 + }, + { + "epoch": 2.6154794942954056, + "grad_norm": 0.2309785783290863, + "learning_rate": 3.816852001425625e-06, + "loss": 0.3854, + "step": 12723 + }, + { + "epoch": 2.615685065268784, + "grad_norm": 0.22770026326179504, + "learning_rate": 3.812827379768491e-06, + "loss": 0.3834, + "step": 12724 + }, + { + "epoch": 2.6158906362421623, + "grad_norm": 0.12256407737731934, + "learning_rate": 3.80880478721084e-06, + "loss": 0.4432, + "step": 12725 + }, + { + "epoch": 2.6160962072155414, + "grad_norm": 0.12518732249736786, + "learning_rate": 3.8047842239508542e-06, + "loss": 0.446, + "step": 12726 + }, + { + "epoch": 2.6163017781889195, + "grad_norm": 0.2416573315858841, + "learning_rate": 3.8007656901865996e-06, + "loss": 0.3825, + "step": 12727 + }, + { + "epoch": 2.6165073491622985, + "grad_norm": 0.23824763298034668, + "learning_rate": 3.7967491861160583e-06, + "loss": 0.3737, + "step": 12728 + }, + { + "epoch": 2.6167129201356767, + "grad_norm": 0.2198529988527298, + "learning_rate": 3.7927347119370966e-06, + "loss": 0.3874, + "step": 12729 + }, + { + "epoch": 2.6169184911090553, + "grad_norm": 0.2278008759021759, + "learning_rate": 3.7887222678474868e-06, + "loss": 0.356, + "step": 12730 + }, + { + "epoch": 2.617124062082434, + "grad_norm": 0.2338324338197708, + "learning_rate": 3.7847118540449202e-06, + "loss": 0.3923, + "step": 12731 + }, + { + "epoch": 2.6173296330558125, + "grad_norm": 0.21889689564704895, + "learning_rate": 3.780703470726959e-06, + "loss": 0.3612, + "step": 12732 + }, + { + "epoch": 2.617535204029191, + "grad_norm": 0.238824263215065, + "learning_rate": 3.7766971180910803e-06, + "loss": 0.3983, + "step": 12733 + }, + { + "epoch": 2.6177407750025696, + "grad_norm": 0.12167999893426895, + "learning_rate": 3.7726927963346564e-06, + "loss": 0.4387, + "step": 12734 + }, + { + "epoch": 2.617946345975948, + "grad_norm": 0.23274268209934235, + "learning_rate": 3.768690505654964e-06, + "loss": 0.3855, + "step": 12735 + }, + { + "epoch": 2.618151916949327, + "grad_norm": 0.24479152262210846, + "learning_rate": 3.7646902462491765e-06, + "loss": 0.3923, + "step": 12736 + }, + { + "epoch": 2.6183574879227054, + "grad_norm": 0.2456178367137909, + "learning_rate": 3.7606920183143546e-06, + "loss": 0.4016, + "step": 12737 + }, + { + "epoch": 2.618563058896084, + "grad_norm": 0.23310574889183044, + "learning_rate": 3.756695822047497e-06, + "loss": 0.3859, + "step": 12738 + }, + { + "epoch": 2.6187686298694626, + "grad_norm": 0.12393064796924591, + "learning_rate": 3.7527016576454603e-06, + "loss": 0.4474, + "step": 12739 + }, + { + "epoch": 2.6189742008428407, + "grad_norm": 0.23465074598789215, + "learning_rate": 3.748709525305028e-06, + "loss": 0.3948, + "step": 12740 + }, + { + "epoch": 2.6191797718162197, + "grad_norm": 0.2182430624961853, + "learning_rate": 3.7447194252228624e-06, + "loss": 0.3807, + "step": 12741 + }, + { + "epoch": 2.619385342789598, + "grad_norm": 0.23260043561458588, + "learning_rate": 3.740731357595551e-06, + "loss": 0.3909, + "step": 12742 + }, + { + "epoch": 2.619590913762977, + "grad_norm": 0.11928752809762955, + "learning_rate": 3.736745322619557e-06, + "loss": 0.45, + "step": 12743 + }, + { + "epoch": 2.619796484736355, + "grad_norm": 0.23598788678646088, + "learning_rate": 3.7327613204912532e-06, + "loss": 0.3824, + "step": 12744 + }, + { + "epoch": 2.6200020557097337, + "grad_norm": 0.24089393019676208, + "learning_rate": 3.7287793514069226e-06, + "loss": 0.3849, + "step": 12745 + }, + { + "epoch": 2.6202076266831122, + "grad_norm": 0.23498208820819855, + "learning_rate": 3.724799415562733e-06, + "loss": 0.3896, + "step": 12746 + }, + { + "epoch": 2.620413197656491, + "grad_norm": 0.23025161027908325, + "learning_rate": 3.720821513154758e-06, + "loss": 0.3816, + "step": 12747 + }, + { + "epoch": 2.6206187686298694, + "grad_norm": 0.22431518137454987, + "learning_rate": 3.7168456443789656e-06, + "loss": 0.3795, + "step": 12748 + }, + { + "epoch": 2.620824339603248, + "grad_norm": 0.12069787830114365, + "learning_rate": 3.7128718094312293e-06, + "loss": 0.4559, + "step": 12749 + }, + { + "epoch": 2.6210299105766266, + "grad_norm": 0.2284901887178421, + "learning_rate": 3.708900008507327e-06, + "loss": 0.3781, + "step": 12750 + }, + { + "epoch": 2.621235481550005, + "grad_norm": 0.12058551609516144, + "learning_rate": 3.704930241802918e-06, + "loss": 0.446, + "step": 12751 + }, + { + "epoch": 2.6214410525233838, + "grad_norm": 0.22641105949878693, + "learning_rate": 3.700962509513595e-06, + "loss": 0.3953, + "step": 12752 + }, + { + "epoch": 2.6216466234967624, + "grad_norm": 0.11827776581048965, + "learning_rate": 3.6969968118348127e-06, + "loss": 0.4369, + "step": 12753 + }, + { + "epoch": 2.621852194470141, + "grad_norm": 0.12709125876426697, + "learning_rate": 3.6930331489619537e-06, + "loss": 0.4419, + "step": 12754 + }, + { + "epoch": 2.6220577654435195, + "grad_norm": 0.24272185564041138, + "learning_rate": 3.689071521090277e-06, + "loss": 0.396, + "step": 12755 + }, + { + "epoch": 2.622263336416898, + "grad_norm": 0.23644161224365234, + "learning_rate": 3.685111928414962e-06, + "loss": 0.3728, + "step": 12756 + }, + { + "epoch": 2.6224689073902763, + "grad_norm": 0.23552681505680084, + "learning_rate": 3.6811543711310777e-06, + "loss": 0.4081, + "step": 12757 + }, + { + "epoch": 2.6226744783636553, + "grad_norm": 0.23269321024417877, + "learning_rate": 3.6771988494335823e-06, + "loss": 0.3695, + "step": 12758 + }, + { + "epoch": 2.6228800493370334, + "grad_norm": 0.22955691814422607, + "learning_rate": 3.673245363517371e-06, + "loss": 0.3751, + "step": 12759 + }, + { + "epoch": 2.623085620310412, + "grad_norm": 0.22635483741760254, + "learning_rate": 3.669293913577197e-06, + "loss": 0.4007, + "step": 12760 + }, + { + "epoch": 2.6232911912837906, + "grad_norm": 0.2312253713607788, + "learning_rate": 3.6653444998077302e-06, + "loss": 0.3691, + "step": 12761 + }, + { + "epoch": 2.623496762257169, + "grad_norm": 0.24891069531440735, + "learning_rate": 3.661397122403545e-06, + "loss": 0.3871, + "step": 12762 + }, + { + "epoch": 2.623702333230548, + "grad_norm": 0.22625122964382172, + "learning_rate": 3.6574517815591002e-06, + "loss": 0.3823, + "step": 12763 + }, + { + "epoch": 2.6239079042039264, + "grad_norm": 0.2243538647890091, + "learning_rate": 3.653508477468781e-06, + "loss": 0.3765, + "step": 12764 + }, + { + "epoch": 2.624113475177305, + "grad_norm": 0.23248639702796936, + "learning_rate": 3.649567210326832e-06, + "loss": 0.3974, + "step": 12765 + }, + { + "epoch": 2.6243190461506836, + "grad_norm": 0.22589054703712463, + "learning_rate": 3.6456279803274474e-06, + "loss": 0.3502, + "step": 12766 + }, + { + "epoch": 2.624524617124062, + "grad_norm": 0.23221521079540253, + "learning_rate": 3.6416907876646824e-06, + "loss": 0.3724, + "step": 12767 + }, + { + "epoch": 2.6247301880974407, + "grad_norm": 0.22528620064258575, + "learning_rate": 3.6377556325325014e-06, + "loss": 0.3963, + "step": 12768 + }, + { + "epoch": 2.6249357590708193, + "grad_norm": 0.2201879322528839, + "learning_rate": 3.6338225151247797e-06, + "loss": 0.3879, + "step": 12769 + }, + { + "epoch": 2.625141330044198, + "grad_norm": 0.2363358587026596, + "learning_rate": 3.629891435635272e-06, + "loss": 0.3906, + "step": 12770 + }, + { + "epoch": 2.6253469010175765, + "grad_norm": 0.12273525446653366, + "learning_rate": 3.625962394257644e-06, + "loss": 0.4524, + "step": 12771 + }, + { + "epoch": 2.6255524719909547, + "grad_norm": 0.23554597795009613, + "learning_rate": 3.6220353911854748e-06, + "loss": 0.3759, + "step": 12772 + }, + { + "epoch": 2.6257580429643337, + "grad_norm": 0.12295962870121002, + "learning_rate": 3.6181104266122206e-06, + "loss": 0.4549, + "step": 12773 + }, + { + "epoch": 2.625963613937712, + "grad_norm": 0.22669798135757446, + "learning_rate": 3.6141875007312465e-06, + "loss": 0.3875, + "step": 12774 + }, + { + "epoch": 2.6261691849110904, + "grad_norm": 0.12479596585035324, + "learning_rate": 3.610266613735818e-06, + "loss": 0.4492, + "step": 12775 + }, + { + "epoch": 2.626374755884469, + "grad_norm": 0.1302644908428192, + "learning_rate": 3.6063477658191e-06, + "loss": 0.4415, + "step": 12776 + }, + { + "epoch": 2.6265803268578476, + "grad_norm": 0.12051938474178314, + "learning_rate": 3.6024309571741533e-06, + "loss": 0.4435, + "step": 12777 + }, + { + "epoch": 2.626785897831226, + "grad_norm": 0.2350439727306366, + "learning_rate": 3.5985161879939338e-06, + "loss": 0.3832, + "step": 12778 + }, + { + "epoch": 2.6269914688046048, + "grad_norm": 0.12057623267173767, + "learning_rate": 3.5946034584713225e-06, + "loss": 0.454, + "step": 12779 + }, + { + "epoch": 2.6271970397779834, + "grad_norm": 0.22473283112049103, + "learning_rate": 3.5906927687990644e-06, + "loss": 0.38, + "step": 12780 + }, + { + "epoch": 2.627402610751362, + "grad_norm": 0.2340707778930664, + "learning_rate": 3.586784119169831e-06, + "loss": 0.3914, + "step": 12781 + }, + { + "epoch": 2.6276081817247405, + "grad_norm": 0.11768918484449387, + "learning_rate": 3.582877509776178e-06, + "loss": 0.4492, + "step": 12782 + }, + { + "epoch": 2.627813752698119, + "grad_norm": 0.2253478765487671, + "learning_rate": 3.5789729408105665e-06, + "loss": 0.3697, + "step": 12783 + }, + { + "epoch": 2.6280193236714977, + "grad_norm": 0.22701376676559448, + "learning_rate": 3.575070412465353e-06, + "loss": 0.3721, + "step": 12784 + }, + { + "epoch": 2.6282248946448763, + "grad_norm": 0.1314348578453064, + "learning_rate": 3.571169924932803e-06, + "loss": 0.4403, + "step": 12785 + }, + { + "epoch": 2.628430465618255, + "grad_norm": 0.22781673073768616, + "learning_rate": 3.567271478405078e-06, + "loss": 0.3857, + "step": 12786 + }, + { + "epoch": 2.628636036591633, + "grad_norm": 0.2242654412984848, + "learning_rate": 3.56337507307422e-06, + "loss": 0.383, + "step": 12787 + }, + { + "epoch": 2.628841607565012, + "grad_norm": 0.23189879953861237, + "learning_rate": 3.5594807091322047e-06, + "loss": 0.3873, + "step": 12788 + }, + { + "epoch": 2.62904717853839, + "grad_norm": 0.23214222490787506, + "learning_rate": 3.555588386770884e-06, + "loss": 0.4035, + "step": 12789 + }, + { + "epoch": 2.629252749511769, + "grad_norm": 0.23310469090938568, + "learning_rate": 3.551698106182014e-06, + "loss": 0.3994, + "step": 12790 + }, + { + "epoch": 2.6294583204851474, + "grad_norm": 0.2216804176568985, + "learning_rate": 3.5478098675572474e-06, + "loss": 0.3785, + "step": 12791 + }, + { + "epoch": 2.629663891458526, + "grad_norm": 0.23494820296764374, + "learning_rate": 3.543923671088135e-06, + "loss": 0.3643, + "step": 12792 + }, + { + "epoch": 2.6298694624319046, + "grad_norm": 0.23349052667617798, + "learning_rate": 3.540039516966144e-06, + "loss": 0.4023, + "step": 12793 + }, + { + "epoch": 2.630075033405283, + "grad_norm": 0.12207508087158203, + "learning_rate": 3.536157405382627e-06, + "loss": 0.4413, + "step": 12794 + }, + { + "epoch": 2.6302806043786617, + "grad_norm": 0.22845908999443054, + "learning_rate": 3.5322773365288298e-06, + "loss": 0.3583, + "step": 12795 + }, + { + "epoch": 2.6304861753520403, + "grad_norm": 0.24039143323898315, + "learning_rate": 3.5283993105959103e-06, + "loss": 0.3928, + "step": 12796 + }, + { + "epoch": 2.630691746325419, + "grad_norm": 0.23867449164390564, + "learning_rate": 3.524523327774915e-06, + "loss": 0.3646, + "step": 12797 + }, + { + "epoch": 2.6308973172987975, + "grad_norm": 0.12439849972724915, + "learning_rate": 3.520649388256802e-06, + "loss": 0.4475, + "step": 12798 + }, + { + "epoch": 2.631102888272176, + "grad_norm": 0.23779630661010742, + "learning_rate": 3.516777492232413e-06, + "loss": 0.371, + "step": 12799 + }, + { + "epoch": 2.6313084592455547, + "grad_norm": 0.2336895763874054, + "learning_rate": 3.512907639892511e-06, + "loss": 0.3766, + "step": 12800 + }, + { + "epoch": 2.6315140302189333, + "grad_norm": 0.23059743642807007, + "learning_rate": 3.5090398314277427e-06, + "loss": 0.4003, + "step": 12801 + }, + { + "epoch": 2.6317196011923114, + "grad_norm": 0.2365521341562271, + "learning_rate": 3.5051740670286466e-06, + "loss": 0.396, + "step": 12802 + }, + { + "epoch": 2.6319251721656904, + "grad_norm": 0.2317541390657425, + "learning_rate": 3.5013103468856846e-06, + "loss": 0.3752, + "step": 12803 + }, + { + "epoch": 2.6321307431390686, + "grad_norm": 0.23596826195716858, + "learning_rate": 3.4974486711891948e-06, + "loss": 0.3894, + "step": 12804 + }, + { + "epoch": 2.632336314112447, + "grad_norm": 0.22818133234977722, + "learning_rate": 3.49358904012942e-06, + "loss": 0.3867, + "step": 12805 + }, + { + "epoch": 2.6325418850858258, + "grad_norm": 0.23365046083927155, + "learning_rate": 3.4897314538965178e-06, + "loss": 0.3997, + "step": 12806 + }, + { + "epoch": 2.6327474560592043, + "grad_norm": 0.21868395805358887, + "learning_rate": 3.4858759126805315e-06, + "loss": 0.3677, + "step": 12807 + }, + { + "epoch": 2.632953027032583, + "grad_norm": 0.23094679415225983, + "learning_rate": 3.4820224166713938e-06, + "loss": 0.407, + "step": 12808 + }, + { + "epoch": 2.6331585980059615, + "grad_norm": 0.2458486258983612, + "learning_rate": 3.4781709660589636e-06, + "loss": 0.3881, + "step": 12809 + }, + { + "epoch": 2.63336416897934, + "grad_norm": 0.23680594563484192, + "learning_rate": 3.4743215610329785e-06, + "loss": 0.3726, + "step": 12810 + }, + { + "epoch": 2.6335697399527187, + "grad_norm": 0.22720733284950256, + "learning_rate": 3.4704742017830815e-06, + "loss": 0.3768, + "step": 12811 + }, + { + "epoch": 2.6337753109260973, + "grad_norm": 0.1218261644244194, + "learning_rate": 3.466628888498807e-06, + "loss": 0.4454, + "step": 12812 + }, + { + "epoch": 2.633980881899476, + "grad_norm": 0.2283984124660492, + "learning_rate": 3.4627856213695977e-06, + "loss": 0.3878, + "step": 12813 + }, + { + "epoch": 2.6341864528728545, + "grad_norm": 0.22897951304912567, + "learning_rate": 3.4589444005848023e-06, + "loss": 0.3816, + "step": 12814 + }, + { + "epoch": 2.634392023846233, + "grad_norm": 0.12013474106788635, + "learning_rate": 3.455105226333654e-06, + "loss": 0.4596, + "step": 12815 + }, + { + "epoch": 2.6345975948196116, + "grad_norm": 0.22502024471759796, + "learning_rate": 3.4512680988052878e-06, + "loss": 0.371, + "step": 12816 + }, + { + "epoch": 2.63480316579299, + "grad_norm": 0.23252278566360474, + "learning_rate": 3.447433018188751e-06, + "loss": 0.3728, + "step": 12817 + }, + { + "epoch": 2.635008736766369, + "grad_norm": 0.23052488267421722, + "learning_rate": 3.4435999846729684e-06, + "loss": 0.395, + "step": 12818 + }, + { + "epoch": 2.635214307739747, + "grad_norm": 0.22712182998657227, + "learning_rate": 3.4397689984467786e-06, + "loss": 0.3595, + "step": 12819 + }, + { + "epoch": 2.6354198787131256, + "grad_norm": 0.11764495819807053, + "learning_rate": 3.4359400596989154e-06, + "loss": 0.4578, + "step": 12820 + }, + { + "epoch": 2.635625449686504, + "grad_norm": 0.23514217138290405, + "learning_rate": 3.4321131686180186e-06, + "loss": 0.3898, + "step": 12821 + }, + { + "epoch": 2.6358310206598827, + "grad_norm": 0.11776993423700333, + "learning_rate": 3.428288325392622e-06, + "loss": 0.444, + "step": 12822 + }, + { + "epoch": 2.6360365916332613, + "grad_norm": 0.21985335648059845, + "learning_rate": 3.4244655302111493e-06, + "loss": 0.3853, + "step": 12823 + }, + { + "epoch": 2.63624216260664, + "grad_norm": 0.23297996819019318, + "learning_rate": 3.420644783261941e-06, + "loss": 0.3617, + "step": 12824 + }, + { + "epoch": 2.6364477335800185, + "grad_norm": 0.23519185185432434, + "learning_rate": 3.4168260847332207e-06, + "loss": 0.3854, + "step": 12825 + }, + { + "epoch": 2.636653304553397, + "grad_norm": 0.23188424110412598, + "learning_rate": 3.413009434813113e-06, + "loss": 0.378, + "step": 12826 + }, + { + "epoch": 2.6368588755267757, + "grad_norm": 0.12326161563396454, + "learning_rate": 3.409194833689663e-06, + "loss": 0.4533, + "step": 12827 + }, + { + "epoch": 2.6370644465001543, + "grad_norm": 0.230136439204216, + "learning_rate": 3.405382281550785e-06, + "loss": 0.3748, + "step": 12828 + }, + { + "epoch": 2.637270017473533, + "grad_norm": 0.23117460310459137, + "learning_rate": 3.4015717785843033e-06, + "loss": 0.4093, + "step": 12829 + }, + { + "epoch": 2.6374755884469114, + "grad_norm": 0.12335003167390823, + "learning_rate": 3.3977633249779582e-06, + "loss": 0.4496, + "step": 12830 + }, + { + "epoch": 2.63768115942029, + "grad_norm": 0.22897659242153168, + "learning_rate": 3.393956920919365e-06, + "loss": 0.3755, + "step": 12831 + }, + { + "epoch": 2.637886730393668, + "grad_norm": 0.24940082430839539, + "learning_rate": 3.390152566596048e-06, + "loss": 0.3918, + "step": 12832 + }, + { + "epoch": 2.638092301367047, + "grad_norm": 0.22477301955223083, + "learning_rate": 3.386350262195428e-06, + "loss": 0.3838, + "step": 12833 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 0.2210949808359146, + "learning_rate": 3.3825500079048244e-06, + "loss": 0.3877, + "step": 12834 + }, + { + "epoch": 2.638503443313804, + "grad_norm": 0.23475997149944305, + "learning_rate": 3.378751803911468e-06, + "loss": 0.379, + "step": 12835 + }, + { + "epoch": 2.6387090142871825, + "grad_norm": 0.23455579578876495, + "learning_rate": 3.3749556504024738e-06, + "loss": 0.3899, + "step": 12836 + }, + { + "epoch": 2.638914585260561, + "grad_norm": 0.2290680706501007, + "learning_rate": 3.3711615475648574e-06, + "loss": 0.3715, + "step": 12837 + }, + { + "epoch": 2.6391201562339397, + "grad_norm": 0.23036018013954163, + "learning_rate": 3.367369495585544e-06, + "loss": 0.3757, + "step": 12838 + }, + { + "epoch": 2.6393257272073183, + "grad_norm": 0.2289496660232544, + "learning_rate": 3.3635794946513393e-06, + "loss": 0.3768, + "step": 12839 + }, + { + "epoch": 2.639531298180697, + "grad_norm": 0.23706848919391632, + "learning_rate": 3.3597915449489694e-06, + "loss": 0.3983, + "step": 12840 + }, + { + "epoch": 2.6397368691540755, + "grad_norm": 0.22767172753810883, + "learning_rate": 3.356005646665034e-06, + "loss": 0.4012, + "step": 12841 + }, + { + "epoch": 2.639942440127454, + "grad_norm": 0.2272636741399765, + "learning_rate": 3.352221799986065e-06, + "loss": 0.3776, + "step": 12842 + }, + { + "epoch": 2.6401480111008326, + "grad_norm": 0.2348739355802536, + "learning_rate": 3.3484400050984677e-06, + "loss": 0.3836, + "step": 12843 + }, + { + "epoch": 2.6403535820742112, + "grad_norm": 0.23279324173927307, + "learning_rate": 3.3446602621885533e-06, + "loss": 0.38, + "step": 12844 + }, + { + "epoch": 2.64055915304759, + "grad_norm": 0.23363368213176727, + "learning_rate": 3.3408825714425273e-06, + "loss": 0.3875, + "step": 12845 + }, + { + "epoch": 2.6407647240209684, + "grad_norm": 0.2381156086921692, + "learning_rate": 3.3371069330465066e-06, + "loss": 0.3899, + "step": 12846 + }, + { + "epoch": 2.6409702949943465, + "grad_norm": 0.22464582324028015, + "learning_rate": 3.333333347186487e-06, + "loss": 0.3757, + "step": 12847 + }, + { + "epoch": 2.6411758659677256, + "grad_norm": 0.22474953532218933, + "learning_rate": 3.3295618140483898e-06, + "loss": 0.3826, + "step": 12848 + }, + { + "epoch": 2.6413814369411037, + "grad_norm": 0.23333978652954102, + "learning_rate": 3.3257923338180166e-06, + "loss": 0.3958, + "step": 12849 + }, + { + "epoch": 2.6415870079144823, + "grad_norm": 0.22718356549739838, + "learning_rate": 3.3220249066810683e-06, + "loss": 0.3781, + "step": 12850 + }, + { + "epoch": 2.641792578887861, + "grad_norm": 0.2350386381149292, + "learning_rate": 3.318259532823147e-06, + "loss": 0.382, + "step": 12851 + }, + { + "epoch": 2.6419981498612395, + "grad_norm": 0.23376323282718658, + "learning_rate": 3.314496212429764e-06, + "loss": 0.3888, + "step": 12852 + }, + { + "epoch": 2.642203720834618, + "grad_norm": 0.12379579991102219, + "learning_rate": 3.3107349456863164e-06, + "loss": 0.4619, + "step": 12853 + }, + { + "epoch": 2.6424092918079967, + "grad_norm": 0.2283748835325241, + "learning_rate": 3.3069757327780903e-06, + "loss": 0.3903, + "step": 12854 + }, + { + "epoch": 2.6426148627813753, + "grad_norm": 0.12083147466182709, + "learning_rate": 3.303218573890308e-06, + "loss": 0.4596, + "step": 12855 + }, + { + "epoch": 2.642820433754754, + "grad_norm": 0.12731818854808807, + "learning_rate": 3.2994634692080566e-06, + "loss": 0.4275, + "step": 12856 + }, + { + "epoch": 2.6430260047281324, + "grad_norm": 0.22971482574939728, + "learning_rate": 3.295710418916333e-06, + "loss": 0.3684, + "step": 12857 + }, + { + "epoch": 2.643231575701511, + "grad_norm": 0.23392242193222046, + "learning_rate": 3.291959423200029e-06, + "loss": 0.3943, + "step": 12858 + }, + { + "epoch": 2.6434371466748896, + "grad_norm": 0.23680520057678223, + "learning_rate": 3.288210482243942e-06, + "loss": 0.3717, + "step": 12859 + }, + { + "epoch": 2.643642717648268, + "grad_norm": 0.2301972508430481, + "learning_rate": 3.284463596232769e-06, + "loss": 0.3778, + "step": 12860 + }, + { + "epoch": 2.643848288621647, + "grad_norm": 0.2333422154188156, + "learning_rate": 3.280718765351083e-06, + "loss": 0.3792, + "step": 12861 + }, + { + "epoch": 2.644053859595025, + "grad_norm": 0.12037578970193863, + "learning_rate": 3.2769759897834006e-06, + "loss": 0.447, + "step": 12862 + }, + { + "epoch": 2.644259430568404, + "grad_norm": 0.23077453672885895, + "learning_rate": 3.273235269714095e-06, + "loss": 0.3731, + "step": 12863 + }, + { + "epoch": 2.644465001541782, + "grad_norm": 0.23764555156230927, + "learning_rate": 3.2694966053274583e-06, + "loss": 0.3879, + "step": 12864 + }, + { + "epoch": 2.644670572515161, + "grad_norm": 0.23211322724819183, + "learning_rate": 3.2657599968076737e-06, + "loss": 0.3737, + "step": 12865 + }, + { + "epoch": 2.6448761434885393, + "grad_norm": 0.24329562485218048, + "learning_rate": 3.2620254443388283e-06, + "loss": 0.3848, + "step": 12866 + }, + { + "epoch": 2.645081714461918, + "grad_norm": 0.12165253609418869, + "learning_rate": 3.25829294810491e-06, + "loss": 0.4326, + "step": 12867 + }, + { + "epoch": 2.6452872854352965, + "grad_norm": 0.1259094923734665, + "learning_rate": 3.2545625082897874e-06, + "loss": 0.4411, + "step": 12868 + }, + { + "epoch": 2.645492856408675, + "grad_norm": 0.9432693123817444, + "learning_rate": 3.250834125077263e-06, + "loss": 0.4054, + "step": 12869 + }, + { + "epoch": 2.6456984273820536, + "grad_norm": 0.23042386770248413, + "learning_rate": 3.2471077986510045e-06, + "loss": 0.3848, + "step": 12870 + }, + { + "epoch": 2.645903998355432, + "grad_norm": 0.24361389875411987, + "learning_rate": 3.243383529194591e-06, + "loss": 0.3802, + "step": 12871 + }, + { + "epoch": 2.646109569328811, + "grad_norm": 0.2344541847705841, + "learning_rate": 3.2396613168914945e-06, + "loss": 0.382, + "step": 12872 + }, + { + "epoch": 2.6463151403021894, + "grad_norm": 0.2395932823419571, + "learning_rate": 3.2359411619251094e-06, + "loss": 0.3851, + "step": 12873 + }, + { + "epoch": 2.646520711275568, + "grad_norm": 0.23469178378582, + "learning_rate": 3.232223064478694e-06, + "loss": 0.3817, + "step": 12874 + }, + { + "epoch": 2.6467262822489466, + "grad_norm": 0.12158174812793732, + "learning_rate": 3.228507024735416e-06, + "loss": 0.4588, + "step": 12875 + }, + { + "epoch": 2.646931853222325, + "grad_norm": 0.2328415811061859, + "learning_rate": 3.2247930428783698e-06, + "loss": 0.3656, + "step": 12876 + }, + { + "epoch": 2.6471374241957033, + "grad_norm": 0.2348622977733612, + "learning_rate": 3.2210811190905133e-06, + "loss": 0.3897, + "step": 12877 + }, + { + "epoch": 2.6473429951690823, + "grad_norm": 0.2322535365819931, + "learning_rate": 3.2173712535547156e-06, + "loss": 0.3972, + "step": 12878 + }, + { + "epoch": 2.6475485661424605, + "grad_norm": 0.2271070033311844, + "learning_rate": 3.2136634464537407e-06, + "loss": 0.3874, + "step": 12879 + }, + { + "epoch": 2.6477541371158395, + "grad_norm": 0.22299452126026154, + "learning_rate": 3.209957697970262e-06, + "loss": 0.3853, + "step": 12880 + }, + { + "epoch": 2.6479597080892177, + "grad_norm": 0.22766993939876556, + "learning_rate": 3.206254008286844e-06, + "loss": 0.3715, + "step": 12881 + }, + { + "epoch": 2.6481652790625962, + "grad_norm": 0.23258061707019806, + "learning_rate": 3.202552377585936e-06, + "loss": 0.3779, + "step": 12882 + }, + { + "epoch": 2.648370850035975, + "grad_norm": 0.2237747460603714, + "learning_rate": 3.198852806049921e-06, + "loss": 0.3782, + "step": 12883 + }, + { + "epoch": 2.6485764210093534, + "grad_norm": 0.2268817126750946, + "learning_rate": 3.1951552938610486e-06, + "loss": 0.3831, + "step": 12884 + }, + { + "epoch": 2.648781991982732, + "grad_norm": 0.22952783107757568, + "learning_rate": 3.1914598412014784e-06, + "loss": 0.387, + "step": 12885 + }, + { + "epoch": 2.6489875629561106, + "grad_norm": 0.11640308797359467, + "learning_rate": 3.1877664482532748e-06, + "loss": 0.4273, + "step": 12886 + }, + { + "epoch": 2.649193133929489, + "grad_norm": 0.23823915421962738, + "learning_rate": 3.184075115198382e-06, + "loss": 0.36, + "step": 12887 + }, + { + "epoch": 2.6493987049028678, + "grad_norm": 0.2281506359577179, + "learning_rate": 3.180385842218665e-06, + "loss": 0.3906, + "step": 12888 + }, + { + "epoch": 2.6496042758762464, + "grad_norm": 0.22782327234745026, + "learning_rate": 3.176698629495868e-06, + "loss": 0.3764, + "step": 12889 + }, + { + "epoch": 2.649809846849625, + "grad_norm": 0.23879340291023254, + "learning_rate": 3.1730134772116507e-06, + "loss": 0.3965, + "step": 12890 + }, + { + "epoch": 2.6500154178230035, + "grad_norm": 0.22930049896240234, + "learning_rate": 3.1693303855475626e-06, + "loss": 0.3817, + "step": 12891 + }, + { + "epoch": 2.6502209887963817, + "grad_norm": 0.12056277692317963, + "learning_rate": 3.1656493546850492e-06, + "loss": 0.4472, + "step": 12892 + }, + { + "epoch": 2.6504265597697607, + "grad_norm": 0.22143647074699402, + "learning_rate": 3.16197038480545e-06, + "loss": 0.3693, + "step": 12893 + }, + { + "epoch": 2.650632130743139, + "grad_norm": 0.2295861691236496, + "learning_rate": 3.1582934760900302e-06, + "loss": 0.3659, + "step": 12894 + }, + { + "epoch": 2.650837701716518, + "grad_norm": 0.2308284193277359, + "learning_rate": 3.1546186287199196e-06, + "loss": 0.3898, + "step": 12895 + }, + { + "epoch": 2.651043272689896, + "grad_norm": 0.12249313294887543, + "learning_rate": 3.1509458428761593e-06, + "loss": 0.4343, + "step": 12896 + }, + { + "epoch": 2.6512488436632746, + "grad_norm": 0.2282872498035431, + "learning_rate": 3.1472751187397034e-06, + "loss": 0.3752, + "step": 12897 + }, + { + "epoch": 2.651454414636653, + "grad_norm": 0.23152220249176025, + "learning_rate": 3.1436064564913824e-06, + "loss": 0.3756, + "step": 12898 + }, + { + "epoch": 2.651659985610032, + "grad_norm": 0.2310069352388382, + "learning_rate": 3.1399398563119376e-06, + "loss": 0.3788, + "step": 12899 + }, + { + "epoch": 2.6518655565834104, + "grad_norm": 0.24678552150726318, + "learning_rate": 3.1362753183819987e-06, + "loss": 0.4028, + "step": 12900 + }, + { + "epoch": 2.652071127556789, + "grad_norm": 0.2333284318447113, + "learning_rate": 3.1326128428821065e-06, + "loss": 0.3579, + "step": 12901 + }, + { + "epoch": 2.6522766985301676, + "grad_norm": 0.22593623399734497, + "learning_rate": 3.128952429992692e-06, + "loss": 0.3587, + "step": 12902 + }, + { + "epoch": 2.652482269503546, + "grad_norm": 0.22910958528518677, + "learning_rate": 3.1252940798940757e-06, + "loss": 0.3806, + "step": 12903 + }, + { + "epoch": 2.6526878404769247, + "grad_norm": 0.2289619743824005, + "learning_rate": 3.1216377927665083e-06, + "loss": 0.3707, + "step": 12904 + }, + { + "epoch": 2.6528934114503033, + "grad_norm": 0.2235552966594696, + "learning_rate": 3.1179835687901104e-06, + "loss": 0.3581, + "step": 12905 + }, + { + "epoch": 2.653098982423682, + "grad_norm": 0.23109345138072968, + "learning_rate": 3.1143314081449036e-06, + "loss": 0.3845, + "step": 12906 + }, + { + "epoch": 2.65330455339706, + "grad_norm": 0.22421690821647644, + "learning_rate": 3.1106813110108143e-06, + "loss": 0.349, + "step": 12907 + }, + { + "epoch": 2.653510124370439, + "grad_norm": 0.23160065710544586, + "learning_rate": 3.1070332775676675e-06, + "loss": 0.3736, + "step": 12908 + }, + { + "epoch": 2.6537156953438172, + "grad_norm": 0.24303646385669708, + "learning_rate": 3.1033873079951803e-06, + "loss": 0.3866, + "step": 12909 + }, + { + "epoch": 2.6539212663171963, + "grad_norm": 0.23496584594249725, + "learning_rate": 3.0997434024729737e-06, + "loss": 0.3996, + "step": 12910 + }, + { + "epoch": 2.6541268372905744, + "grad_norm": 0.23909246921539307, + "learning_rate": 3.0961015611805742e-06, + "loss": 0.3741, + "step": 12911 + }, + { + "epoch": 2.654332408263953, + "grad_norm": 0.24438230693340302, + "learning_rate": 3.0924617842973936e-06, + "loss": 0.3972, + "step": 12912 + }, + { + "epoch": 2.6545379792373316, + "grad_norm": 0.23331284523010254, + "learning_rate": 3.0888240720027427e-06, + "loss": 0.3722, + "step": 12913 + }, + { + "epoch": 2.65474355021071, + "grad_norm": 0.22317220270633698, + "learning_rate": 3.085188424475834e-06, + "loss": 0.3871, + "step": 12914 + }, + { + "epoch": 2.6549491211840888, + "grad_norm": 0.2298220545053482, + "learning_rate": 3.0815548418957884e-06, + "loss": 0.3819, + "step": 12915 + }, + { + "epoch": 2.6551546921574674, + "grad_norm": 0.22008730471134186, + "learning_rate": 3.0779233244416084e-06, + "loss": 0.374, + "step": 12916 + }, + { + "epoch": 2.655360263130846, + "grad_norm": 0.22563999891281128, + "learning_rate": 3.0742938722921956e-06, + "loss": 0.3705, + "step": 12917 + }, + { + "epoch": 2.6555658341042245, + "grad_norm": 0.231903076171875, + "learning_rate": 3.070666485626367e-06, + "loss": 0.389, + "step": 12918 + }, + { + "epoch": 2.655771405077603, + "grad_norm": 0.2327851802110672, + "learning_rate": 3.067041164622829e-06, + "loss": 0.3835, + "step": 12919 + }, + { + "epoch": 2.6559769760509817, + "grad_norm": 0.225325807929039, + "learning_rate": 3.063417909460175e-06, + "loss": 0.3936, + "step": 12920 + }, + { + "epoch": 2.6561825470243603, + "grad_norm": 0.12957926094532013, + "learning_rate": 3.0597967203169113e-06, + "loss": 0.4463, + "step": 12921 + }, + { + "epoch": 2.656388117997739, + "grad_norm": 0.22962552309036255, + "learning_rate": 3.056177597371436e-06, + "loss": 0.3842, + "step": 12922 + }, + { + "epoch": 2.6565936889711175, + "grad_norm": 0.22653664648532867, + "learning_rate": 3.0525605408020405e-06, + "loss": 0.3896, + "step": 12923 + }, + { + "epoch": 2.6567992599444956, + "grad_norm": 0.23864829540252686, + "learning_rate": 3.0489455507869275e-06, + "loss": 0.3847, + "step": 12924 + }, + { + "epoch": 2.6570048309178746, + "grad_norm": 0.22004222869873047, + "learning_rate": 3.0453326275041898e-06, + "loss": 0.3739, + "step": 12925 + }, + { + "epoch": 2.657210401891253, + "grad_norm": 0.22327813506126404, + "learning_rate": 3.0417217711318203e-06, + "loss": 0.3769, + "step": 12926 + }, + { + "epoch": 2.6574159728646314, + "grad_norm": 0.2343619167804718, + "learning_rate": 3.038112981847706e-06, + "loss": 0.395, + "step": 12927 + }, + { + "epoch": 2.65762154383801, + "grad_norm": 0.2278946340084076, + "learning_rate": 3.034506259829635e-06, + "loss": 0.3972, + "step": 12928 + }, + { + "epoch": 2.6578271148113886, + "grad_norm": 0.23201905190944672, + "learning_rate": 3.030901605255296e-06, + "loss": 0.3898, + "step": 12929 + }, + { + "epoch": 2.658032685784767, + "grad_norm": 0.2293037325143814, + "learning_rate": 3.0272990183022606e-06, + "loss": 0.3821, + "step": 12930 + }, + { + "epoch": 2.6582382567581457, + "grad_norm": 0.12265797704458237, + "learning_rate": 3.0236984991480323e-06, + "loss": 0.447, + "step": 12931 + }, + { + "epoch": 2.6584438277315243, + "grad_norm": 0.2273169308900833, + "learning_rate": 3.0201000479699793e-06, + "loss": 0.373, + "step": 12932 + }, + { + "epoch": 2.658649398704903, + "grad_norm": 0.12377490103244781, + "learning_rate": 3.01650366494539e-06, + "loss": 0.458, + "step": 12933 + }, + { + "epoch": 2.6588549696782815, + "grad_norm": 0.2221972793340683, + "learning_rate": 3.012909350251427e-06, + "loss": 0.3638, + "step": 12934 + }, + { + "epoch": 2.65906054065166, + "grad_norm": 0.22611477971076965, + "learning_rate": 3.0093171040651795e-06, + "loss": 0.3919, + "step": 12935 + }, + { + "epoch": 2.6592661116250387, + "grad_norm": 0.22873830795288086, + "learning_rate": 3.005726926563606e-06, + "loss": 0.3743, + "step": 12936 + }, + { + "epoch": 2.6594716825984173, + "grad_norm": 0.24191910028457642, + "learning_rate": 3.0021388179235887e-06, + "loss": 0.3736, + "step": 12937 + }, + { + "epoch": 2.659677253571796, + "grad_norm": 0.2301923930644989, + "learning_rate": 2.9985527783218924e-06, + "loss": 0.3863, + "step": 12938 + }, + { + "epoch": 2.659882824545174, + "grad_norm": 0.24405382573604584, + "learning_rate": 2.9949688079351906e-06, + "loss": 0.3997, + "step": 12939 + }, + { + "epoch": 2.660088395518553, + "grad_norm": 0.23321811854839325, + "learning_rate": 2.991386906940047e-06, + "loss": 0.3724, + "step": 12940 + }, + { + "epoch": 2.660293966491931, + "grad_norm": 0.1216077208518982, + "learning_rate": 2.98780707551292e-06, + "loss": 0.438, + "step": 12941 + }, + { + "epoch": 2.6604995374653098, + "grad_norm": 0.23895247280597687, + "learning_rate": 2.984229313830179e-06, + "loss": 0.3645, + "step": 12942 + }, + { + "epoch": 2.6607051084386883, + "grad_norm": 0.2277345210313797, + "learning_rate": 2.9806536220680733e-06, + "loss": 0.3865, + "step": 12943 + }, + { + "epoch": 2.660910679412067, + "grad_norm": 0.22947533428668976, + "learning_rate": 2.977080000402761e-06, + "loss": 0.3807, + "step": 12944 + }, + { + "epoch": 2.6611162503854455, + "grad_norm": 0.22254477441310883, + "learning_rate": 2.973508449010307e-06, + "loss": 0.3799, + "step": 12945 + }, + { + "epoch": 2.661321821358824, + "grad_norm": 0.23514899611473083, + "learning_rate": 2.9699389680666607e-06, + "loss": 0.3769, + "step": 12946 + }, + { + "epoch": 2.6615273923322027, + "grad_norm": 0.2325250208377838, + "learning_rate": 2.9663715577476757e-06, + "loss": 0.3932, + "step": 12947 + }, + { + "epoch": 2.6617329633055813, + "grad_norm": 0.23977595567703247, + "learning_rate": 2.962806218229097e-06, + "loss": 0.3916, + "step": 12948 + }, + { + "epoch": 2.66193853427896, + "grad_norm": 0.23064671456813812, + "learning_rate": 2.9592429496865793e-06, + "loss": 0.3747, + "step": 12949 + }, + { + "epoch": 2.6621441052523385, + "grad_norm": 0.22543418407440186, + "learning_rate": 2.9556817522956613e-06, + "loss": 0.3767, + "step": 12950 + }, + { + "epoch": 2.662349676225717, + "grad_norm": 0.23423805832862854, + "learning_rate": 2.9521226262317785e-06, + "loss": 0.3838, + "step": 12951 + }, + { + "epoch": 2.6625552471990956, + "grad_norm": 0.22551970183849335, + "learning_rate": 2.9485655716702904e-06, + "loss": 0.3817, + "step": 12952 + }, + { + "epoch": 2.6627608181724742, + "grad_norm": 0.2365717738866806, + "learning_rate": 2.9450105887864316e-06, + "loss": 0.3874, + "step": 12953 + }, + { + "epoch": 2.6629663891458524, + "grad_norm": 0.24153275787830353, + "learning_rate": 2.941457677755337e-06, + "loss": 0.3949, + "step": 12954 + }, + { + "epoch": 2.6631719601192314, + "grad_norm": 0.23784461617469788, + "learning_rate": 2.937906838752037e-06, + "loss": 0.3925, + "step": 12955 + }, + { + "epoch": 2.6633775310926096, + "grad_norm": 0.23372387886047363, + "learning_rate": 2.934358071951471e-06, + "loss": 0.4013, + "step": 12956 + }, + { + "epoch": 2.663583102065988, + "grad_norm": 0.24772094190120697, + "learning_rate": 2.930811377528465e-06, + "loss": 0.3938, + "step": 12957 + }, + { + "epoch": 2.6637886730393667, + "grad_norm": 0.24129636585712433, + "learning_rate": 2.927266755657754e-06, + "loss": 0.3854, + "step": 12958 + }, + { + "epoch": 2.6639942440127453, + "grad_norm": 0.11964880675077438, + "learning_rate": 2.9237242065139626e-06, + "loss": 0.4409, + "step": 12959 + }, + { + "epoch": 2.664199814986124, + "grad_norm": 0.23363502323627472, + "learning_rate": 2.9201837302716118e-06, + "loss": 0.3931, + "step": 12960 + }, + { + "epoch": 2.6644053859595025, + "grad_norm": 0.23559674620628357, + "learning_rate": 2.916645327105132e-06, + "loss": 0.3897, + "step": 12961 + }, + { + "epoch": 2.664610956932881, + "grad_norm": 0.2335934042930603, + "learning_rate": 2.913108997188844e-06, + "loss": 0.3799, + "step": 12962 + }, + { + "epoch": 2.6648165279062597, + "grad_norm": 0.23663899302482605, + "learning_rate": 2.9095747406969577e-06, + "loss": 0.3606, + "step": 12963 + }, + { + "epoch": 2.6650220988796383, + "grad_norm": 0.22651928663253784, + "learning_rate": 2.9060425578035995e-06, + "loss": 0.3795, + "step": 12964 + }, + { + "epoch": 2.665227669853017, + "grad_norm": 0.22793136537075043, + "learning_rate": 2.902512448682765e-06, + "loss": 0.3749, + "step": 12965 + }, + { + "epoch": 2.6654332408263954, + "grad_norm": 0.2406536191701889, + "learning_rate": 2.898984413508385e-06, + "loss": 0.3877, + "step": 12966 + }, + { + "epoch": 2.665638811799774, + "grad_norm": 0.24164964258670807, + "learning_rate": 2.8954584524542707e-06, + "loss": 0.3982, + "step": 12967 + }, + { + "epoch": 2.6658443827731526, + "grad_norm": 0.2386479675769806, + "learning_rate": 2.891934565694118e-06, + "loss": 0.3901, + "step": 12968 + }, + { + "epoch": 2.6660499537465308, + "grad_norm": 0.231131449341774, + "learning_rate": 2.8884127534015327e-06, + "loss": 0.3654, + "step": 12969 + }, + { + "epoch": 2.66625552471991, + "grad_norm": 0.12683962285518646, + "learning_rate": 2.8848930157500264e-06, + "loss": 0.4251, + "step": 12970 + }, + { + "epoch": 2.666461095693288, + "grad_norm": 0.23223094642162323, + "learning_rate": 2.8813753529129956e-06, + "loss": 0.3818, + "step": 12971 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.22929398715496063, + "learning_rate": 2.8778597650637312e-06, + "loss": 0.3858, + "step": 12972 + }, + { + "epoch": 2.666872237640045, + "grad_norm": 0.22222407162189484, + "learning_rate": 2.874346252375445e-06, + "loss": 0.3972, + "step": 12973 + }, + { + "epoch": 2.6670778086134237, + "grad_norm": 0.22839607298374176, + "learning_rate": 2.8708348150212236e-06, + "loss": 0.3687, + "step": 12974 + }, + { + "epoch": 2.6672833795868023, + "grad_norm": 0.22204485535621643, + "learning_rate": 2.867325453174063e-06, + "loss": 0.3772, + "step": 12975 + }, + { + "epoch": 2.667488950560181, + "grad_norm": 0.12451615929603577, + "learning_rate": 2.8638181670068452e-06, + "loss": 0.4541, + "step": 12976 + }, + { + "epoch": 2.6676945215335595, + "grad_norm": 0.24160999059677124, + "learning_rate": 2.8603129566923676e-06, + "loss": 0.3808, + "step": 12977 + }, + { + "epoch": 2.667900092506938, + "grad_norm": 0.23395465314388275, + "learning_rate": 2.8568098224032963e-06, + "loss": 0.4002, + "step": 12978 + }, + { + "epoch": 2.6681056634803166, + "grad_norm": 0.22567373514175415, + "learning_rate": 2.8533087643122387e-06, + "loss": 0.3679, + "step": 12979 + }, + { + "epoch": 2.6683112344536952, + "grad_norm": 0.2304529845714569, + "learning_rate": 2.8498097825916664e-06, + "loss": 0.3783, + "step": 12980 + }, + { + "epoch": 2.668516805427074, + "grad_norm": 0.22995389997959137, + "learning_rate": 2.846312877413947e-06, + "loss": 0.3743, + "step": 12981 + }, + { + "epoch": 2.6687223764004524, + "grad_norm": 0.2304750382900238, + "learning_rate": 2.842818048951377e-06, + "loss": 0.3882, + "step": 12982 + }, + { + "epoch": 2.668927947373831, + "grad_norm": 0.23300042748451233, + "learning_rate": 2.8393252973761146e-06, + "loss": 0.3901, + "step": 12983 + }, + { + "epoch": 2.669133518347209, + "grad_norm": 0.231519877910614, + "learning_rate": 2.8358346228602416e-06, + "loss": 0.3797, + "step": 12984 + }, + { + "epoch": 2.669339089320588, + "grad_norm": 0.22919991612434387, + "learning_rate": 2.8323460255757206e-06, + "loss": 0.3678, + "step": 12985 + }, + { + "epoch": 2.6695446602939663, + "grad_norm": 0.2351444512605667, + "learning_rate": 2.828859505694409e-06, + "loss": 0.3931, + "step": 12986 + }, + { + "epoch": 2.669750231267345, + "grad_norm": 0.12017477303743362, + "learning_rate": 2.8253750633880943e-06, + "loss": 0.4364, + "step": 12987 + }, + { + "epoch": 2.6699558022407235, + "grad_norm": 0.22112242877483368, + "learning_rate": 2.8218926988284245e-06, + "loss": 0.3572, + "step": 12988 + }, + { + "epoch": 2.670161373214102, + "grad_norm": 0.23203538358211517, + "learning_rate": 2.8184124121869572e-06, + "loss": 0.3769, + "step": 12989 + }, + { + "epoch": 2.6703669441874807, + "grad_norm": 0.24650564789772034, + "learning_rate": 2.81493420363516e-06, + "loss": 0.3939, + "step": 12990 + }, + { + "epoch": 2.6705725151608593, + "grad_norm": 0.22376540303230286, + "learning_rate": 2.8114580733443815e-06, + "loss": 0.3736, + "step": 12991 + }, + { + "epoch": 2.670778086134238, + "grad_norm": 0.23529557883739471, + "learning_rate": 2.8079840214858738e-06, + "loss": 0.4071, + "step": 12992 + }, + { + "epoch": 2.6709836571076164, + "grad_norm": 0.22694729268550873, + "learning_rate": 2.804512048230781e-06, + "loss": 0.366, + "step": 12993 + }, + { + "epoch": 2.671189228080995, + "grad_norm": 0.11928309500217438, + "learning_rate": 2.8010421537501653e-06, + "loss": 0.4387, + "step": 12994 + }, + { + "epoch": 2.6713947990543736, + "grad_norm": 0.22787900269031525, + "learning_rate": 2.7975743382149655e-06, + "loss": 0.3641, + "step": 12995 + }, + { + "epoch": 2.671600370027752, + "grad_norm": 0.23789376020431519, + "learning_rate": 2.79410860179602e-06, + "loss": 0.3965, + "step": 12996 + }, + { + "epoch": 2.671805941001131, + "grad_norm": 0.23560819029808044, + "learning_rate": 2.790644944664082e-06, + "loss": 0.3903, + "step": 12997 + }, + { + "epoch": 2.6720115119745094, + "grad_norm": 0.1282123327255249, + "learning_rate": 2.787183366989775e-06, + "loss": 0.4493, + "step": 12998 + }, + { + "epoch": 2.6722170829478875, + "grad_norm": 0.2356158196926117, + "learning_rate": 2.783723868943638e-06, + "loss": 0.3806, + "step": 12999 + }, + { + "epoch": 2.6724226539212665, + "grad_norm": 0.22649535536766052, + "learning_rate": 2.780266450696114e-06, + "loss": 0.3694, + "step": 13000 + }, + { + "epoch": 2.6726282248946447, + "grad_norm": 0.22387070953845978, + "learning_rate": 2.7768111124175274e-06, + "loss": 0.3648, + "step": 13001 + }, + { + "epoch": 2.6728337958680233, + "grad_norm": 0.12043121457099915, + "learning_rate": 2.7733578542780964e-06, + "loss": 0.4574, + "step": 13002 + }, + { + "epoch": 2.673039366841402, + "grad_norm": 0.22910076379776, + "learning_rate": 2.7699066764479703e-06, + "loss": 0.3764, + "step": 13003 + }, + { + "epoch": 2.6732449378147805, + "grad_norm": 0.2299896627664566, + "learning_rate": 2.766457579097153e-06, + "loss": 0.3939, + "step": 13004 + }, + { + "epoch": 2.673450508788159, + "grad_norm": 0.24078021943569183, + "learning_rate": 2.763010562395579e-06, + "loss": 0.385, + "step": 13005 + }, + { + "epoch": 2.6736560797615376, + "grad_norm": 0.11954071372747421, + "learning_rate": 2.7595656265130464e-06, + "loss": 0.4594, + "step": 13006 + }, + { + "epoch": 2.673861650734916, + "grad_norm": 0.23649781942367554, + "learning_rate": 2.7561227716192906e-06, + "loss": 0.3816, + "step": 13007 + }, + { + "epoch": 2.674067221708295, + "grad_norm": 0.21963661909103394, + "learning_rate": 2.75268199788392e-06, + "loss": 0.3746, + "step": 13008 + }, + { + "epoch": 2.6742727926816734, + "grad_norm": 0.2391149252653122, + "learning_rate": 2.749243305476445e-06, + "loss": 0.3899, + "step": 13009 + }, + { + "epoch": 2.674478363655052, + "grad_norm": 0.22582948207855225, + "learning_rate": 2.745806694566274e-06, + "loss": 0.3797, + "step": 13010 + }, + { + "epoch": 2.6746839346284306, + "grad_norm": 0.23774947226047516, + "learning_rate": 2.7423721653227076e-06, + "loss": 0.3978, + "step": 13011 + }, + { + "epoch": 2.674889505601809, + "grad_norm": 0.2316160500049591, + "learning_rate": 2.7389397179149596e-06, + "loss": 0.3722, + "step": 13012 + }, + { + "epoch": 2.6750950765751877, + "grad_norm": 0.22677737474441528, + "learning_rate": 2.73550935251211e-06, + "loss": 0.3865, + "step": 13013 + }, + { + "epoch": 2.675300647548566, + "grad_norm": 0.2274550050497055, + "learning_rate": 2.732081069283179e-06, + "loss": 0.3732, + "step": 13014 + }, + { + "epoch": 2.675506218521945, + "grad_norm": 0.23151232302188873, + "learning_rate": 2.728654868397056e-06, + "loss": 0.3861, + "step": 13015 + }, + { + "epoch": 2.675711789495323, + "grad_norm": 0.12545832991600037, + "learning_rate": 2.725230750022531e-06, + "loss": 0.4512, + "step": 13016 + }, + { + "epoch": 2.6759173604687017, + "grad_norm": 0.24073415994644165, + "learning_rate": 2.7218087143282994e-06, + "loss": 0.3836, + "step": 13017 + }, + { + "epoch": 2.6761229314420802, + "grad_norm": 0.23176778852939606, + "learning_rate": 2.7183887614829412e-06, + "loss": 0.4068, + "step": 13018 + }, + { + "epoch": 2.676328502415459, + "grad_norm": 0.23318178951740265, + "learning_rate": 2.7149708916549418e-06, + "loss": 0.3968, + "step": 13019 + }, + { + "epoch": 2.6765340733888374, + "grad_norm": 0.24132607877254486, + "learning_rate": 2.711555105012681e-06, + "loss": 0.3644, + "step": 13020 + }, + { + "epoch": 2.676739644362216, + "grad_norm": 0.22596125304698944, + "learning_rate": 2.7081414017244543e-06, + "loss": 0.3598, + "step": 13021 + }, + { + "epoch": 2.6769452153355946, + "grad_norm": 0.2259039431810379, + "learning_rate": 2.7047297819584276e-06, + "loss": 0.3662, + "step": 13022 + }, + { + "epoch": 2.677150786308973, + "grad_norm": 0.23511864244937897, + "learning_rate": 2.7013202458826765e-06, + "loss": 0.4058, + "step": 13023 + }, + { + "epoch": 2.6773563572823518, + "grad_norm": 0.24032087624073029, + "learning_rate": 2.697912793665171e-06, + "loss": 0.3719, + "step": 13024 + }, + { + "epoch": 2.6775619282557304, + "grad_norm": 0.23492936789989471, + "learning_rate": 2.6945074254737823e-06, + "loss": 0.3734, + "step": 13025 + }, + { + "epoch": 2.677767499229109, + "grad_norm": 0.23162946105003357, + "learning_rate": 2.691104141476281e-06, + "loss": 0.3805, + "step": 13026 + }, + { + "epoch": 2.6779730702024875, + "grad_norm": 0.24035188555717468, + "learning_rate": 2.6877029418403233e-06, + "loss": 0.3693, + "step": 13027 + }, + { + "epoch": 2.678178641175866, + "grad_norm": 0.23720598220825195, + "learning_rate": 2.6843038267334797e-06, + "loss": 0.4006, + "step": 13028 + }, + { + "epoch": 2.6783842121492443, + "grad_norm": 0.23743665218353271, + "learning_rate": 2.6809067963232016e-06, + "loss": 0.4038, + "step": 13029 + }, + { + "epoch": 2.6785897831226233, + "grad_norm": 0.240424245595932, + "learning_rate": 2.677511850776845e-06, + "loss": 0.3842, + "step": 13030 + }, + { + "epoch": 2.6787953540960014, + "grad_norm": 0.1235266923904419, + "learning_rate": 2.674118990261666e-06, + "loss": 0.4391, + "step": 13031 + }, + { + "epoch": 2.6790009250693805, + "grad_norm": 0.23002861440181732, + "learning_rate": 2.670728214944816e-06, + "loss": 0.384, + "step": 13032 + }, + { + "epoch": 2.6792064960427586, + "grad_norm": 0.22837281227111816, + "learning_rate": 2.6673395249933415e-06, + "loss": 0.38, + "step": 13033 + }, + { + "epoch": 2.679412067016137, + "grad_norm": 0.24020573496818542, + "learning_rate": 2.6639529205741737e-06, + "loss": 0.3887, + "step": 13034 + }, + { + "epoch": 2.679617637989516, + "grad_norm": 0.24188318848609924, + "learning_rate": 2.6605684018541794e-06, + "loss": 0.3972, + "step": 13035 + }, + { + "epoch": 2.6798232089628944, + "grad_norm": 0.12417499721050262, + "learning_rate": 2.657185969000085e-06, + "loss": 0.4522, + "step": 13036 + }, + { + "epoch": 2.680028779936273, + "grad_norm": 0.21679937839508057, + "learning_rate": 2.653805622178527e-06, + "loss": 0.3873, + "step": 13037 + }, + { + "epoch": 2.6802343509096516, + "grad_norm": 0.22777822613716125, + "learning_rate": 2.6504273615560383e-06, + "loss": 0.3618, + "step": 13038 + }, + { + "epoch": 2.68043992188303, + "grad_norm": 0.12555932998657227, + "learning_rate": 2.6470511872990544e-06, + "loss": 0.4445, + "step": 13039 + }, + { + "epoch": 2.6806454928564087, + "grad_norm": 0.23415377736091614, + "learning_rate": 2.643677099573903e-06, + "loss": 0.3909, + "step": 13040 + }, + { + "epoch": 2.6808510638297873, + "grad_norm": 0.22409114241600037, + "learning_rate": 2.640305098546801e-06, + "loss": 0.362, + "step": 13041 + }, + { + "epoch": 2.681056634803166, + "grad_norm": 0.23534564673900604, + "learning_rate": 2.6369351843838803e-06, + "loss": 0.3977, + "step": 13042 + }, + { + "epoch": 2.6812622057765445, + "grad_norm": 0.23140472173690796, + "learning_rate": 2.633567357251163e-06, + "loss": 0.3775, + "step": 13043 + }, + { + "epoch": 2.6814677767499226, + "grad_norm": 0.23929573595523834, + "learning_rate": 2.630201617314557e-06, + "loss": 0.3746, + "step": 13044 + }, + { + "epoch": 2.6816733477233017, + "grad_norm": 0.23926587402820587, + "learning_rate": 2.6268379647398795e-06, + "loss": 0.3772, + "step": 13045 + }, + { + "epoch": 2.68187891869668, + "grad_norm": 0.23361510038375854, + "learning_rate": 2.6234763996928526e-06, + "loss": 0.3855, + "step": 13046 + }, + { + "epoch": 2.682084489670059, + "grad_norm": 0.23641300201416016, + "learning_rate": 2.620116922339069e-06, + "loss": 0.3859, + "step": 13047 + }, + { + "epoch": 2.682290060643437, + "grad_norm": 0.22642360627651215, + "learning_rate": 2.616759532844041e-06, + "loss": 0.375, + "step": 13048 + }, + { + "epoch": 2.6824956316168156, + "grad_norm": 0.22510544955730438, + "learning_rate": 2.6134042313731765e-06, + "loss": 0.3614, + "step": 13049 + }, + { + "epoch": 2.682701202590194, + "grad_norm": 0.23352572321891785, + "learning_rate": 2.6100510180917686e-06, + "loss": 0.3866, + "step": 13050 + }, + { + "epoch": 2.6829067735635728, + "grad_norm": 0.2314728945493698, + "learning_rate": 2.60669989316502e-06, + "loss": 0.3931, + "step": 13051 + }, + { + "epoch": 2.6831123445369514, + "grad_norm": 0.23167473077774048, + "learning_rate": 2.603350856758018e-06, + "loss": 0.3845, + "step": 13052 + }, + { + "epoch": 2.68331791551033, + "grad_norm": 0.23171542584896088, + "learning_rate": 2.600003909035762e-06, + "loss": 0.3828, + "step": 13053 + }, + { + "epoch": 2.6835234864837085, + "grad_norm": 0.12145873159170151, + "learning_rate": 2.596659050163139e-06, + "loss": 0.4463, + "step": 13054 + }, + { + "epoch": 2.683729057457087, + "grad_norm": 0.22926872968673706, + "learning_rate": 2.593316280304917e-06, + "loss": 0.3856, + "step": 13055 + }, + { + "epoch": 2.6839346284304657, + "grad_norm": 0.2303893268108368, + "learning_rate": 2.589975599625805e-06, + "loss": 0.3838, + "step": 13056 + }, + { + "epoch": 2.6841401994038443, + "grad_norm": 0.2381599098443985, + "learning_rate": 2.5866370082903713e-06, + "loss": 0.3799, + "step": 13057 + }, + { + "epoch": 2.684345770377223, + "grad_norm": 0.23543013632297516, + "learning_rate": 2.583300506463094e-06, + "loss": 0.385, + "step": 13058 + }, + { + "epoch": 2.684551341350601, + "grad_norm": 0.2375613898038864, + "learning_rate": 2.5799660943083415e-06, + "loss": 0.388, + "step": 13059 + }, + { + "epoch": 2.68475691232398, + "grad_norm": 0.22905340790748596, + "learning_rate": 2.5766337719903927e-06, + "loss": 0.3664, + "step": 13060 + }, + { + "epoch": 2.684962483297358, + "grad_norm": 0.23459582030773163, + "learning_rate": 2.5733035396734113e-06, + "loss": 0.3786, + "step": 13061 + }, + { + "epoch": 2.6851680542707372, + "grad_norm": 0.23848964273929596, + "learning_rate": 2.569975397521451e-06, + "loss": 0.374, + "step": 13062 + }, + { + "epoch": 2.6853736252441154, + "grad_norm": 0.22707267105579376, + "learning_rate": 2.5666493456985e-06, + "loss": 0.3724, + "step": 13063 + }, + { + "epoch": 2.685579196217494, + "grad_norm": 0.22259126603603363, + "learning_rate": 2.5633253843683986e-06, + "loss": 0.3879, + "step": 13064 + }, + { + "epoch": 2.6857847671908726, + "grad_norm": 0.23496946692466736, + "learning_rate": 2.5600035136949045e-06, + "loss": 0.3857, + "step": 13065 + }, + { + "epoch": 2.685990338164251, + "grad_norm": 0.23178550601005554, + "learning_rate": 2.5566837338416676e-06, + "loss": 0.3643, + "step": 13066 + }, + { + "epoch": 2.6861959091376297, + "grad_norm": 0.22792139649391174, + "learning_rate": 2.553366044972252e-06, + "loss": 0.3983, + "step": 13067 + }, + { + "epoch": 2.6864014801110083, + "grad_norm": 0.12821319699287415, + "learning_rate": 2.5500504472500965e-06, + "loss": 0.4591, + "step": 13068 + }, + { + "epoch": 2.686607051084387, + "grad_norm": 0.22924353182315826, + "learning_rate": 2.5467369408385405e-06, + "loss": 0.3922, + "step": 13069 + }, + { + "epoch": 2.6868126220577655, + "grad_norm": 0.2222532331943512, + "learning_rate": 2.5434255259008338e-06, + "loss": 0.3853, + "step": 13070 + }, + { + "epoch": 2.687018193031144, + "grad_norm": 0.2258753478527069, + "learning_rate": 2.5401162026001056e-06, + "loss": 0.3812, + "step": 13071 + }, + { + "epoch": 2.6872237640045227, + "grad_norm": 0.12898650765419006, + "learning_rate": 2.536808971099401e-06, + "loss": 0.459, + "step": 13072 + }, + { + "epoch": 2.6874293349779013, + "grad_norm": 0.23579534888267517, + "learning_rate": 2.533503831561644e-06, + "loss": 0.3781, + "step": 13073 + }, + { + "epoch": 2.6876349059512794, + "grad_norm": 0.23496629297733307, + "learning_rate": 2.5302007841496646e-06, + "loss": 0.3986, + "step": 13074 + }, + { + "epoch": 2.6878404769246584, + "grad_norm": 0.23853163421154022, + "learning_rate": 2.5268998290261877e-06, + "loss": 0.3919, + "step": 13075 + }, + { + "epoch": 2.6880460478980366, + "grad_norm": 0.23851320147514343, + "learning_rate": 2.523600966353833e-06, + "loss": 0.3835, + "step": 13076 + }, + { + "epoch": 2.6882516188714156, + "grad_norm": 0.12485864758491516, + "learning_rate": 2.5203041962951306e-06, + "loss": 0.447, + "step": 13077 + }, + { + "epoch": 2.6884571898447938, + "grad_norm": 0.22811704874038696, + "learning_rate": 2.517009519012496e-06, + "loss": 0.3961, + "step": 13078 + }, + { + "epoch": 2.6886627608181723, + "grad_norm": 0.2245602309703827, + "learning_rate": 2.513716934668229e-06, + "loss": 0.3827, + "step": 13079 + }, + { + "epoch": 2.688868331791551, + "grad_norm": 0.23157405853271484, + "learning_rate": 2.5104264434245545e-06, + "loss": 0.3791, + "step": 13080 + }, + { + "epoch": 2.6890739027649295, + "grad_norm": 0.2352142632007599, + "learning_rate": 2.5071380454435682e-06, + "loss": 0.3827, + "step": 13081 + }, + { + "epoch": 2.689279473738308, + "grad_norm": 0.24821443855762482, + "learning_rate": 2.503851740887276e-06, + "loss": 0.3834, + "step": 13082 + }, + { + "epoch": 2.6894850447116867, + "grad_norm": 0.2236967235803604, + "learning_rate": 2.5005675299175875e-06, + "loss": 0.3846, + "step": 13083 + }, + { + "epoch": 2.6896906156850653, + "grad_norm": 0.2317054569721222, + "learning_rate": 2.4972854126962986e-06, + "loss": 0.3587, + "step": 13084 + }, + { + "epoch": 2.689896186658444, + "grad_norm": 0.2305641770362854, + "learning_rate": 2.494005389385095e-06, + "loss": 0.3853, + "step": 13085 + }, + { + "epoch": 2.6901017576318225, + "grad_norm": 0.22506798803806305, + "learning_rate": 2.4907274601455726e-06, + "loss": 0.371, + "step": 13086 + }, + { + "epoch": 2.690307328605201, + "grad_norm": 0.23190316557884216, + "learning_rate": 2.487451625139217e-06, + "loss": 0.3807, + "step": 13087 + }, + { + "epoch": 2.6905128995785796, + "grad_norm": 0.23732031881809235, + "learning_rate": 2.4841778845274242e-06, + "loss": 0.3917, + "step": 13088 + }, + { + "epoch": 2.690718470551958, + "grad_norm": 0.23446981608867645, + "learning_rate": 2.4809062384714706e-06, + "loss": 0.3926, + "step": 13089 + }, + { + "epoch": 2.690924041525337, + "grad_norm": 0.2362259030342102, + "learning_rate": 2.4776366871325213e-06, + "loss": 0.3592, + "step": 13090 + }, + { + "epoch": 2.691129612498715, + "grad_norm": 0.12213429063558578, + "learning_rate": 2.4743692306716734e-06, + "loss": 0.4355, + "step": 13091 + }, + { + "epoch": 2.691335183472094, + "grad_norm": 0.23348113894462585, + "learning_rate": 2.4711038692498873e-06, + "loss": 0.3789, + "step": 13092 + }, + { + "epoch": 2.691540754445472, + "grad_norm": 0.23292915523052216, + "learning_rate": 2.46784060302803e-06, + "loss": 0.3832, + "step": 13093 + }, + { + "epoch": 2.6917463254188507, + "grad_norm": 0.12205676734447479, + "learning_rate": 2.4645794321668774e-06, + "loss": 0.4589, + "step": 13094 + }, + { + "epoch": 2.6919518963922293, + "grad_norm": 0.24196146428585052, + "learning_rate": 2.4613203568270864e-06, + "loss": 0.3818, + "step": 13095 + }, + { + "epoch": 2.692157467365608, + "grad_norm": 0.23547834157943726, + "learning_rate": 2.4580633771692036e-06, + "loss": 0.3813, + "step": 13096 + }, + { + "epoch": 2.6923630383389865, + "grad_norm": 0.23056018352508545, + "learning_rate": 2.4548084933537104e-06, + "loss": 0.3645, + "step": 13097 + }, + { + "epoch": 2.692568609312365, + "grad_norm": 0.23042161762714386, + "learning_rate": 2.4515557055409433e-06, + "loss": 0.3756, + "step": 13098 + }, + { + "epoch": 2.6927741802857437, + "grad_norm": 0.22752924263477325, + "learning_rate": 2.4483050138911598e-06, + "loss": 0.3805, + "step": 13099 + }, + { + "epoch": 2.6929797512591223, + "grad_norm": 0.12008198350667953, + "learning_rate": 2.445056418564496e-06, + "loss": 0.442, + "step": 13100 + }, + { + "epoch": 2.693185322232501, + "grad_norm": 0.23437613248825073, + "learning_rate": 2.4418099197210043e-06, + "loss": 0.3716, + "step": 13101 + }, + { + "epoch": 2.6933908932058794, + "grad_norm": 0.2585579752922058, + "learning_rate": 2.438565517520622e-06, + "loss": 0.3716, + "step": 13102 + }, + { + "epoch": 2.693596464179258, + "grad_norm": 0.23899348080158234, + "learning_rate": 2.4353232121231807e-06, + "loss": 0.3918, + "step": 13103 + }, + { + "epoch": 2.6938020351526366, + "grad_norm": 0.23468652367591858, + "learning_rate": 2.432083003688423e-06, + "loss": 0.4057, + "step": 13104 + }, + { + "epoch": 2.694007606126015, + "grad_norm": 0.2242051213979721, + "learning_rate": 2.428844892375971e-06, + "loss": 0.3746, + "step": 13105 + }, + { + "epoch": 2.6942131770993933, + "grad_norm": 0.22958362102508545, + "learning_rate": 2.4256088783453573e-06, + "loss": 0.3729, + "step": 13106 + }, + { + "epoch": 2.6944187480727724, + "grad_norm": 0.2308982014656067, + "learning_rate": 2.4223749617559994e-06, + "loss": 0.3924, + "step": 13107 + }, + { + "epoch": 2.6946243190461505, + "grad_norm": 0.22810958325862885, + "learning_rate": 2.4191431427672194e-06, + "loss": 0.378, + "step": 13108 + }, + { + "epoch": 2.694829890019529, + "grad_norm": 0.2260715216398239, + "learning_rate": 2.4159134215382305e-06, + "loss": 0.3683, + "step": 13109 + }, + { + "epoch": 2.6950354609929077, + "grad_norm": 0.22648762166500092, + "learning_rate": 2.4126857982281553e-06, + "loss": 0.3933, + "step": 13110 + }, + { + "epoch": 2.6952410319662863, + "grad_norm": 0.2309451550245285, + "learning_rate": 2.4094602729959916e-06, + "loss": 0.3877, + "step": 13111 + }, + { + "epoch": 2.695446602939665, + "grad_norm": 0.23046442866325378, + "learning_rate": 2.406236846000657e-06, + "loss": 0.3708, + "step": 13112 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 0.23607337474822998, + "learning_rate": 2.4030155174009545e-06, + "loss": 0.3971, + "step": 13113 + }, + { + "epoch": 2.695857744886422, + "grad_norm": 0.12376264482736588, + "learning_rate": 2.3997962873555773e-06, + "loss": 0.4165, + "step": 13114 + }, + { + "epoch": 2.6960633158598006, + "grad_norm": 0.22271588444709778, + "learning_rate": 2.396579156023124e-06, + "loss": 0.3727, + "step": 13115 + }, + { + "epoch": 2.6962688868331792, + "grad_norm": 0.22929471731185913, + "learning_rate": 2.393364123562087e-06, + "loss": 0.3777, + "step": 13116 + }, + { + "epoch": 2.696474457806558, + "grad_norm": 0.12126558274030685, + "learning_rate": 2.39015119013085e-06, + "loss": 0.4323, + "step": 13117 + }, + { + "epoch": 2.6966800287799364, + "grad_norm": 0.22162644565105438, + "learning_rate": 2.3869403558877163e-06, + "loss": 0.3741, + "step": 13118 + }, + { + "epoch": 2.696885599753315, + "grad_norm": 0.12397125363349915, + "learning_rate": 2.3837316209908546e-06, + "loss": 0.4487, + "step": 13119 + }, + { + "epoch": 2.6970911707266936, + "grad_norm": 0.11935931444168091, + "learning_rate": 2.380524985598348e-06, + "loss": 0.4464, + "step": 13120 + }, + { + "epoch": 2.6972967417000717, + "grad_norm": 0.23302531242370605, + "learning_rate": 2.3773204498681758e-06, + "loss": 0.3829, + "step": 13121 + }, + { + "epoch": 2.6975023126734508, + "grad_norm": 0.12195220589637756, + "learning_rate": 2.374118013958206e-06, + "loss": 0.4471, + "step": 13122 + }, + { + "epoch": 2.697707883646829, + "grad_norm": 0.23271100223064423, + "learning_rate": 2.3709176780262076e-06, + "loss": 0.3808, + "step": 13123 + }, + { + "epoch": 2.6979134546202075, + "grad_norm": 0.22822687029838562, + "learning_rate": 2.36771944222984e-06, + "loss": 0.3695, + "step": 13124 + }, + { + "epoch": 2.698119025593586, + "grad_norm": 0.23481127619743347, + "learning_rate": 2.3645233067266815e-06, + "loss": 0.3999, + "step": 13125 + }, + { + "epoch": 2.6983245965669647, + "grad_norm": 0.225934699177742, + "learning_rate": 2.3613292716741816e-06, + "loss": 0.3737, + "step": 13126 + }, + { + "epoch": 2.6985301675403432, + "grad_norm": 0.23984263837337494, + "learning_rate": 2.358137337229694e-06, + "loss": 0.3819, + "step": 13127 + }, + { + "epoch": 2.698735738513722, + "grad_norm": 0.2241523116827011, + "learning_rate": 2.3549475035504733e-06, + "loss": 0.3683, + "step": 13128 + }, + { + "epoch": 2.6989413094871004, + "grad_norm": 0.2185996025800705, + "learning_rate": 2.3517597707936636e-06, + "loss": 0.3635, + "step": 13129 + }, + { + "epoch": 2.699146880460479, + "grad_norm": 0.23561379313468933, + "learning_rate": 2.3485741391163092e-06, + "loss": 0.3819, + "step": 13130 + }, + { + "epoch": 2.6993524514338576, + "grad_norm": 0.12253455072641373, + "learning_rate": 2.3453906086753646e-06, + "loss": 0.4446, + "step": 13131 + }, + { + "epoch": 2.699558022407236, + "grad_norm": 0.22365736961364746, + "learning_rate": 2.34220917962765e-06, + "loss": 0.382, + "step": 13132 + }, + { + "epoch": 2.699763593380615, + "grad_norm": 0.12028893828392029, + "learning_rate": 2.339029852129909e-06, + "loss": 0.4583, + "step": 13133 + }, + { + "epoch": 2.6999691643539934, + "grad_norm": 0.12149006873369217, + "learning_rate": 2.3358526263387715e-06, + "loss": 0.4422, + "step": 13134 + }, + { + "epoch": 2.700174735327372, + "grad_norm": 0.23423054814338684, + "learning_rate": 2.3326775024107627e-06, + "loss": 0.3989, + "step": 13135 + }, + { + "epoch": 2.70038030630075, + "grad_norm": 0.2282380312681198, + "learning_rate": 2.3295044805023075e-06, + "loss": 0.3798, + "step": 13136 + }, + { + "epoch": 2.700585877274129, + "grad_norm": 0.22905749082565308, + "learning_rate": 2.3263335607697258e-06, + "loss": 0.3987, + "step": 13137 + }, + { + "epoch": 2.7007914482475073, + "grad_norm": 0.223682701587677, + "learning_rate": 2.3231647433692273e-06, + "loss": 0.3574, + "step": 13138 + }, + { + "epoch": 2.700997019220886, + "grad_norm": 0.12208550423383713, + "learning_rate": 2.3199980284569373e-06, + "loss": 0.4494, + "step": 13139 + }, + { + "epoch": 2.7012025901942645, + "grad_norm": 0.2424362748861313, + "learning_rate": 2.316833416188861e-06, + "loss": 0.3936, + "step": 13140 + }, + { + "epoch": 2.701408161167643, + "grad_norm": 0.12195998430252075, + "learning_rate": 2.313670906720899e-06, + "loss": 0.4549, + "step": 13141 + }, + { + "epoch": 2.7016137321410216, + "grad_norm": 0.23335258662700653, + "learning_rate": 2.310510500208856e-06, + "loss": 0.398, + "step": 13142 + }, + { + "epoch": 2.7018193031144, + "grad_norm": 0.239247128367424, + "learning_rate": 2.3073521968084285e-06, + "loss": 0.3936, + "step": 13143 + }, + { + "epoch": 2.702024874087779, + "grad_norm": 0.23259401321411133, + "learning_rate": 2.304195996675216e-06, + "loss": 0.395, + "step": 13144 + }, + { + "epoch": 2.7022304450611574, + "grad_norm": 0.2279106080532074, + "learning_rate": 2.3010418999646995e-06, + "loss": 0.3653, + "step": 13145 + }, + { + "epoch": 2.702436016034536, + "grad_norm": 0.23209701478481293, + "learning_rate": 2.2978899068322845e-06, + "loss": 0.3991, + "step": 13146 + }, + { + "epoch": 2.7026415870079146, + "grad_norm": 0.23221181333065033, + "learning_rate": 2.294740017433242e-06, + "loss": 0.3696, + "step": 13147 + }, + { + "epoch": 2.702847157981293, + "grad_norm": 0.23377148807048798, + "learning_rate": 2.2915922319227536e-06, + "loss": 0.3691, + "step": 13148 + }, + { + "epoch": 2.7030527289546717, + "grad_norm": 0.23326507210731506, + "learning_rate": 2.288446550455899e-06, + "loss": 0.371, + "step": 13149 + }, + { + "epoch": 2.7032582999280503, + "grad_norm": 0.11786891520023346, + "learning_rate": 2.2853029731876445e-06, + "loss": 0.4332, + "step": 13150 + }, + { + "epoch": 2.7034638709014285, + "grad_norm": 0.23896630108356476, + "learning_rate": 2.282161500272867e-06, + "loss": 0.3665, + "step": 13151 + }, + { + "epoch": 2.7036694418748075, + "grad_norm": 0.12194350361824036, + "learning_rate": 2.2790221318663267e-06, + "loss": 0.4324, + "step": 13152 + }, + { + "epoch": 2.7038750128481857, + "grad_norm": 0.23250941932201385, + "learning_rate": 2.275884868122696e-06, + "loss": 0.3881, + "step": 13153 + }, + { + "epoch": 2.7040805838215642, + "grad_norm": 0.232101172208786, + "learning_rate": 2.272749709196515e-06, + "loss": 0.3908, + "step": 13154 + }, + { + "epoch": 2.704286154794943, + "grad_norm": 0.22767187654972076, + "learning_rate": 2.269616655242261e-06, + "loss": 0.3864, + "step": 13155 + }, + { + "epoch": 2.7044917257683214, + "grad_norm": 0.232827827334404, + "learning_rate": 2.2664857064142654e-06, + "loss": 0.3791, + "step": 13156 + }, + { + "epoch": 2.7046972967417, + "grad_norm": 0.12185569107532501, + "learning_rate": 2.2633568628667894e-06, + "loss": 0.4662, + "step": 13157 + }, + { + "epoch": 2.7049028677150786, + "grad_norm": 0.23200562596321106, + "learning_rate": 2.2602301247539605e-06, + "loss": 0.3772, + "step": 13158 + }, + { + "epoch": 2.705108438688457, + "grad_norm": 0.24151834845542908, + "learning_rate": 2.2571054922298347e-06, + "loss": 0.372, + "step": 13159 + }, + { + "epoch": 2.7053140096618358, + "grad_norm": 0.23124399781227112, + "learning_rate": 2.253982965448344e-06, + "loss": 0.4018, + "step": 13160 + }, + { + "epoch": 2.7055195806352144, + "grad_norm": 0.11762725561857224, + "learning_rate": 2.250862544563316e-06, + "loss": 0.4491, + "step": 13161 + }, + { + "epoch": 2.705725151608593, + "grad_norm": 0.11691106110811234, + "learning_rate": 2.2477442297284817e-06, + "loss": 0.4284, + "step": 13162 + }, + { + "epoch": 2.7059307225819715, + "grad_norm": 0.22589966654777527, + "learning_rate": 2.244628021097469e-06, + "loss": 0.3723, + "step": 13163 + }, + { + "epoch": 2.70613629355535, + "grad_norm": 0.23272038996219635, + "learning_rate": 2.24151391882379e-06, + "loss": 0.3985, + "step": 13164 + }, + { + "epoch": 2.7063418645287287, + "grad_norm": 0.22128084301948547, + "learning_rate": 2.2384019230608664e-06, + "loss": 0.3743, + "step": 13165 + }, + { + "epoch": 2.706547435502107, + "grad_norm": 0.11852707713842392, + "learning_rate": 2.2352920339620166e-06, + "loss": 0.4401, + "step": 13166 + }, + { + "epoch": 2.706753006475486, + "grad_norm": 0.23066598176956177, + "learning_rate": 2.232184251680447e-06, + "loss": 0.3604, + "step": 13167 + }, + { + "epoch": 2.706958577448864, + "grad_norm": 0.11477980017662048, + "learning_rate": 2.229078576369261e-06, + "loss": 0.4282, + "step": 13168 + }, + { + "epoch": 2.7071641484222426, + "grad_norm": 0.2299182116985321, + "learning_rate": 2.2259750081814653e-06, + "loss": 0.3933, + "step": 13169 + }, + { + "epoch": 2.707369719395621, + "grad_norm": 0.22713468968868256, + "learning_rate": 2.222873547269953e-06, + "loss": 0.4137, + "step": 13170 + }, + { + "epoch": 2.707575290369, + "grad_norm": 0.23036979138851166, + "learning_rate": 2.2197741937875274e-06, + "loss": 0.3846, + "step": 13171 + }, + { + "epoch": 2.7077808613423784, + "grad_norm": 0.2358933985233307, + "learning_rate": 2.2166769478868607e-06, + "loss": 0.3745, + "step": 13172 + }, + { + "epoch": 2.707986432315757, + "grad_norm": 0.25210806727409363, + "learning_rate": 2.2135818097205606e-06, + "loss": 0.4011, + "step": 13173 + }, + { + "epoch": 2.7081920032891356, + "grad_norm": 0.23163393139839172, + "learning_rate": 2.210488779441101e-06, + "loss": 0.3744, + "step": 13174 + }, + { + "epoch": 2.708397574262514, + "grad_norm": 0.24861502647399902, + "learning_rate": 2.207397857200855e-06, + "loss": 0.3905, + "step": 13175 + }, + { + "epoch": 2.7086031452358927, + "grad_norm": 0.23797625303268433, + "learning_rate": 2.20430904315211e-06, + "loss": 0.3745, + "step": 13176 + }, + { + "epoch": 2.7088087162092713, + "grad_norm": 0.23504245281219482, + "learning_rate": 2.201222337447034e-06, + "loss": 0.3763, + "step": 13177 + }, + { + "epoch": 2.70901428718265, + "grad_norm": 0.23205745220184326, + "learning_rate": 2.1981377402376917e-06, + "loss": 0.3683, + "step": 13178 + }, + { + "epoch": 2.7092198581560285, + "grad_norm": 0.12594787776470184, + "learning_rate": 2.195055251676041e-06, + "loss": 0.4583, + "step": 13179 + }, + { + "epoch": 2.709425429129407, + "grad_norm": 0.22511595487594604, + "learning_rate": 2.191974871913955e-06, + "loss": 0.3569, + "step": 13180 + }, + { + "epoch": 2.7096310001027852, + "grad_norm": 0.2308862954378128, + "learning_rate": 2.1888966011031823e-06, + "loss": 0.3656, + "step": 13181 + }, + { + "epoch": 2.7098365710761643, + "grad_norm": 0.2143402099609375, + "learning_rate": 2.1858204393953726e-06, + "loss": 0.3644, + "step": 13182 + }, + { + "epoch": 2.7100421420495424, + "grad_norm": 0.24129731953144073, + "learning_rate": 2.1827463869420834e-06, + "loss": 0.3925, + "step": 13183 + }, + { + "epoch": 2.710247713022921, + "grad_norm": 0.23186422884464264, + "learning_rate": 2.179674443894749e-06, + "loss": 0.3777, + "step": 13184 + }, + { + "epoch": 2.7104532839962996, + "grad_norm": 0.22773997485637665, + "learning_rate": 2.176604610404709e-06, + "loss": 0.3691, + "step": 13185 + }, + { + "epoch": 2.710658854969678, + "grad_norm": 0.22896374762058258, + "learning_rate": 2.1735368866232013e-06, + "loss": 0.3764, + "step": 13186 + }, + { + "epoch": 2.7108644259430568, + "grad_norm": 0.23243440687656403, + "learning_rate": 2.170471272701371e-06, + "loss": 0.367, + "step": 13187 + }, + { + "epoch": 2.7110699969164354, + "grad_norm": 0.2411787211894989, + "learning_rate": 2.1674077687902318e-06, + "loss": 0.3867, + "step": 13188 + }, + { + "epoch": 2.711275567889814, + "grad_norm": 0.21772977709770203, + "learning_rate": 2.164346375040713e-06, + "loss": 0.372, + "step": 13189 + }, + { + "epoch": 2.7114811388631925, + "grad_norm": 0.2415088415145874, + "learning_rate": 2.1612870916036336e-06, + "loss": 0.3886, + "step": 13190 + }, + { + "epoch": 2.711686709836571, + "grad_norm": 0.22934192419052124, + "learning_rate": 2.1582299186297138e-06, + "loss": 0.384, + "step": 13191 + }, + { + "epoch": 2.7118922808099497, + "grad_norm": 0.23373478651046753, + "learning_rate": 2.1551748562695627e-06, + "loss": 0.3916, + "step": 13192 + }, + { + "epoch": 2.7120978517833283, + "grad_norm": 0.23007836937904358, + "learning_rate": 2.152121904673685e-06, + "loss": 0.3613, + "step": 13193 + }, + { + "epoch": 2.712303422756707, + "grad_norm": 0.12053580582141876, + "learning_rate": 2.1490710639925003e-06, + "loss": 0.4356, + "step": 13194 + }, + { + "epoch": 2.7125089937300855, + "grad_norm": 0.23001576960086823, + "learning_rate": 2.1460223343762937e-06, + "loss": 0.3559, + "step": 13195 + }, + { + "epoch": 2.7127145647034636, + "grad_norm": 0.2180272787809372, + "learning_rate": 2.1429757159752697e-06, + "loss": 0.3824, + "step": 13196 + }, + { + "epoch": 2.7129201356768426, + "grad_norm": 0.23263666033744812, + "learning_rate": 2.139931208939513e-06, + "loss": 0.3618, + "step": 13197 + }, + { + "epoch": 2.713125706650221, + "grad_norm": 0.2428431212902069, + "learning_rate": 2.136888813419024e-06, + "loss": 0.3795, + "step": 13198 + }, + { + "epoch": 2.7133312776236, + "grad_norm": 0.11948748677968979, + "learning_rate": 2.133848529563683e-06, + "loss": 0.4315, + "step": 13199 + }, + { + "epoch": 2.713536848596978, + "grad_norm": 0.2290315479040146, + "learning_rate": 2.1308103575232645e-06, + "loss": 0.3947, + "step": 13200 + }, + { + "epoch": 2.7137424195703566, + "grad_norm": 0.11666145920753479, + "learning_rate": 2.12777429744745e-06, + "loss": 0.4485, + "step": 13201 + }, + { + "epoch": 2.713947990543735, + "grad_norm": 0.2218608856201172, + "learning_rate": 2.124740349485818e-06, + "loss": 0.3646, + "step": 13202 + }, + { + "epoch": 2.7141535615171137, + "grad_norm": 0.23134684562683105, + "learning_rate": 2.1217085137878256e-06, + "loss": 0.3925, + "step": 13203 + }, + { + "epoch": 2.7143591324904923, + "grad_norm": 0.22924216091632843, + "learning_rate": 2.118678790502843e-06, + "loss": 0.3695, + "step": 13204 + }, + { + "epoch": 2.714564703463871, + "grad_norm": 0.23405931890010834, + "learning_rate": 2.11565117978013e-06, + "loss": 0.3772, + "step": 13205 + }, + { + "epoch": 2.7147702744372495, + "grad_norm": 0.22839610278606415, + "learning_rate": 2.1126256817688427e-06, + "loss": 0.3642, + "step": 13206 + }, + { + "epoch": 2.714975845410628, + "grad_norm": 0.22964198887348175, + "learning_rate": 2.1096022966180274e-06, + "loss": 0.3658, + "step": 13207 + }, + { + "epoch": 2.7151814163840067, + "grad_norm": 0.2313418686389923, + "learning_rate": 2.106581024476644e-06, + "loss": 0.3822, + "step": 13208 + }, + { + "epoch": 2.7153869873573853, + "grad_norm": 0.23704691231250763, + "learning_rate": 2.10356186549353e-06, + "loss": 0.3822, + "step": 13209 + }, + { + "epoch": 2.715592558330764, + "grad_norm": 0.2327091097831726, + "learning_rate": 2.100544819817424e-06, + "loss": 0.3948, + "step": 13210 + }, + { + "epoch": 2.715798129304142, + "grad_norm": 0.23315146565437317, + "learning_rate": 2.0975298875969646e-06, + "loss": 0.384, + "step": 13211 + }, + { + "epoch": 2.716003700277521, + "grad_norm": 0.22651349008083344, + "learning_rate": 2.0945170689806813e-06, + "loss": 0.3692, + "step": 13212 + }, + { + "epoch": 2.716209271250899, + "grad_norm": 0.22469674050807953, + "learning_rate": 2.0915063641170015e-06, + "loss": 0.3868, + "step": 13213 + }, + { + "epoch": 2.716414842224278, + "grad_norm": 0.12531313300132751, + "learning_rate": 2.0884977731542454e-06, + "loss": 0.4479, + "step": 13214 + }, + { + "epoch": 2.7166204131976563, + "grad_norm": 0.11977065354585648, + "learning_rate": 2.0854912962406403e-06, + "loss": 0.4418, + "step": 13215 + }, + { + "epoch": 2.716825984171035, + "grad_norm": 0.23392470180988312, + "learning_rate": 2.0824869335242976e-06, + "loss": 0.3844, + "step": 13216 + }, + { + "epoch": 2.7170315551444135, + "grad_norm": 0.23545394837856293, + "learning_rate": 2.0794846851532287e-06, + "loss": 0.3921, + "step": 13217 + }, + { + "epoch": 2.717237126117792, + "grad_norm": 0.11989542841911316, + "learning_rate": 2.076484551275335e-06, + "loss": 0.4487, + "step": 13218 + }, + { + "epoch": 2.7174426970911707, + "grad_norm": 0.2391575127840042, + "learning_rate": 2.073486532038424e-06, + "loss": 0.3802, + "step": 13219 + }, + { + "epoch": 2.7176482680645493, + "grad_norm": 0.12006057053804398, + "learning_rate": 2.0704906275901968e-06, + "loss": 0.4567, + "step": 13220 + }, + { + "epoch": 2.717853839037928, + "grad_norm": 0.23207461833953857, + "learning_rate": 2.067496838078241e-06, + "loss": 0.3808, + "step": 13221 + }, + { + "epoch": 2.7180594100113065, + "grad_norm": 0.2288663387298584, + "learning_rate": 2.0645051636500534e-06, + "loss": 0.3663, + "step": 13222 + }, + { + "epoch": 2.718264980984685, + "grad_norm": 0.23807507753372192, + "learning_rate": 2.061515604453016e-06, + "loss": 0.3887, + "step": 13223 + }, + { + "epoch": 2.7184705519580636, + "grad_norm": 0.23132917284965515, + "learning_rate": 2.058528160634411e-06, + "loss": 0.4107, + "step": 13224 + }, + { + "epoch": 2.7186761229314422, + "grad_norm": 0.2228475958108902, + "learning_rate": 2.0555428323414157e-06, + "loss": 0.3912, + "step": 13225 + }, + { + "epoch": 2.7188816939048204, + "grad_norm": 0.2397620677947998, + "learning_rate": 2.0525596197211022e-06, + "loss": 0.3716, + "step": 13226 + }, + { + "epoch": 2.7190872648781994, + "grad_norm": 0.23399171233177185, + "learning_rate": 2.0495785229204432e-06, + "loss": 0.3902, + "step": 13227 + }, + { + "epoch": 2.7192928358515775, + "grad_norm": 0.23543019592761993, + "learning_rate": 2.0465995420862917e-06, + "loss": 0.3836, + "step": 13228 + }, + { + "epoch": 2.7194984068249566, + "grad_norm": 0.12511909008026123, + "learning_rate": 2.043622677365424e-06, + "loss": 0.4333, + "step": 13229 + }, + { + "epoch": 2.7197039777983347, + "grad_norm": 0.22897110879421234, + "learning_rate": 2.0406479289044895e-06, + "loss": 0.3777, + "step": 13230 + }, + { + "epoch": 2.7199095487717133, + "grad_norm": 0.23776206374168396, + "learning_rate": 2.0376752968500397e-06, + "loss": 0.385, + "step": 13231 + }, + { + "epoch": 2.720115119745092, + "grad_norm": 0.22551818192005157, + "learning_rate": 2.0347047813485274e-06, + "loss": 0.3732, + "step": 13232 + }, + { + "epoch": 2.7203206907184705, + "grad_norm": 0.23058441281318665, + "learning_rate": 2.0317363825462867e-06, + "loss": 0.3617, + "step": 13233 + }, + { + "epoch": 2.720526261691849, + "grad_norm": 0.22936634719371796, + "learning_rate": 2.0287701005895543e-06, + "loss": 0.3821, + "step": 13234 + }, + { + "epoch": 2.7207318326652277, + "grad_norm": 0.2348644882440567, + "learning_rate": 2.025805935624479e-06, + "loss": 0.3783, + "step": 13235 + }, + { + "epoch": 2.7209374036386063, + "grad_norm": 0.22684963047504425, + "learning_rate": 2.022843887797084e-06, + "loss": 0.3885, + "step": 13236 + }, + { + "epoch": 2.721142974611985, + "grad_norm": 0.2363407462835312, + "learning_rate": 2.0198839572532972e-06, + "loss": 0.379, + "step": 13237 + }, + { + "epoch": 2.7213485455853634, + "grad_norm": 0.11835481971502304, + "learning_rate": 2.0169261441389376e-06, + "loss": 0.4572, + "step": 13238 + }, + { + "epoch": 2.721554116558742, + "grad_norm": 0.11983449012041092, + "learning_rate": 2.013970448599723e-06, + "loss": 0.4419, + "step": 13239 + }, + { + "epoch": 2.7217596875321206, + "grad_norm": 0.23388756811618805, + "learning_rate": 2.011016870781267e-06, + "loss": 0.3931, + "step": 13240 + }, + { + "epoch": 2.7219652585054988, + "grad_norm": 0.23092390596866608, + "learning_rate": 2.0080654108290835e-06, + "loss": 0.3978, + "step": 13241 + }, + { + "epoch": 2.722170829478878, + "grad_norm": 0.23938718438148499, + "learning_rate": 2.0051160688885714e-06, + "loss": 0.3733, + "step": 13242 + }, + { + "epoch": 2.722376400452256, + "grad_norm": 0.11977725476026535, + "learning_rate": 2.0021688451050334e-06, + "loss": 0.4444, + "step": 13243 + }, + { + "epoch": 2.722581971425635, + "grad_norm": 0.22424502670764923, + "learning_rate": 1.9992237396236645e-06, + "loss": 0.383, + "step": 13244 + }, + { + "epoch": 2.722787542399013, + "grad_norm": 0.23597703874111176, + "learning_rate": 1.996280752589563e-06, + "loss": 0.369, + "step": 13245 + }, + { + "epoch": 2.7229931133723917, + "grad_norm": 0.23005138337612152, + "learning_rate": 1.993339884147704e-06, + "loss": 0.3738, + "step": 13246 + }, + { + "epoch": 2.7231986843457703, + "grad_norm": 0.22499051690101624, + "learning_rate": 1.9904011344429797e-06, + "loss": 0.3796, + "step": 13247 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 0.22470605373382568, + "learning_rate": 1.9874645036201557e-06, + "loss": 0.3958, + "step": 13248 + }, + { + "epoch": 2.7236098262925275, + "grad_norm": 0.22984722256660461, + "learning_rate": 1.9845299918239257e-06, + "loss": 0.4004, + "step": 13249 + }, + { + "epoch": 2.723815397265906, + "grad_norm": 0.2346932291984558, + "learning_rate": 1.9815975991988445e-06, + "loss": 0.3883, + "step": 13250 + }, + { + "epoch": 2.7240209682392846, + "grad_norm": 0.24680159986019135, + "learning_rate": 1.978667325889386e-06, + "loss": 0.3792, + "step": 13251 + }, + { + "epoch": 2.7242265392126632, + "grad_norm": 0.2350476086139679, + "learning_rate": 1.9757391720399056e-06, + "loss": 0.3892, + "step": 13252 + }, + { + "epoch": 2.724432110186042, + "grad_norm": 0.22723430395126343, + "learning_rate": 1.972813137794662e-06, + "loss": 0.3875, + "step": 13253 + }, + { + "epoch": 2.7246376811594204, + "grad_norm": 0.238324373960495, + "learning_rate": 1.969889223297805e-06, + "loss": 0.3872, + "step": 13254 + }, + { + "epoch": 2.724843252132799, + "grad_norm": 0.12185484915971756, + "learning_rate": 1.96696742869338e-06, + "loss": 0.4491, + "step": 13255 + }, + { + "epoch": 2.725048823106177, + "grad_norm": 0.23587733507156372, + "learning_rate": 1.964047754125341e-06, + "loss": 0.3813, + "step": 13256 + }, + { + "epoch": 2.725254394079556, + "grad_norm": 0.22594283521175385, + "learning_rate": 1.961130199737514e-06, + "loss": 0.3808, + "step": 13257 + }, + { + "epoch": 2.7254599650529343, + "grad_norm": 0.117709681391716, + "learning_rate": 1.9582147656736426e-06, + "loss": 0.4448, + "step": 13258 + }, + { + "epoch": 2.7256655360263133, + "grad_norm": 0.1209564283490181, + "learning_rate": 1.9553014520773535e-06, + "loss": 0.4412, + "step": 13259 + }, + { + "epoch": 2.7258711069996915, + "grad_norm": 0.23455938696861267, + "learning_rate": 1.9523902590921657e-06, + "loss": 0.3511, + "step": 13260 + }, + { + "epoch": 2.72607667797307, + "grad_norm": 0.2380744367837906, + "learning_rate": 1.94948118686151e-06, + "loss": 0.376, + "step": 13261 + }, + { + "epoch": 2.7262822489464487, + "grad_norm": 0.11584602296352386, + "learning_rate": 1.9465742355287014e-06, + "loss": 0.4523, + "step": 13262 + }, + { + "epoch": 2.7264878199198272, + "grad_norm": 0.12415748089551926, + "learning_rate": 1.943669405236941e-06, + "loss": 0.435, + "step": 13263 + }, + { + "epoch": 2.726693390893206, + "grad_norm": 0.227211594581604, + "learning_rate": 1.9407666961293487e-06, + "loss": 0.383, + "step": 13264 + }, + { + "epoch": 2.7268989618665844, + "grad_norm": 0.12154388427734375, + "learning_rate": 1.9378661083489255e-06, + "loss": 0.4509, + "step": 13265 + }, + { + "epoch": 2.727104532839963, + "grad_norm": 0.23259896039962769, + "learning_rate": 1.9349676420385665e-06, + "loss": 0.385, + "step": 13266 + }, + { + "epoch": 2.7273101038133416, + "grad_norm": 0.2539547383785248, + "learning_rate": 1.9320712973410634e-06, + "loss": 0.397, + "step": 13267 + }, + { + "epoch": 2.72751567478672, + "grad_norm": 0.22695066034793854, + "learning_rate": 1.929177074399111e-06, + "loss": 0.3688, + "step": 13268 + }, + { + "epoch": 2.727721245760099, + "grad_norm": 0.23069900274276733, + "learning_rate": 1.9262849733552864e-06, + "loss": 0.3655, + "step": 13269 + }, + { + "epoch": 2.7279268167334774, + "grad_norm": 0.22440584003925323, + "learning_rate": 1.9233949943520798e-06, + "loss": 0.3756, + "step": 13270 + }, + { + "epoch": 2.728132387706856, + "grad_norm": 0.23021718859672546, + "learning_rate": 1.920507137531862e-06, + "loss": 0.3766, + "step": 13271 + }, + { + "epoch": 2.7283379586802345, + "grad_norm": 0.22625602781772614, + "learning_rate": 1.9176214030369055e-06, + "loss": 0.3834, + "step": 13272 + }, + { + "epoch": 2.7285435296536127, + "grad_norm": 0.22805316746234894, + "learning_rate": 1.9147377910093754e-06, + "loss": 0.3796, + "step": 13273 + }, + { + "epoch": 2.7287491006269917, + "grad_norm": 0.12454908341169357, + "learning_rate": 1.9118563015913337e-06, + "loss": 0.4406, + "step": 13274 + }, + { + "epoch": 2.72895467160037, + "grad_norm": 0.23324701189994812, + "learning_rate": 1.9089769349247417e-06, + "loss": 0.3713, + "step": 13275 + }, + { + "epoch": 2.7291602425737485, + "grad_norm": 0.23551899194717407, + "learning_rate": 1.9060996911514407e-06, + "loss": 0.3907, + "step": 13276 + }, + { + "epoch": 2.729365813547127, + "grad_norm": 0.24228408932685852, + "learning_rate": 1.9032245704131973e-06, + "loss": 0.386, + "step": 13277 + }, + { + "epoch": 2.7295713845205056, + "grad_norm": 0.22902436554431915, + "learning_rate": 1.9003515728516386e-06, + "loss": 0.3902, + "step": 13278 + }, + { + "epoch": 2.729776955493884, + "grad_norm": 0.2340046763420105, + "learning_rate": 1.897480698608316e-06, + "loss": 0.3771, + "step": 13279 + }, + { + "epoch": 2.729982526467263, + "grad_norm": 0.12157081812620163, + "learning_rate": 1.8946119478246565e-06, + "loss": 0.4443, + "step": 13280 + }, + { + "epoch": 2.7301880974406414, + "grad_norm": 0.22382938861846924, + "learning_rate": 1.8917453206419922e-06, + "loss": 0.3782, + "step": 13281 + }, + { + "epoch": 2.73039366841402, + "grad_norm": 0.2339484691619873, + "learning_rate": 1.888880817201545e-06, + "loss": 0.3879, + "step": 13282 + }, + { + "epoch": 2.7305992393873986, + "grad_norm": 0.23548907041549683, + "learning_rate": 1.8860184376444418e-06, + "loss": 0.3741, + "step": 13283 + }, + { + "epoch": 2.730804810360777, + "grad_norm": 0.22213146090507507, + "learning_rate": 1.8831581821116901e-06, + "loss": 0.3853, + "step": 13284 + }, + { + "epoch": 2.7310103813341557, + "grad_norm": 0.22502891719341278, + "learning_rate": 1.8803000507442171e-06, + "loss": 0.3773, + "step": 13285 + }, + { + "epoch": 2.7312159523075343, + "grad_norm": 0.22634708881378174, + "learning_rate": 1.877444043682815e-06, + "loss": 0.3758, + "step": 13286 + }, + { + "epoch": 2.731421523280913, + "grad_norm": 0.23306407034397125, + "learning_rate": 1.8745901610681915e-06, + "loss": 0.373, + "step": 13287 + }, + { + "epoch": 2.731627094254291, + "grad_norm": 0.2324984073638916, + "learning_rate": 1.8717384030409442e-06, + "loss": 0.3757, + "step": 13288 + }, + { + "epoch": 2.73183266522767, + "grad_norm": 0.2317853718996048, + "learning_rate": 1.8688887697415653e-06, + "loss": 0.3755, + "step": 13289 + }, + { + "epoch": 2.7320382362010482, + "grad_norm": 0.2325008064508438, + "learning_rate": 1.8660412613104379e-06, + "loss": 0.3827, + "step": 13290 + }, + { + "epoch": 2.732243807174427, + "grad_norm": 0.23261573910713196, + "learning_rate": 1.8631958778878495e-06, + "loss": 0.3833, + "step": 13291 + }, + { + "epoch": 2.7324493781478054, + "grad_norm": 0.22458091378211975, + "learning_rate": 1.860352619613983e-06, + "loss": 0.3718, + "step": 13292 + }, + { + "epoch": 2.732654949121184, + "grad_norm": 0.23159871995449066, + "learning_rate": 1.8575114866289118e-06, + "loss": 0.3698, + "step": 13293 + }, + { + "epoch": 2.7328605200945626, + "grad_norm": 0.22185635566711426, + "learning_rate": 1.8546724790725984e-06, + "loss": 0.3753, + "step": 13294 + }, + { + "epoch": 2.733066091067941, + "grad_norm": 0.23244544863700867, + "learning_rate": 1.851835597084911e-06, + "loss": 0.3959, + "step": 13295 + }, + { + "epoch": 2.7332716620413198, + "grad_norm": 0.2387784868478775, + "learning_rate": 1.8490008408056131e-06, + "loss": 0.3712, + "step": 13296 + }, + { + "epoch": 2.7334772330146984, + "grad_norm": 0.2200855165719986, + "learning_rate": 1.8461682103743478e-06, + "loss": 0.3655, + "step": 13297 + }, + { + "epoch": 2.733682803988077, + "grad_norm": 0.22560839354991913, + "learning_rate": 1.8433377059306835e-06, + "loss": 0.3768, + "step": 13298 + }, + { + "epoch": 2.7338883749614555, + "grad_norm": 0.23543007671833038, + "learning_rate": 1.8405093276140534e-06, + "loss": 0.4065, + "step": 13299 + }, + { + "epoch": 2.734093945934834, + "grad_norm": 0.2383396327495575, + "learning_rate": 1.8376830755638013e-06, + "loss": 0.3916, + "step": 13300 + }, + { + "epoch": 2.7342995169082127, + "grad_norm": 0.22618170082569122, + "learning_rate": 1.834858949919166e-06, + "loss": 0.3665, + "step": 13301 + }, + { + "epoch": 2.7345050878815913, + "grad_norm": 0.12377354502677917, + "learning_rate": 1.8320369508192759e-06, + "loss": 0.4598, + "step": 13302 + }, + { + "epoch": 2.7347106588549694, + "grad_norm": 0.24000823497772217, + "learning_rate": 1.8292170784031548e-06, + "loss": 0.3789, + "step": 13303 + }, + { + "epoch": 2.7349162298283485, + "grad_norm": 0.23978973925113678, + "learning_rate": 1.8263993328097318e-06, + "loss": 0.385, + "step": 13304 + }, + { + "epoch": 2.7351218008017266, + "grad_norm": 0.21829509735107422, + "learning_rate": 1.8235837141778206e-06, + "loss": 0.3776, + "step": 13305 + }, + { + "epoch": 2.735327371775105, + "grad_norm": 0.2309700846672058, + "learning_rate": 1.8207702226461305e-06, + "loss": 0.3829, + "step": 13306 + }, + { + "epoch": 2.735532942748484, + "grad_norm": 0.225687175989151, + "learning_rate": 1.8179588583532753e-06, + "loss": 0.3622, + "step": 13307 + }, + { + "epoch": 2.7357385137218624, + "grad_norm": 0.22879765927791595, + "learning_rate": 1.8151496214377546e-06, + "loss": 0.3916, + "step": 13308 + }, + { + "epoch": 2.735944084695241, + "grad_norm": 0.23625633120536804, + "learning_rate": 1.8123425120379672e-06, + "loss": 0.405, + "step": 13309 + }, + { + "epoch": 2.7361496556686196, + "grad_norm": 0.2239421159029007, + "learning_rate": 1.809537530292203e-06, + "loss": 0.3717, + "step": 13310 + }, + { + "epoch": 2.736355226641998, + "grad_norm": 0.23815853893756866, + "learning_rate": 1.806734676338656e-06, + "loss": 0.3639, + "step": 13311 + }, + { + "epoch": 2.7365607976153767, + "grad_norm": 0.23043270409107208, + "learning_rate": 1.8039339503154062e-06, + "loss": 0.3773, + "step": 13312 + }, + { + "epoch": 2.7367663685887553, + "grad_norm": 0.23327013850212097, + "learning_rate": 1.801135352360433e-06, + "loss": 0.3803, + "step": 13313 + }, + { + "epoch": 2.736971939562134, + "grad_norm": 0.2213982492685318, + "learning_rate": 1.798338882611611e-06, + "loss": 0.3834, + "step": 13314 + }, + { + "epoch": 2.7371775105355125, + "grad_norm": 0.23775465786457062, + "learning_rate": 1.7955445412067102e-06, + "loss": 0.3809, + "step": 13315 + }, + { + "epoch": 2.737383081508891, + "grad_norm": 0.23932182788848877, + "learning_rate": 1.7927523282833902e-06, + "loss": 0.3749, + "step": 13316 + }, + { + "epoch": 2.7375886524822697, + "grad_norm": 0.24264240264892578, + "learning_rate": 1.7899622439792063e-06, + "loss": 0.378, + "step": 13317 + }, + { + "epoch": 2.737794223455648, + "grad_norm": 0.22181403636932373, + "learning_rate": 1.7871742884316284e-06, + "loss": 0.3769, + "step": 13318 + }, + { + "epoch": 2.737999794429027, + "grad_norm": 0.231339693069458, + "learning_rate": 1.7843884617779917e-06, + "loss": 0.3888, + "step": 13319 + }, + { + "epoch": 2.738205365402405, + "grad_norm": 0.23549319803714752, + "learning_rate": 1.7816047641555512e-06, + "loss": 0.3774, + "step": 13320 + }, + { + "epoch": 2.7384109363757836, + "grad_norm": 0.12438720464706421, + "learning_rate": 1.7788231957014424e-06, + "loss": 0.4356, + "step": 13321 + }, + { + "epoch": 2.738616507349162, + "grad_norm": 0.22666363418102264, + "learning_rate": 1.7760437565526955e-06, + "loss": 0.3691, + "step": 13322 + }, + { + "epoch": 2.7388220783225408, + "grad_norm": 0.23813243210315704, + "learning_rate": 1.7732664468462463e-06, + "loss": 0.3833, + "step": 13323 + }, + { + "epoch": 2.7390276492959194, + "grad_norm": 0.23540125787258148, + "learning_rate": 1.77049126671891e-06, + "loss": 0.3851, + "step": 13324 + }, + { + "epoch": 2.739233220269298, + "grad_norm": 0.22330401837825775, + "learning_rate": 1.7677182163074224e-06, + "loss": 0.3608, + "step": 13325 + }, + { + "epoch": 2.7394387912426765, + "grad_norm": 0.2364005595445633, + "learning_rate": 1.7649472957483942e-06, + "loss": 0.3804, + "step": 13326 + }, + { + "epoch": 2.739644362216055, + "grad_norm": 0.22581815719604492, + "learning_rate": 1.7621785051783213e-06, + "loss": 0.3868, + "step": 13327 + }, + { + "epoch": 2.7398499331894337, + "grad_norm": 0.22957521677017212, + "learning_rate": 1.7594118447336294e-06, + "loss": 0.3731, + "step": 13328 + }, + { + "epoch": 2.7400555041628123, + "grad_norm": 0.2309531569480896, + "learning_rate": 1.7566473145506097e-06, + "loss": 0.3712, + "step": 13329 + }, + { + "epoch": 2.740261075136191, + "grad_norm": 0.22596019506454468, + "learning_rate": 1.753884914765458e-06, + "loss": 0.352, + "step": 13330 + }, + { + "epoch": 2.7404666461095695, + "grad_norm": 0.1303359568119049, + "learning_rate": 1.7511246455142555e-06, + "loss": 0.4202, + "step": 13331 + }, + { + "epoch": 2.740672217082948, + "grad_norm": 0.22711026668548584, + "learning_rate": 1.7483665069330086e-06, + "loss": 0.3873, + "step": 13332 + }, + { + "epoch": 2.740877788056326, + "grad_norm": 0.12200283259153366, + "learning_rate": 1.7456104991575834e-06, + "loss": 0.4612, + "step": 13333 + }, + { + "epoch": 2.7410833590297052, + "grad_norm": 0.2212943285703659, + "learning_rate": 1.7428566223237564e-06, + "loss": 0.3736, + "step": 13334 + }, + { + "epoch": 2.7412889300030834, + "grad_norm": 0.2410779595375061, + "learning_rate": 1.740104876567204e-06, + "loss": 0.3901, + "step": 13335 + }, + { + "epoch": 2.741494500976462, + "grad_norm": 0.229153111577034, + "learning_rate": 1.737355262023483e-06, + "loss": 0.3911, + "step": 13336 + }, + { + "epoch": 2.7417000719498406, + "grad_norm": 0.2225484400987625, + "learning_rate": 1.7346077788280646e-06, + "loss": 0.3882, + "step": 13337 + }, + { + "epoch": 2.741905642923219, + "grad_norm": 0.23056496679782867, + "learning_rate": 1.731862427116291e-06, + "loss": 0.3961, + "step": 13338 + }, + { + "epoch": 2.7421112138965977, + "grad_norm": 0.12282350659370422, + "learning_rate": 1.7291192070234285e-06, + "loss": 0.4445, + "step": 13339 + }, + { + "epoch": 2.7423167848699763, + "grad_norm": 0.23378808796405792, + "learning_rate": 1.7263781186846096e-06, + "loss": 0.3796, + "step": 13340 + }, + { + "epoch": 2.742522355843355, + "grad_norm": 0.22584164142608643, + "learning_rate": 1.7236391622348857e-06, + "loss": 0.3835, + "step": 13341 + }, + { + "epoch": 2.7427279268167335, + "grad_norm": 0.23093900084495544, + "learning_rate": 1.7209023378091844e-06, + "loss": 0.3808, + "step": 13342 + }, + { + "epoch": 2.742933497790112, + "grad_norm": 0.22885221242904663, + "learning_rate": 1.7181676455423425e-06, + "loss": 0.3878, + "step": 13343 + }, + { + "epoch": 2.7431390687634907, + "grad_norm": 0.11858902126550674, + "learning_rate": 1.715435085569077e-06, + "loss": 0.4476, + "step": 13344 + }, + { + "epoch": 2.7433446397368693, + "grad_norm": 0.21561181545257568, + "learning_rate": 1.712704658024011e-06, + "loss": 0.3538, + "step": 13345 + }, + { + "epoch": 2.743550210710248, + "grad_norm": 0.12362432479858398, + "learning_rate": 1.709976363041666e-06, + "loss": 0.4305, + "step": 13346 + }, + { + "epoch": 2.7437557816836264, + "grad_norm": 0.2305142730474472, + "learning_rate": 1.7072502007564501e-06, + "loss": 0.3542, + "step": 13347 + }, + { + "epoch": 2.7439613526570046, + "grad_norm": 0.22127611935138702, + "learning_rate": 1.7045261713026607e-06, + "loss": 0.3839, + "step": 13348 + }, + { + "epoch": 2.7441669236303836, + "grad_norm": 0.22477680444717407, + "learning_rate": 1.7018042748145103e-06, + "loss": 0.3914, + "step": 13349 + }, + { + "epoch": 2.7443724946037618, + "grad_norm": 0.23768995702266693, + "learning_rate": 1.6990845114260868e-06, + "loss": 0.3856, + "step": 13350 + }, + { + "epoch": 2.7445780655771403, + "grad_norm": 0.23600324988365173, + "learning_rate": 1.696366881271383e-06, + "loss": 0.3844, + "step": 13351 + }, + { + "epoch": 2.744783636550519, + "grad_norm": 0.19089475274085999, + "learning_rate": 1.6936513844842767e-06, + "loss": 0.4466, + "step": 13352 + }, + { + "epoch": 2.7449892075238975, + "grad_norm": 0.12042814493179321, + "learning_rate": 1.690938021198556e-06, + "loss": 0.446, + "step": 13353 + }, + { + "epoch": 2.745194778497276, + "grad_norm": 0.23038606345653534, + "learning_rate": 1.688226791547899e-06, + "loss": 0.3753, + "step": 13354 + }, + { + "epoch": 2.7454003494706547, + "grad_norm": 0.12141604721546173, + "learning_rate": 1.6855176956658635e-06, + "loss": 0.4526, + "step": 13355 + }, + { + "epoch": 2.7456059204440333, + "grad_norm": 0.23495237529277802, + "learning_rate": 1.6828107336859233e-06, + "loss": 0.3864, + "step": 13356 + }, + { + "epoch": 2.745811491417412, + "grad_norm": 0.11687915772199631, + "learning_rate": 1.6801059057414314e-06, + "loss": 0.4553, + "step": 13357 + }, + { + "epoch": 2.7460170623907905, + "grad_norm": 0.23180118203163147, + "learning_rate": 1.6774032119656463e-06, + "loss": 0.3715, + "step": 13358 + }, + { + "epoch": 2.746222633364169, + "grad_norm": 0.2334972769021988, + "learning_rate": 1.6747026524917114e-06, + "loss": 0.3905, + "step": 13359 + }, + { + "epoch": 2.7464282043375476, + "grad_norm": 0.11535855382680893, + "learning_rate": 1.6720042274526754e-06, + "loss": 0.4416, + "step": 13360 + }, + { + "epoch": 2.7466337753109262, + "grad_norm": 0.2297336459159851, + "learning_rate": 1.6693079369814819e-06, + "loss": 0.3875, + "step": 13361 + }, + { + "epoch": 2.746839346284305, + "grad_norm": 0.23500292003154755, + "learning_rate": 1.6666137812109595e-06, + "loss": 0.3828, + "step": 13362 + }, + { + "epoch": 2.747044917257683, + "grad_norm": 0.22683893144130707, + "learning_rate": 1.6639217602738322e-06, + "loss": 0.3724, + "step": 13363 + }, + { + "epoch": 2.747250488231062, + "grad_norm": 0.22383469343185425, + "learning_rate": 1.6612318743027288e-06, + "loss": 0.3791, + "step": 13364 + }, + { + "epoch": 2.74745605920444, + "grad_norm": 0.23103176057338715, + "learning_rate": 1.6585441234301686e-06, + "loss": 0.372, + "step": 13365 + }, + { + "epoch": 2.7476616301778187, + "grad_norm": 0.12242773920297623, + "learning_rate": 1.6558585077885553e-06, + "loss": 0.431, + "step": 13366 + }, + { + "epoch": 2.7478672011511973, + "grad_norm": 0.24223840236663818, + "learning_rate": 1.6531750275102082e-06, + "loss": 0.4015, + "step": 13367 + }, + { + "epoch": 2.748072772124576, + "grad_norm": 0.24244338274002075, + "learning_rate": 1.6504936827273216e-06, + "loss": 0.3931, + "step": 13368 + }, + { + "epoch": 2.7482783430979545, + "grad_norm": 0.23767444491386414, + "learning_rate": 1.6478144735719997e-06, + "loss": 0.39, + "step": 13369 + }, + { + "epoch": 2.748483914071333, + "grad_norm": 0.2309853583574295, + "learning_rate": 1.6451374001762272e-06, + "loss": 0.3812, + "step": 13370 + }, + { + "epoch": 2.7486894850447117, + "grad_norm": 0.2257552295923233, + "learning_rate": 1.6424624626718982e-06, + "loss": 0.3695, + "step": 13371 + }, + { + "epoch": 2.7488950560180903, + "grad_norm": 0.22814756631851196, + "learning_rate": 1.6397896611907925e-06, + "loss": 0.3859, + "step": 13372 + }, + { + "epoch": 2.749100626991469, + "grad_norm": 0.23302559554576874, + "learning_rate": 1.63711899586458e-06, + "loss": 0.389, + "step": 13373 + }, + { + "epoch": 2.7493061979648474, + "grad_norm": 0.1236652210354805, + "learning_rate": 1.6344504668248401e-06, + "loss": 0.4524, + "step": 13374 + }, + { + "epoch": 2.749511768938226, + "grad_norm": 0.12643174827098846, + "learning_rate": 1.6317840742030328e-06, + "loss": 0.4511, + "step": 13375 + }, + { + "epoch": 2.7497173399116046, + "grad_norm": 0.122630275785923, + "learning_rate": 1.6291198181305279e-06, + "loss": 0.4569, + "step": 13376 + }, + { + "epoch": 2.749922910884983, + "grad_norm": 0.12135348469018936, + "learning_rate": 1.6264576987385705e-06, + "loss": 0.4401, + "step": 13377 + }, + { + "epoch": 2.7501284818583613, + "grad_norm": 0.2322167158126831, + "learning_rate": 1.6237977161583157e-06, + "loss": 0.3739, + "step": 13378 + }, + { + "epoch": 2.7503340528317404, + "grad_norm": 0.22395208477973938, + "learning_rate": 1.6211398705208086e-06, + "loss": 0.3799, + "step": 13379 + }, + { + "epoch": 2.7505396238051185, + "grad_norm": 0.23402728140354156, + "learning_rate": 1.6184841619569847e-06, + "loss": 0.4011, + "step": 13380 + }, + { + "epoch": 2.7507451947784975, + "grad_norm": 0.23176077008247375, + "learning_rate": 1.6158305905976839e-06, + "loss": 0.3742, + "step": 13381 + }, + { + "epoch": 2.7509507657518757, + "grad_norm": 0.23246802389621735, + "learning_rate": 1.6131791565736322e-06, + "loss": 0.3758, + "step": 13382 + }, + { + "epoch": 2.7511563367252543, + "grad_norm": 0.23959018290042877, + "learning_rate": 1.6105298600154545e-06, + "loss": 0.3795, + "step": 13383 + }, + { + "epoch": 2.751361907698633, + "grad_norm": 0.22550779581069946, + "learning_rate": 1.6078827010536717e-06, + "loss": 0.3797, + "step": 13384 + }, + { + "epoch": 2.7515674786720115, + "grad_norm": 0.23018544912338257, + "learning_rate": 1.6052376798186896e-06, + "loss": 0.4094, + "step": 13385 + }, + { + "epoch": 2.75177304964539, + "grad_norm": 0.12095669656991959, + "learning_rate": 1.602594796440824e-06, + "loss": 0.4569, + "step": 13386 + }, + { + "epoch": 2.7519786206187686, + "grad_norm": 0.22460295259952545, + "learning_rate": 1.5999540510502653e-06, + "loss": 0.3634, + "step": 13387 + }, + { + "epoch": 2.7521841915921472, + "grad_norm": 0.23332244157791138, + "learning_rate": 1.597315443777125e-06, + "loss": 0.3988, + "step": 13388 + }, + { + "epoch": 2.752389762565526, + "grad_norm": 0.22451160848140717, + "learning_rate": 1.5946789747513935e-06, + "loss": 0.3883, + "step": 13389 + }, + { + "epoch": 2.7525953335389044, + "grad_norm": 0.23024022579193115, + "learning_rate": 1.5920446441029474e-06, + "loss": 0.3961, + "step": 13390 + }, + { + "epoch": 2.752800904512283, + "grad_norm": 0.2318277209997177, + "learning_rate": 1.5894124519615678e-06, + "loss": 0.3562, + "step": 13391 + }, + { + "epoch": 2.7530064754856616, + "grad_norm": 0.22565960884094238, + "learning_rate": 1.5867823984569458e-06, + "loss": 0.3716, + "step": 13392 + }, + { + "epoch": 2.7532120464590397, + "grad_norm": 0.23131342232227325, + "learning_rate": 1.5841544837186428e-06, + "loss": 0.3782, + "step": 13393 + }, + { + "epoch": 2.7534176174324188, + "grad_norm": 0.24259567260742188, + "learning_rate": 1.5815287078761155e-06, + "loss": 0.3828, + "step": 13394 + }, + { + "epoch": 2.753623188405797, + "grad_norm": 0.23125715553760529, + "learning_rate": 1.578905071058735e-06, + "loss": 0.3934, + "step": 13395 + }, + { + "epoch": 2.753828759379176, + "grad_norm": 0.22932687401771545, + "learning_rate": 1.5762835733957531e-06, + "loss": 0.3589, + "step": 13396 + }, + { + "epoch": 2.754034330352554, + "grad_norm": 0.22249168157577515, + "learning_rate": 1.5736642150163168e-06, + "loss": 0.3756, + "step": 13397 + }, + { + "epoch": 2.7542399013259327, + "grad_norm": 0.23607565462589264, + "learning_rate": 1.5710469960494723e-06, + "loss": 0.362, + "step": 13398 + }, + { + "epoch": 2.7544454722993112, + "grad_norm": 0.2208351343870163, + "learning_rate": 1.5684319166241568e-06, + "loss": 0.3629, + "step": 13399 + }, + { + "epoch": 2.75465104327269, + "grad_norm": 0.22543267905712128, + "learning_rate": 1.5658189768691923e-06, + "loss": 0.3795, + "step": 13400 + }, + { + "epoch": 2.7548566142460684, + "grad_norm": 0.233917698264122, + "learning_rate": 1.5632081769133255e-06, + "loss": 0.4046, + "step": 13401 + }, + { + "epoch": 2.755062185219447, + "grad_norm": 0.23534435033798218, + "learning_rate": 1.560599516885169e-06, + "loss": 0.3964, + "step": 13402 + }, + { + "epoch": 2.7552677561928256, + "grad_norm": 0.2306138277053833, + "learning_rate": 1.5579929969132395e-06, + "loss": 0.3845, + "step": 13403 + }, + { + "epoch": 2.755473327166204, + "grad_norm": 0.23294024169445038, + "learning_rate": 1.5553886171259446e-06, + "loss": 0.3764, + "step": 13404 + }, + { + "epoch": 2.7556788981395828, + "grad_norm": 0.23482374846935272, + "learning_rate": 1.5527863776515918e-06, + "loss": 0.3821, + "step": 13405 + }, + { + "epoch": 2.7558844691129614, + "grad_norm": 0.12278321385383606, + "learning_rate": 1.550186278618388e-06, + "loss": 0.4522, + "step": 13406 + }, + { + "epoch": 2.75609004008634, + "grad_norm": 0.23249104619026184, + "learning_rate": 1.5475883201544111e-06, + "loss": 0.3943, + "step": 13407 + }, + { + "epoch": 2.756295611059718, + "grad_norm": 0.23344482481479645, + "learning_rate": 1.544992502387669e-06, + "loss": 0.3739, + "step": 13408 + }, + { + "epoch": 2.756501182033097, + "grad_norm": 0.22119440138339996, + "learning_rate": 1.5423988254460386e-06, + "loss": 0.3884, + "step": 13409 + }, + { + "epoch": 2.7567067530064753, + "grad_norm": 0.2359190583229065, + "learning_rate": 1.5398072894572984e-06, + "loss": 0.3688, + "step": 13410 + }, + { + "epoch": 2.7569123239798543, + "grad_norm": 0.23092953860759735, + "learning_rate": 1.537217894549121e-06, + "loss": 0.3673, + "step": 13411 + }, + { + "epoch": 2.7571178949532325, + "grad_norm": 0.23413263261318207, + "learning_rate": 1.5346306408490697e-06, + "loss": 0.3826, + "step": 13412 + }, + { + "epoch": 2.757323465926611, + "grad_norm": 0.23555617034435272, + "learning_rate": 1.532045528484612e-06, + "loss": 0.3885, + "step": 13413 + }, + { + "epoch": 2.7575290368999896, + "grad_norm": 0.2366188019514084, + "learning_rate": 1.5294625575831012e-06, + "loss": 0.4028, + "step": 13414 + }, + { + "epoch": 2.757734607873368, + "grad_norm": 0.23909413814544678, + "learning_rate": 1.5268817282717857e-06, + "loss": 0.387, + "step": 13415 + }, + { + "epoch": 2.757940178846747, + "grad_norm": 0.11871679872274399, + "learning_rate": 1.5243030406778237e-06, + "loss": 0.4433, + "step": 13416 + }, + { + "epoch": 2.7581457498201254, + "grad_norm": 0.24047636985778809, + "learning_rate": 1.5217264949282384e-06, + "loss": 0.368, + "step": 13417 + }, + { + "epoch": 2.758351320793504, + "grad_norm": 0.1217452734708786, + "learning_rate": 1.5191520911499786e-06, + "loss": 0.4494, + "step": 13418 + }, + { + "epoch": 2.7585568917668826, + "grad_norm": 0.22655776143074036, + "learning_rate": 1.5165798294698625e-06, + "loss": 0.3789, + "step": 13419 + }, + { + "epoch": 2.758762462740261, + "grad_norm": 0.12003947049379349, + "learning_rate": 1.5140097100146188e-06, + "loss": 0.4444, + "step": 13420 + }, + { + "epoch": 2.7589680337136397, + "grad_norm": 0.24004173278808594, + "learning_rate": 1.5114417329108565e-06, + "loss": 0.3891, + "step": 13421 + }, + { + "epoch": 2.7591736046870183, + "grad_norm": 0.24962955713272095, + "learning_rate": 1.5088758982851042e-06, + "loss": 0.3885, + "step": 13422 + }, + { + "epoch": 2.7593791756603965, + "grad_norm": 0.22153249382972717, + "learning_rate": 1.5063122062637558e-06, + "loss": 0.3772, + "step": 13423 + }, + { + "epoch": 2.7595847466337755, + "grad_norm": 0.2372589111328125, + "learning_rate": 1.5037506569731202e-06, + "loss": 0.3729, + "step": 13424 + }, + { + "epoch": 2.7597903176071537, + "grad_norm": 0.2346244603395462, + "learning_rate": 1.5011912505393867e-06, + "loss": 0.4007, + "step": 13425 + }, + { + "epoch": 2.7599958885805327, + "grad_norm": 0.22780828177928925, + "learning_rate": 1.498633987088644e-06, + "loss": 0.3916, + "step": 13426 + }, + { + "epoch": 2.760201459553911, + "grad_norm": 0.23186977207660675, + "learning_rate": 1.4960788667468816e-06, + "loss": 0.384, + "step": 13427 + }, + { + "epoch": 2.7604070305272894, + "grad_norm": 0.22554920613765717, + "learning_rate": 1.493525889639974e-06, + "loss": 0.3773, + "step": 13428 + }, + { + "epoch": 2.760612601500668, + "grad_norm": 0.22903324663639069, + "learning_rate": 1.4909750558937003e-06, + "loss": 0.3697, + "step": 13429 + }, + { + "epoch": 2.7608181724740466, + "grad_norm": 0.23504561185836792, + "learning_rate": 1.488426365633725e-06, + "loss": 0.3907, + "step": 13430 + }, + { + "epoch": 2.761023743447425, + "grad_norm": 0.2242177128791809, + "learning_rate": 1.4858798189856076e-06, + "loss": 0.3697, + "step": 13431 + }, + { + "epoch": 2.7612293144208038, + "grad_norm": 0.21883516013622284, + "learning_rate": 1.4833354160748131e-06, + "loss": 0.4012, + "step": 13432 + }, + { + "epoch": 2.7614348853941824, + "grad_norm": 0.23364631831645966, + "learning_rate": 1.480793157026676e-06, + "loss": 0.403, + "step": 13433 + }, + { + "epoch": 2.761640456367561, + "grad_norm": 0.21907542645931244, + "learning_rate": 1.478253041966461e-06, + "loss": 0.3822, + "step": 13434 + }, + { + "epoch": 2.7618460273409395, + "grad_norm": 0.22405709326267242, + "learning_rate": 1.475715071019293e-06, + "loss": 0.3756, + "step": 13435 + }, + { + "epoch": 2.762051598314318, + "grad_norm": 0.2236599624156952, + "learning_rate": 1.473179244310212e-06, + "loss": 0.3914, + "step": 13436 + }, + { + "epoch": 2.7622571692876967, + "grad_norm": 0.22199760377407074, + "learning_rate": 1.4706455619641485e-06, + "loss": 0.3774, + "step": 13437 + }, + { + "epoch": 2.7624627402610753, + "grad_norm": 0.12080203741788864, + "learning_rate": 1.4681140241059221e-06, + "loss": 0.4438, + "step": 13438 + }, + { + "epoch": 2.762668311234454, + "grad_norm": 0.22729521989822388, + "learning_rate": 1.4655846308602483e-06, + "loss": 0.378, + "step": 13439 + }, + { + "epoch": 2.762873882207832, + "grad_norm": 0.23560449481010437, + "learning_rate": 1.4630573823517425e-06, + "loss": 0.3705, + "step": 13440 + }, + { + "epoch": 2.763079453181211, + "grad_norm": 0.22415785491466522, + "learning_rate": 1.4605322787049097e-06, + "loss": 0.3701, + "step": 13441 + }, + { + "epoch": 2.763285024154589, + "grad_norm": 0.12505774199962616, + "learning_rate": 1.4580093200441408e-06, + "loss": 0.4395, + "step": 13442 + }, + { + "epoch": 2.763490595127968, + "grad_norm": 0.23564325273036957, + "learning_rate": 1.4554885064937462e-06, + "loss": 0.4002, + "step": 13443 + }, + { + "epoch": 2.7636961661013464, + "grad_norm": 0.2280401885509491, + "learning_rate": 1.4529698381779067e-06, + "loss": 0.3785, + "step": 13444 + }, + { + "epoch": 2.763901737074725, + "grad_norm": 0.2783918082714081, + "learning_rate": 1.4504533152207028e-06, + "loss": 0.3812, + "step": 13445 + }, + { + "epoch": 2.7641073080481036, + "grad_norm": 0.2317892611026764, + "learning_rate": 1.4479389377461105e-06, + "loss": 0.3993, + "step": 13446 + }, + { + "epoch": 2.764312879021482, + "grad_norm": 0.23319129645824432, + "learning_rate": 1.4454267058780108e-06, + "loss": 0.3947, + "step": 13447 + }, + { + "epoch": 2.7645184499948607, + "grad_norm": 0.23588362336158752, + "learning_rate": 1.4429166197401594e-06, + "loss": 0.389, + "step": 13448 + }, + { + "epoch": 2.7647240209682393, + "grad_norm": 0.23121041059494019, + "learning_rate": 1.4404086794562177e-06, + "loss": 0.3932, + "step": 13449 + }, + { + "epoch": 2.764929591941618, + "grad_norm": 0.22811183333396912, + "learning_rate": 1.4379028851497516e-06, + "loss": 0.371, + "step": 13450 + }, + { + "epoch": 2.7651351629149965, + "grad_norm": 0.24577990174293518, + "learning_rate": 1.4353992369441976e-06, + "loss": 0.3868, + "step": 13451 + }, + { + "epoch": 2.765340733888375, + "grad_norm": 0.22762644290924072, + "learning_rate": 1.4328977349629019e-06, + "loss": 0.3735, + "step": 13452 + }, + { + "epoch": 2.7655463048617537, + "grad_norm": 0.23274122178554535, + "learning_rate": 1.430398379329106e-06, + "loss": 0.3751, + "step": 13453 + }, + { + "epoch": 2.7657518758351323, + "grad_norm": 0.23309700191020966, + "learning_rate": 1.4279011701659362e-06, + "loss": 0.3838, + "step": 13454 + }, + { + "epoch": 2.7659574468085104, + "grad_norm": 0.23482760787010193, + "learning_rate": 1.4254061075964143e-06, + "loss": 0.3797, + "step": 13455 + }, + { + "epoch": 2.7661630177818894, + "grad_norm": 0.11869847029447556, + "learning_rate": 1.4229131917434769e-06, + "loss": 0.4533, + "step": 13456 + }, + { + "epoch": 2.7663685887552676, + "grad_norm": 0.12506672739982605, + "learning_rate": 1.4204224227299156e-06, + "loss": 0.4454, + "step": 13457 + }, + { + "epoch": 2.766574159728646, + "grad_norm": 0.12263701856136322, + "learning_rate": 1.4179338006784626e-06, + "loss": 0.4483, + "step": 13458 + }, + { + "epoch": 2.7667797307020248, + "grad_norm": 0.24314455687999725, + "learning_rate": 1.4154473257117047e-06, + "loss": 0.3715, + "step": 13459 + }, + { + "epoch": 2.7669853016754034, + "grad_norm": 0.22938649356365204, + "learning_rate": 1.4129629979521436e-06, + "loss": 0.3668, + "step": 13460 + }, + { + "epoch": 2.767190872648782, + "grad_norm": 0.23843181133270264, + "learning_rate": 1.4104808175221717e-06, + "loss": 0.3938, + "step": 13461 + }, + { + "epoch": 2.7673964436221605, + "grad_norm": 0.22452838718891144, + "learning_rate": 1.4080007845440713e-06, + "loss": 0.3485, + "step": 13462 + }, + { + "epoch": 2.767602014595539, + "grad_norm": 0.2309243083000183, + "learning_rate": 1.4055228991400193e-06, + "loss": 0.3835, + "step": 13463 + }, + { + "epoch": 2.7678075855689177, + "grad_norm": 0.24043014645576477, + "learning_rate": 1.4030471614320984e-06, + "loss": 0.3677, + "step": 13464 + }, + { + "epoch": 2.7680131565422963, + "grad_norm": 0.24299444258213043, + "learning_rate": 1.4005735715422757e-06, + "loss": 0.392, + "step": 13465 + }, + { + "epoch": 2.768218727515675, + "grad_norm": 0.2283448427915573, + "learning_rate": 1.3981021295924091e-06, + "loss": 0.3609, + "step": 13466 + }, + { + "epoch": 2.7684242984890535, + "grad_norm": 0.2528086304664612, + "learning_rate": 1.395632835704251e-06, + "loss": 0.3771, + "step": 13467 + }, + { + "epoch": 2.768629869462432, + "grad_norm": 0.23460538685321808, + "learning_rate": 1.393165689999464e-06, + "loss": 0.3766, + "step": 13468 + }, + { + "epoch": 2.7688354404358106, + "grad_norm": 0.23330725729465485, + "learning_rate": 1.390700692599576e-06, + "loss": 0.3756, + "step": 13469 + }, + { + "epoch": 2.769041011409189, + "grad_norm": 0.11881226301193237, + "learning_rate": 1.3882378436260396e-06, + "loss": 0.4427, + "step": 13470 + }, + { + "epoch": 2.769246582382568, + "grad_norm": 0.2295810878276825, + "learning_rate": 1.3857771432001881e-06, + "loss": 0.374, + "step": 13471 + }, + { + "epoch": 2.769452153355946, + "grad_norm": 0.12282148748636246, + "learning_rate": 1.3833185914432396e-06, + "loss": 0.4614, + "step": 13472 + }, + { + "epoch": 2.7696577243293246, + "grad_norm": 0.2301100343465805, + "learning_rate": 1.3808621884763218e-06, + "loss": 0.3805, + "step": 13473 + }, + { + "epoch": 2.769863295302703, + "grad_norm": 0.23633736371994019, + "learning_rate": 1.378407934420448e-06, + "loss": 0.3947, + "step": 13474 + }, + { + "epoch": 2.7700688662760817, + "grad_norm": 0.22140897810459137, + "learning_rate": 1.375955829396532e-06, + "loss": 0.3804, + "step": 13475 + }, + { + "epoch": 2.7702744372494603, + "grad_norm": 0.12200979143381119, + "learning_rate": 1.3735058735253663e-06, + "loss": 0.4414, + "step": 13476 + }, + { + "epoch": 2.770480008222839, + "grad_norm": 0.23393046855926514, + "learning_rate": 1.3710580669276601e-06, + "loss": 0.3847, + "step": 13477 + }, + { + "epoch": 2.7706855791962175, + "grad_norm": 0.1250157356262207, + "learning_rate": 1.3686124097240066e-06, + "loss": 0.4526, + "step": 13478 + }, + { + "epoch": 2.770891150169596, + "grad_norm": 0.2439979910850525, + "learning_rate": 1.3661689020348795e-06, + "loss": 0.4, + "step": 13479 + }, + { + "epoch": 2.7710967211429747, + "grad_norm": 0.23264381289482117, + "learning_rate": 1.3637275439806723e-06, + "loss": 0.3863, + "step": 13480 + }, + { + "epoch": 2.7713022921163533, + "grad_norm": 0.23298904299736023, + "learning_rate": 1.3612883356816493e-06, + "loss": 0.3768, + "step": 13481 + }, + { + "epoch": 2.771507863089732, + "grad_norm": 0.23129281401634216, + "learning_rate": 1.3588512772579887e-06, + "loss": 0.3775, + "step": 13482 + }, + { + "epoch": 2.7717134340631104, + "grad_norm": 0.22436164319515228, + "learning_rate": 1.3564163688297398e-06, + "loss": 0.386, + "step": 13483 + }, + { + "epoch": 2.771919005036489, + "grad_norm": 0.12501150369644165, + "learning_rate": 1.353983610516872e-06, + "loss": 0.457, + "step": 13484 + }, + { + "epoch": 2.772124576009867, + "grad_norm": 0.22815532982349396, + "learning_rate": 1.3515530024392286e-06, + "loss": 0.3907, + "step": 13485 + }, + { + "epoch": 2.772330146983246, + "grad_norm": 0.23359054327011108, + "learning_rate": 1.3491245447165596e-06, + "loss": 0.38, + "step": 13486 + }, + { + "epoch": 2.7725357179566243, + "grad_norm": 0.23818518221378326, + "learning_rate": 1.3466982374684988e-06, + "loss": 0.3788, + "step": 13487 + }, + { + "epoch": 2.772741288930003, + "grad_norm": 0.2215246707201004, + "learning_rate": 1.344274080814586e-06, + "loss": 0.3717, + "step": 13488 + }, + { + "epoch": 2.7729468599033815, + "grad_norm": 0.2501251697540283, + "learning_rate": 1.3418520748742352e-06, + "loss": 0.3799, + "step": 13489 + }, + { + "epoch": 2.77315243087676, + "grad_norm": 0.22703197598457336, + "learning_rate": 1.3394322197667763e-06, + "loss": 0.3996, + "step": 13490 + }, + { + "epoch": 2.7733580018501387, + "grad_norm": 0.1154903993010521, + "learning_rate": 1.3370145156114239e-06, + "loss": 0.4539, + "step": 13491 + }, + { + "epoch": 2.7735635728235173, + "grad_norm": 0.23296396434307098, + "learning_rate": 1.3345989625272875e-06, + "loss": 0.397, + "step": 13492 + }, + { + "epoch": 2.773769143796896, + "grad_norm": 0.23191124200820923, + "learning_rate": 1.3321855606333673e-06, + "loss": 0.3711, + "step": 13493 + }, + { + "epoch": 2.7739747147702745, + "grad_norm": 0.22586466372013092, + "learning_rate": 1.3297743100485627e-06, + "loss": 0.3726, + "step": 13494 + }, + { + "epoch": 2.774180285743653, + "grad_norm": 0.23403707146644592, + "learning_rate": 1.327365210891664e-06, + "loss": 0.3872, + "step": 13495 + }, + { + "epoch": 2.7743858567170316, + "grad_norm": 0.2312314212322235, + "learning_rate": 1.3249582632813563e-06, + "loss": 0.3814, + "step": 13496 + }, + { + "epoch": 2.7745914276904102, + "grad_norm": 0.12255984544754028, + "learning_rate": 1.3225534673362144e-06, + "loss": 0.4595, + "step": 13497 + }, + { + "epoch": 2.774796998663789, + "grad_norm": 0.24602609872817993, + "learning_rate": 1.320150823174719e-06, + "loss": 0.3767, + "step": 13498 + }, + { + "epoch": 2.7750025696371674, + "grad_norm": 0.22925207018852234, + "learning_rate": 1.3177503309152351e-06, + "loss": 0.3627, + "step": 13499 + }, + { + "epoch": 2.7752081406105455, + "grad_norm": 0.2292235791683197, + "learning_rate": 1.3153519906760132e-06, + "loss": 0.3868, + "step": 13500 + }, + { + "epoch": 2.7754137115839246, + "grad_norm": 0.2379998415708542, + "learning_rate": 1.3129558025752236e-06, + "loss": 0.3589, + "step": 13501 + }, + { + "epoch": 2.7756192825573027, + "grad_norm": 0.12553149461746216, + "learning_rate": 1.3105617667309124e-06, + "loss": 0.4474, + "step": 13502 + }, + { + "epoch": 2.7758248535306813, + "grad_norm": 0.12078402936458588, + "learning_rate": 1.3081698832610146e-06, + "loss": 0.4486, + "step": 13503 + }, + { + "epoch": 2.77603042450406, + "grad_norm": 0.22506415843963623, + "learning_rate": 1.3057801522833662e-06, + "loss": 0.3689, + "step": 13504 + }, + { + "epoch": 2.7762359954774385, + "grad_norm": 0.12277916818857193, + "learning_rate": 1.3033925739157133e-06, + "loss": 0.447, + "step": 13505 + }, + { + "epoch": 2.776441566450817, + "grad_norm": 0.2416677474975586, + "learning_rate": 1.3010071482756665e-06, + "loss": 0.3757, + "step": 13506 + }, + { + "epoch": 2.7766471374241957, + "grad_norm": 0.23068147897720337, + "learning_rate": 1.2986238754807518e-06, + "loss": 0.371, + "step": 13507 + }, + { + "epoch": 2.7768527083975743, + "grad_norm": 0.23163877427577972, + "learning_rate": 1.2962427556483753e-06, + "loss": 0.358, + "step": 13508 + }, + { + "epoch": 2.777058279370953, + "grad_norm": 0.22567118704319, + "learning_rate": 1.2938637888958482e-06, + "loss": 0.3855, + "step": 13509 + }, + { + "epoch": 2.7772638503443314, + "grad_norm": 0.2283574789762497, + "learning_rate": 1.2914869753403718e-06, + "loss": 0.3802, + "step": 13510 + }, + { + "epoch": 2.77746942131771, + "grad_norm": 0.1205214112997055, + "learning_rate": 1.2891123150990376e-06, + "loss": 0.4479, + "step": 13511 + }, + { + "epoch": 2.7776749922910886, + "grad_norm": 0.12035630643367767, + "learning_rate": 1.2867398082888366e-06, + "loss": 0.4525, + "step": 13512 + }, + { + "epoch": 2.777880563264467, + "grad_norm": 0.22281195223331451, + "learning_rate": 1.2843694550266506e-06, + "loss": 0.3897, + "step": 13513 + }, + { + "epoch": 2.778086134237846, + "grad_norm": 0.22318775951862335, + "learning_rate": 1.282001255429251e-06, + "loss": 0.3767, + "step": 13514 + }, + { + "epoch": 2.778291705211224, + "grad_norm": 0.11824406683444977, + "learning_rate": 1.2796352096133195e-06, + "loss": 0.4477, + "step": 13515 + }, + { + "epoch": 2.778497276184603, + "grad_norm": 0.2342909723520279, + "learning_rate": 1.2772713176954082e-06, + "loss": 0.3699, + "step": 13516 + }, + { + "epoch": 2.778702847157981, + "grad_norm": 0.2417282909154892, + "learning_rate": 1.2749095797919785e-06, + "loss": 0.3834, + "step": 13517 + }, + { + "epoch": 2.7789084181313597, + "grad_norm": 0.2351347953081131, + "learning_rate": 1.2725499960193826e-06, + "loss": 0.3835, + "step": 13518 + }, + { + "epoch": 2.7791139891047383, + "grad_norm": 0.11795809119939804, + "learning_rate": 1.2701925664938675e-06, + "loss": 0.4358, + "step": 13519 + }, + { + "epoch": 2.779319560078117, + "grad_norm": 0.2242356687784195, + "learning_rate": 1.267837291331575e-06, + "loss": 0.3773, + "step": 13520 + }, + { + "epoch": 2.7795251310514955, + "grad_norm": 0.23602786660194397, + "learning_rate": 1.2654841706485326e-06, + "loss": 0.3895, + "step": 13521 + }, + { + "epoch": 2.779730702024874, + "grad_norm": 0.23076754808425903, + "learning_rate": 1.2631332045606725e-06, + "loss": 0.3835, + "step": 13522 + }, + { + "epoch": 2.7799362729982526, + "grad_norm": 0.11837997287511826, + "learning_rate": 1.260784393183812e-06, + "loss": 0.4325, + "step": 13523 + }, + { + "epoch": 2.780141843971631, + "grad_norm": 0.23414716124534607, + "learning_rate": 1.2584377366336687e-06, + "loss": 0.3697, + "step": 13524 + }, + { + "epoch": 2.78034741494501, + "grad_norm": 0.2339978665113449, + "learning_rate": 1.2560932350258498e-06, + "loss": 0.3645, + "step": 13525 + }, + { + "epoch": 2.7805529859183884, + "grad_norm": 0.22943206131458282, + "learning_rate": 1.2537508884758581e-06, + "loss": 0.3743, + "step": 13526 + }, + { + "epoch": 2.780758556891767, + "grad_norm": 0.24171233177185059, + "learning_rate": 1.2514106970990962e-06, + "loss": 0.3866, + "step": 13527 + }, + { + "epoch": 2.7809641278651456, + "grad_norm": 0.2301642745733261, + "learning_rate": 1.2490726610108423e-06, + "loss": 0.3776, + "step": 13528 + }, + { + "epoch": 2.781169698838524, + "grad_norm": 0.23848628997802734, + "learning_rate": 1.2467367803262937e-06, + "loss": 0.378, + "step": 13529 + }, + { + "epoch": 2.7813752698119023, + "grad_norm": 0.23197340965270996, + "learning_rate": 1.2444030551605185e-06, + "loss": 0.3848, + "step": 13530 + }, + { + "epoch": 2.7815808407852813, + "grad_norm": 0.2298150360584259, + "learning_rate": 1.24207148562849e-06, + "loss": 0.3695, + "step": 13531 + }, + { + "epoch": 2.7817864117586595, + "grad_norm": 0.23591184616088867, + "learning_rate": 1.2397420718450708e-06, + "loss": 0.3615, + "step": 13532 + }, + { + "epoch": 2.781991982732038, + "grad_norm": 0.23564256727695465, + "learning_rate": 1.2374148139250348e-06, + "loss": 0.3773, + "step": 13533 + }, + { + "epoch": 2.7821975537054167, + "grad_norm": 0.2334429770708084, + "learning_rate": 1.2350897119830195e-06, + "loss": 0.3737, + "step": 13534 + }, + { + "epoch": 2.7824031246787952, + "grad_norm": 0.12291624397039413, + "learning_rate": 1.232766766133579e-06, + "loss": 0.4507, + "step": 13535 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.23279529809951782, + "learning_rate": 1.2304459764911514e-06, + "loss": 0.385, + "step": 13536 + }, + { + "epoch": 2.7828142666255524, + "grad_norm": 0.22842784225940704, + "learning_rate": 1.2281273431700752e-06, + "loss": 0.3793, + "step": 13537 + }, + { + "epoch": 2.783019837598931, + "grad_norm": 0.2227255403995514, + "learning_rate": 1.225810866284574e-06, + "loss": 0.3711, + "step": 13538 + }, + { + "epoch": 2.7832254085723096, + "grad_norm": 0.21851521730422974, + "learning_rate": 1.2234965459487668e-06, + "loss": 0.3655, + "step": 13539 + }, + { + "epoch": 2.783430979545688, + "grad_norm": 0.5399391651153564, + "learning_rate": 1.2211843822766771e-06, + "loss": 0.3967, + "step": 13540 + }, + { + "epoch": 2.7836365505190668, + "grad_norm": 0.21860133111476898, + "learning_rate": 1.218874375382214e-06, + "loss": 0.3758, + "step": 13541 + }, + { + "epoch": 2.7838421214924454, + "grad_norm": 0.2336316704750061, + "learning_rate": 1.2165665253791764e-06, + "loss": 0.393, + "step": 13542 + }, + { + "epoch": 2.784047692465824, + "grad_norm": 0.22940628230571747, + "learning_rate": 1.2142608323812582e-06, + "loss": 0.3921, + "step": 13543 + }, + { + "epoch": 2.7842532634392025, + "grad_norm": 0.12518246471881866, + "learning_rate": 1.2119572965020588e-06, + "loss": 0.4595, + "step": 13544 + }, + { + "epoch": 2.7844588344125807, + "grad_norm": 0.12655936181545258, + "learning_rate": 1.209655917855057e-06, + "loss": 0.4395, + "step": 13545 + }, + { + "epoch": 2.7846644053859597, + "grad_norm": 0.22794488072395325, + "learning_rate": 1.2073566965536327e-06, + "loss": 0.4002, + "step": 13546 + }, + { + "epoch": 2.784869976359338, + "grad_norm": 0.23608453571796417, + "learning_rate": 1.2050596327110598e-06, + "loss": 0.3706, + "step": 13547 + }, + { + "epoch": 2.785075547332717, + "grad_norm": 0.22875483334064484, + "learning_rate": 1.202764726440503e-06, + "loss": 0.3529, + "step": 13548 + }, + { + "epoch": 2.785281118306095, + "grad_norm": 0.23389698565006256, + "learning_rate": 1.2004719778550167e-06, + "loss": 0.3826, + "step": 13549 + }, + { + "epoch": 2.7854866892794736, + "grad_norm": 0.1383344829082489, + "learning_rate": 1.1981813870675608e-06, + "loss": 0.4421, + "step": 13550 + }, + { + "epoch": 2.785692260252852, + "grad_norm": 0.2320520430803299, + "learning_rate": 1.1958929541909798e-06, + "loss": 0.3664, + "step": 13551 + }, + { + "epoch": 2.785897831226231, + "grad_norm": 0.236195906996727, + "learning_rate": 1.1936066793380035e-06, + "loss": 0.3693, + "step": 13552 + }, + { + "epoch": 2.7861034021996094, + "grad_norm": 0.23397257924079895, + "learning_rate": 1.191322562621287e-06, + "loss": 0.377, + "step": 13553 + }, + { + "epoch": 2.786308973172988, + "grad_norm": 0.23099073767662048, + "learning_rate": 1.1890406041533404e-06, + "loss": 0.3729, + "step": 13554 + }, + { + "epoch": 2.7865145441463666, + "grad_norm": 0.23826487362384796, + "learning_rate": 1.1867608040465933e-06, + "loss": 0.3772, + "step": 13555 + }, + { + "epoch": 2.786720115119745, + "grad_norm": 0.23329326510429382, + "learning_rate": 1.1844831624133611e-06, + "loss": 0.3604, + "step": 13556 + }, + { + "epoch": 2.7869256860931237, + "grad_norm": 0.22708694636821747, + "learning_rate": 1.1822076793658493e-06, + "loss": 0.3632, + "step": 13557 + }, + { + "epoch": 2.7871312570665023, + "grad_norm": 0.23591133952140808, + "learning_rate": 1.179934355016158e-06, + "loss": 0.3962, + "step": 13558 + }, + { + "epoch": 2.787336828039881, + "grad_norm": 0.23737779259681702, + "learning_rate": 1.1776631894762874e-06, + "loss": 0.37, + "step": 13559 + }, + { + "epoch": 2.787542399013259, + "grad_norm": 0.22978876531124115, + "learning_rate": 1.1753941828581283e-06, + "loss": 0.3697, + "step": 13560 + }, + { + "epoch": 2.787747969986638, + "grad_norm": 0.22215284407138824, + "learning_rate": 1.1731273352734612e-06, + "loss": 0.38, + "step": 13561 + }, + { + "epoch": 2.7879535409600162, + "grad_norm": 0.22937338054180145, + "learning_rate": 1.1708626468339619e-06, + "loss": 0.3759, + "step": 13562 + }, + { + "epoch": 2.7881591119333953, + "grad_norm": 0.2433684915304184, + "learning_rate": 1.1686001176512108e-06, + "loss": 0.383, + "step": 13563 + }, + { + "epoch": 2.7883646829067734, + "grad_norm": 0.2298046201467514, + "learning_rate": 1.1663397478366539e-06, + "loss": 0.3739, + "step": 13564 + }, + { + "epoch": 2.788570253880152, + "grad_norm": 0.12105909734964371, + "learning_rate": 1.1640815375016623e-06, + "loss": 0.4447, + "step": 13565 + }, + { + "epoch": 2.7887758248535306, + "grad_norm": 0.22252750396728516, + "learning_rate": 1.1618254867574918e-06, + "loss": 0.3771, + "step": 13566 + }, + { + "epoch": 2.788981395826909, + "grad_norm": 0.2337454855442047, + "learning_rate": 1.1595715957152686e-06, + "loss": 0.3896, + "step": 13567 + }, + { + "epoch": 2.7891869668002878, + "grad_norm": 0.23451459407806396, + "learning_rate": 1.157319864486054e-06, + "loss": 0.3887, + "step": 13568 + }, + { + "epoch": 2.7893925377736664, + "grad_norm": 0.2217395305633545, + "learning_rate": 1.155070293180764e-06, + "loss": 0.38, + "step": 13569 + }, + { + "epoch": 2.789598108747045, + "grad_norm": 0.23397988080978394, + "learning_rate": 1.1528228819102348e-06, + "loss": 0.3831, + "step": 13570 + }, + { + "epoch": 2.7898036797204235, + "grad_norm": 0.23299863934516907, + "learning_rate": 1.1505776307851784e-06, + "loss": 0.3913, + "step": 13571 + }, + { + "epoch": 2.790009250693802, + "grad_norm": 0.12697121500968933, + "learning_rate": 1.148334539916211e-06, + "loss": 0.4494, + "step": 13572 + }, + { + "epoch": 2.7902148216671807, + "grad_norm": 0.23409722745418549, + "learning_rate": 1.1460936094138342e-06, + "loss": 0.3775, + "step": 13573 + }, + { + "epoch": 2.7904203926405593, + "grad_norm": 0.23983454704284668, + "learning_rate": 1.1438548393884545e-06, + "loss": 0.3591, + "step": 13574 + }, + { + "epoch": 2.7906259636139374, + "grad_norm": 0.12223486602306366, + "learning_rate": 1.1416182299503692e-06, + "loss": 0.4504, + "step": 13575 + }, + { + "epoch": 2.7908315345873165, + "grad_norm": 0.2318880558013916, + "learning_rate": 1.1393837812097546e-06, + "loss": 0.3754, + "step": 13576 + }, + { + "epoch": 2.7910371055606946, + "grad_norm": 0.22092534601688385, + "learning_rate": 1.137151493276703e-06, + "loss": 0.3631, + "step": 13577 + }, + { + "epoch": 2.7912426765340737, + "grad_norm": 0.23008911311626434, + "learning_rate": 1.1349213662611764e-06, + "loss": 0.3736, + "step": 13578 + }, + { + "epoch": 2.791448247507452, + "grad_norm": 0.2271631807088852, + "learning_rate": 1.1326934002730516e-06, + "loss": 0.4001, + "step": 13579 + }, + { + "epoch": 2.7916538184808304, + "grad_norm": 0.22966791689395905, + "learning_rate": 1.1304675954220861e-06, + "loss": 0.3774, + "step": 13580 + }, + { + "epoch": 2.791859389454209, + "grad_norm": 0.2344343513250351, + "learning_rate": 1.1282439518179373e-06, + "loss": 0.3852, + "step": 13581 + }, + { + "epoch": 2.7920649604275876, + "grad_norm": 0.21964535117149353, + "learning_rate": 1.1260224695701571e-06, + "loss": 0.3675, + "step": 13582 + }, + { + "epoch": 2.792270531400966, + "grad_norm": 0.23566703498363495, + "learning_rate": 1.1238031487881785e-06, + "loss": 0.3684, + "step": 13583 + }, + { + "epoch": 2.7924761023743447, + "grad_norm": 0.23792453110218048, + "learning_rate": 1.1215859895813436e-06, + "loss": 0.4032, + "step": 13584 + }, + { + "epoch": 2.7926816733477233, + "grad_norm": 0.23992085456848145, + "learning_rate": 1.1193709920588803e-06, + "loss": 0.3779, + "step": 13585 + }, + { + "epoch": 2.792887244321102, + "grad_norm": 0.23918254673480988, + "learning_rate": 1.117158156329911e-06, + "loss": 0.38, + "step": 13586 + }, + { + "epoch": 2.7930928152944805, + "grad_norm": 0.23621824383735657, + "learning_rate": 1.114947482503449e-06, + "loss": 0.3967, + "step": 13587 + }, + { + "epoch": 2.793298386267859, + "grad_norm": 0.23575182259082794, + "learning_rate": 1.1127389706884017e-06, + "loss": 0.3905, + "step": 13588 + }, + { + "epoch": 2.7935039572412377, + "grad_norm": 0.11634790897369385, + "learning_rate": 1.1105326209935874e-06, + "loss": 0.4412, + "step": 13589 + }, + { + "epoch": 2.793709528214616, + "grad_norm": 0.11823614686727524, + "learning_rate": 1.108328433527689e-06, + "loss": 0.4613, + "step": 13590 + }, + { + "epoch": 2.793915099187995, + "grad_norm": 0.2277180552482605, + "learning_rate": 1.1061264083992995e-06, + "loss": 0.4023, + "step": 13591 + }, + { + "epoch": 2.794120670161373, + "grad_norm": 0.23190085589885712, + "learning_rate": 1.1039265457168973e-06, + "loss": 0.3905, + "step": 13592 + }, + { + "epoch": 2.794326241134752, + "grad_norm": 0.22355376183986664, + "learning_rate": 1.1017288455888708e-06, + "loss": 0.3748, + "step": 13593 + }, + { + "epoch": 2.79453181210813, + "grad_norm": 0.22133591771125793, + "learning_rate": 1.0995333081234783e-06, + "loss": 0.3757, + "step": 13594 + }, + { + "epoch": 2.7947373830815088, + "grad_norm": 0.2462836503982544, + "learning_rate": 1.097339933428893e-06, + "loss": 0.3903, + "step": 13595 + }, + { + "epoch": 2.7949429540548874, + "grad_norm": 0.2253459244966507, + "learning_rate": 1.095148721613169e-06, + "loss": 0.3692, + "step": 13596 + }, + { + "epoch": 2.795148525028266, + "grad_norm": 0.2545377016067505, + "learning_rate": 1.0929596727842545e-06, + "loss": 0.3871, + "step": 13597 + }, + { + "epoch": 2.7953540960016445, + "grad_norm": 0.2286592274904251, + "learning_rate": 1.0907727870499985e-06, + "loss": 0.3749, + "step": 13598 + }, + { + "epoch": 2.795559666975023, + "grad_norm": 0.23702724277973175, + "learning_rate": 1.0885880645181395e-06, + "loss": 0.3861, + "step": 13599 + }, + { + "epoch": 2.7957652379484017, + "grad_norm": 0.12967750430107117, + "learning_rate": 1.086405505296302e-06, + "loss": 0.4553, + "step": 13600 + }, + { + "epoch": 2.7959708089217803, + "grad_norm": 0.23417022824287415, + "learning_rate": 1.0842251094920042e-06, + "loss": 0.3808, + "step": 13601 + }, + { + "epoch": 2.796176379895159, + "grad_norm": 0.23133817315101624, + "learning_rate": 1.0820468772126858e-06, + "loss": 0.3838, + "step": 13602 + }, + { + "epoch": 2.7963819508685375, + "grad_norm": 0.12354867160320282, + "learning_rate": 1.0798708085656406e-06, + "loss": 0.4403, + "step": 13603 + }, + { + "epoch": 2.796587521841916, + "grad_norm": 0.2297942191362381, + "learning_rate": 1.0776969036580831e-06, + "loss": 0.3838, + "step": 13604 + }, + { + "epoch": 2.7967930928152946, + "grad_norm": 0.12144782394170761, + "learning_rate": 1.0755251625971025e-06, + "loss": 0.4596, + "step": 13605 + }, + { + "epoch": 2.7969986637886732, + "grad_norm": 0.23768429458141327, + "learning_rate": 1.0733555854896931e-06, + "loss": 0.389, + "step": 13606 + }, + { + "epoch": 2.7972042347620514, + "grad_norm": 0.11554042994976044, + "learning_rate": 1.0711881724427398e-06, + "loss": 0.4285, + "step": 13607 + }, + { + "epoch": 2.7974098057354304, + "grad_norm": 0.23497696220874786, + "learning_rate": 1.0690229235630318e-06, + "loss": 0.3711, + "step": 13608 + }, + { + "epoch": 2.7976153767088086, + "grad_norm": 0.23065055906772614, + "learning_rate": 1.0668598389572187e-06, + "loss": 0.3752, + "step": 13609 + }, + { + "epoch": 2.797820947682187, + "grad_norm": 0.2266397476196289, + "learning_rate": 1.0646989187318856e-06, + "loss": 0.3693, + "step": 13610 + }, + { + "epoch": 2.7980265186555657, + "grad_norm": 0.2287440001964569, + "learning_rate": 1.0625401629934873e-06, + "loss": 0.3822, + "step": 13611 + }, + { + "epoch": 2.7982320896289443, + "grad_norm": 0.23608548939228058, + "learning_rate": 1.0603835718483686e-06, + "loss": 0.3633, + "step": 13612 + }, + { + "epoch": 2.798437660602323, + "grad_norm": 0.23724471032619476, + "learning_rate": 1.0582291454027792e-06, + "loss": 0.3976, + "step": 13613 + }, + { + "epoch": 2.7986432315757015, + "grad_norm": 0.3131234645843506, + "learning_rate": 1.0560768837628549e-06, + "loss": 0.3685, + "step": 13614 + }, + { + "epoch": 2.79884880254908, + "grad_norm": 0.24307109415531158, + "learning_rate": 1.0539267870346253e-06, + "loss": 0.3986, + "step": 13615 + }, + { + "epoch": 2.7990543735224587, + "grad_norm": 0.23056308925151825, + "learning_rate": 1.051778855324026e-06, + "loss": 0.3667, + "step": 13616 + }, + { + "epoch": 2.7992599444958373, + "grad_norm": 0.2293158620595932, + "learning_rate": 1.0496330887368672e-06, + "loss": 0.3761, + "step": 13617 + }, + { + "epoch": 2.799465515469216, + "grad_norm": 0.23687753081321716, + "learning_rate": 1.0474894873788643e-06, + "loss": 0.4005, + "step": 13618 + }, + { + "epoch": 2.7996710864425944, + "grad_norm": 0.2287084013223648, + "learning_rate": 1.045348051355618e-06, + "loss": 0.3946, + "step": 13619 + }, + { + "epoch": 2.799876657415973, + "grad_norm": 0.23279039561748505, + "learning_rate": 1.0432087807726288e-06, + "loss": 0.3591, + "step": 13620 + }, + { + "epoch": 2.8000822283893516, + "grad_norm": 0.23075073957443237, + "learning_rate": 1.0410716757352923e-06, + "loss": 0.3777, + "step": 13621 + }, + { + "epoch": 2.8002877993627298, + "grad_norm": 0.23093274235725403, + "learning_rate": 1.0389367363488895e-06, + "loss": 0.4152, + "step": 13622 + }, + { + "epoch": 2.800493370336109, + "grad_norm": 0.2433861345052719, + "learning_rate": 1.036803962718601e-06, + "loss": 0.3827, + "step": 13623 + }, + { + "epoch": 2.800698941309487, + "grad_norm": 0.2396126538515091, + "learning_rate": 1.034673354949498e-06, + "loss": 0.3938, + "step": 13624 + }, + { + "epoch": 2.8009045122828655, + "grad_norm": 0.231951504945755, + "learning_rate": 1.0325449131465414e-06, + "loss": 0.3815, + "step": 13625 + }, + { + "epoch": 2.801110083256244, + "grad_norm": 0.23407815396785736, + "learning_rate": 1.0304186374145975e-06, + "loss": 0.3898, + "step": 13626 + }, + { + "epoch": 2.8013156542296227, + "grad_norm": 0.23772378265857697, + "learning_rate": 1.0282945278584172e-06, + "loss": 0.3771, + "step": 13627 + }, + { + "epoch": 2.8015212252030013, + "grad_norm": 0.3126058578491211, + "learning_rate": 1.026172584582632e-06, + "loss": 0.3637, + "step": 13628 + }, + { + "epoch": 2.80172679617638, + "grad_norm": 0.1172962412238121, + "learning_rate": 1.0240528076917982e-06, + "loss": 0.4601, + "step": 13629 + }, + { + "epoch": 2.8019323671497585, + "grad_norm": 0.2429337203502655, + "learning_rate": 1.0219351972903375e-06, + "loss": 0.368, + "step": 13630 + }, + { + "epoch": 2.802137938123137, + "grad_norm": 0.24216631054878235, + "learning_rate": 1.019819753482576e-06, + "loss": 0.3908, + "step": 13631 + }, + { + "epoch": 2.8023435090965156, + "grad_norm": 0.23483149707317352, + "learning_rate": 1.0177064763727356e-06, + "loss": 0.3796, + "step": 13632 + }, + { + "epoch": 2.8025490800698942, + "grad_norm": 0.22960315644741058, + "learning_rate": 1.0155953660649232e-06, + "loss": 0.3897, + "step": 13633 + }, + { + "epoch": 2.802754651043273, + "grad_norm": 0.22772780060768127, + "learning_rate": 1.0134864226631402e-06, + "loss": 0.3716, + "step": 13634 + }, + { + "epoch": 2.8029602220166514, + "grad_norm": 0.12643857300281525, + "learning_rate": 1.0113796462712888e-06, + "loss": 0.4547, + "step": 13635 + }, + { + "epoch": 2.80316579299003, + "grad_norm": 0.2468147575855255, + "learning_rate": 1.009275036993166e-06, + "loss": 0.3853, + "step": 13636 + }, + { + "epoch": 2.803371363963408, + "grad_norm": 0.25671377778053284, + "learning_rate": 1.0071725949324484e-06, + "loss": 0.3783, + "step": 13637 + }, + { + "epoch": 2.803576934936787, + "grad_norm": 0.2457493543624878, + "learning_rate": 1.0050723201927136e-06, + "loss": 0.391, + "step": 13638 + }, + { + "epoch": 2.8037825059101653, + "grad_norm": 0.23700536787509918, + "learning_rate": 1.002974212877439e-06, + "loss": 0.3792, + "step": 13639 + }, + { + "epoch": 2.803988076883544, + "grad_norm": 0.23173773288726807, + "learning_rate": 1.0008782730899764e-06, + "loss": 0.4102, + "step": 13640 + }, + { + "epoch": 2.8041936478569225, + "grad_norm": 0.22917988896369934, + "learning_rate": 9.987845009335933e-07, + "loss": 0.3808, + "step": 13641 + }, + { + "epoch": 2.804399218830301, + "grad_norm": 0.22746974229812622, + "learning_rate": 9.966928965114325e-07, + "loss": 0.3807, + "step": 13642 + }, + { + "epoch": 2.8046047898036797, + "grad_norm": 0.2384635955095291, + "learning_rate": 9.946034599265464e-07, + "loss": 0.3678, + "step": 13643 + }, + { + "epoch": 2.8048103607770583, + "grad_norm": 0.12134691327810287, + "learning_rate": 9.925161912818625e-07, + "loss": 0.4635, + "step": 13644 + }, + { + "epoch": 2.805015931750437, + "grad_norm": 0.12150728702545166, + "learning_rate": 9.90431090680224e-07, + "loss": 0.4485, + "step": 13645 + }, + { + "epoch": 2.8052215027238154, + "grad_norm": 0.2309166043996811, + "learning_rate": 9.88348158224338e-07, + "loss": 0.3792, + "step": 13646 + }, + { + "epoch": 2.805427073697194, + "grad_norm": 0.12357798218727112, + "learning_rate": 9.862673940168332e-07, + "loss": 0.4412, + "step": 13647 + }, + { + "epoch": 2.8056326446705726, + "grad_norm": 0.1247616782784462, + "learning_rate": 9.841887981602121e-07, + "loss": 0.4396, + "step": 13648 + }, + { + "epoch": 2.805838215643951, + "grad_norm": 0.231892392039299, + "learning_rate": 9.82112370756873e-07, + "loss": 0.3681, + "step": 13649 + }, + { + "epoch": 2.80604378661733, + "grad_norm": 0.23392446339130402, + "learning_rate": 9.80038111909124e-07, + "loss": 0.3689, + "step": 13650 + }, + { + "epoch": 2.8062493575907084, + "grad_norm": 0.22620777785778046, + "learning_rate": 9.779660217191484e-07, + "loss": 0.3742, + "step": 13651 + }, + { + "epoch": 2.8064549285640865, + "grad_norm": 0.23345611989498138, + "learning_rate": 9.758961002890242e-07, + "loss": 0.3886, + "step": 13652 + }, + { + "epoch": 2.8066604995374655, + "grad_norm": 0.22174043953418732, + "learning_rate": 9.738283477207405e-07, + "loss": 0.3853, + "step": 13653 + }, + { + "epoch": 2.8068660705108437, + "grad_norm": 0.125930517911911, + "learning_rate": 9.717627641161502e-07, + "loss": 0.4399, + "step": 13654 + }, + { + "epoch": 2.8070716414842223, + "grad_norm": 0.23766390979290009, + "learning_rate": 9.696993495770224e-07, + "loss": 0.3779, + "step": 13655 + }, + { + "epoch": 2.807277212457601, + "grad_norm": 0.22734849154949188, + "learning_rate": 9.676381042050053e-07, + "loss": 0.3785, + "step": 13656 + }, + { + "epoch": 2.8074827834309795, + "grad_norm": 0.12355753779411316, + "learning_rate": 9.65579028101658e-07, + "loss": 0.4431, + "step": 13657 + }, + { + "epoch": 2.807688354404358, + "grad_norm": 0.23215292394161224, + "learning_rate": 9.635221213684143e-07, + "loss": 0.3898, + "step": 13658 + }, + { + "epoch": 2.8078939253777366, + "grad_norm": 0.2282809466123581, + "learning_rate": 9.61467384106613e-07, + "loss": 0.3711, + "step": 13659 + }, + { + "epoch": 2.808099496351115, + "grad_norm": 0.23502275347709656, + "learning_rate": 9.594148164174731e-07, + "loss": 0.3777, + "step": 13660 + }, + { + "epoch": 2.808305067324494, + "grad_norm": 0.13005268573760986, + "learning_rate": 9.57364418402124e-07, + "loss": 0.4485, + "step": 13661 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 0.12916310131549835, + "learning_rate": 9.553161901615748e-07, + "loss": 0.433, + "step": 13662 + }, + { + "epoch": 2.808716209271251, + "grad_norm": 0.22590284049510956, + "learning_rate": 9.532701317967247e-07, + "loss": 0.374, + "step": 13663 + }, + { + "epoch": 2.8089217802446296, + "grad_norm": 0.22926348447799683, + "learning_rate": 9.512262434083879e-07, + "loss": 0.3615, + "step": 13664 + }, + { + "epoch": 2.809127351218008, + "grad_norm": 0.22875775396823883, + "learning_rate": 9.491845250972542e-07, + "loss": 0.3893, + "step": 13665 + }, + { + "epoch": 2.8093329221913867, + "grad_norm": 0.22488847374916077, + "learning_rate": 9.47144976963903e-07, + "loss": 0.3798, + "step": 13666 + }, + { + "epoch": 2.809538493164765, + "grad_norm": 0.2324180006980896, + "learning_rate": 9.451075991088138e-07, + "loss": 0.3821, + "step": 13667 + }, + { + "epoch": 2.809744064138144, + "grad_norm": 0.22795747220516205, + "learning_rate": 9.430723916323663e-07, + "loss": 0.3638, + "step": 13668 + }, + { + "epoch": 2.809949635111522, + "grad_norm": 0.23047983646392822, + "learning_rate": 9.410393546348156e-07, + "loss": 0.4035, + "step": 13669 + }, + { + "epoch": 2.8101552060849007, + "grad_norm": 0.23792816698551178, + "learning_rate": 9.390084882163214e-07, + "loss": 0.3815, + "step": 13670 + }, + { + "epoch": 2.8103607770582792, + "grad_norm": 0.22404231131076813, + "learning_rate": 9.369797924769436e-07, + "loss": 0.3589, + "step": 13671 + }, + { + "epoch": 2.810566348031658, + "grad_norm": 0.23044191300868988, + "learning_rate": 9.349532675166223e-07, + "loss": 0.3835, + "step": 13672 + }, + { + "epoch": 2.8107719190050364, + "grad_norm": 0.232622891664505, + "learning_rate": 9.329289134351927e-07, + "loss": 0.3969, + "step": 13673 + }, + { + "epoch": 2.810977489978415, + "grad_norm": 0.2428961992263794, + "learning_rate": 9.309067303323848e-07, + "loss": 0.3955, + "step": 13674 + }, + { + "epoch": 2.8111830609517936, + "grad_norm": 0.12209093570709229, + "learning_rate": 9.288867183078243e-07, + "loss": 0.4581, + "step": 13675 + }, + { + "epoch": 2.811388631925172, + "grad_norm": 0.2335900366306305, + "learning_rate": 9.268688774610313e-07, + "loss": 0.3835, + "step": 13676 + }, + { + "epoch": 2.8115942028985508, + "grad_norm": 0.22861804068088531, + "learning_rate": 9.248532078914063e-07, + "loss": 0.3936, + "step": 13677 + }, + { + "epoch": 2.8117997738719294, + "grad_norm": 0.2362525463104248, + "learning_rate": 9.2283970969826e-07, + "loss": 0.3921, + "step": 13678 + }, + { + "epoch": 2.812005344845308, + "grad_norm": 0.2308216392993927, + "learning_rate": 9.208283829807829e-07, + "loss": 0.4013, + "step": 13679 + }, + { + "epoch": 2.8122109158186865, + "grad_norm": 0.22565658390522003, + "learning_rate": 9.188192278380709e-07, + "loss": 0.3744, + "step": 13680 + }, + { + "epoch": 2.812416486792065, + "grad_norm": 0.22707243263721466, + "learning_rate": 9.168122443690997e-07, + "loss": 0.3629, + "step": 13681 + }, + { + "epoch": 2.8126220577654433, + "grad_norm": 0.22881367802619934, + "learning_rate": 9.148074326727402e-07, + "loss": 0.3871, + "step": 13682 + }, + { + "epoch": 2.8128276287388223, + "grad_norm": 0.21950559318065643, + "learning_rate": 9.128047928477685e-07, + "loss": 0.3675, + "step": 13683 + }, + { + "epoch": 2.8130331997122004, + "grad_norm": 0.23215575516223907, + "learning_rate": 9.108043249928355e-07, + "loss": 0.3695, + "step": 13684 + }, + { + "epoch": 2.813238770685579, + "grad_norm": 0.250355988740921, + "learning_rate": 9.088060292065076e-07, + "loss": 0.3879, + "step": 13685 + }, + { + "epoch": 2.8134443416589576, + "grad_norm": 0.2308340221643448, + "learning_rate": 9.068099055872259e-07, + "loss": 0.3749, + "step": 13686 + }, + { + "epoch": 2.813649912632336, + "grad_norm": 0.11878734081983566, + "learning_rate": 9.048159542333268e-07, + "loss": 0.4479, + "step": 13687 + }, + { + "epoch": 2.813855483605715, + "grad_norm": 0.12243502587080002, + "learning_rate": 9.028241752430417e-07, + "loss": 0.463, + "step": 13688 + }, + { + "epoch": 2.8140610545790934, + "grad_norm": 0.22359062731266022, + "learning_rate": 9.00834568714507e-07, + "loss": 0.3762, + "step": 13689 + }, + { + "epoch": 2.814266625552472, + "grad_norm": 0.22925424575805664, + "learning_rate": 8.988471347457295e-07, + "loss": 0.3776, + "step": 13690 + }, + { + "epoch": 2.8144721965258506, + "grad_norm": 0.2405097633600235, + "learning_rate": 8.968618734346207e-07, + "loss": 0.3733, + "step": 13691 + }, + { + "epoch": 2.814677767499229, + "grad_norm": 0.21798452734947205, + "learning_rate": 8.948787848789974e-07, + "loss": 0.365, + "step": 13692 + }, + { + "epoch": 2.8148833384726077, + "grad_norm": 0.24408473074436188, + "learning_rate": 8.928978691765466e-07, + "loss": 0.3723, + "step": 13693 + }, + { + "epoch": 2.8150889094459863, + "grad_norm": 0.23546668887138367, + "learning_rate": 8.909191264248601e-07, + "loss": 0.399, + "step": 13694 + }, + { + "epoch": 2.815294480419365, + "grad_norm": 0.2411290407180786, + "learning_rate": 8.889425567214249e-07, + "loss": 0.3898, + "step": 13695 + }, + { + "epoch": 2.8155000513927435, + "grad_norm": 0.12170881778001785, + "learning_rate": 8.869681601636181e-07, + "loss": 0.453, + "step": 13696 + }, + { + "epoch": 2.8157056223661217, + "grad_norm": 0.22512827813625336, + "learning_rate": 8.849959368487021e-07, + "loss": 0.3593, + "step": 13697 + }, + { + "epoch": 2.8159111933395007, + "grad_norm": 0.23673585057258606, + "learning_rate": 8.830258868738439e-07, + "loss": 0.3814, + "step": 13698 + }, + { + "epoch": 2.816116764312879, + "grad_norm": 0.23666398227214813, + "learning_rate": 8.81058010336101e-07, + "loss": 0.3891, + "step": 13699 + }, + { + "epoch": 2.8163223352862574, + "grad_norm": 0.23257263004779816, + "learning_rate": 8.790923073324159e-07, + "loss": 0.3874, + "step": 13700 + }, + { + "epoch": 2.816527906259636, + "grad_norm": 0.11912301182746887, + "learning_rate": 8.771287779596361e-07, + "loss": 0.4726, + "step": 13701 + }, + { + "epoch": 2.8167334772330146, + "grad_norm": 0.24169382452964783, + "learning_rate": 8.75167422314489e-07, + "loss": 0.3887, + "step": 13702 + }, + { + "epoch": 2.816939048206393, + "grad_norm": 0.24105940759181976, + "learning_rate": 8.732082404936026e-07, + "loss": 0.3656, + "step": 13703 + }, + { + "epoch": 2.8171446191797718, + "grad_norm": 0.23163765668869019, + "learning_rate": 8.712512325934946e-07, + "loss": 0.3995, + "step": 13704 + }, + { + "epoch": 2.8173501901531504, + "grad_norm": 0.24219734966754913, + "learning_rate": 8.692963987105878e-07, + "loss": 0.3994, + "step": 13705 + }, + { + "epoch": 2.817555761126529, + "grad_norm": 0.23079170286655426, + "learning_rate": 8.673437389411804e-07, + "loss": 0.386, + "step": 13706 + }, + { + "epoch": 2.8177613320999075, + "grad_norm": 0.23005284368991852, + "learning_rate": 8.653932533814702e-07, + "loss": 0.3753, + "step": 13707 + }, + { + "epoch": 2.817966903073286, + "grad_norm": 0.23586174845695496, + "learning_rate": 8.634449421275504e-07, + "loss": 0.3902, + "step": 13708 + }, + { + "epoch": 2.8181724740466647, + "grad_norm": 0.22992920875549316, + "learning_rate": 8.614988052754042e-07, + "loss": 0.3829, + "step": 13709 + }, + { + "epoch": 2.8183780450200433, + "grad_norm": 0.2352675497531891, + "learning_rate": 8.5955484292091e-07, + "loss": 0.3804, + "step": 13710 + }, + { + "epoch": 2.818583615993422, + "grad_norm": 0.22630825638771057, + "learning_rate": 8.576130551598311e-07, + "loss": 0.3642, + "step": 13711 + }, + { + "epoch": 2.8187891869668, + "grad_norm": 0.23707729578018188, + "learning_rate": 8.556734420878409e-07, + "loss": 0.3683, + "step": 13712 + }, + { + "epoch": 2.818994757940179, + "grad_norm": 0.23465366661548615, + "learning_rate": 8.537360038004883e-07, + "loss": 0.3868, + "step": 13713 + }, + { + "epoch": 2.819200328913557, + "grad_norm": 0.23585152626037598, + "learning_rate": 8.518007403932266e-07, + "loss": 0.4204, + "step": 13714 + }, + { + "epoch": 2.8194058998869362, + "grad_norm": 0.23271988332271576, + "learning_rate": 8.498676519613947e-07, + "loss": 0.3661, + "step": 13715 + }, + { + "epoch": 2.8196114708603144, + "grad_norm": 0.23224134743213654, + "learning_rate": 8.479367386002163e-07, + "loss": 0.3807, + "step": 13716 + }, + { + "epoch": 2.819817041833693, + "grad_norm": 0.22672690451145172, + "learning_rate": 8.460080004048404e-07, + "loss": 0.3921, + "step": 13717 + }, + { + "epoch": 2.8200226128070716, + "grad_norm": 0.2301137000322342, + "learning_rate": 8.44081437470266e-07, + "loss": 0.3761, + "step": 13718 + }, + { + "epoch": 2.82022818378045, + "grad_norm": 0.24038895964622498, + "learning_rate": 8.421570498914222e-07, + "loss": 0.3823, + "step": 13719 + }, + { + "epoch": 2.8204337547538287, + "grad_norm": 0.11897142231464386, + "learning_rate": 8.402348377631031e-07, + "loss": 0.4372, + "step": 13720 + }, + { + "epoch": 2.8206393257272073, + "grad_norm": 0.23280301690101624, + "learning_rate": 8.383148011800179e-07, + "loss": 0.3707, + "step": 13721 + }, + { + "epoch": 2.820844896700586, + "grad_norm": 0.2358703911304474, + "learning_rate": 8.363969402367461e-07, + "loss": 0.3826, + "step": 13722 + }, + { + "epoch": 2.8210504676739645, + "grad_norm": 0.2333759367465973, + "learning_rate": 8.34481255027777e-07, + "loss": 0.3911, + "step": 13723 + }, + { + "epoch": 2.821256038647343, + "grad_norm": 0.23327887058258057, + "learning_rate": 8.325677456474901e-07, + "loss": 0.3781, + "step": 13724 + }, + { + "epoch": 2.8214616096207217, + "grad_norm": 0.23647433519363403, + "learning_rate": 8.30656412190145e-07, + "loss": 0.3817, + "step": 13725 + }, + { + "epoch": 2.8216671805941003, + "grad_norm": 0.12305039912462234, + "learning_rate": 8.287472547499165e-07, + "loss": 0.4555, + "step": 13726 + }, + { + "epoch": 2.8218727515674784, + "grad_norm": 0.22186824679374695, + "learning_rate": 8.268402734208592e-07, + "loss": 0.3963, + "step": 13727 + }, + { + "epoch": 2.8220783225408574, + "grad_norm": 0.22588272392749786, + "learning_rate": 8.249354682969129e-07, + "loss": 0.3854, + "step": 13728 + }, + { + "epoch": 2.8222838935142356, + "grad_norm": 0.23009559512138367, + "learning_rate": 8.230328394719228e-07, + "loss": 0.3894, + "step": 13729 + }, + { + "epoch": 2.8224894644876146, + "grad_norm": 0.23012928664684296, + "learning_rate": 8.211323870396187e-07, + "loss": 0.3711, + "step": 13730 + }, + { + "epoch": 2.8226950354609928, + "grad_norm": 0.12790702283382416, + "learning_rate": 8.192341110936358e-07, + "loss": 0.466, + "step": 13731 + }, + { + "epoch": 2.8229006064343714, + "grad_norm": 0.2347603589296341, + "learning_rate": 8.173380117274792e-07, + "loss": 0.3855, + "step": 13732 + }, + { + "epoch": 2.82310617740775, + "grad_norm": 0.11841531097888947, + "learning_rate": 8.154440890345794e-07, + "loss": 0.4421, + "step": 13733 + }, + { + "epoch": 2.8233117483811285, + "grad_norm": 0.22990132868289948, + "learning_rate": 8.135523431082265e-07, + "loss": 0.373, + "step": 13734 + }, + { + "epoch": 2.823517319354507, + "grad_norm": 0.2206183522939682, + "learning_rate": 8.11662774041626e-07, + "loss": 0.3587, + "step": 13735 + }, + { + "epoch": 2.8237228903278857, + "grad_norm": 0.2378583699464798, + "learning_rate": 8.097753819278636e-07, + "loss": 0.3793, + "step": 13736 + }, + { + "epoch": 2.8239284613012643, + "grad_norm": 0.22767938673496246, + "learning_rate": 8.078901668599149e-07, + "loss": 0.3706, + "step": 13737 + }, + { + "epoch": 2.824134032274643, + "grad_norm": 0.23271609842777252, + "learning_rate": 8.060071289306753e-07, + "loss": 0.3807, + "step": 13738 + }, + { + "epoch": 2.8243396032480215, + "grad_norm": 0.21641339361667633, + "learning_rate": 8.04126268232901e-07, + "loss": 0.3673, + "step": 13739 + }, + { + "epoch": 2.8245451742214, + "grad_norm": 0.2371521145105362, + "learning_rate": 8.022475848592475e-07, + "loss": 0.3795, + "step": 13740 + }, + { + "epoch": 2.8247507451947786, + "grad_norm": 0.22861357033252716, + "learning_rate": 8.003710789022811e-07, + "loss": 0.3907, + "step": 13741 + }, + { + "epoch": 2.824956316168157, + "grad_norm": 0.23238502442836761, + "learning_rate": 7.984967504544427e-07, + "loss": 0.376, + "step": 13742 + }, + { + "epoch": 2.825161887141536, + "grad_norm": 0.2233378142118454, + "learning_rate": 7.966245996080734e-07, + "loss": 0.3744, + "step": 13743 + }, + { + "epoch": 2.825367458114914, + "grad_norm": 0.22623707354068756, + "learning_rate": 7.947546264553996e-07, + "loss": 0.3867, + "step": 13744 + }, + { + "epoch": 2.825573029088293, + "grad_norm": 0.24018484354019165, + "learning_rate": 7.928868310885573e-07, + "loss": 0.3648, + "step": 13745 + }, + { + "epoch": 2.825778600061671, + "grad_norm": 0.13057471811771393, + "learning_rate": 7.910212135995481e-07, + "loss": 0.4654, + "step": 13746 + }, + { + "epoch": 2.8259841710350497, + "grad_norm": 0.22883687913417816, + "learning_rate": 7.891577740802985e-07, + "loss": 0.3663, + "step": 13747 + }, + { + "epoch": 2.8261897420084283, + "grad_norm": 0.23778748512268066, + "learning_rate": 7.872965126226e-07, + "loss": 0.3603, + "step": 13748 + }, + { + "epoch": 2.826395312981807, + "grad_norm": 0.12038971483707428, + "learning_rate": 7.854374293181593e-07, + "loss": 0.4537, + "step": 13749 + }, + { + "epoch": 2.8266008839551855, + "grad_norm": 0.11914535611867905, + "learning_rate": 7.835805242585531e-07, + "loss": 0.4408, + "step": 13750 + }, + { + "epoch": 2.826806454928564, + "grad_norm": 0.22773846983909607, + "learning_rate": 7.817257975352682e-07, + "loss": 0.3739, + "step": 13751 + }, + { + "epoch": 2.8270120259019427, + "grad_norm": 0.2309103161096573, + "learning_rate": 7.798732492396815e-07, + "loss": 0.3781, + "step": 13752 + }, + { + "epoch": 2.8272175968753213, + "grad_norm": 0.12411284446716309, + "learning_rate": 7.780228794630451e-07, + "loss": 0.4418, + "step": 13753 + }, + { + "epoch": 2.8274231678487, + "grad_norm": 0.22320185601711273, + "learning_rate": 7.761746882965359e-07, + "loss": 0.3706, + "step": 13754 + }, + { + "epoch": 2.8276287388220784, + "grad_norm": 0.23378294706344604, + "learning_rate": 7.743286758312013e-07, + "loss": 0.3784, + "step": 13755 + }, + { + "epoch": 2.827834309795457, + "grad_norm": 0.23577441275119781, + "learning_rate": 7.724848421579784e-07, + "loss": 0.371, + "step": 13756 + }, + { + "epoch": 2.828039880768835, + "grad_norm": 0.22351431846618652, + "learning_rate": 7.706431873677094e-07, + "loss": 0.3703, + "step": 13757 + }, + { + "epoch": 2.828245451742214, + "grad_norm": 0.24170389771461487, + "learning_rate": 7.688037115511171e-07, + "loss": 0.391, + "step": 13758 + }, + { + "epoch": 2.8284510227155923, + "grad_norm": 0.23205341398715973, + "learning_rate": 7.669664147988387e-07, + "loss": 0.3744, + "step": 13759 + }, + { + "epoch": 2.8286565936889714, + "grad_norm": 0.22255219519138336, + "learning_rate": 7.651312972013769e-07, + "loss": 0.3775, + "step": 13760 + }, + { + "epoch": 2.8288621646623495, + "grad_norm": 0.22708290815353394, + "learning_rate": 7.632983588491393e-07, + "loss": 0.3945, + "step": 13761 + }, + { + "epoch": 2.829067735635728, + "grad_norm": 0.23190079629421234, + "learning_rate": 7.614675998324339e-07, + "loss": 0.3955, + "step": 13762 + }, + { + "epoch": 2.8292733066091067, + "grad_norm": 0.11703302711248398, + "learning_rate": 7.596390202414483e-07, + "loss": 0.4556, + "step": 13763 + }, + { + "epoch": 2.8294788775824853, + "grad_norm": 0.232466459274292, + "learning_rate": 7.578126201662706e-07, + "loss": 0.3894, + "step": 13764 + }, + { + "epoch": 2.829684448555864, + "grad_norm": 0.23175998032093048, + "learning_rate": 7.559883996968787e-07, + "loss": 0.36, + "step": 13765 + }, + { + "epoch": 2.8298900195292425, + "grad_norm": 0.2221493124961853, + "learning_rate": 7.541663589231407e-07, + "loss": 0.3767, + "step": 13766 + }, + { + "epoch": 2.830095590502621, + "grad_norm": 0.23145779967308044, + "learning_rate": 7.5234649793482e-07, + "loss": 0.3761, + "step": 13767 + }, + { + "epoch": 2.8303011614759996, + "grad_norm": 0.2308301031589508, + "learning_rate": 7.505288168215746e-07, + "loss": 0.3777, + "step": 13768 + }, + { + "epoch": 2.8305067324493782, + "grad_norm": 0.22926832735538483, + "learning_rate": 7.487133156729531e-07, + "loss": 0.3794, + "step": 13769 + }, + { + "epoch": 2.830712303422757, + "grad_norm": 0.22793909907341003, + "learning_rate": 7.468999945783989e-07, + "loss": 0.3854, + "step": 13770 + }, + { + "epoch": 2.8309178743961354, + "grad_norm": 0.23420362174510956, + "learning_rate": 7.450888536272455e-07, + "loss": 0.3804, + "step": 13771 + }, + { + "epoch": 2.8311234453695135, + "grad_norm": 0.2258753925561905, + "learning_rate": 7.432798929087115e-07, + "loss": 0.386, + "step": 13772 + }, + { + "epoch": 2.8313290163428926, + "grad_norm": 0.12601035833358765, + "learning_rate": 7.414731125119256e-07, + "loss": 0.4424, + "step": 13773 + }, + { + "epoch": 2.8315345873162707, + "grad_norm": 0.22683130204677582, + "learning_rate": 7.396685125258917e-07, + "loss": 0.3806, + "step": 13774 + }, + { + "epoch": 2.8317401582896498, + "grad_norm": 0.23239809274673462, + "learning_rate": 7.378660930395237e-07, + "loss": 0.373, + "step": 13775 + }, + { + "epoch": 2.831945729263028, + "grad_norm": 0.23171231150627136, + "learning_rate": 7.360658541416054e-07, + "loss": 0.3781, + "step": 13776 + }, + { + "epoch": 2.8321513002364065, + "grad_norm": 0.23430903255939484, + "learning_rate": 7.34267795920841e-07, + "loss": 0.3819, + "step": 13777 + }, + { + "epoch": 2.832356871209785, + "grad_norm": 0.22949565947055817, + "learning_rate": 7.324719184657997e-07, + "loss": 0.378, + "step": 13778 + }, + { + "epoch": 2.8325624421831637, + "grad_norm": 0.11871360242366791, + "learning_rate": 7.306782218649605e-07, + "loss": 0.4448, + "step": 13779 + }, + { + "epoch": 2.8327680131565423, + "grad_norm": 0.2298881858587265, + "learning_rate": 7.288867062066928e-07, + "loss": 0.3606, + "step": 13780 + }, + { + "epoch": 2.832973584129921, + "grad_norm": 0.11663959920406342, + "learning_rate": 7.270973715792562e-07, + "loss": 0.4501, + "step": 13781 + }, + { + "epoch": 2.8331791551032994, + "grad_norm": 0.12173844128847122, + "learning_rate": 7.253102180707949e-07, + "loss": 0.4564, + "step": 13782 + }, + { + "epoch": 2.833384726076678, + "grad_norm": 0.2263535112142563, + "learning_rate": 7.235252457693686e-07, + "loss": 0.3858, + "step": 13783 + }, + { + "epoch": 2.8335902970500566, + "grad_norm": 0.11779969185590744, + "learning_rate": 7.21742454762902e-07, + "loss": 0.4431, + "step": 13784 + }, + { + "epoch": 2.833795868023435, + "grad_norm": 0.2434069812297821, + "learning_rate": 7.199618451392298e-07, + "loss": 0.4067, + "step": 13785 + }, + { + "epoch": 2.834001438996814, + "grad_norm": 0.22886650264263153, + "learning_rate": 7.181834169860719e-07, + "loss": 0.3828, + "step": 13786 + }, + { + "epoch": 2.8342070099701924, + "grad_norm": 0.2306927889585495, + "learning_rate": 7.16407170391038e-07, + "loss": 0.3762, + "step": 13787 + }, + { + "epoch": 2.834412580943571, + "grad_norm": 0.2322409451007843, + "learning_rate": 7.146331054416483e-07, + "loss": 0.3907, + "step": 13788 + }, + { + "epoch": 2.834618151916949, + "grad_norm": 0.22728115320205688, + "learning_rate": 7.128612222252979e-07, + "loss": 0.3824, + "step": 13789 + }, + { + "epoch": 2.834823722890328, + "grad_norm": 0.225159153342247, + "learning_rate": 7.110915208292768e-07, + "loss": 0.4054, + "step": 13790 + }, + { + "epoch": 2.8350292938637063, + "grad_norm": 0.12113186717033386, + "learning_rate": 7.093240013407704e-07, + "loss": 0.439, + "step": 13791 + }, + { + "epoch": 2.835234864837085, + "grad_norm": 0.2332168072462082, + "learning_rate": 7.07558663846854e-07, + "loss": 0.3793, + "step": 13792 + }, + { + "epoch": 2.8354404358104635, + "grad_norm": 0.22835347056388855, + "learning_rate": 7.05795508434503e-07, + "loss": 0.3758, + "step": 13793 + }, + { + "epoch": 2.835646006783842, + "grad_norm": 0.12069544196128845, + "learning_rate": 7.040345351905731e-07, + "loss": 0.4602, + "step": 13794 + }, + { + "epoch": 2.8358515777572206, + "grad_norm": 0.22868898510932922, + "learning_rate": 7.022757442018246e-07, + "loss": 0.3804, + "step": 13795 + }, + { + "epoch": 2.836057148730599, + "grad_norm": 0.232134148478508, + "learning_rate": 7.005191355549034e-07, + "loss": 0.3889, + "step": 13796 + }, + { + "epoch": 2.836262719703978, + "grad_norm": 0.23718050122261047, + "learning_rate": 6.987647093363503e-07, + "loss": 0.3728, + "step": 13797 + }, + { + "epoch": 2.8364682906773564, + "grad_norm": 0.24368955194950104, + "learning_rate": 6.970124656325911e-07, + "loss": 0.3852, + "step": 13798 + }, + { + "epoch": 2.836673861650735, + "grad_norm": 0.2304588258266449, + "learning_rate": 6.952624045299617e-07, + "loss": 0.3809, + "step": 13799 + }, + { + "epoch": 2.8368794326241136, + "grad_norm": 0.23114575445652008, + "learning_rate": 6.935145261146731e-07, + "loss": 0.3808, + "step": 13800 + }, + { + "epoch": 2.837085003597492, + "grad_norm": 0.22746378183364868, + "learning_rate": 6.917688304728315e-07, + "loss": 0.3887, + "step": 13801 + }, + { + "epoch": 2.8372905745708707, + "grad_norm": 0.22767049074172974, + "learning_rate": 6.900253176904481e-07, + "loss": 0.3729, + "step": 13802 + }, + { + "epoch": 2.8374961455442493, + "grad_norm": 0.22864069044589996, + "learning_rate": 6.882839878534092e-07, + "loss": 0.3854, + "step": 13803 + }, + { + "epoch": 2.8377017165176275, + "grad_norm": 0.22305408120155334, + "learning_rate": 6.865448410475112e-07, + "loss": 0.4005, + "step": 13804 + }, + { + "epoch": 2.8379072874910065, + "grad_norm": 0.22816435992717743, + "learning_rate": 6.848078773584255e-07, + "loss": 0.3775, + "step": 13805 + }, + { + "epoch": 2.8381128584643847, + "grad_norm": 0.23188713192939758, + "learning_rate": 6.830730968717236e-07, + "loss": 0.3879, + "step": 13806 + }, + { + "epoch": 2.8383184294377632, + "grad_norm": 0.11994650214910507, + "learning_rate": 6.813404996728823e-07, + "loss": 0.4432, + "step": 13807 + }, + { + "epoch": 2.838524000411142, + "grad_norm": 0.23941002786159515, + "learning_rate": 6.796100858472382e-07, + "loss": 0.3655, + "step": 13808 + }, + { + "epoch": 2.8387295713845204, + "grad_norm": 0.12042734026908875, + "learning_rate": 6.778818554800581e-07, + "loss": 0.451, + "step": 13809 + }, + { + "epoch": 2.838935142357899, + "grad_norm": 0.23225072026252747, + "learning_rate": 6.76155808656479e-07, + "loss": 0.3759, + "step": 13810 + }, + { + "epoch": 2.8391407133312776, + "grad_norm": 0.23144301772117615, + "learning_rate": 6.744319454615328e-07, + "loss": 0.3922, + "step": 13811 + }, + { + "epoch": 2.839346284304656, + "grad_norm": 0.24022118747234344, + "learning_rate": 6.727102659801515e-07, + "loss": 0.3847, + "step": 13812 + }, + { + "epoch": 2.8395518552780348, + "grad_norm": 0.22620242834091187, + "learning_rate": 6.709907702971474e-07, + "loss": 0.3849, + "step": 13813 + }, + { + "epoch": 2.8397574262514134, + "grad_norm": 0.2255433201789856, + "learning_rate": 6.692734584972326e-07, + "loss": 0.3737, + "step": 13814 + }, + { + "epoch": 2.839962997224792, + "grad_norm": 0.2278052270412445, + "learning_rate": 6.675583306650096e-07, + "loss": 0.3742, + "step": 13815 + }, + { + "epoch": 2.8401685681981705, + "grad_norm": 0.22527383267879486, + "learning_rate": 6.658453868849857e-07, + "loss": 0.3887, + "step": 13816 + }, + { + "epoch": 2.840374139171549, + "grad_norm": 0.2278517484664917, + "learning_rate": 6.641346272415383e-07, + "loss": 0.3734, + "step": 13817 + }, + { + "epoch": 2.8405797101449277, + "grad_norm": 0.23448723554611206, + "learning_rate": 6.624260518189551e-07, + "loss": 0.3784, + "step": 13818 + }, + { + "epoch": 2.840785281118306, + "grad_norm": 0.24033266305923462, + "learning_rate": 6.607196607014088e-07, + "loss": 0.3812, + "step": 13819 + }, + { + "epoch": 2.840990852091685, + "grad_norm": 0.22752645611763, + "learning_rate": 6.590154539729621e-07, + "loss": 0.3747, + "step": 13820 + }, + { + "epoch": 2.841196423065063, + "grad_norm": 0.2382228821516037, + "learning_rate": 6.573134317175728e-07, + "loss": 0.3989, + "step": 13821 + }, + { + "epoch": 2.8414019940384416, + "grad_norm": 0.23340356349945068, + "learning_rate": 6.556135940190888e-07, + "loss": 0.3767, + "step": 13822 + }, + { + "epoch": 2.84160756501182, + "grad_norm": 0.12209226191043854, + "learning_rate": 6.539159409612633e-07, + "loss": 0.4466, + "step": 13823 + }, + { + "epoch": 2.841813135985199, + "grad_norm": 0.22561949491500854, + "learning_rate": 6.522204726277293e-07, + "loss": 0.3758, + "step": 13824 + }, + { + "epoch": 2.8420187069585774, + "grad_norm": 0.225555419921875, + "learning_rate": 6.505271891020048e-07, + "loss": 0.3724, + "step": 13825 + }, + { + "epoch": 2.842224277931956, + "grad_norm": 0.2285340279340744, + "learning_rate": 6.488360904675234e-07, + "loss": 0.3866, + "step": 13826 + }, + { + "epoch": 2.8424298489053346, + "grad_norm": 0.2325884997844696, + "learning_rate": 6.471471768075882e-07, + "loss": 0.3787, + "step": 13827 + }, + { + "epoch": 2.842635419878713, + "grad_norm": 0.1197914183139801, + "learning_rate": 6.454604482054077e-07, + "loss": 0.4564, + "step": 13828 + }, + { + "epoch": 2.8428409908520917, + "grad_norm": 0.24161775410175323, + "learning_rate": 6.437759047440706e-07, + "loss": 0.3779, + "step": 13829 + }, + { + "epoch": 2.8430465618254703, + "grad_norm": 0.23106519877910614, + "learning_rate": 6.420935465065853e-07, + "loss": 0.3715, + "step": 13830 + }, + { + "epoch": 2.843252132798849, + "grad_norm": 0.22928760945796967, + "learning_rate": 6.404133735758156e-07, + "loss": 0.3916, + "step": 13831 + }, + { + "epoch": 2.8434577037722275, + "grad_norm": 0.22873489558696747, + "learning_rate": 6.387353860345452e-07, + "loss": 0.381, + "step": 13832 + }, + { + "epoch": 2.843663274745606, + "grad_norm": 0.23243139684200287, + "learning_rate": 6.370595839654431e-07, + "loss": 0.3902, + "step": 13833 + }, + { + "epoch": 2.8438688457189842, + "grad_norm": 0.2291172593832016, + "learning_rate": 6.353859674510582e-07, + "loss": 0.3911, + "step": 13834 + }, + { + "epoch": 2.8440744166923633, + "grad_norm": 0.22925592958927155, + "learning_rate": 6.337145365738495e-07, + "loss": 0.3684, + "step": 13835 + }, + { + "epoch": 2.8442799876657414, + "grad_norm": 0.22563427686691284, + "learning_rate": 6.320452914161512e-07, + "loss": 0.3863, + "step": 13836 + }, + { + "epoch": 2.84448555863912, + "grad_norm": 0.23132719099521637, + "learning_rate": 6.303782320602126e-07, + "loss": 0.397, + "step": 13837 + }, + { + "epoch": 2.8446911296124986, + "grad_norm": 0.12186164408922195, + "learning_rate": 6.287133585881528e-07, + "loss": 0.4323, + "step": 13838 + }, + { + "epoch": 2.844896700585877, + "grad_norm": 0.1260182410478592, + "learning_rate": 6.270506710819963e-07, + "loss": 0.4418, + "step": 13839 + }, + { + "epoch": 2.8451022715592558, + "grad_norm": 0.11887041479349136, + "learning_rate": 6.253901696236575e-07, + "loss": 0.4506, + "step": 13840 + }, + { + "epoch": 2.8453078425326344, + "grad_norm": 0.23686912655830383, + "learning_rate": 6.237318542949361e-07, + "loss": 0.3608, + "step": 13841 + }, + { + "epoch": 2.845513413506013, + "grad_norm": 0.2436566948890686, + "learning_rate": 6.220757251775316e-07, + "loss": 0.3661, + "step": 13842 + }, + { + "epoch": 2.8457189844793915, + "grad_norm": 0.2323562502861023, + "learning_rate": 6.20421782353034e-07, + "loss": 0.3828, + "step": 13843 + }, + { + "epoch": 2.84592455545277, + "grad_norm": 0.12596507370471954, + "learning_rate": 6.187700259029227e-07, + "loss": 0.4397, + "step": 13844 + }, + { + "epoch": 2.8461301264261487, + "grad_norm": 0.243175208568573, + "learning_rate": 6.17120455908578e-07, + "loss": 0.3926, + "step": 13845 + }, + { + "epoch": 2.8463356973995273, + "grad_norm": 0.24358853697776794, + "learning_rate": 6.154730724512648e-07, + "loss": 0.3934, + "step": 13846 + }, + { + "epoch": 2.846541268372906, + "grad_norm": 0.23144344985485077, + "learning_rate": 6.13827875612138e-07, + "loss": 0.3733, + "step": 13847 + }, + { + "epoch": 2.8467468393462845, + "grad_norm": 0.33637747168540955, + "learning_rate": 6.121848654722528e-07, + "loss": 0.3871, + "step": 13848 + }, + { + "epoch": 2.8469524103196626, + "grad_norm": 0.24188685417175293, + "learning_rate": 6.105440421125497e-07, + "loss": 0.3871, + "step": 13849 + }, + { + "epoch": 2.8471579812930417, + "grad_norm": 0.12031394243240356, + "learning_rate": 6.089054056138687e-07, + "loss": 0.441, + "step": 13850 + }, + { + "epoch": 2.84736355226642, + "grad_norm": 0.23142001032829285, + "learning_rate": 6.072689560569306e-07, + "loss": 0.3923, + "step": 13851 + }, + { + "epoch": 2.8475691232397984, + "grad_norm": 0.23788262903690338, + "learning_rate": 6.056346935223656e-07, + "loss": 0.3881, + "step": 13852 + }, + { + "epoch": 2.847774694213177, + "grad_norm": 0.23109963536262512, + "learning_rate": 6.040026180906744e-07, + "loss": 0.3941, + "step": 13853 + }, + { + "epoch": 2.8479802651865556, + "grad_norm": 0.23182469606399536, + "learning_rate": 6.023727298422726e-07, + "loss": 0.3771, + "step": 13854 + }, + { + "epoch": 2.848185836159934, + "grad_norm": 0.23489411175251007, + "learning_rate": 6.007450288574512e-07, + "loss": 0.3841, + "step": 13855 + }, + { + "epoch": 2.8483914071333127, + "grad_norm": 0.23740611970424652, + "learning_rate": 5.991195152164009e-07, + "loss": 0.3707, + "step": 13856 + }, + { + "epoch": 2.8485969781066913, + "grad_norm": 0.23565572500228882, + "learning_rate": 5.974961889992026e-07, + "loss": 0.4023, + "step": 13857 + }, + { + "epoch": 2.84880254908007, + "grad_norm": 0.23655489087104797, + "learning_rate": 5.958750502858274e-07, + "loss": 0.3848, + "step": 13858 + }, + { + "epoch": 2.8490081200534485, + "grad_norm": 0.2304118573665619, + "learning_rate": 5.942560991561464e-07, + "loss": 0.3871, + "step": 13859 + }, + { + "epoch": 2.849213691026827, + "grad_norm": 0.22532600164413452, + "learning_rate": 5.926393356899207e-07, + "loss": 0.3746, + "step": 13860 + }, + { + "epoch": 2.8494192620002057, + "grad_norm": 0.22565500438213348, + "learning_rate": 5.910247599667867e-07, + "loss": 0.4012, + "step": 13861 + }, + { + "epoch": 2.8496248329735843, + "grad_norm": 0.22938272356987, + "learning_rate": 5.894123720663009e-07, + "loss": 0.3793, + "step": 13862 + }, + { + "epoch": 2.849830403946963, + "grad_norm": 0.2282402366399765, + "learning_rate": 5.878021720678894e-07, + "loss": 0.3631, + "step": 13863 + }, + { + "epoch": 2.850035974920341, + "grad_norm": 0.23935887217521667, + "learning_rate": 5.861941600508841e-07, + "loss": 0.3811, + "step": 13864 + }, + { + "epoch": 2.85024154589372, + "grad_norm": 0.12173505127429962, + "learning_rate": 5.845883360945065e-07, + "loss": 0.4352, + "step": 13865 + }, + { + "epoch": 2.850447116867098, + "grad_norm": 0.12043416500091553, + "learning_rate": 5.829847002778633e-07, + "loss": 0.4488, + "step": 13866 + }, + { + "epoch": 2.8506526878404768, + "grad_norm": 0.23177044093608856, + "learning_rate": 5.813832526799562e-07, + "loss": 0.3819, + "step": 13867 + }, + { + "epoch": 2.8508582588138554, + "grad_norm": 0.12020587176084518, + "learning_rate": 5.797839933796823e-07, + "loss": 0.4398, + "step": 13868 + }, + { + "epoch": 2.851063829787234, + "grad_norm": 0.2312840223312378, + "learning_rate": 5.781869224558384e-07, + "loss": 0.3687, + "step": 13869 + }, + { + "epoch": 2.8512694007606125, + "grad_norm": 0.12858018279075623, + "learning_rate": 5.765920399870917e-07, + "loss": 0.4559, + "step": 13870 + }, + { + "epoch": 2.851474971733991, + "grad_norm": 0.24785396456718445, + "learning_rate": 5.749993460520242e-07, + "loss": 0.3848, + "step": 13871 + }, + { + "epoch": 2.8516805427073697, + "grad_norm": 0.23876793682575226, + "learning_rate": 5.734088407290933e-07, + "loss": 0.4002, + "step": 13872 + }, + { + "epoch": 2.8518861136807483, + "grad_norm": 0.12341229617595673, + "learning_rate": 5.718205240966662e-07, + "loss": 0.4539, + "step": 13873 + }, + { + "epoch": 2.852091684654127, + "grad_norm": 0.23897776007652283, + "learning_rate": 5.702343962329803e-07, + "loss": 0.3986, + "step": 13874 + }, + { + "epoch": 2.8522972556275055, + "grad_norm": 0.11988009512424469, + "learning_rate": 5.686504572161833e-07, + "loss": 0.4562, + "step": 13875 + }, + { + "epoch": 2.852502826600884, + "grad_norm": 0.23703759908676147, + "learning_rate": 5.670687071243075e-07, + "loss": 0.382, + "step": 13876 + }, + { + "epoch": 2.8527083975742626, + "grad_norm": 0.23015399277210236, + "learning_rate": 5.654891460352707e-07, + "loss": 0.3671, + "step": 13877 + }, + { + "epoch": 2.8529139685476412, + "grad_norm": 0.23037444055080414, + "learning_rate": 5.639117740269056e-07, + "loss": 0.3773, + "step": 13878 + }, + { + "epoch": 2.8531195395210194, + "grad_norm": 0.2336786836385727, + "learning_rate": 5.623365911769102e-07, + "loss": 0.385, + "step": 13879 + }, + { + "epoch": 2.8533251104943984, + "grad_norm": 0.24950271844863892, + "learning_rate": 5.607635975628922e-07, + "loss": 0.3763, + "step": 13880 + }, + { + "epoch": 2.8535306814677766, + "grad_norm": 0.2312586009502411, + "learning_rate": 5.591927932623397e-07, + "loss": 0.3725, + "step": 13881 + }, + { + "epoch": 2.8537362524411556, + "grad_norm": 0.23014506697654724, + "learning_rate": 5.57624178352646e-07, + "loss": 0.3614, + "step": 13882 + }, + { + "epoch": 2.8539418234145337, + "grad_norm": 0.22436246275901794, + "learning_rate": 5.560577529110839e-07, + "loss": 0.3772, + "step": 13883 + }, + { + "epoch": 2.8541473943879123, + "grad_norm": 0.12695522606372833, + "learning_rate": 5.544935170148218e-07, + "loss": 0.4635, + "step": 13884 + }, + { + "epoch": 2.854352965361291, + "grad_norm": 0.24410668015480042, + "learning_rate": 5.529314707409333e-07, + "loss": 0.378, + "step": 13885 + }, + { + "epoch": 2.8545585363346695, + "grad_norm": 0.12377558648586273, + "learning_rate": 5.513716141663616e-07, + "loss": 0.435, + "step": 13886 + }, + { + "epoch": 2.854764107308048, + "grad_norm": 0.24002113938331604, + "learning_rate": 5.498139473679603e-07, + "loss": 0.3777, + "step": 13887 + }, + { + "epoch": 2.8549696782814267, + "grad_norm": 0.23580054938793182, + "learning_rate": 5.48258470422463e-07, + "loss": 0.3832, + "step": 13888 + }, + { + "epoch": 2.8551752492548053, + "grad_norm": 0.23273934423923492, + "learning_rate": 5.467051834065084e-07, + "loss": 0.3725, + "step": 13889 + }, + { + "epoch": 2.855380820228184, + "grad_norm": 0.23366734385490417, + "learning_rate": 5.451540863966103e-07, + "loss": 0.3706, + "step": 13890 + }, + { + "epoch": 2.8555863912015624, + "grad_norm": 0.11989044398069382, + "learning_rate": 5.436051794691926e-07, + "loss": 0.4374, + "step": 13891 + }, + { + "epoch": 2.855791962174941, + "grad_norm": 0.22055114805698395, + "learning_rate": 5.420584627005593e-07, + "loss": 0.3711, + "step": 13892 + }, + { + "epoch": 2.8559975331483196, + "grad_norm": 0.12336910516023636, + "learning_rate": 5.405139361669093e-07, + "loss": 0.444, + "step": 13893 + }, + { + "epoch": 2.8562031041216978, + "grad_norm": 0.1187121644616127, + "learning_rate": 5.389715999443318e-07, + "loss": 0.4488, + "step": 13894 + }, + { + "epoch": 2.856408675095077, + "grad_norm": 0.21668803691864014, + "learning_rate": 5.37431454108816e-07, + "loss": 0.3714, + "step": 13895 + }, + { + "epoch": 2.856614246068455, + "grad_norm": 0.11917508393526077, + "learning_rate": 5.358934987362363e-07, + "loss": 0.4409, + "step": 13896 + }, + { + "epoch": 2.856819817041834, + "grad_norm": 0.22866788506507874, + "learning_rate": 5.34357733902357e-07, + "loss": 0.3774, + "step": 13897 + }, + { + "epoch": 2.857025388015212, + "grad_norm": 0.12167064100503922, + "learning_rate": 5.328241596828376e-07, + "loss": 0.452, + "step": 13898 + }, + { + "epoch": 2.8572309589885907, + "grad_norm": 0.12296809256076813, + "learning_rate": 5.312927761532377e-07, + "loss": 0.4389, + "step": 13899 + }, + { + "epoch": 2.8574365299619693, + "grad_norm": 0.24001666903495789, + "learning_rate": 5.297635833889969e-07, + "loss": 0.3771, + "step": 13900 + }, + { + "epoch": 2.857642100935348, + "grad_norm": 0.22801834344863892, + "learning_rate": 5.2823658146545e-07, + "loss": 0.3763, + "step": 13901 + }, + { + "epoch": 2.8578476719087265, + "grad_norm": 0.22676675021648407, + "learning_rate": 5.267117704578267e-07, + "loss": 0.3693, + "step": 13902 + }, + { + "epoch": 2.858053242882105, + "grad_norm": 0.2277052402496338, + "learning_rate": 5.251891504412421e-07, + "loss": 0.3509, + "step": 13903 + }, + { + "epoch": 2.8582588138554836, + "grad_norm": 0.22454136610031128, + "learning_rate": 5.23668721490716e-07, + "loss": 0.3813, + "step": 13904 + }, + { + "epoch": 2.8584643848288622, + "grad_norm": 0.2237093299627304, + "learning_rate": 5.221504836811486e-07, + "loss": 0.3734, + "step": 13905 + }, + { + "epoch": 2.858669955802241, + "grad_norm": 0.24160228669643402, + "learning_rate": 5.2063443708734e-07, + "loss": 0.3786, + "step": 13906 + }, + { + "epoch": 2.8588755267756194, + "grad_norm": 0.2331501841545105, + "learning_rate": 5.191205817839806e-07, + "loss": 0.3789, + "step": 13907 + }, + { + "epoch": 2.859081097748998, + "grad_norm": 0.24461065232753754, + "learning_rate": 5.176089178456406e-07, + "loss": 0.3826, + "step": 13908 + }, + { + "epoch": 2.859286668722376, + "grad_norm": 0.22187209129333496, + "learning_rate": 5.160994453468055e-07, + "loss": 0.364, + "step": 13909 + }, + { + "epoch": 2.859492239695755, + "grad_norm": 0.232316792011261, + "learning_rate": 5.145921643618257e-07, + "loss": 0.3813, + "step": 13910 + }, + { + "epoch": 2.8596978106691333, + "grad_norm": 0.22536687552928925, + "learning_rate": 5.130870749649669e-07, + "loss": 0.3738, + "step": 13911 + }, + { + "epoch": 2.8599033816425123, + "grad_norm": 0.2332964688539505, + "learning_rate": 5.115841772303798e-07, + "loss": 0.376, + "step": 13912 + }, + { + "epoch": 2.8601089526158905, + "grad_norm": 0.23040318489074707, + "learning_rate": 5.100834712321001e-07, + "loss": 0.3887, + "step": 13913 + }, + { + "epoch": 2.860314523589269, + "grad_norm": 0.2240133285522461, + "learning_rate": 5.085849570440638e-07, + "loss": 0.3693, + "step": 13914 + }, + { + "epoch": 2.8605200945626477, + "grad_norm": 0.2326270490884781, + "learning_rate": 5.070886347400966e-07, + "loss": 0.3749, + "step": 13915 + }, + { + "epoch": 2.8607256655360263, + "grad_norm": 0.12496310472488403, + "learning_rate": 5.055945043939098e-07, + "loss": 0.4531, + "step": 13916 + }, + { + "epoch": 2.860931236509405, + "grad_norm": 0.12099100649356842, + "learning_rate": 5.041025660791193e-07, + "loss": 0.4613, + "step": 13917 + }, + { + "epoch": 2.8611368074827834, + "grad_norm": 0.23122435808181763, + "learning_rate": 5.026128198692165e-07, + "loss": 0.3912, + "step": 13918 + }, + { + "epoch": 2.861342378456162, + "grad_norm": 0.24232856929302216, + "learning_rate": 5.011252658376025e-07, + "loss": 0.3617, + "step": 13919 + }, + { + "epoch": 2.8615479494295406, + "grad_norm": 0.2327503263950348, + "learning_rate": 4.996399040575589e-07, + "loss": 0.3817, + "step": 13920 + }, + { + "epoch": 2.861753520402919, + "grad_norm": 0.2326626479625702, + "learning_rate": 4.981567346022619e-07, + "loss": 0.3987, + "step": 13921 + }, + { + "epoch": 2.861959091376298, + "grad_norm": 0.22813312709331512, + "learning_rate": 4.966757575447833e-07, + "loss": 0.3884, + "step": 13922 + }, + { + "epoch": 2.8621646623496764, + "grad_norm": 0.22625859081745148, + "learning_rate": 4.951969729580846e-07, + "loss": 0.3947, + "step": 13923 + }, + { + "epoch": 2.8623702333230545, + "grad_norm": 0.23106195032596588, + "learning_rate": 4.937203809150126e-07, + "loss": 0.376, + "step": 13924 + }, + { + "epoch": 2.8625758042964335, + "grad_norm": 0.1207781508564949, + "learning_rate": 4.92245981488319e-07, + "loss": 0.4405, + "step": 13925 + }, + { + "epoch": 2.8627813752698117, + "grad_norm": 0.232728511095047, + "learning_rate": 4.907737747506308e-07, + "loss": 0.3792, + "step": 13926 + }, + { + "epoch": 2.8629869462431907, + "grad_norm": 0.2338234782218933, + "learning_rate": 4.893037607744849e-07, + "loss": 0.3716, + "step": 13927 + }, + { + "epoch": 2.863192517216569, + "grad_norm": 0.24571533501148224, + "learning_rate": 4.878359396323035e-07, + "loss": 0.3928, + "step": 13928 + }, + { + "epoch": 2.8633980881899475, + "grad_norm": 0.23208092153072357, + "learning_rate": 4.863703113963986e-07, + "loss": 0.3748, + "step": 13929 + }, + { + "epoch": 2.863603659163326, + "grad_norm": 0.23107780516147614, + "learning_rate": 4.849068761389675e-07, + "loss": 0.3716, + "step": 13930 + }, + { + "epoch": 2.8638092301367046, + "grad_norm": 0.12082730978727341, + "learning_rate": 4.834456339321075e-07, + "loss": 0.4541, + "step": 13931 + }, + { + "epoch": 2.864014801110083, + "grad_norm": 0.12191561609506607, + "learning_rate": 4.819865848478212e-07, + "loss": 0.4471, + "step": 13932 + }, + { + "epoch": 2.864220372083462, + "grad_norm": 0.23875342309474945, + "learning_rate": 4.805297289579708e-07, + "loss": 0.4194, + "step": 13933 + }, + { + "epoch": 2.8644259430568404, + "grad_norm": 0.22163498401641846, + "learning_rate": 4.790750663343391e-07, + "loss": 0.3613, + "step": 13934 + }, + { + "epoch": 2.864631514030219, + "grad_norm": 0.24136824905872345, + "learning_rate": 4.776225970485937e-07, + "loss": 0.3839, + "step": 13935 + }, + { + "epoch": 2.8648370850035976, + "grad_norm": 0.22400477528572083, + "learning_rate": 4.761723211722824e-07, + "loss": 0.3655, + "step": 13936 + }, + { + "epoch": 2.865042655976976, + "grad_norm": 0.23349706828594208, + "learning_rate": 4.7472423877685804e-07, + "loss": 0.3814, + "step": 13937 + }, + { + "epoch": 2.8652482269503547, + "grad_norm": 0.24638283252716064, + "learning_rate": 4.732783499336585e-07, + "loss": 0.3953, + "step": 13938 + }, + { + "epoch": 2.865453797923733, + "grad_norm": 0.23078061640262604, + "learning_rate": 4.718346547139119e-07, + "loss": 0.3858, + "step": 13939 + }, + { + "epoch": 2.865659368897112, + "grad_norm": 0.23065340518951416, + "learning_rate": 4.7039315318875623e-07, + "loss": 0.3522, + "step": 13940 + }, + { + "epoch": 2.86586493987049, + "grad_norm": 0.22871986031532288, + "learning_rate": 4.6895384542919477e-07, + "loss": 0.3913, + "step": 13941 + }, + { + "epoch": 2.866070510843869, + "grad_norm": 0.23301458358764648, + "learning_rate": 4.6751673150614575e-07, + "loss": 0.3834, + "step": 13942 + }, + { + "epoch": 2.8662760818172472, + "grad_norm": 0.22655089199543, + "learning_rate": 4.6608181149039757e-07, + "loss": 0.3899, + "step": 13943 + }, + { + "epoch": 2.866481652790626, + "grad_norm": 0.12195513397455215, + "learning_rate": 4.646490854526486e-07, + "loss": 0.4349, + "step": 13944 + }, + { + "epoch": 2.8666872237640044, + "grad_norm": 0.23551727831363678, + "learning_rate": 4.6321855346348254e-07, + "loss": 0.3738, + "step": 13945 + }, + { + "epoch": 2.866892794737383, + "grad_norm": 0.23190248012542725, + "learning_rate": 4.617902155933679e-07, + "loss": 0.3944, + "step": 13946 + }, + { + "epoch": 2.8670983657107616, + "grad_norm": 0.22424408793449402, + "learning_rate": 4.6036407191268337e-07, + "loss": 0.3904, + "step": 13947 + }, + { + "epoch": 2.86730393668414, + "grad_norm": 0.11816349625587463, + "learning_rate": 4.5894012249168285e-07, + "loss": 0.4426, + "step": 13948 + }, + { + "epoch": 2.8675095076575188, + "grad_norm": 0.22937704622745514, + "learning_rate": 4.5751836740052015e-07, + "loss": 0.3796, + "step": 13949 + }, + { + "epoch": 2.8677150786308974, + "grad_norm": 0.11853787302970886, + "learning_rate": 4.560988067092342e-07, + "loss": 0.4408, + "step": 13950 + }, + { + "epoch": 2.867920649604276, + "grad_norm": 0.23124562203884125, + "learning_rate": 4.5468144048776416e-07, + "loss": 0.3838, + "step": 13951 + }, + { + "epoch": 2.8681262205776545, + "grad_norm": 0.23542582988739014, + "learning_rate": 4.5326626880593416e-07, + "loss": 0.3749, + "step": 13952 + }, + { + "epoch": 2.868331791551033, + "grad_norm": 0.22498956322669983, + "learning_rate": 4.5185329173346334e-07, + "loss": 0.3877, + "step": 13953 + }, + { + "epoch": 2.8685373625244117, + "grad_norm": 0.12203694880008698, + "learning_rate": 4.5044250933996615e-07, + "loss": 0.4589, + "step": 13954 + }, + { + "epoch": 2.8687429334977903, + "grad_norm": 0.22876019775867462, + "learning_rate": 4.490339216949369e-07, + "loss": 0.3773, + "step": 13955 + }, + { + "epoch": 2.8689485044711684, + "grad_norm": 0.22930005192756653, + "learning_rate": 4.4762752886778004e-07, + "loss": 0.3838, + "step": 13956 + }, + { + "epoch": 2.8691540754445475, + "grad_norm": 0.2380819171667099, + "learning_rate": 4.4622333092777524e-07, + "loss": 0.3939, + "step": 13957 + }, + { + "epoch": 2.8693596464179256, + "grad_norm": 0.24039901793003082, + "learning_rate": 4.4482132794410714e-07, + "loss": 0.3881, + "step": 13958 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.2359398603439331, + "learning_rate": 4.434215199858355e-07, + "loss": 0.386, + "step": 13959 + }, + { + "epoch": 2.869770788364683, + "grad_norm": 0.12011504173278809, + "learning_rate": 4.420239071219301e-07, + "loss": 0.4551, + "step": 13960 + }, + { + "epoch": 2.8699763593380614, + "grad_norm": 0.2287997305393219, + "learning_rate": 4.406284894212459e-07, + "loss": 0.3777, + "step": 13961 + }, + { + "epoch": 2.87018193031144, + "grad_norm": 0.21278510987758636, + "learning_rate": 4.392352669525279e-07, + "loss": 0.3631, + "step": 13962 + }, + { + "epoch": 2.8703875012848186, + "grad_norm": 0.23229098320007324, + "learning_rate": 4.3784423978441125e-07, + "loss": 0.384, + "step": 13963 + }, + { + "epoch": 2.870593072258197, + "grad_norm": 0.2308778017759323, + "learning_rate": 4.3645540798542605e-07, + "loss": 0.394, + "step": 13964 + }, + { + "epoch": 2.8707986432315757, + "grad_norm": 0.23160767555236816, + "learning_rate": 4.3506877162399263e-07, + "loss": 0.3779, + "step": 13965 + }, + { + "epoch": 2.8710042142049543, + "grad_norm": 0.23534901440143585, + "learning_rate": 4.336843307684213e-07, + "loss": 0.365, + "step": 13966 + }, + { + "epoch": 2.871209785178333, + "grad_norm": 0.11934797465801239, + "learning_rate": 4.323020854869225e-07, + "loss": 0.4542, + "step": 13967 + }, + { + "epoch": 2.8714153561517115, + "grad_norm": 0.11757402122020721, + "learning_rate": 4.3092203584759185e-07, + "loss": 0.4468, + "step": 13968 + }, + { + "epoch": 2.87162092712509, + "grad_norm": 0.22173817455768585, + "learning_rate": 4.2954418191841484e-07, + "loss": 0.3748, + "step": 13969 + }, + { + "epoch": 2.8718264980984687, + "grad_norm": 0.23279330134391785, + "learning_rate": 4.281685237672772e-07, + "loss": 0.3775, + "step": 13970 + }, + { + "epoch": 2.872032069071847, + "grad_norm": 0.23133385181427002, + "learning_rate": 4.267950614619498e-07, + "loss": 0.3657, + "step": 13971 + }, + { + "epoch": 2.872237640045226, + "grad_norm": 0.2283874899148941, + "learning_rate": 4.2542379507009347e-07, + "loss": 0.3612, + "step": 13972 + }, + { + "epoch": 2.872443211018604, + "grad_norm": 0.12400206178426743, + "learning_rate": 4.240547246592641e-07, + "loss": 0.4621, + "step": 13973 + }, + { + "epoch": 2.8726487819919826, + "grad_norm": 0.22691883146762848, + "learning_rate": 4.2268785029690783e-07, + "loss": 0.362, + "step": 13974 + }, + { + "epoch": 2.872854352965361, + "grad_norm": 0.23167765140533447, + "learning_rate": 4.2132317205037573e-07, + "loss": 0.3854, + "step": 13975 + }, + { + "epoch": 2.8730599239387398, + "grad_norm": 0.26033303141593933, + "learning_rate": 4.199606899868841e-07, + "loss": 0.3508, + "step": 13976 + }, + { + "epoch": 2.8732654949121184, + "grad_norm": 0.22448518872261047, + "learning_rate": 4.186004041735642e-07, + "loss": 0.3895, + "step": 13977 + }, + { + "epoch": 2.873471065885497, + "grad_norm": 0.11807616800069809, + "learning_rate": 4.1724231467743236e-07, + "loss": 0.4393, + "step": 13978 + }, + { + "epoch": 2.8736766368588755, + "grad_norm": 0.23837019503116608, + "learning_rate": 4.1588642156539014e-07, + "loss": 0.4048, + "step": 13979 + }, + { + "epoch": 2.873882207832254, + "grad_norm": 0.24100029468536377, + "learning_rate": 4.145327249042391e-07, + "loss": 0.3877, + "step": 13980 + }, + { + "epoch": 2.8740877788056327, + "grad_norm": 0.23236291110515594, + "learning_rate": 4.131812247606659e-07, + "loss": 0.3805, + "step": 13981 + }, + { + "epoch": 2.8742933497790113, + "grad_norm": 0.234677255153656, + "learning_rate": 4.1183192120125723e-07, + "loss": 0.3882, + "step": 13982 + }, + { + "epoch": 2.87449892075239, + "grad_norm": 0.22873461246490479, + "learning_rate": 4.10484814292485e-07, + "loss": 0.3691, + "step": 13983 + }, + { + "epoch": 2.8747044917257685, + "grad_norm": 0.22885732352733612, + "learning_rate": 4.09139904100716e-07, + "loss": 0.3814, + "step": 13984 + }, + { + "epoch": 2.874910062699147, + "grad_norm": 0.23706702888011932, + "learning_rate": 4.0779719069220735e-07, + "loss": 0.3747, + "step": 13985 + }, + { + "epoch": 2.875115633672525, + "grad_norm": 0.22555503249168396, + "learning_rate": 4.0645667413310605e-07, + "loss": 0.3678, + "step": 13986 + }, + { + "epoch": 2.8753212046459042, + "grad_norm": 0.11815163493156433, + "learning_rate": 4.0511835448945934e-07, + "loss": 0.4461, + "step": 13987 + }, + { + "epoch": 2.8755267756192824, + "grad_norm": 0.23131482303142548, + "learning_rate": 4.0378223182718943e-07, + "loss": 0.3946, + "step": 13988 + }, + { + "epoch": 2.875732346592661, + "grad_norm": 0.22287005186080933, + "learning_rate": 4.024483062121287e-07, + "loss": 0.3732, + "step": 13989 + }, + { + "epoch": 2.8759379175660396, + "grad_norm": 0.22222553193569183, + "learning_rate": 4.011165777099896e-07, + "loss": 0.3618, + "step": 13990 + }, + { + "epoch": 2.876143488539418, + "grad_norm": 0.22416678071022034, + "learning_rate": 3.9978704638638455e-07, + "loss": 0.3859, + "step": 13991 + }, + { + "epoch": 2.8763490595127967, + "grad_norm": 0.23659634590148926, + "learning_rate": 3.984597123068112e-07, + "loss": 0.3624, + "step": 13992 + }, + { + "epoch": 2.8765546304861753, + "grad_norm": 0.12456272542476654, + "learning_rate": 3.971345755366623e-07, + "loss": 0.4535, + "step": 13993 + }, + { + "epoch": 2.876760201459554, + "grad_norm": 0.23349931836128235, + "learning_rate": 3.9581163614121564e-07, + "loss": 0.3767, + "step": 13994 + }, + { + "epoch": 2.8769657724329325, + "grad_norm": 0.2434905469417572, + "learning_rate": 3.94490894185649e-07, + "loss": 0.3731, + "step": 13995 + }, + { + "epoch": 2.877171343406311, + "grad_norm": 0.12112405896186829, + "learning_rate": 3.9317234973503536e-07, + "loss": 0.4481, + "step": 13996 + }, + { + "epoch": 2.8773769143796897, + "grad_norm": 0.22560545802116394, + "learning_rate": 3.9185600285432777e-07, + "loss": 0.3906, + "step": 13997 + }, + { + "epoch": 2.8775824853530683, + "grad_norm": 0.12590011954307556, + "learning_rate": 3.905418536083744e-07, + "loss": 0.4603, + "step": 13998 + }, + { + "epoch": 2.877788056326447, + "grad_norm": 0.11752758920192719, + "learning_rate": 3.8922990206191833e-07, + "loss": 0.4465, + "step": 13999 + }, + { + "epoch": 2.8779936272998254, + "grad_norm": 0.22191815078258514, + "learning_rate": 3.87920148279598e-07, + "loss": 0.3697, + "step": 14000 + }, + { + "epoch": 2.8781991982732036, + "grad_norm": 0.23301634192466736, + "learning_rate": 3.866125923259367e-07, + "loss": 0.3553, + "step": 14001 + }, + { + "epoch": 2.8784047692465826, + "grad_norm": 0.22838152945041656, + "learning_rate": 3.8530723426534797e-07, + "loss": 0.3772, + "step": 14002 + }, + { + "epoch": 2.8786103402199608, + "grad_norm": 0.2294638454914093, + "learning_rate": 3.840040741621404e-07, + "loss": 0.3832, + "step": 14003 + }, + { + "epoch": 2.8788159111933393, + "grad_norm": 0.24881219863891602, + "learning_rate": 3.8270311208052246e-07, + "loss": 0.3631, + "step": 14004 + }, + { + "epoch": 2.879021482166718, + "grad_norm": 0.2229405790567398, + "learning_rate": 3.81404348084583e-07, + "loss": 0.3767, + "step": 14005 + }, + { + "epoch": 2.8792270531400965, + "grad_norm": 0.11796759814023972, + "learning_rate": 3.801077822383009e-07, + "loss": 0.4422, + "step": 14006 + }, + { + "epoch": 2.879432624113475, + "grad_norm": 0.23424452543258667, + "learning_rate": 3.7881341460555496e-07, + "loss": 0.3664, + "step": 14007 + }, + { + "epoch": 2.8796381950868537, + "grad_norm": 0.23670734465122223, + "learning_rate": 3.775212452501192e-07, + "loss": 0.3929, + "step": 14008 + }, + { + "epoch": 2.8798437660602323, + "grad_norm": 0.12096056342124939, + "learning_rate": 3.762312742356378e-07, + "loss": 0.4595, + "step": 14009 + }, + { + "epoch": 2.880049337033611, + "grad_norm": 0.2295764833688736, + "learning_rate": 3.749435016256747e-07, + "loss": 0.3821, + "step": 14010 + }, + { + "epoch": 2.8802549080069895, + "grad_norm": 0.2285950481891632, + "learning_rate": 3.7365792748366934e-07, + "loss": 0.3757, + "step": 14011 + }, + { + "epoch": 2.880460478980368, + "grad_norm": 0.12199006229639053, + "learning_rate": 3.72374551872956e-07, + "loss": 0.4473, + "step": 14012 + }, + { + "epoch": 2.8806660499537466, + "grad_norm": 0.22347088158130646, + "learning_rate": 3.710933748567541e-07, + "loss": 0.3702, + "step": 14013 + }, + { + "epoch": 2.8808716209271252, + "grad_norm": 0.23266130685806274, + "learning_rate": 3.698143964981932e-07, + "loss": 0.3802, + "step": 14014 + }, + { + "epoch": 2.881077191900504, + "grad_norm": 0.23003004491329193, + "learning_rate": 3.6853761686026776e-07, + "loss": 0.3668, + "step": 14015 + }, + { + "epoch": 2.881282762873882, + "grad_norm": 0.22506079077720642, + "learning_rate": 3.672630360058926e-07, + "loss": 0.3672, + "step": 14016 + }, + { + "epoch": 2.881488333847261, + "grad_norm": 0.23392482101917267, + "learning_rate": 3.659906539978575e-07, + "loss": 0.3907, + "step": 14017 + }, + { + "epoch": 2.881693904820639, + "grad_norm": 0.22708185017108917, + "learning_rate": 3.647204708988422e-07, + "loss": 0.3736, + "step": 14018 + }, + { + "epoch": 2.8818994757940177, + "grad_norm": 0.11717811226844788, + "learning_rate": 3.6345248677142176e-07, + "loss": 0.4522, + "step": 14019 + }, + { + "epoch": 2.8821050467673963, + "grad_norm": 0.22868549823760986, + "learning_rate": 3.621867016780661e-07, + "loss": 0.3855, + "step": 14020 + }, + { + "epoch": 2.882310617740775, + "grad_norm": 0.12395808845758438, + "learning_rate": 3.6092311568113546e-07, + "loss": 0.4369, + "step": 14021 + }, + { + "epoch": 2.8825161887141535, + "grad_norm": 0.22594808042049408, + "learning_rate": 3.5966172884287995e-07, + "loss": 0.3708, + "step": 14022 + }, + { + "epoch": 2.882721759687532, + "grad_norm": 0.11887579411268234, + "learning_rate": 3.5840254122544495e-07, + "loss": 0.4554, + "step": 14023 + }, + { + "epoch": 2.8829273306609107, + "grad_norm": 0.12510953843593597, + "learning_rate": 3.571455528908657e-07, + "loss": 0.4457, + "step": 14024 + }, + { + "epoch": 2.8831329016342893, + "grad_norm": 0.22904570400714874, + "learning_rate": 3.558907639010628e-07, + "loss": 0.3703, + "step": 14025 + }, + { + "epoch": 2.883338472607668, + "grad_norm": 0.24266590178012848, + "learning_rate": 3.5463817431785176e-07, + "loss": 0.3713, + "step": 14026 + }, + { + "epoch": 2.8835440435810464, + "grad_norm": 0.22441810369491577, + "learning_rate": 3.5338778420294817e-07, + "loss": 0.4028, + "step": 14027 + }, + { + "epoch": 2.883749614554425, + "grad_norm": 0.23846034705638885, + "learning_rate": 3.521395936179528e-07, + "loss": 0.3993, + "step": 14028 + }, + { + "epoch": 2.8839551855278036, + "grad_norm": 0.2247145175933838, + "learning_rate": 3.5089360262435146e-07, + "loss": 0.3895, + "step": 14029 + }, + { + "epoch": 2.884160756501182, + "grad_norm": 0.2352132946252823, + "learning_rate": 3.4964981128354e-07, + "loss": 0.3754, + "step": 14030 + }, + { + "epoch": 2.8843663274745603, + "grad_norm": 0.22683286666870117, + "learning_rate": 3.484082196567795e-07, + "loss": 0.3893, + "step": 14031 + }, + { + "epoch": 2.8845718984479394, + "grad_norm": 0.2301369607448578, + "learning_rate": 3.4716882780525097e-07, + "loss": 0.3909, + "step": 14032 + }, + { + "epoch": 2.8847774694213175, + "grad_norm": 0.23967629671096802, + "learning_rate": 3.4593163579000553e-07, + "loss": 0.3981, + "step": 14033 + }, + { + "epoch": 2.884983040394696, + "grad_norm": 0.2322077453136444, + "learning_rate": 3.446966436719945e-07, + "loss": 0.3826, + "step": 14034 + }, + { + "epoch": 2.8851886113680747, + "grad_norm": 0.12146010994911194, + "learning_rate": 3.4346385151206416e-07, + "loss": 0.4504, + "step": 14035 + }, + { + "epoch": 2.8853941823414533, + "grad_norm": 0.24295859038829803, + "learning_rate": 3.4223325937094096e-07, + "loss": 0.369, + "step": 14036 + }, + { + "epoch": 2.885599753314832, + "grad_norm": 0.24125894904136658, + "learning_rate": 3.410048673092614e-07, + "loss": 0.3895, + "step": 14037 + }, + { + "epoch": 2.8858053242882105, + "grad_norm": 0.11830901354551315, + "learning_rate": 3.397786753875321e-07, + "loss": 0.4409, + "step": 14038 + }, + { + "epoch": 2.886010895261589, + "grad_norm": 0.2366967350244522, + "learning_rate": 3.385546836661696e-07, + "loss": 0.3942, + "step": 14039 + }, + { + "epoch": 2.8862164662349676, + "grad_norm": 0.22245019674301147, + "learning_rate": 3.373328922054658e-07, + "loss": 0.3795, + "step": 14040 + }, + { + "epoch": 2.8864220372083462, + "grad_norm": 0.12539364397525787, + "learning_rate": 3.3611330106561754e-07, + "loss": 0.4422, + "step": 14041 + }, + { + "epoch": 2.886627608181725, + "grad_norm": 0.22733426094055176, + "learning_rate": 3.3489591030671174e-07, + "loss": 0.3805, + "step": 14042 + }, + { + "epoch": 2.8868331791551034, + "grad_norm": 0.24280138313770294, + "learning_rate": 3.336807199887204e-07, + "loss": 0.3993, + "step": 14043 + }, + { + "epoch": 2.887038750128482, + "grad_norm": 0.12910622358322144, + "learning_rate": 3.3246773017151066e-07, + "loss": 0.4552, + "step": 14044 + }, + { + "epoch": 2.8872443211018606, + "grad_norm": 0.11929771304130554, + "learning_rate": 3.3125694091483474e-07, + "loss": 0.4486, + "step": 14045 + }, + { + "epoch": 2.8874498920752387, + "grad_norm": 0.23444950580596924, + "learning_rate": 3.3004835227835485e-07, + "loss": 0.3619, + "step": 14046 + }, + { + "epoch": 2.8876554630486178, + "grad_norm": 0.2314281016588211, + "learning_rate": 3.2884196432160343e-07, + "loss": 0.3573, + "step": 14047 + }, + { + "epoch": 2.887861034021996, + "grad_norm": 0.22594213485717773, + "learning_rate": 3.276377771040179e-07, + "loss": 0.3828, + "step": 14048 + }, + { + "epoch": 2.8880666049953745, + "grad_norm": 0.2312646061182022, + "learning_rate": 3.264357906849208e-07, + "loss": 0.3858, + "step": 14049 + }, + { + "epoch": 2.888272175968753, + "grad_norm": 0.23432159423828125, + "learning_rate": 3.252360051235248e-07, + "loss": 0.3754, + "step": 14050 + }, + { + "epoch": 2.8884777469421317, + "grad_norm": 0.23932310938835144, + "learning_rate": 3.240384204789426e-07, + "loss": 0.3918, + "step": 14051 + }, + { + "epoch": 2.8886833179155103, + "grad_norm": 0.2506803572177887, + "learning_rate": 3.2284303681017203e-07, + "loss": 0.368, + "step": 14052 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.22862713038921356, + "learning_rate": 3.2164985417610596e-07, + "loss": 0.3896, + "step": 14053 + }, + { + "epoch": 2.8890944598622674, + "grad_norm": 0.23301179707050323, + "learning_rate": 3.204588726355273e-07, + "loss": 0.3869, + "step": 14054 + }, + { + "epoch": 2.889300030835646, + "grad_norm": 0.2313561588525772, + "learning_rate": 3.1927009224710925e-07, + "loss": 0.3629, + "step": 14055 + }, + { + "epoch": 2.8895056018090246, + "grad_norm": 0.22642727196216583, + "learning_rate": 3.1808351306941486e-07, + "loss": 0.3816, + "step": 14056 + }, + { + "epoch": 2.889711172782403, + "grad_norm": 0.2348901927471161, + "learning_rate": 3.1689913516089743e-07, + "loss": 0.3855, + "step": 14057 + }, + { + "epoch": 2.889916743755782, + "grad_norm": 0.24844767153263092, + "learning_rate": 3.1571695857991523e-07, + "loss": 0.3891, + "step": 14058 + }, + { + "epoch": 2.8901223147291604, + "grad_norm": 0.226862832903862, + "learning_rate": 3.145369833847067e-07, + "loss": 0.3812, + "step": 14059 + }, + { + "epoch": 2.890327885702539, + "grad_norm": 0.22782935202121735, + "learning_rate": 3.1335920963340037e-07, + "loss": 0.3698, + "step": 14060 + }, + { + "epoch": 2.890533456675917, + "grad_norm": 0.22575967013835907, + "learning_rate": 3.121836373840198e-07, + "loss": 0.3807, + "step": 14061 + }, + { + "epoch": 2.890739027649296, + "grad_norm": 0.24145731329917908, + "learning_rate": 3.110102666944836e-07, + "loss": 0.3619, + "step": 14062 + }, + { + "epoch": 2.8909445986226743, + "grad_norm": 0.24116384983062744, + "learning_rate": 3.0983909762259567e-07, + "loss": 0.3831, + "step": 14063 + }, + { + "epoch": 2.8911501695960533, + "grad_norm": 0.21999165415763855, + "learning_rate": 3.0867013022604977e-07, + "loss": 0.3963, + "step": 14064 + }, + { + "epoch": 2.8913557405694315, + "grad_norm": 0.23448392748832703, + "learning_rate": 3.075033645624448e-07, + "loss": 0.3707, + "step": 14065 + }, + { + "epoch": 2.89156131154281, + "grad_norm": 0.11776132136583328, + "learning_rate": 3.063388006892548e-07, + "loss": 0.4614, + "step": 14066 + }, + { + "epoch": 2.8917668825161886, + "grad_norm": 0.12120406329631805, + "learning_rate": 3.0517643866385395e-07, + "loss": 0.4609, + "step": 14067 + }, + { + "epoch": 2.891972453489567, + "grad_norm": 0.22066402435302734, + "learning_rate": 3.0401627854351133e-07, + "loss": 0.3709, + "step": 14068 + }, + { + "epoch": 2.892178024462946, + "grad_norm": 0.22971779108047485, + "learning_rate": 3.0285832038537134e-07, + "loss": 0.3811, + "step": 14069 + }, + { + "epoch": 2.8923835954363244, + "grad_norm": 0.12074688076972961, + "learning_rate": 3.0170256424649325e-07, + "loss": 0.4428, + "step": 14070 + }, + { + "epoch": 2.892589166409703, + "grad_norm": 0.23068879544734955, + "learning_rate": 3.0054901018380656e-07, + "loss": 0.3824, + "step": 14071 + }, + { + "epoch": 2.8927947373830816, + "grad_norm": 0.23140643537044525, + "learning_rate": 2.993976582541458e-07, + "loss": 0.3776, + "step": 14072 + }, + { + "epoch": 2.89300030835646, + "grad_norm": 0.2334955334663391, + "learning_rate": 2.982485085142356e-07, + "loss": 0.3668, + "step": 14073 + }, + { + "epoch": 2.8932058793298387, + "grad_norm": 0.22583140432834625, + "learning_rate": 2.9710156102068563e-07, + "loss": 0.3872, + "step": 14074 + }, + { + "epoch": 2.8934114503032173, + "grad_norm": 0.23303750157356262, + "learning_rate": 2.959568158300008e-07, + "loss": 0.383, + "step": 14075 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 0.2299990952014923, + "learning_rate": 2.948142729985759e-07, + "loss": 0.36, + "step": 14076 + }, + { + "epoch": 2.8938225922499745, + "grad_norm": 0.12316111475229263, + "learning_rate": 2.9367393258270094e-07, + "loss": 0.4644, + "step": 14077 + }, + { + "epoch": 2.8940281632233527, + "grad_norm": 0.24173006415367126, + "learning_rate": 2.9253579463855097e-07, + "loss": 0.3787, + "step": 14078 + }, + { + "epoch": 2.8942337341967317, + "grad_norm": 0.12229252606630325, + "learning_rate": 2.9139985922220114e-07, + "loss": 0.4535, + "step": 14079 + }, + { + "epoch": 2.89443930517011, + "grad_norm": 0.22947286069393158, + "learning_rate": 2.9026612638961673e-07, + "loss": 0.3694, + "step": 14080 + }, + { + "epoch": 2.8946448761434884, + "grad_norm": 0.2314113825559616, + "learning_rate": 2.8913459619664795e-07, + "loss": 0.3772, + "step": 14081 + }, + { + "epoch": 2.894850447116867, + "grad_norm": 0.23245009779930115, + "learning_rate": 2.880052686990353e-07, + "loss": 0.3879, + "step": 14082 + }, + { + "epoch": 2.8950560180902456, + "grad_norm": 0.23955170810222626, + "learning_rate": 2.868781439524193e-07, + "loss": 0.3769, + "step": 14083 + }, + { + "epoch": 2.895261589063624, + "grad_norm": 0.22946025431156158, + "learning_rate": 2.857532220123305e-07, + "loss": 0.3739, + "step": 14084 + }, + { + "epoch": 2.8954671600370028, + "grad_norm": 0.22186554968357086, + "learning_rate": 2.8463050293418946e-07, + "loss": 0.3714, + "step": 14085 + }, + { + "epoch": 2.8956727310103814, + "grad_norm": 0.24299030005931854, + "learning_rate": 2.835099867733021e-07, + "loss": 0.384, + "step": 14086 + }, + { + "epoch": 2.89587830198376, + "grad_norm": 0.24568887054920197, + "learning_rate": 2.823916735848742e-07, + "loss": 0.3973, + "step": 14087 + }, + { + "epoch": 2.8960838729571385, + "grad_norm": 0.23442420363426208, + "learning_rate": 2.812755634239966e-07, + "loss": 0.3832, + "step": 14088 + }, + { + "epoch": 2.896289443930517, + "grad_norm": 0.22998051345348358, + "learning_rate": 2.801616563456605e-07, + "loss": 0.394, + "step": 14089 + }, + { + "epoch": 2.8964950149038957, + "grad_norm": 0.2347511351108551, + "learning_rate": 2.7904995240473684e-07, + "loss": 0.3739, + "step": 14090 + }, + { + "epoch": 2.896700585877274, + "grad_norm": 0.11745678633451462, + "learning_rate": 2.779404516559969e-07, + "loss": 0.4466, + "step": 14091 + }, + { + "epoch": 2.896906156850653, + "grad_norm": 0.23240487277507782, + "learning_rate": 2.7683315415410195e-07, + "loss": 0.3759, + "step": 14092 + }, + { + "epoch": 2.897111727824031, + "grad_norm": 0.23751090466976166, + "learning_rate": 2.757280599535983e-07, + "loss": 0.3839, + "step": 14093 + }, + { + "epoch": 2.89731729879741, + "grad_norm": 0.12017631530761719, + "learning_rate": 2.7462516910893745e-07, + "loss": 0.4444, + "step": 14094 + }, + { + "epoch": 2.897522869770788, + "grad_norm": 0.22520147264003754, + "learning_rate": 2.735244816744459e-07, + "loss": 0.3866, + "step": 14095 + }, + { + "epoch": 2.897728440744167, + "grad_norm": 0.23022042214870453, + "learning_rate": 2.7242599770435527e-07, + "loss": 0.3813, + "step": 14096 + }, + { + "epoch": 2.8979340117175454, + "grad_norm": 0.23134584724903107, + "learning_rate": 2.7132971725277736e-07, + "loss": 0.3583, + "step": 14097 + }, + { + "epoch": 2.898139582690924, + "grad_norm": 0.23426006734371185, + "learning_rate": 2.7023564037372383e-07, + "loss": 0.3785, + "step": 14098 + }, + { + "epoch": 2.8983451536643026, + "grad_norm": 0.11582779884338379, + "learning_rate": 2.6914376712109166e-07, + "loss": 0.4577, + "step": 14099 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 0.12305998057126999, + "learning_rate": 2.6805409754867783e-07, + "loss": 0.4363, + "step": 14100 + }, + { + "epoch": 2.8987562956110597, + "grad_norm": 0.21655605733394623, + "learning_rate": 2.6696663171015933e-07, + "loss": 0.3598, + "step": 14101 + }, + { + "epoch": 2.8989618665844383, + "grad_norm": 0.2362840622663498, + "learning_rate": 2.658813696591134e-07, + "loss": 0.3895, + "step": 14102 + }, + { + "epoch": 2.899167437557817, + "grad_norm": 0.11493504792451859, + "learning_rate": 2.6479831144900714e-07, + "loss": 0.4464, + "step": 14103 + }, + { + "epoch": 2.8993730085311955, + "grad_norm": 0.2330009937286377, + "learning_rate": 2.63717457133193e-07, + "loss": 0.3753, + "step": 14104 + }, + { + "epoch": 2.899578579504574, + "grad_norm": 0.23272541165351868, + "learning_rate": 2.6263880676492823e-07, + "loss": 0.3787, + "step": 14105 + }, + { + "epoch": 2.8997841504779522, + "grad_norm": 0.12182939052581787, + "learning_rate": 2.615623603973405e-07, + "loss": 0.4519, + "step": 14106 + }, + { + "epoch": 2.8999897214513313, + "grad_norm": 0.23827330768108368, + "learning_rate": 2.6048811808347227e-07, + "loss": 0.3878, + "step": 14107 + }, + { + "epoch": 2.9001952924247094, + "grad_norm": 0.22565053403377533, + "learning_rate": 2.5941607987624626e-07, + "loss": 0.3737, + "step": 14108 + }, + { + "epoch": 2.9004008633980884, + "grad_norm": 0.22274649143218994, + "learning_rate": 2.583462458284652e-07, + "loss": 0.3767, + "step": 14109 + }, + { + "epoch": 2.9006064343714666, + "grad_norm": 0.2283952683210373, + "learning_rate": 2.57278615992852e-07, + "loss": 0.3775, + "step": 14110 + }, + { + "epoch": 2.900812005344845, + "grad_norm": 0.2287856638431549, + "learning_rate": 2.5621319042198945e-07, + "loss": 0.3912, + "step": 14111 + }, + { + "epoch": 2.9010175763182238, + "grad_norm": 0.23402291536331177, + "learning_rate": 2.5514996916836564e-07, + "loss": 0.3875, + "step": 14112 + }, + { + "epoch": 2.9012231472916024, + "grad_norm": 0.23402421176433563, + "learning_rate": 2.5408895228437366e-07, + "loss": 0.3955, + "step": 14113 + }, + { + "epoch": 2.901428718264981, + "grad_norm": 0.13076432049274445, + "learning_rate": 2.530301398222767e-07, + "loss": 0.4522, + "step": 14114 + }, + { + "epoch": 2.9016342892383595, + "grad_norm": 0.23658651113510132, + "learning_rate": 2.519735318342331e-07, + "loss": 0.3845, + "step": 14115 + }, + { + "epoch": 2.901839860211738, + "grad_norm": 0.12689454853534698, + "learning_rate": 2.509191283723061e-07, + "loss": 0.4543, + "step": 14116 + }, + { + "epoch": 2.9020454311851167, + "grad_norm": 0.12162572145462036, + "learning_rate": 2.4986692948843925e-07, + "loss": 0.4385, + "step": 14117 + }, + { + "epoch": 2.9022510021584953, + "grad_norm": 0.23482230305671692, + "learning_rate": 2.48816935234461e-07, + "loss": 0.381, + "step": 14118 + }, + { + "epoch": 2.902456573131874, + "grad_norm": 0.22458134591579437, + "learning_rate": 2.477691456621051e-07, + "loss": 0.3733, + "step": 14119 + }, + { + "epoch": 2.9026621441052525, + "grad_norm": 0.22772455215454102, + "learning_rate": 2.467235608230001e-07, + "loss": 0.373, + "step": 14120 + }, + { + "epoch": 2.902867715078631, + "grad_norm": 0.22812238335609436, + "learning_rate": 2.4568018076864484e-07, + "loss": 0.3561, + "step": 14121 + }, + { + "epoch": 2.9030732860520096, + "grad_norm": 0.24260129034519196, + "learning_rate": 2.446390055504433e-07, + "loss": 0.3947, + "step": 14122 + }, + { + "epoch": 2.903278857025388, + "grad_norm": 0.224505215883255, + "learning_rate": 2.436000352196943e-07, + "loss": 0.374, + "step": 14123 + }, + { + "epoch": 2.903484427998767, + "grad_norm": 0.22481678426265717, + "learning_rate": 2.42563269827582e-07, + "loss": 0.3731, + "step": 14124 + }, + { + "epoch": 2.903689998972145, + "grad_norm": 0.2302400767803192, + "learning_rate": 2.415287094251756e-07, + "loss": 0.3861, + "step": 14125 + }, + { + "epoch": 2.9038955699455236, + "grad_norm": 0.12294553965330124, + "learning_rate": 2.404963540634542e-07, + "loss": 0.4498, + "step": 14126 + }, + { + "epoch": 2.904101140918902, + "grad_norm": 0.23122653365135193, + "learning_rate": 2.3946620379327214e-07, + "loss": 0.3611, + "step": 14127 + }, + { + "epoch": 2.9043067118922807, + "grad_norm": 0.2358085960149765, + "learning_rate": 2.3843825866537883e-07, + "loss": 0.396, + "step": 14128 + }, + { + "epoch": 2.9045122828656593, + "grad_norm": 0.12024319916963577, + "learning_rate": 2.374125187304188e-07, + "loss": 0.4486, + "step": 14129 + }, + { + "epoch": 2.904717853839038, + "grad_norm": 0.22727903723716736, + "learning_rate": 2.3638898403892162e-07, + "loss": 0.355, + "step": 14130 + }, + { + "epoch": 2.9049234248124165, + "grad_norm": 0.23375868797302246, + "learning_rate": 2.3536765464131695e-07, + "loss": 0.3874, + "step": 14131 + }, + { + "epoch": 2.905128995785795, + "grad_norm": 0.23008592426776886, + "learning_rate": 2.343485305879195e-07, + "loss": 0.4048, + "step": 14132 + }, + { + "epoch": 2.9053345667591737, + "grad_norm": 0.12159692496061325, + "learning_rate": 2.3333161192893416e-07, + "loss": 0.4508, + "step": 14133 + }, + { + "epoch": 2.9055401377325523, + "grad_norm": 0.22369949519634247, + "learning_rate": 2.3231689871446083e-07, + "loss": 0.3681, + "step": 14134 + }, + { + "epoch": 2.905745708705931, + "grad_norm": 0.23495550453662872, + "learning_rate": 2.3130439099448953e-07, + "loss": 0.3923, + "step": 14135 + }, + { + "epoch": 2.9059512796793094, + "grad_norm": 0.23136933147907257, + "learning_rate": 2.3029408881890535e-07, + "loss": 0.3752, + "step": 14136 + }, + { + "epoch": 2.906156850652688, + "grad_norm": 0.12260935455560684, + "learning_rate": 2.292859922374785e-07, + "loss": 0.4599, + "step": 14137 + }, + { + "epoch": 2.906362421626066, + "grad_norm": 0.22997353971004486, + "learning_rate": 2.2828010129986922e-07, + "loss": 0.4108, + "step": 14138 + }, + { + "epoch": 2.906567992599445, + "grad_norm": 0.23019467294216156, + "learning_rate": 2.2727641605564287e-07, + "loss": 0.391, + "step": 14139 + }, + { + "epoch": 2.9067735635728233, + "grad_norm": 0.219661682844162, + "learning_rate": 2.2627493655423492e-07, + "loss": 0.3711, + "step": 14140 + }, + { + "epoch": 2.906979134546202, + "grad_norm": 0.23242846131324768, + "learning_rate": 2.252756628449909e-07, + "loss": 0.3808, + "step": 14141 + }, + { + "epoch": 2.9071847055195805, + "grad_norm": 0.21850700676441193, + "learning_rate": 2.2427859497713644e-07, + "loss": 0.3812, + "step": 14142 + }, + { + "epoch": 2.907390276492959, + "grad_norm": 0.22501927614212036, + "learning_rate": 2.2328373299979723e-07, + "loss": 0.3597, + "step": 14143 + }, + { + "epoch": 2.9075958474663377, + "grad_norm": 0.22534947097301483, + "learning_rate": 2.2229107696198403e-07, + "loss": 0.3821, + "step": 14144 + }, + { + "epoch": 2.9078014184397163, + "grad_norm": 0.22857366502285004, + "learning_rate": 2.213006269125978e-07, + "loss": 0.3938, + "step": 14145 + }, + { + "epoch": 2.908006989413095, + "grad_norm": 0.23802757263183594, + "learning_rate": 2.2031238290042943e-07, + "loss": 0.3871, + "step": 14146 + }, + { + "epoch": 2.9082125603864735, + "grad_norm": 0.2242707461118698, + "learning_rate": 2.1932634497417505e-07, + "loss": 0.3684, + "step": 14147 + }, + { + "epoch": 2.908418131359852, + "grad_norm": 0.12156816571950912, + "learning_rate": 2.1834251318240573e-07, + "loss": 0.4328, + "step": 14148 + }, + { + "epoch": 2.9086237023332306, + "grad_norm": 0.22905348241329193, + "learning_rate": 2.1736088757359274e-07, + "loss": 0.3772, + "step": 14149 + }, + { + "epoch": 2.9088292733066092, + "grad_norm": 0.2287713587284088, + "learning_rate": 2.163814681960924e-07, + "loss": 0.3649, + "step": 14150 + }, + { + "epoch": 2.909034844279988, + "grad_norm": 0.2233857661485672, + "learning_rate": 2.1540425509816608e-07, + "loss": 0.3777, + "step": 14151 + }, + { + "epoch": 2.9092404152533664, + "grad_norm": 0.23792994022369385, + "learning_rate": 2.1442924832794532e-07, + "loss": 0.3854, + "step": 14152 + }, + { + "epoch": 2.9094459862267446, + "grad_norm": 0.22729872167110443, + "learning_rate": 2.1345644793346663e-07, + "loss": 0.3748, + "step": 14153 + }, + { + "epoch": 2.9096515572001236, + "grad_norm": 0.2334190458059311, + "learning_rate": 2.1248585396265674e-07, + "loss": 0.3878, + "step": 14154 + }, + { + "epoch": 2.9098571281735017, + "grad_norm": 0.11780460178852081, + "learning_rate": 2.1151746646333237e-07, + "loss": 0.4551, + "step": 14155 + }, + { + "epoch": 2.9100626991468803, + "grad_norm": 0.12107131630182266, + "learning_rate": 2.1055128548320534e-07, + "loss": 0.4406, + "step": 14156 + }, + { + "epoch": 2.910268270120259, + "grad_norm": 0.12362342327833176, + "learning_rate": 2.0958731106986762e-07, + "loss": 0.4345, + "step": 14157 + }, + { + "epoch": 2.9104738410936375, + "grad_norm": 0.12129798531532288, + "learning_rate": 2.086255432708162e-07, + "loss": 0.4588, + "step": 14158 + }, + { + "epoch": 2.910679412067016, + "grad_norm": 0.2197551131248474, + "learning_rate": 2.0766598213342814e-07, + "loss": 0.3835, + "step": 14159 + }, + { + "epoch": 2.9108849830403947, + "grad_norm": 0.23252207040786743, + "learning_rate": 2.0670862770498068e-07, + "loss": 0.3737, + "step": 14160 + }, + { + "epoch": 2.9110905540137733, + "grad_norm": 0.23787905275821686, + "learning_rate": 2.0575348003263107e-07, + "loss": 0.3891, + "step": 14161 + }, + { + "epoch": 2.911296124987152, + "grad_norm": 0.3696140944957733, + "learning_rate": 2.0480053916344666e-07, + "loss": 0.3845, + "step": 14162 + }, + { + "epoch": 2.9115016959605304, + "grad_norm": 0.22981351613998413, + "learning_rate": 2.0384980514435993e-07, + "loss": 0.3624, + "step": 14163 + }, + { + "epoch": 2.911707266933909, + "grad_norm": 0.24888327717781067, + "learning_rate": 2.0290127802222337e-07, + "loss": 0.3841, + "step": 14164 + }, + { + "epoch": 2.9119128379072876, + "grad_norm": 0.22164028882980347, + "learning_rate": 2.0195495784375463e-07, + "loss": 0.3711, + "step": 14165 + }, + { + "epoch": 2.912118408880666, + "grad_norm": 0.23485776782035828, + "learning_rate": 2.0101084465558141e-07, + "loss": 0.3845, + "step": 14166 + }, + { + "epoch": 2.912323979854045, + "grad_norm": 0.23392504453659058, + "learning_rate": 2.000689385042115e-07, + "loss": 0.374, + "step": 14167 + }, + { + "epoch": 2.912529550827423, + "grad_norm": 0.23598188161849976, + "learning_rate": 1.9912923943605278e-07, + "loss": 0.384, + "step": 14168 + }, + { + "epoch": 2.912735121800802, + "grad_norm": 0.23599591851234436, + "learning_rate": 1.9819174749739822e-07, + "loss": 0.3803, + "step": 14169 + }, + { + "epoch": 2.91294069277418, + "grad_norm": 0.22775763273239136, + "learning_rate": 1.972564627344359e-07, + "loss": 0.3712, + "step": 14170 + }, + { + "epoch": 2.9131462637475587, + "grad_norm": 0.22463448345661163, + "learning_rate": 1.9632338519323391e-07, + "loss": 0.3685, + "step": 14171 + }, + { + "epoch": 2.9133518347209373, + "grad_norm": 0.12167978286743164, + "learning_rate": 1.9539251491977052e-07, + "loss": 0.4519, + "step": 14172 + }, + { + "epoch": 2.913557405694316, + "grad_norm": 0.22514131665229797, + "learning_rate": 1.9446385195990403e-07, + "loss": 0.368, + "step": 14173 + }, + { + "epoch": 2.9137629766676945, + "grad_norm": 0.22279466688632965, + "learning_rate": 1.9353739635937784e-07, + "loss": 0.3641, + "step": 14174 + }, + { + "epoch": 2.913968547641073, + "grad_norm": 0.11997415125370026, + "learning_rate": 1.9261314816384046e-07, + "loss": 0.4445, + "step": 14175 + }, + { + "epoch": 2.9141741186144516, + "grad_norm": 0.12249033898115158, + "learning_rate": 1.9169110741882546e-07, + "loss": 0.4391, + "step": 14176 + }, + { + "epoch": 2.9143796895878302, + "grad_norm": 0.2306622713804245, + "learning_rate": 1.907712741697565e-07, + "loss": 0.3835, + "step": 14177 + }, + { + "epoch": 2.914585260561209, + "grad_norm": 0.11753799021244049, + "learning_rate": 1.898536484619473e-07, + "loss": 0.4275, + "step": 14178 + }, + { + "epoch": 2.9147908315345874, + "grad_norm": 0.2338220775127411, + "learning_rate": 1.8893823034061176e-07, + "loss": 0.4019, + "step": 14179 + }, + { + "epoch": 2.914996402507966, + "grad_norm": 0.1225576251745224, + "learning_rate": 1.8802501985083875e-07, + "loss": 0.4512, + "step": 14180 + }, + { + "epoch": 2.9152019734813446, + "grad_norm": 0.11842867732048035, + "learning_rate": 1.8711401703762232e-07, + "loss": 0.4508, + "step": 14181 + }, + { + "epoch": 2.915407544454723, + "grad_norm": 0.22266638278961182, + "learning_rate": 1.8620522194584156e-07, + "loss": 0.377, + "step": 14182 + }, + { + "epoch": 2.9156131154281013, + "grad_norm": 0.2360514998435974, + "learning_rate": 1.8529863462027563e-07, + "loss": 0.3851, + "step": 14183 + }, + { + "epoch": 2.9158186864014803, + "grad_norm": 0.24209058284759521, + "learning_rate": 1.8439425510557885e-07, + "loss": 0.3813, + "step": 14184 + }, + { + "epoch": 2.9160242573748585, + "grad_norm": 0.22401560842990875, + "learning_rate": 1.8349208344631052e-07, + "loss": 0.381, + "step": 14185 + }, + { + "epoch": 2.916229828348237, + "grad_norm": 0.12131255865097046, + "learning_rate": 1.8259211968691514e-07, + "loss": 0.462, + "step": 14186 + }, + { + "epoch": 2.9164353993216157, + "grad_norm": 0.22824054956436157, + "learning_rate": 1.8169436387173222e-07, + "loss": 0.3783, + "step": 14187 + }, + { + "epoch": 2.9166409702949943, + "grad_norm": 0.2334217131137848, + "learning_rate": 1.807988160449864e-07, + "loss": 0.3739, + "step": 14188 + }, + { + "epoch": 2.916846541268373, + "grad_norm": 0.22181300818920135, + "learning_rate": 1.7990547625079735e-07, + "loss": 0.3881, + "step": 14189 + }, + { + "epoch": 2.9170521122417514, + "grad_norm": 0.22861194610595703, + "learning_rate": 1.790143445331749e-07, + "loss": 0.3828, + "step": 14190 + }, + { + "epoch": 2.91725768321513, + "grad_norm": 0.2301216721534729, + "learning_rate": 1.781254209360289e-07, + "loss": 0.3711, + "step": 14191 + }, + { + "epoch": 2.9174632541885086, + "grad_norm": 0.22555860877037048, + "learning_rate": 1.7723870550313938e-07, + "loss": 0.3734, + "step": 14192 + }, + { + "epoch": 2.917668825161887, + "grad_norm": 0.23354589939117432, + "learning_rate": 1.7635419827820132e-07, + "loss": 0.3687, + "step": 14193 + }, + { + "epoch": 2.917874396135266, + "grad_norm": 0.23408174514770508, + "learning_rate": 1.754718993047899e-07, + "loss": 0.3708, + "step": 14194 + }, + { + "epoch": 2.9180799671086444, + "grad_norm": 0.23139835894107819, + "learning_rate": 1.7459180862636037e-07, + "loss": 0.3577, + "step": 14195 + }, + { + "epoch": 2.918285538082023, + "grad_norm": 0.11874835938215256, + "learning_rate": 1.7371392628628802e-07, + "loss": 0.4303, + "step": 14196 + }, + { + "epoch": 2.9184911090554015, + "grad_norm": 0.23826338350772858, + "learning_rate": 1.7283825232780825e-07, + "loss": 0.3858, + "step": 14197 + }, + { + "epoch": 2.9186966800287797, + "grad_norm": 0.13164587318897247, + "learning_rate": 1.7196478679406658e-07, + "loss": 0.447, + "step": 14198 + }, + { + "epoch": 2.9189022510021587, + "grad_norm": 0.22203896939754486, + "learning_rate": 1.7109352972809856e-07, + "loss": 0.3562, + "step": 14199 + }, + { + "epoch": 2.919107821975537, + "grad_norm": 0.24076960980892181, + "learning_rate": 1.7022448117281487e-07, + "loss": 0.3789, + "step": 14200 + }, + { + "epoch": 2.9193133929489155, + "grad_norm": 0.24343958497047424, + "learning_rate": 1.6935764117104125e-07, + "loss": 0.3937, + "step": 14201 + }, + { + "epoch": 2.919518963922294, + "grad_norm": 0.23502768576145172, + "learning_rate": 1.6849300976547856e-07, + "loss": 0.3881, + "step": 14202 + }, + { + "epoch": 2.9197245348956726, + "grad_norm": 0.24000953137874603, + "learning_rate": 1.6763058699872269e-07, + "loss": 0.376, + "step": 14203 + }, + { + "epoch": 2.919930105869051, + "grad_norm": 0.12082278728485107, + "learning_rate": 1.667703729132647e-07, + "loss": 0.4705, + "step": 14204 + }, + { + "epoch": 2.92013567684243, + "grad_norm": 0.22615081071853638, + "learning_rate": 1.6591236755148064e-07, + "loss": 0.3811, + "step": 14205 + }, + { + "epoch": 2.9203412478158084, + "grad_norm": 0.23181886970996857, + "learning_rate": 1.6505657095563675e-07, + "loss": 0.3656, + "step": 14206 + }, + { + "epoch": 2.920546818789187, + "grad_norm": 0.22532041370868683, + "learning_rate": 1.642029831678993e-07, + "loss": 0.375, + "step": 14207 + }, + { + "epoch": 2.9207523897625656, + "grad_norm": 0.11956392228603363, + "learning_rate": 1.633516042303196e-07, + "loss": 0.4629, + "step": 14208 + }, + { + "epoch": 2.920957960735944, + "grad_norm": 0.22714190185070038, + "learning_rate": 1.6250243418483412e-07, + "loss": 0.3596, + "step": 14209 + }, + { + "epoch": 2.9211635317093227, + "grad_norm": 0.2288563847541809, + "learning_rate": 1.6165547307328944e-07, + "loss": 0.3806, + "step": 14210 + }, + { + "epoch": 2.9213691026827013, + "grad_norm": 0.21944314241409302, + "learning_rate": 1.6081072093740711e-07, + "loss": 0.3687, + "step": 14211 + }, + { + "epoch": 2.92157467365608, + "grad_norm": 0.11778556555509567, + "learning_rate": 1.599681778187989e-07, + "loss": 0.4448, + "step": 14212 + }, + { + "epoch": 2.921780244629458, + "grad_norm": 0.24057716131210327, + "learning_rate": 1.591278437589816e-07, + "loss": 0.39, + "step": 14213 + }, + { + "epoch": 2.921985815602837, + "grad_norm": 0.11888077110052109, + "learning_rate": 1.5828971879934706e-07, + "loss": 0.4486, + "step": 14214 + }, + { + "epoch": 2.9221913865762152, + "grad_norm": 0.23414359986782074, + "learning_rate": 1.574538029811873e-07, + "loss": 0.3829, + "step": 14215 + }, + { + "epoch": 2.922396957549594, + "grad_norm": 0.2228407859802246, + "learning_rate": 1.5662009634568432e-07, + "loss": 0.3908, + "step": 14216 + }, + { + "epoch": 2.9226025285229724, + "grad_norm": 0.23232321441173553, + "learning_rate": 1.557885989339103e-07, + "loss": 0.3872, + "step": 14217 + }, + { + "epoch": 2.922808099496351, + "grad_norm": 0.23603259027004242, + "learning_rate": 1.5495931078683746e-07, + "loss": 0.3729, + "step": 14218 + }, + { + "epoch": 2.9230136704697296, + "grad_norm": 0.23010489344596863, + "learning_rate": 1.5413223194530813e-07, + "loss": 0.3702, + "step": 14219 + }, + { + "epoch": 2.923219241443108, + "grad_norm": 0.22785669565200806, + "learning_rate": 1.5330736245007972e-07, + "loss": 0.3773, + "step": 14220 + }, + { + "epoch": 2.9234248124164868, + "grad_norm": 0.2386084794998169, + "learning_rate": 1.524847023417797e-07, + "loss": 0.3803, + "step": 14221 + }, + { + "epoch": 2.9236303833898654, + "grad_norm": 0.23408401012420654, + "learning_rate": 1.5166425166094567e-07, + "loss": 0.3836, + "step": 14222 + }, + { + "epoch": 2.923835954363244, + "grad_norm": 0.23765285313129425, + "learning_rate": 1.508460104479903e-07, + "loss": 0.376, + "step": 14223 + }, + { + "epoch": 2.9240415253366225, + "grad_norm": 0.23104673624038696, + "learning_rate": 1.5002997874323134e-07, + "loss": 0.3758, + "step": 14224 + }, + { + "epoch": 2.924247096310001, + "grad_norm": 0.2328345626592636, + "learning_rate": 1.492161565868616e-07, + "loss": 0.3718, + "step": 14225 + }, + { + "epoch": 2.9244526672833797, + "grad_norm": 0.22445005178451538, + "learning_rate": 1.4840454401898407e-07, + "loss": 0.363, + "step": 14226 + }, + { + "epoch": 2.9246582382567583, + "grad_norm": 0.22506146132946014, + "learning_rate": 1.4759514107957673e-07, + "loss": 0.3583, + "step": 14227 + }, + { + "epoch": 2.9248638092301364, + "grad_norm": 0.22295387089252472, + "learning_rate": 1.4678794780852267e-07, + "loss": 0.3901, + "step": 14228 + }, + { + "epoch": 2.9250693802035155, + "grad_norm": 0.22863556444644928, + "learning_rate": 1.4598296424557512e-07, + "loss": 0.3554, + "step": 14229 + }, + { + "epoch": 2.9252749511768936, + "grad_norm": 0.23237614333629608, + "learning_rate": 1.4518019043040233e-07, + "loss": 0.3843, + "step": 14230 + }, + { + "epoch": 2.9254805221502727, + "grad_norm": 0.22150248289108276, + "learning_rate": 1.4437962640255264e-07, + "loss": 0.3708, + "step": 14231 + }, + { + "epoch": 2.925686093123651, + "grad_norm": 0.2305610030889511, + "learning_rate": 1.4358127220146456e-07, + "loss": 0.3922, + "step": 14232 + }, + { + "epoch": 2.9258916640970294, + "grad_norm": 0.2294863909482956, + "learning_rate": 1.4278512786646658e-07, + "loss": 0.3815, + "step": 14233 + }, + { + "epoch": 2.926097235070408, + "grad_norm": 0.22797244787216187, + "learning_rate": 1.4199119343678236e-07, + "loss": 0.3751, + "step": 14234 + }, + { + "epoch": 2.9263028060437866, + "grad_norm": 0.23715586960315704, + "learning_rate": 1.4119946895153058e-07, + "loss": 0.3842, + "step": 14235 + }, + { + "epoch": 2.926508377017165, + "grad_norm": 0.22145721316337585, + "learning_rate": 1.4040995444970505e-07, + "loss": 0.3793, + "step": 14236 + }, + { + "epoch": 2.9267139479905437, + "grad_norm": 0.2366815060377121, + "learning_rate": 1.396226499702097e-07, + "loss": 0.3848, + "step": 14237 + }, + { + "epoch": 2.9269195189639223, + "grad_norm": 0.24056269228458405, + "learning_rate": 1.3883755555183343e-07, + "loss": 0.3725, + "step": 14238 + }, + { + "epoch": 2.927125089937301, + "grad_norm": 0.22281573712825775, + "learning_rate": 1.3805467123325035e-07, + "loss": 0.3695, + "step": 14239 + }, + { + "epoch": 2.9273306609106795, + "grad_norm": 0.2354237586259842, + "learning_rate": 1.3727399705302458e-07, + "loss": 0.3834, + "step": 14240 + }, + { + "epoch": 2.927536231884058, + "grad_norm": 0.121092788875103, + "learning_rate": 1.3649553304962536e-07, + "loss": 0.4405, + "step": 14241 + }, + { + "epoch": 2.9277418028574367, + "grad_norm": 0.23735617101192474, + "learning_rate": 1.3571927926139705e-07, + "loss": 0.3735, + "step": 14242 + }, + { + "epoch": 2.927947373830815, + "grad_norm": 0.24596528708934784, + "learning_rate": 1.3494523572658402e-07, + "loss": 0.3967, + "step": 14243 + }, + { + "epoch": 2.928152944804194, + "grad_norm": 0.21962697803974152, + "learning_rate": 1.3417340248332578e-07, + "loss": 0.3747, + "step": 14244 + }, + { + "epoch": 2.928358515777572, + "grad_norm": 0.2285209596157074, + "learning_rate": 1.334037795696369e-07, + "loss": 0.3752, + "step": 14245 + }, + { + "epoch": 2.928564086750951, + "grad_norm": 0.22602157294750214, + "learning_rate": 1.3263636702344207e-07, + "loss": 0.3754, + "step": 14246 + }, + { + "epoch": 2.928769657724329, + "grad_norm": 0.22371745109558105, + "learning_rate": 1.3187116488254103e-07, + "loss": 0.3879, + "step": 14247 + }, + { + "epoch": 2.9289752286977078, + "grad_norm": 0.23446328938007355, + "learning_rate": 1.3110817318463365e-07, + "loss": 0.3569, + "step": 14248 + }, + { + "epoch": 2.9291807996710864, + "grad_norm": 0.22867922484874725, + "learning_rate": 1.3034739196730984e-07, + "loss": 0.397, + "step": 14249 + }, + { + "epoch": 2.929386370644465, + "grad_norm": 0.23485369980335236, + "learning_rate": 1.295888212680496e-07, + "loss": 0.3934, + "step": 14250 + }, + { + "epoch": 2.9295919416178435, + "grad_norm": 0.23388779163360596, + "learning_rate": 1.2883246112422808e-07, + "loss": 0.3878, + "step": 14251 + }, + { + "epoch": 2.929797512591222, + "grad_norm": 0.23058055341243744, + "learning_rate": 1.2807831157310046e-07, + "loss": 0.3728, + "step": 14252 + }, + { + "epoch": 2.9300030835646007, + "grad_norm": 0.12012367695569992, + "learning_rate": 1.2732637265182702e-07, + "loss": 0.428, + "step": 14253 + }, + { + "epoch": 2.9302086545379793, + "grad_norm": 0.11733004450798035, + "learning_rate": 1.265766443974431e-07, + "loss": 0.4467, + "step": 14254 + }, + { + "epoch": 2.930414225511358, + "grad_norm": 0.23373596370220184, + "learning_rate": 1.2582912684689418e-07, + "loss": 0.3774, + "step": 14255 + }, + { + "epoch": 2.9306197964847365, + "grad_norm": 0.22442536056041718, + "learning_rate": 1.250838200370008e-07, + "loss": 0.3723, + "step": 14256 + }, + { + "epoch": 2.930825367458115, + "grad_norm": 0.119273342192173, + "learning_rate": 1.243407240044836e-07, + "loss": 0.4466, + "step": 14257 + }, + { + "epoch": 2.931030938431493, + "grad_norm": 0.12564511597156525, + "learning_rate": 1.2359983878595329e-07, + "loss": 0.4453, + "step": 14258 + }, + { + "epoch": 2.9312365094048722, + "grad_norm": 0.2270507961511612, + "learning_rate": 1.2286116441790064e-07, + "loss": 0.3577, + "step": 14259 + }, + { + "epoch": 2.9314420803782504, + "grad_norm": 0.24136748909950256, + "learning_rate": 1.2212470093673155e-07, + "loss": 0.3874, + "step": 14260 + }, + { + "epoch": 2.9316476513516294, + "grad_norm": 0.22944435477256775, + "learning_rate": 1.2139044837871204e-07, + "loss": 0.3783, + "step": 14261 + }, + { + "epoch": 2.9318532223250076, + "grad_norm": 0.2328665405511856, + "learning_rate": 1.2065840678002815e-07, + "loss": 0.3704, + "step": 14262 + }, + { + "epoch": 2.932058793298386, + "grad_norm": 0.23235177993774414, + "learning_rate": 1.1992857617674103e-07, + "loss": 0.3927, + "step": 14263 + }, + { + "epoch": 2.9322643642717647, + "grad_norm": 0.22136935591697693, + "learning_rate": 1.1920095660479691e-07, + "loss": 0.3554, + "step": 14264 + }, + { + "epoch": 2.9324699352451433, + "grad_norm": 0.23103518784046173, + "learning_rate": 1.1847554810005212e-07, + "loss": 0.3751, + "step": 14265 + }, + { + "epoch": 2.932675506218522, + "grad_norm": 0.1267227828502655, + "learning_rate": 1.177523506982431e-07, + "loss": 0.4482, + "step": 14266 + }, + { + "epoch": 2.9328810771919005, + "grad_norm": 0.23589691519737244, + "learning_rate": 1.1703136443499629e-07, + "loss": 0.3912, + "step": 14267 + }, + { + "epoch": 2.933086648165279, + "grad_norm": 0.22941534221172333, + "learning_rate": 1.1631258934583333e-07, + "loss": 0.3815, + "step": 14268 + }, + { + "epoch": 2.9332922191386577, + "grad_norm": 0.2415175586938858, + "learning_rate": 1.1559602546616089e-07, + "loss": 0.3837, + "step": 14269 + }, + { + "epoch": 2.9334977901120363, + "grad_norm": 0.22201284766197205, + "learning_rate": 1.148816728312857e-07, + "loss": 0.3859, + "step": 14270 + }, + { + "epoch": 2.933703361085415, + "grad_norm": 0.23160016536712646, + "learning_rate": 1.1416953147639464e-07, + "loss": 0.378, + "step": 14271 + }, + { + "epoch": 2.9339089320587934, + "grad_norm": 0.23736536502838135, + "learning_rate": 1.1345960143657463e-07, + "loss": 0.363, + "step": 14272 + }, + { + "epoch": 2.9341145030321716, + "grad_norm": 0.12271010130643845, + "learning_rate": 1.127518827468027e-07, + "loss": 0.4492, + "step": 14273 + }, + { + "epoch": 2.9343200740055506, + "grad_norm": 0.2341691106557846, + "learning_rate": 1.1204637544194097e-07, + "loss": 0.3787, + "step": 14274 + }, + { + "epoch": 2.9345256449789288, + "grad_norm": 0.23392406105995178, + "learning_rate": 1.1134307955675161e-07, + "loss": 0.3873, + "step": 14275 + }, + { + "epoch": 2.934731215952308, + "grad_norm": 0.2216750532388687, + "learning_rate": 1.1064199512587692e-07, + "loss": 0.3777, + "step": 14276 + }, + { + "epoch": 2.934936786925686, + "grad_norm": 0.23249836266040802, + "learning_rate": 1.0994312218385927e-07, + "loss": 0.365, + "step": 14277 + }, + { + "epoch": 2.9351423578990645, + "grad_norm": 0.23422518372535706, + "learning_rate": 1.0924646076513112e-07, + "loss": 0.3889, + "step": 14278 + }, + { + "epoch": 2.935347928872443, + "grad_norm": 0.23376347124576569, + "learning_rate": 1.0855201090401002e-07, + "loss": 0.3868, + "step": 14279 + }, + { + "epoch": 2.9355534998458217, + "grad_norm": 0.21998612582683563, + "learning_rate": 1.078597726347086e-07, + "loss": 0.3909, + "step": 14280 + }, + { + "epoch": 2.9357590708192003, + "grad_norm": 0.23854362964630127, + "learning_rate": 1.0716974599132956e-07, + "loss": 0.3771, + "step": 14281 + }, + { + "epoch": 2.935964641792579, + "grad_norm": 0.11935044080018997, + "learning_rate": 1.0648193100787074e-07, + "loss": 0.4408, + "step": 14282 + }, + { + "epoch": 2.9361702127659575, + "grad_norm": 0.23328512907028198, + "learning_rate": 1.0579632771821502e-07, + "loss": 0.3917, + "step": 14283 + }, + { + "epoch": 2.936375783739336, + "grad_norm": 0.2255300134420395, + "learning_rate": 1.0511293615613539e-07, + "loss": 0.3756, + "step": 14284 + }, + { + "epoch": 2.9365813547127146, + "grad_norm": 0.2301304042339325, + "learning_rate": 1.0443175635530489e-07, + "loss": 0.3927, + "step": 14285 + }, + { + "epoch": 2.9367869256860932, + "grad_norm": 0.11829908192157745, + "learning_rate": 1.037527883492817e-07, + "loss": 0.4427, + "step": 14286 + }, + { + "epoch": 2.936992496659472, + "grad_norm": 0.23846930265426636, + "learning_rate": 1.0307603217151906e-07, + "loss": 0.3758, + "step": 14287 + }, + { + "epoch": 2.9371980676328504, + "grad_norm": 0.22976188361644745, + "learning_rate": 1.0240148785534532e-07, + "loss": 0.3702, + "step": 14288 + }, + { + "epoch": 2.937403638606229, + "grad_norm": 0.235699862241745, + "learning_rate": 1.0172915543400386e-07, + "loss": 0.3791, + "step": 14289 + }, + { + "epoch": 2.937609209579607, + "grad_norm": 0.23126575350761414, + "learning_rate": 1.0105903494060821e-07, + "loss": 0.3925, + "step": 14290 + }, + { + "epoch": 2.937814780552986, + "grad_norm": 0.12287239730358124, + "learning_rate": 1.0039112640818193e-07, + "loss": 0.4534, + "step": 14291 + }, + { + "epoch": 2.9380203515263643, + "grad_norm": 0.22776830196380615, + "learning_rate": 9.972542986961875e-08, + "loss": 0.3802, + "step": 14292 + }, + { + "epoch": 2.938225922499743, + "grad_norm": 0.23235289752483368, + "learning_rate": 9.906194535772739e-08, + "loss": 0.3922, + "step": 14293 + }, + { + "epoch": 2.9384314934731215, + "grad_norm": 0.12463247776031494, + "learning_rate": 9.840067290518173e-08, + "loss": 0.4412, + "step": 14294 + }, + { + "epoch": 2.9386370644465, + "grad_norm": 0.12007234990596771, + "learning_rate": 9.77416125445707e-08, + "loss": 0.4616, + "step": 14295 + }, + { + "epoch": 2.9388426354198787, + "grad_norm": 0.121745266020298, + "learning_rate": 9.708476430835333e-08, + "loss": 0.4576, + "step": 14296 + }, + { + "epoch": 2.9390482063932573, + "grad_norm": 0.23362316191196442, + "learning_rate": 9.643012822889375e-08, + "loss": 0.387, + "step": 14297 + }, + { + "epoch": 2.939253777366636, + "grad_norm": 0.21919940412044525, + "learning_rate": 9.577770433844613e-08, + "loss": 0.3776, + "step": 14298 + }, + { + "epoch": 2.9394593483400144, + "grad_norm": 0.23474140465259552, + "learning_rate": 9.512749266914978e-08, + "loss": 0.3797, + "step": 14299 + }, + { + "epoch": 2.939664919313393, + "grad_norm": 0.22480328381061554, + "learning_rate": 9.447949325303407e-08, + "loss": 0.3901, + "step": 14300 + }, + { + "epoch": 2.9398704902867716, + "grad_norm": 0.11820299923419952, + "learning_rate": 9.383370612202347e-08, + "loss": 0.4407, + "step": 14301 + }, + { + "epoch": 2.94007606126015, + "grad_norm": 0.23817752301692963, + "learning_rate": 9.319013130794252e-08, + "loss": 0.4009, + "step": 14302 + }, + { + "epoch": 2.940281632233529, + "grad_norm": 0.23195527493953705, + "learning_rate": 9.254876884248587e-08, + "loss": 0.373, + "step": 14303 + }, + { + "epoch": 2.9404872032069074, + "grad_norm": 0.22849521040916443, + "learning_rate": 9.190961875725324e-08, + "loss": 0.387, + "step": 14304 + }, + { + "epoch": 2.9406927741802855, + "grad_norm": 0.2443472295999527, + "learning_rate": 9.127268108373444e-08, + "loss": 0.395, + "step": 14305 + }, + { + "epoch": 2.9408983451536646, + "grad_norm": 0.12250496447086334, + "learning_rate": 9.063795585330937e-08, + "loss": 0.4541, + "step": 14306 + }, + { + "epoch": 2.9411039161270427, + "grad_norm": 0.23145142197608948, + "learning_rate": 9.000544309724302e-08, + "loss": 0.37, + "step": 14307 + }, + { + "epoch": 2.9413094871004213, + "grad_norm": 0.2310493439435959, + "learning_rate": 8.937514284670545e-08, + "loss": 0.3729, + "step": 14308 + }, + { + "epoch": 2.9415150580738, + "grad_norm": 0.2356126606464386, + "learning_rate": 8.874705513273685e-08, + "loss": 0.3647, + "step": 14309 + }, + { + "epoch": 2.9417206290471785, + "grad_norm": 0.1224084421992302, + "learning_rate": 8.812117998629244e-08, + "loss": 0.4391, + "step": 14310 + }, + { + "epoch": 2.941926200020557, + "grad_norm": 0.23388880491256714, + "learning_rate": 8.749751743819257e-08, + "loss": 0.4037, + "step": 14311 + }, + { + "epoch": 2.9421317709939356, + "grad_norm": 0.1348462849855423, + "learning_rate": 8.687606751917766e-08, + "loss": 0.4476, + "step": 14312 + }, + { + "epoch": 2.9423373419673142, + "grad_norm": 0.11941714584827423, + "learning_rate": 8.625683025984821e-08, + "loss": 0.4535, + "step": 14313 + }, + { + "epoch": 2.942542912940693, + "grad_norm": 0.2301827371120453, + "learning_rate": 8.563980569071983e-08, + "loss": 0.373, + "step": 14314 + }, + { + "epoch": 2.9427484839140714, + "grad_norm": 0.23292043805122375, + "learning_rate": 8.50249938421932e-08, + "loss": 0.3848, + "step": 14315 + }, + { + "epoch": 2.94295405488745, + "grad_norm": 0.23747049272060394, + "learning_rate": 8.44123947445491e-08, + "loss": 0.3767, + "step": 14316 + }, + { + "epoch": 2.9431596258608286, + "grad_norm": 0.23186716437339783, + "learning_rate": 8.380200842797336e-08, + "loss": 0.3821, + "step": 14317 + }, + { + "epoch": 2.943365196834207, + "grad_norm": 0.23163893818855286, + "learning_rate": 8.319383492253696e-08, + "loss": 0.3671, + "step": 14318 + }, + { + "epoch": 2.9435707678075858, + "grad_norm": 0.12333094328641891, + "learning_rate": 8.258787425819592e-08, + "loss": 0.4495, + "step": 14319 + }, + { + "epoch": 2.943776338780964, + "grad_norm": 0.12259241193532944, + "learning_rate": 8.198412646480636e-08, + "loss": 0.46, + "step": 14320 + }, + { + "epoch": 2.943981909754343, + "grad_norm": 0.23415526747703552, + "learning_rate": 8.138259157211447e-08, + "loss": 0.3829, + "step": 14321 + }, + { + "epoch": 2.944187480727721, + "grad_norm": 0.23561497032642365, + "learning_rate": 8.078326960975158e-08, + "loss": 0.3766, + "step": 14322 + }, + { + "epoch": 2.9443930517010997, + "grad_norm": 0.22486624121665955, + "learning_rate": 8.018616060724904e-08, + "loss": 0.3748, + "step": 14323 + }, + { + "epoch": 2.9445986226744782, + "grad_norm": 0.22078227996826172, + "learning_rate": 7.959126459401834e-08, + "loss": 0.3688, + "step": 14324 + }, + { + "epoch": 2.944804193647857, + "grad_norm": 0.23797355592250824, + "learning_rate": 7.899858159936601e-08, + "loss": 0.387, + "step": 14325 + }, + { + "epoch": 2.9450097646212354, + "grad_norm": 0.2293400913476944, + "learning_rate": 7.840811165249373e-08, + "loss": 0.373, + "step": 14326 + }, + { + "epoch": 2.945215335594614, + "grad_norm": 0.12413428723812103, + "learning_rate": 7.781985478249321e-08, + "loss": 0.4478, + "step": 14327 + }, + { + "epoch": 2.9454209065679926, + "grad_norm": 0.22216647863388062, + "learning_rate": 7.723381101834126e-08, + "loss": 0.3686, + "step": 14328 + }, + { + "epoch": 2.945626477541371, + "grad_norm": 0.23056413233280182, + "learning_rate": 7.66499803889098e-08, + "loss": 0.3739, + "step": 14329 + }, + { + "epoch": 2.94583204851475, + "grad_norm": 0.23194332420825958, + "learning_rate": 7.606836292296582e-08, + "loss": 0.3727, + "step": 14330 + }, + { + "epoch": 2.9460376194881284, + "grad_norm": 0.24576567113399506, + "learning_rate": 7.548895864915639e-08, + "loss": 0.3977, + "step": 14331 + }, + { + "epoch": 2.946243190461507, + "grad_norm": 0.22875289618968964, + "learning_rate": 7.491176759602869e-08, + "loss": 0.3842, + "step": 14332 + }, + { + "epoch": 2.9464487614348855, + "grad_norm": 0.2283722311258316, + "learning_rate": 7.433678979201997e-08, + "loss": 0.3824, + "step": 14333 + }, + { + "epoch": 2.946654332408264, + "grad_norm": 0.23309841752052307, + "learning_rate": 7.376402526545755e-08, + "loss": 0.3859, + "step": 14334 + }, + { + "epoch": 2.9468599033816423, + "grad_norm": 0.23801040649414062, + "learning_rate": 7.31934740445589e-08, + "loss": 0.3857, + "step": 14335 + }, + { + "epoch": 2.9470654743550213, + "grad_norm": 0.23424702882766724, + "learning_rate": 7.26251361574265e-08, + "loss": 0.3835, + "step": 14336 + }, + { + "epoch": 2.9472710453283995, + "grad_norm": 0.23089328408241272, + "learning_rate": 7.205901163206297e-08, + "loss": 0.3669, + "step": 14337 + }, + { + "epoch": 2.947476616301778, + "grad_norm": 0.22902965545654297, + "learning_rate": 7.149510049636099e-08, + "loss": 0.3738, + "step": 14338 + }, + { + "epoch": 2.9476821872751566, + "grad_norm": 0.22217592597007751, + "learning_rate": 7.093340277809834e-08, + "loss": 0.3853, + "step": 14339 + }, + { + "epoch": 2.947887758248535, + "grad_norm": 0.24186544120311737, + "learning_rate": 7.03739185049529e-08, + "loss": 0.3741, + "step": 14340 + }, + { + "epoch": 2.948093329221914, + "grad_norm": 0.2368420511484146, + "learning_rate": 6.98166477044826e-08, + "loss": 0.3809, + "step": 14341 + }, + { + "epoch": 2.9482989001952924, + "grad_norm": 0.11286085844039917, + "learning_rate": 6.926159040414049e-08, + "loss": 0.4515, + "step": 14342 + }, + { + "epoch": 2.948504471168671, + "grad_norm": 0.23017874360084534, + "learning_rate": 6.870874663127469e-08, + "loss": 0.3689, + "step": 14343 + }, + { + "epoch": 2.9487100421420496, + "grad_norm": 0.22851766645908356, + "learning_rate": 6.815811641312342e-08, + "loss": 0.3729, + "step": 14344 + }, + { + "epoch": 2.948915613115428, + "grad_norm": 0.22411444783210754, + "learning_rate": 6.760969977680498e-08, + "loss": 0.3666, + "step": 14345 + }, + { + "epoch": 2.9491211840888067, + "grad_norm": 0.22939811646938324, + "learning_rate": 6.706349674934776e-08, + "loss": 0.3816, + "step": 14346 + }, + { + "epoch": 2.9493267550621853, + "grad_norm": 0.2309289425611496, + "learning_rate": 6.651950735765522e-08, + "loss": 0.3866, + "step": 14347 + }, + { + "epoch": 2.949532326035564, + "grad_norm": 0.25561413168907166, + "learning_rate": 6.597773162853094e-08, + "loss": 0.4024, + "step": 14348 + }, + { + "epoch": 2.9497378970089425, + "grad_norm": 0.23126906156539917, + "learning_rate": 6.543816958865857e-08, + "loss": 0.3858, + "step": 14349 + }, + { + "epoch": 2.9499434679823207, + "grad_norm": 0.23696114122867584, + "learning_rate": 6.490082126462682e-08, + "loss": 0.3707, + "step": 14350 + }, + { + "epoch": 2.9501490389556997, + "grad_norm": 0.12179608643054962, + "learning_rate": 6.436568668290455e-08, + "loss": 0.4631, + "step": 14351 + }, + { + "epoch": 2.950354609929078, + "grad_norm": 0.22677427530288696, + "learning_rate": 6.383276586985565e-08, + "loss": 0.3663, + "step": 14352 + }, + { + "epoch": 2.9505601809024564, + "grad_norm": 0.23234906792640686, + "learning_rate": 6.330205885173413e-08, + "loss": 0.3811, + "step": 14353 + }, + { + "epoch": 2.950765751875835, + "grad_norm": 0.23495686054229736, + "learning_rate": 6.277356565468906e-08, + "loss": 0.3994, + "step": 14354 + }, + { + "epoch": 2.9509713228492136, + "grad_norm": 0.23368287086486816, + "learning_rate": 6.224728630474964e-08, + "loss": 0.3713, + "step": 14355 + }, + { + "epoch": 2.951176893822592, + "grad_norm": 0.11765862256288528, + "learning_rate": 6.17232208278551e-08, + "loss": 0.445, + "step": 14356 + }, + { + "epoch": 2.9513824647959708, + "grad_norm": 0.23425832390785217, + "learning_rate": 6.12013692498098e-08, + "loss": 0.3983, + "step": 14357 + }, + { + "epoch": 2.9515880357693494, + "grad_norm": 0.11689037829637527, + "learning_rate": 6.068173159633317e-08, + "loss": 0.4463, + "step": 14358 + }, + { + "epoch": 2.951793606742728, + "grad_norm": 0.2250240296125412, + "learning_rate": 6.016430789302474e-08, + "loss": 0.3852, + "step": 14359 + }, + { + "epoch": 2.9519991777161065, + "grad_norm": 0.23186476528644562, + "learning_rate": 5.964909816536912e-08, + "loss": 0.3659, + "step": 14360 + }, + { + "epoch": 2.952204748689485, + "grad_norm": 0.2178521603345871, + "learning_rate": 5.913610243875101e-08, + "loss": 0.3706, + "step": 14361 + }, + { + "epoch": 2.9524103196628637, + "grad_norm": 0.23056325316429138, + "learning_rate": 5.8625320738445176e-08, + "loss": 0.3751, + "step": 14362 + }, + { + "epoch": 2.9526158906362423, + "grad_norm": 0.2350500226020813, + "learning_rate": 5.811675308961151e-08, + "loss": 0.3784, + "step": 14363 + }, + { + "epoch": 2.952821461609621, + "grad_norm": 0.22323279082775116, + "learning_rate": 5.7610399517309956e-08, + "loss": 0.3732, + "step": 14364 + }, + { + "epoch": 2.953027032582999, + "grad_norm": 0.23257021605968475, + "learning_rate": 5.7106260046485564e-08, + "loss": 0.3833, + "step": 14365 + }, + { + "epoch": 2.953232603556378, + "grad_norm": 0.22943510115146637, + "learning_rate": 5.6604334701968466e-08, + "loss": 0.3664, + "step": 14366 + }, + { + "epoch": 2.953438174529756, + "grad_norm": 0.25284644961357117, + "learning_rate": 5.6104623508493883e-08, + "loss": 0.3844, + "step": 14367 + }, + { + "epoch": 2.953643745503135, + "grad_norm": 0.23901039361953735, + "learning_rate": 5.560712649067712e-08, + "loss": 0.3866, + "step": 14368 + }, + { + "epoch": 2.9538493164765134, + "grad_norm": 0.23246188461780548, + "learning_rate": 5.5111843673028574e-08, + "loss": 0.3791, + "step": 14369 + }, + { + "epoch": 2.954054887449892, + "grad_norm": 0.22920754551887512, + "learning_rate": 5.4618775079948725e-08, + "loss": 0.3846, + "step": 14370 + }, + { + "epoch": 2.9542604584232706, + "grad_norm": 0.23537150025367737, + "learning_rate": 5.412792073572315e-08, + "loss": 0.3787, + "step": 14371 + }, + { + "epoch": 2.954466029396649, + "grad_norm": 0.23101921379566193, + "learning_rate": 5.363928066454249e-08, + "loss": 0.3592, + "step": 14372 + }, + { + "epoch": 2.9546716003700277, + "grad_norm": 0.11517384648323059, + "learning_rate": 5.31528548904775e-08, + "loss": 0.4418, + "step": 14373 + }, + { + "epoch": 2.9548771713434063, + "grad_norm": 0.24331872165203094, + "learning_rate": 5.266864343748401e-08, + "loss": 0.3696, + "step": 14374 + }, + { + "epoch": 2.955082742316785, + "grad_norm": 0.12557660043239594, + "learning_rate": 5.218664632942794e-08, + "loss": 0.4405, + "step": 14375 + }, + { + "epoch": 2.9552883132901635, + "grad_norm": 0.2361089289188385, + "learning_rate": 5.170686359005028e-08, + "loss": 0.399, + "step": 14376 + }, + { + "epoch": 2.955493884263542, + "grad_norm": 0.23642629384994507, + "learning_rate": 5.122929524298215e-08, + "loss": 0.3766, + "step": 14377 + }, + { + "epoch": 2.9556994552369207, + "grad_norm": 0.22410228848457336, + "learning_rate": 5.07539413117647e-08, + "loss": 0.369, + "step": 14378 + }, + { + "epoch": 2.9559050262102993, + "grad_norm": 0.11725395172834396, + "learning_rate": 5.028080181980421e-08, + "loss": 0.4579, + "step": 14379 + }, + { + "epoch": 2.9561105971836774, + "grad_norm": 0.11997832357883453, + "learning_rate": 4.9809876790412045e-08, + "loss": 0.4409, + "step": 14380 + }, + { + "epoch": 2.9563161681570564, + "grad_norm": 0.22873012721538544, + "learning_rate": 4.9341166246794635e-08, + "loss": 0.3836, + "step": 14381 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 0.24191080033779144, + "learning_rate": 4.8874670212033516e-08, + "loss": 0.408, + "step": 14382 + }, + { + "epoch": 2.956727310103813, + "grad_norm": 0.23186847567558289, + "learning_rate": 4.841038870912029e-08, + "loss": 0.4031, + "step": 14383 + }, + { + "epoch": 2.9569328810771918, + "grad_norm": 0.22565621137619019, + "learning_rate": 4.7948321760926675e-08, + "loss": 0.3672, + "step": 14384 + }, + { + "epoch": 2.9571384520505704, + "grad_norm": 0.23073460161685944, + "learning_rate": 4.748846939020946e-08, + "loss": 0.3797, + "step": 14385 + }, + { + "epoch": 2.957344023023949, + "grad_norm": 0.23532749712467194, + "learning_rate": 4.703083161963051e-08, + "loss": 0.3721, + "step": 14386 + }, + { + "epoch": 2.9575495939973275, + "grad_norm": 0.27987563610076904, + "learning_rate": 4.657540847173181e-08, + "loss": 0.368, + "step": 14387 + }, + { + "epoch": 2.957755164970706, + "grad_norm": 0.2355928122997284, + "learning_rate": 4.61221999689504e-08, + "loss": 0.3909, + "step": 14388 + }, + { + "epoch": 2.9579607359440847, + "grad_norm": 0.22605665028095245, + "learning_rate": 4.567120613361342e-08, + "loss": 0.3669, + "step": 14389 + }, + { + "epoch": 2.9581663069174633, + "grad_norm": 0.22839273512363434, + "learning_rate": 4.52224269879431e-08, + "loss": 0.3821, + "step": 14390 + }, + { + "epoch": 2.958371877890842, + "grad_norm": 0.23111465573310852, + "learning_rate": 4.477586255404176e-08, + "loss": 0.3917, + "step": 14391 + }, + { + "epoch": 2.9585774488642205, + "grad_norm": 0.2226291000843048, + "learning_rate": 4.433151285391679e-08, + "loss": 0.374, + "step": 14392 + }, + { + "epoch": 2.958783019837599, + "grad_norm": 0.235224187374115, + "learning_rate": 4.388937790945569e-08, + "loss": 0.3761, + "step": 14393 + }, + { + "epoch": 2.9589885908109776, + "grad_norm": 0.22255218029022217, + "learning_rate": 4.3449457742441025e-08, + "loss": 0.3554, + "step": 14394 + }, + { + "epoch": 2.959194161784356, + "grad_norm": 0.23567332327365875, + "learning_rate": 4.3011752374545464e-08, + "loss": 0.3757, + "step": 14395 + }, + { + "epoch": 2.959399732757735, + "grad_norm": 0.2175855040550232, + "learning_rate": 4.257626182732677e-08, + "loss": 0.3868, + "step": 14396 + }, + { + "epoch": 2.959605303731113, + "grad_norm": 0.22269536554813385, + "learning_rate": 4.214298612225276e-08, + "loss": 0.3789, + "step": 14397 + }, + { + "epoch": 2.959810874704492, + "grad_norm": 0.2323738932609558, + "learning_rate": 4.1711925280656376e-08, + "loss": 0.3916, + "step": 14398 + }, + { + "epoch": 2.96001644567787, + "grad_norm": 0.23786410689353943, + "learning_rate": 4.1283079323780616e-08, + "loss": 0.3622, + "step": 14399 + }, + { + "epoch": 2.9602220166512487, + "grad_norm": 0.22747676074504852, + "learning_rate": 4.085644827275359e-08, + "loss": 0.3803, + "step": 14400 + }, + { + "epoch": 2.9604275876246273, + "grad_norm": 0.23511525988578796, + "learning_rate": 4.043203214858848e-08, + "loss": 0.3931, + "step": 14401 + }, + { + "epoch": 2.960633158598006, + "grad_norm": 0.225963294506073, + "learning_rate": 4.000983097219358e-08, + "loss": 0.3679, + "step": 14402 + }, + { + "epoch": 2.9608387295713845, + "grad_norm": 0.23282590508460999, + "learning_rate": 3.958984476437722e-08, + "loss": 0.3793, + "step": 14403 + }, + { + "epoch": 2.961044300544763, + "grad_norm": 0.2279476523399353, + "learning_rate": 3.917207354581787e-08, + "loss": 0.3647, + "step": 14404 + }, + { + "epoch": 2.9612498715181417, + "grad_norm": 0.239571213722229, + "learning_rate": 3.875651733710906e-08, + "loss": 0.3865, + "step": 14405 + }, + { + "epoch": 2.9614554424915203, + "grad_norm": 0.2309008538722992, + "learning_rate": 3.834317615871941e-08, + "loss": 0.3594, + "step": 14406 + }, + { + "epoch": 2.961661013464899, + "grad_norm": 0.22634616494178772, + "learning_rate": 3.793205003100764e-08, + "loss": 0.3762, + "step": 14407 + }, + { + "epoch": 2.9618665844382774, + "grad_norm": 0.24357974529266357, + "learning_rate": 3.752313897423754e-08, + "loss": 0.3808, + "step": 14408 + }, + { + "epoch": 2.962072155411656, + "grad_norm": 0.2598305642604828, + "learning_rate": 3.7116443008543e-08, + "loss": 0.3751, + "step": 14409 + }, + { + "epoch": 2.962277726385034, + "grad_norm": 0.2315262258052826, + "learning_rate": 3.6711962153963e-08, + "loss": 0.3667, + "step": 14410 + }, + { + "epoch": 2.962483297358413, + "grad_norm": 0.22729608416557312, + "learning_rate": 3.6309696430431586e-08, + "loss": 0.3758, + "step": 14411 + }, + { + "epoch": 2.9626888683317913, + "grad_norm": 0.23362228274345398, + "learning_rate": 3.590964585776291e-08, + "loss": 0.402, + "step": 14412 + }, + { + "epoch": 2.9628944393051704, + "grad_norm": 0.24321232736110687, + "learning_rate": 3.551181045566121e-08, + "loss": 0.3867, + "step": 14413 + }, + { + "epoch": 2.9631000102785485, + "grad_norm": 0.2254071682691574, + "learning_rate": 3.511619024373081e-08, + "loss": 0.3674, + "step": 14414 + }, + { + "epoch": 2.963305581251927, + "grad_norm": 0.23968133330345154, + "learning_rate": 3.472278524145611e-08, + "loss": 0.3507, + "step": 14415 + }, + { + "epoch": 2.9635111522253057, + "grad_norm": 0.22927747666835785, + "learning_rate": 3.433159546822662e-08, + "loss": 0.3699, + "step": 14416 + }, + { + "epoch": 2.9637167231986843, + "grad_norm": 0.4465451240539551, + "learning_rate": 3.394262094331191e-08, + "loss": 0.3874, + "step": 14417 + }, + { + "epoch": 2.963922294172063, + "grad_norm": 0.23466768860816956, + "learning_rate": 3.355586168587166e-08, + "loss": 0.376, + "step": 14418 + }, + { + "epoch": 2.9641278651454415, + "grad_norm": 0.11790206283330917, + "learning_rate": 3.3171317714960624e-08, + "loss": 0.4474, + "step": 14419 + }, + { + "epoch": 2.96433343611882, + "grad_norm": 0.23741118609905243, + "learning_rate": 3.278898904952366e-08, + "loss": 0.3897, + "step": 14420 + }, + { + "epoch": 2.9645390070921986, + "grad_norm": 0.2253805696964264, + "learning_rate": 3.240887570840068e-08, + "loss": 0.3498, + "step": 14421 + }, + { + "epoch": 2.9647445780655772, + "grad_norm": 0.22657155990600586, + "learning_rate": 3.203097771031172e-08, + "loss": 0.3677, + "step": 14422 + }, + { + "epoch": 2.964950149038956, + "grad_norm": 0.22919400036334991, + "learning_rate": 3.165529507387188e-08, + "loss": 0.3756, + "step": 14423 + }, + { + "epoch": 2.9651557200123344, + "grad_norm": 0.12317074835300446, + "learning_rate": 3.128182781760136e-08, + "loss": 0.4487, + "step": 14424 + }, + { + "epoch": 2.9653612909857126, + "grad_norm": 0.12205608189105988, + "learning_rate": 3.0910575959890444e-08, + "loss": 0.4527, + "step": 14425 + }, + { + "epoch": 2.9655668619590916, + "grad_norm": 0.22888796031475067, + "learning_rate": 3.0541539519029495e-08, + "loss": 0.3703, + "step": 14426 + }, + { + "epoch": 2.9657724329324697, + "grad_norm": 0.24040739238262177, + "learning_rate": 3.017471851319897e-08, + "loss": 0.3859, + "step": 14427 + }, + { + "epoch": 2.9659780039058488, + "grad_norm": 0.120720773935318, + "learning_rate": 2.9810112960474425e-08, + "loss": 0.4494, + "step": 14428 + }, + { + "epoch": 2.966183574879227, + "grad_norm": 0.23405267298221588, + "learning_rate": 2.944772287881148e-08, + "loss": 0.3848, + "step": 14429 + }, + { + "epoch": 2.9663891458526055, + "grad_norm": 0.23550044000148773, + "learning_rate": 2.9087548286070853e-08, + "loss": 0.38, + "step": 14430 + }, + { + "epoch": 2.966594716825984, + "grad_norm": 0.22151748836040497, + "learning_rate": 2.8729589199993357e-08, + "loss": 0.3612, + "step": 14431 + }, + { + "epoch": 2.9668002877993627, + "grad_norm": 0.2364836186170578, + "learning_rate": 2.837384563821488e-08, + "loss": 0.3892, + "step": 14432 + }, + { + "epoch": 2.9670058587727413, + "grad_norm": 0.23630112409591675, + "learning_rate": 2.802031761825641e-08, + "loss": 0.3829, + "step": 14433 + }, + { + "epoch": 2.96721142974612, + "grad_norm": 0.23327064514160156, + "learning_rate": 2.766900515753901e-08, + "loss": 0.3951, + "step": 14434 + }, + { + "epoch": 2.9674170007194984, + "grad_norm": 0.12371329218149185, + "learning_rate": 2.7319908273373828e-08, + "loss": 0.4328, + "step": 14435 + }, + { + "epoch": 2.967622571692877, + "grad_norm": 0.11786897480487823, + "learning_rate": 2.697302698295212e-08, + "loss": 0.4559, + "step": 14436 + }, + { + "epoch": 2.9678281426662556, + "grad_norm": 0.23964469134807587, + "learning_rate": 2.6628361303365212e-08, + "loss": 0.3775, + "step": 14437 + }, + { + "epoch": 2.968033713639634, + "grad_norm": 0.11859652400016785, + "learning_rate": 2.628591125159452e-08, + "loss": 0.4539, + "step": 14438 + }, + { + "epoch": 2.968239284613013, + "grad_norm": 0.24552220106124878, + "learning_rate": 2.594567684450655e-08, + "loss": 0.3925, + "step": 14439 + }, + { + "epoch": 2.968444855586391, + "grad_norm": 0.2301911562681198, + "learning_rate": 2.560765809887289e-08, + "loss": 0.3591, + "step": 14440 + }, + { + "epoch": 2.96865042655977, + "grad_norm": 0.23525011539459229, + "learning_rate": 2.527185503134022e-08, + "loss": 0.3905, + "step": 14441 + }, + { + "epoch": 2.968855997533148, + "grad_norm": 0.123292475938797, + "learning_rate": 2.493826765845031e-08, + "loss": 0.4365, + "step": 14442 + }, + { + "epoch": 2.969061568506527, + "grad_norm": 0.23423157632350922, + "learning_rate": 2.4606895996635016e-08, + "loss": 0.3867, + "step": 14443 + }, + { + "epoch": 2.9692671394799053, + "grad_norm": 0.2355274260044098, + "learning_rate": 2.4277740062226274e-08, + "loss": 0.392, + "step": 14444 + }, + { + "epoch": 2.969472710453284, + "grad_norm": 0.12355451285839081, + "learning_rate": 2.395079987144111e-08, + "loss": 0.4473, + "step": 14445 + }, + { + "epoch": 2.9696782814266625, + "grad_norm": 0.23346541821956635, + "learning_rate": 2.362607544037665e-08, + "loss": 0.3761, + "step": 14446 + }, + { + "epoch": 2.969883852400041, + "grad_norm": 0.23978963494300842, + "learning_rate": 2.3303566785040087e-08, + "loss": 0.366, + "step": 14447 + }, + { + "epoch": 2.9700894233734196, + "grad_norm": 0.22105932235717773, + "learning_rate": 2.298327392131372e-08, + "loss": 0.3697, + "step": 14448 + }, + { + "epoch": 2.9702949943467982, + "grad_norm": 0.23754067718982697, + "learning_rate": 2.2665196864984918e-08, + "loss": 0.3773, + "step": 14449 + }, + { + "epoch": 2.970500565320177, + "grad_norm": 0.23977546393871307, + "learning_rate": 2.2349335631711155e-08, + "loss": 0.4008, + "step": 14450 + }, + { + "epoch": 2.9707061362935554, + "grad_norm": 0.22875571250915527, + "learning_rate": 2.2035690237064977e-08, + "loss": 0.3623, + "step": 14451 + }, + { + "epoch": 2.970911707266934, + "grad_norm": 0.23122116923332214, + "learning_rate": 2.1724260696494027e-08, + "loss": 0.369, + "step": 14452 + }, + { + "epoch": 2.9711172782403126, + "grad_norm": 0.12258761376142502, + "learning_rate": 2.141504702533603e-08, + "loss": 0.4432, + "step": 14453 + }, + { + "epoch": 2.971322849213691, + "grad_norm": 0.2527145445346832, + "learning_rate": 2.1108049238833806e-08, + "loss": 0.3691, + "step": 14454 + }, + { + "epoch": 2.9715284201870698, + "grad_norm": 0.22957132756710052, + "learning_rate": 2.080326735210525e-08, + "loss": 0.3863, + "step": 14455 + }, + { + "epoch": 2.9717339911604483, + "grad_norm": 0.11908449977636337, + "learning_rate": 2.050070138016835e-08, + "loss": 0.4381, + "step": 14456 + }, + { + "epoch": 2.9719395621338265, + "grad_norm": 0.23832279443740845, + "learning_rate": 2.020035133793119e-08, + "loss": 0.3762, + "step": 14457 + }, + { + "epoch": 2.9721451331072055, + "grad_norm": 0.22900496423244476, + "learning_rate": 1.990221724018193e-08, + "loss": 0.3593, + "step": 14458 + }, + { + "epoch": 2.9723507040805837, + "grad_norm": 0.23259581625461578, + "learning_rate": 1.960629910161882e-08, + "loss": 0.3722, + "step": 14459 + }, + { + "epoch": 2.9725562750539622, + "grad_norm": 0.2373165637254715, + "learning_rate": 1.93125969368102e-08, + "loss": 0.3792, + "step": 14460 + }, + { + "epoch": 2.972761846027341, + "grad_norm": 0.23144948482513428, + "learning_rate": 1.9021110760234494e-08, + "loss": 0.3921, + "step": 14461 + }, + { + "epoch": 2.9729674170007194, + "grad_norm": 0.22227592766284943, + "learning_rate": 1.8731840586250217e-08, + "loss": 0.3614, + "step": 14462 + }, + { + "epoch": 2.973172987974098, + "grad_norm": 0.231735497713089, + "learning_rate": 1.844478642910097e-08, + "loss": 0.39, + "step": 14463 + }, + { + "epoch": 2.9733785589474766, + "grad_norm": 0.23499642312526703, + "learning_rate": 1.8159948302940432e-08, + "loss": 0.3636, + "step": 14464 + }, + { + "epoch": 2.973584129920855, + "grad_norm": 0.12244053184986115, + "learning_rate": 1.7877326221787395e-08, + "loss": 0.449, + "step": 14465 + }, + { + "epoch": 2.973789700894234, + "grad_norm": 0.2361563742160797, + "learning_rate": 1.7596920199575706e-08, + "loss": 0.3727, + "step": 14466 + }, + { + "epoch": 2.9739952718676124, + "grad_norm": 0.22725822031497955, + "learning_rate": 1.731873025011932e-08, + "loss": 0.3663, + "step": 14467 + }, + { + "epoch": 2.974200842840991, + "grad_norm": 0.23438185453414917, + "learning_rate": 1.7042756387117275e-08, + "loss": 0.3793, + "step": 14468 + }, + { + "epoch": 2.9744064138143695, + "grad_norm": 0.24105405807495117, + "learning_rate": 1.6768998624168698e-08, + "loss": 0.393, + "step": 14469 + }, + { + "epoch": 2.974611984787748, + "grad_norm": 0.23841865360736847, + "learning_rate": 1.6497456974762794e-08, + "loss": 0.389, + "step": 14470 + }, + { + "epoch": 2.9748175557611267, + "grad_norm": 0.23006348311901093, + "learning_rate": 1.6228131452273864e-08, + "loss": 0.3752, + "step": 14471 + }, + { + "epoch": 2.975023126734505, + "grad_norm": 0.22141233086585999, + "learning_rate": 1.5961022069971298e-08, + "loss": 0.3466, + "step": 14472 + }, + { + "epoch": 2.975228697707884, + "grad_norm": 0.2314436435699463, + "learning_rate": 1.5696128841014568e-08, + "loss": 0.3913, + "step": 14473 + }, + { + "epoch": 2.975434268681262, + "grad_norm": 0.2326936423778534, + "learning_rate": 1.5433451778448238e-08, + "loss": 0.3836, + "step": 14474 + }, + { + "epoch": 2.9756398396546406, + "grad_norm": 0.22499439120292664, + "learning_rate": 1.5172990895226948e-08, + "loss": 0.3827, + "step": 14475 + }, + { + "epoch": 2.975845410628019, + "grad_norm": 0.23209989070892334, + "learning_rate": 1.4914746204165443e-08, + "loss": 0.3626, + "step": 14476 + }, + { + "epoch": 2.976050981601398, + "grad_norm": 0.2376868724822998, + "learning_rate": 1.4658717718003535e-08, + "loss": 0.3645, + "step": 14477 + }, + { + "epoch": 2.9762565525747764, + "grad_norm": 0.2345684915781021, + "learning_rate": 1.4404905449336149e-08, + "loss": 0.3789, + "step": 14478 + }, + { + "epoch": 2.976462123548155, + "grad_norm": 0.11508353054523468, + "learning_rate": 1.415330941068327e-08, + "loss": 0.4672, + "step": 14479 + }, + { + "epoch": 2.9766676945215336, + "grad_norm": 0.11564578115940094, + "learning_rate": 1.3903929614434986e-08, + "loss": 0.444, + "step": 14480 + }, + { + "epoch": 2.976873265494912, + "grad_norm": 0.22484390437602997, + "learning_rate": 1.3656766072871475e-08, + "loss": 0.3825, + "step": 14481 + }, + { + "epoch": 2.9770788364682907, + "grad_norm": 0.22326096892356873, + "learning_rate": 1.3411818798172993e-08, + "loss": 0.3728, + "step": 14482 + }, + { + "epoch": 2.9772844074416693, + "grad_norm": 0.22746115922927856, + "learning_rate": 1.3169087802409885e-08, + "loss": 0.3882, + "step": 14483 + }, + { + "epoch": 2.977489978415048, + "grad_norm": 0.23284806311130524, + "learning_rate": 1.2928573097537588e-08, + "loss": 0.3927, + "step": 14484 + }, + { + "epoch": 2.9776955493884265, + "grad_norm": 0.23383115231990814, + "learning_rate": 1.2690274695406623e-08, + "loss": 0.3897, + "step": 14485 + }, + { + "epoch": 2.977901120361805, + "grad_norm": 0.23091500997543335, + "learning_rate": 1.2454192607752602e-08, + "loss": 0.3783, + "step": 14486 + }, + { + "epoch": 2.9781066913351832, + "grad_norm": 0.23954810202121735, + "learning_rate": 1.2220326846211217e-08, + "loss": 0.3963, + "step": 14487 + }, + { + "epoch": 2.9783122623085623, + "grad_norm": 0.23919789493083954, + "learning_rate": 1.1988677422303251e-08, + "loss": 0.3777, + "step": 14488 + }, + { + "epoch": 2.9785178332819404, + "grad_norm": 0.2225130796432495, + "learning_rate": 1.1759244347434584e-08, + "loss": 0.3658, + "step": 14489 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 0.2334238588809967, + "learning_rate": 1.153202763292116e-08, + "loss": 0.3906, + "step": 14490 + }, + { + "epoch": 2.9789289752286976, + "grad_norm": 0.2455325573682785, + "learning_rate": 1.1307027289944038e-08, + "loss": 0.3829, + "step": 14491 + }, + { + "epoch": 2.979134546202076, + "grad_norm": 0.2270069569349289, + "learning_rate": 1.1084243329594347e-08, + "loss": 0.3802, + "step": 14492 + }, + { + "epoch": 2.9793401171754548, + "grad_norm": 0.22187288105487823, + "learning_rate": 1.0863675762843306e-08, + "loss": 0.3832, + "step": 14493 + }, + { + "epoch": 2.9795456881488334, + "grad_norm": 0.2417239248752594, + "learning_rate": 1.0645324600562223e-08, + "loss": 0.3624, + "step": 14494 + }, + { + "epoch": 2.979751259122212, + "grad_norm": 0.11916260421276093, + "learning_rate": 1.0429189853507493e-08, + "loss": 0.4491, + "step": 14495 + }, + { + "epoch": 2.9799568300955905, + "grad_norm": 0.23340509831905365, + "learning_rate": 1.02152715323256e-08, + "loss": 0.3903, + "step": 14496 + }, + { + "epoch": 2.980162401068969, + "grad_norm": 0.2338993400335312, + "learning_rate": 1.0003569647558109e-08, + "loss": 0.3733, + "step": 14497 + }, + { + "epoch": 2.9803679720423477, + "grad_norm": 0.23980024456977844, + "learning_rate": 9.794084209626687e-09, + "loss": 0.3745, + "step": 14498 + }, + { + "epoch": 2.9805735430157263, + "grad_norm": 0.22634217143058777, + "learning_rate": 9.58681522885807e-09, + "loss": 0.3821, + "step": 14499 + }, + { + "epoch": 2.980779113989105, + "grad_norm": 0.2323930710554123, + "learning_rate": 9.381762715464093e-09, + "loss": 0.3854, + "step": 14500 + }, + { + "epoch": 2.9809846849624835, + "grad_norm": 0.23172056674957275, + "learning_rate": 9.178926679546673e-09, + "loss": 0.3735, + "step": 14501 + }, + { + "epoch": 2.9811902559358616, + "grad_norm": 0.23236456513404846, + "learning_rate": 8.978307131097818e-09, + "loss": 0.3731, + "step": 14502 + }, + { + "epoch": 2.9813958269092407, + "grad_norm": 0.23568768799304962, + "learning_rate": 8.779904079994628e-09, + "loss": 0.3654, + "step": 14503 + }, + { + "epoch": 2.981601397882619, + "grad_norm": 0.22454003989696503, + "learning_rate": 8.583717536019276e-09, + "loss": 0.4069, + "step": 14504 + }, + { + "epoch": 2.9818069688559974, + "grad_norm": 0.2187877893447876, + "learning_rate": 8.38974750883903e-09, + "loss": 0.3814, + "step": 14505 + }, + { + "epoch": 2.982012539829376, + "grad_norm": 0.2329930067062378, + "learning_rate": 8.197994008001253e-09, + "loss": 0.3876, + "step": 14506 + }, + { + "epoch": 2.9822181108027546, + "grad_norm": 0.22449320554733276, + "learning_rate": 8.008457042958384e-09, + "loss": 0.3766, + "step": 14507 + }, + { + "epoch": 2.982423681776133, + "grad_norm": 0.12521809339523315, + "learning_rate": 7.821136623047953e-09, + "loss": 0.4432, + "step": 14508 + }, + { + "epoch": 2.9826292527495117, + "grad_norm": 0.12057320028543472, + "learning_rate": 7.636032757492583e-09, + "loss": 0.4466, + "step": 14509 + }, + { + "epoch": 2.9828348237228903, + "grad_norm": 0.22894617915153503, + "learning_rate": 7.453145455419975e-09, + "loss": 0.378, + "step": 14510 + }, + { + "epoch": 2.983040394696269, + "grad_norm": 0.22911518812179565, + "learning_rate": 7.272474725837919e-09, + "loss": 0.3645, + "step": 14511 + }, + { + "epoch": 2.9832459656696475, + "grad_norm": 0.23008479177951813, + "learning_rate": 7.0940205776443004e-09, + "loss": 0.3793, + "step": 14512 + }, + { + "epoch": 2.983451536643026, + "grad_norm": 0.23247113823890686, + "learning_rate": 6.917783019627089e-09, + "loss": 0.3744, + "step": 14513 + }, + { + "epoch": 2.9836571076164047, + "grad_norm": 0.12385281175374985, + "learning_rate": 6.7437620604793304e-09, + "loss": 0.4215, + "step": 14514 + }, + { + "epoch": 2.9838626785897833, + "grad_norm": 0.23346978425979614, + "learning_rate": 6.571957708764176e-09, + "loss": 0.3934, + "step": 14515 + }, + { + "epoch": 2.984068249563162, + "grad_norm": 0.24097535014152527, + "learning_rate": 6.402369972954847e-09, + "loss": 0.3913, + "step": 14516 + }, + { + "epoch": 2.98427382053654, + "grad_norm": 0.2249852418899536, + "learning_rate": 6.234998861399666e-09, + "loss": 0.3629, + "step": 14517 + }, + { + "epoch": 2.984479391509919, + "grad_norm": 0.22656574845314026, + "learning_rate": 6.069844382342038e-09, + "loss": 0.3682, + "step": 14518 + }, + { + "epoch": 2.984684962483297, + "grad_norm": 0.2244207262992859, + "learning_rate": 5.90690654392545e-09, + "loss": 0.3743, + "step": 14519 + }, + { + "epoch": 2.9848905334566758, + "grad_norm": 0.2316739857196808, + "learning_rate": 5.746185354173484e-09, + "loss": 0.3774, + "step": 14520 + }, + { + "epoch": 2.9850961044300544, + "grad_norm": 0.1228955090045929, + "learning_rate": 5.587680821004803e-09, + "loss": 0.4535, + "step": 14521 + }, + { + "epoch": 2.985301675403433, + "grad_norm": 0.2285931259393692, + "learning_rate": 5.431392952228165e-09, + "loss": 0.3802, + "step": 14522 + }, + { + "epoch": 2.9855072463768115, + "grad_norm": 0.21995897591114044, + "learning_rate": 5.2773217555424086e-09, + "loss": 0.3836, + "step": 14523 + }, + { + "epoch": 2.98571281735019, + "grad_norm": 0.23392610251903534, + "learning_rate": 5.125467238536463e-09, + "loss": 0.3909, + "step": 14524 + }, + { + "epoch": 2.9859183883235687, + "grad_norm": 0.23672394454479218, + "learning_rate": 4.975829408694344e-09, + "loss": 0.3759, + "step": 14525 + }, + { + "epoch": 2.9861239592969473, + "grad_norm": 0.12258664518594742, + "learning_rate": 4.828408273385154e-09, + "loss": 0.4437, + "step": 14526 + }, + { + "epoch": 2.986329530270326, + "grad_norm": 0.23568691313266754, + "learning_rate": 4.683203839878081e-09, + "loss": 0.3702, + "step": 14527 + }, + { + "epoch": 2.9865351012437045, + "grad_norm": 0.2551220953464508, + "learning_rate": 4.540216115317409e-09, + "loss": 0.3785, + "step": 14528 + }, + { + "epoch": 2.986740672217083, + "grad_norm": 0.22651293873786926, + "learning_rate": 4.399445106752498e-09, + "loss": 0.3663, + "step": 14529 + }, + { + "epoch": 2.9869462431904616, + "grad_norm": 0.2356937676668167, + "learning_rate": 4.260890821117802e-09, + "loss": 0.3979, + "step": 14530 + }, + { + "epoch": 2.9871518141638402, + "grad_norm": 0.2306637316942215, + "learning_rate": 4.124553265242859e-09, + "loss": 0.3723, + "step": 14531 + }, + { + "epoch": 2.9873573851372184, + "grad_norm": 0.1231551244854927, + "learning_rate": 3.9904324458373e-09, + "loss": 0.4615, + "step": 14532 + }, + { + "epoch": 2.9875629561105974, + "grad_norm": 0.23342475295066833, + "learning_rate": 3.8585283695158345e-09, + "loss": 0.376, + "step": 14533 + }, + { + "epoch": 2.9877685270839756, + "grad_norm": 0.23796530067920685, + "learning_rate": 3.728841042768272e-09, + "loss": 0.3981, + "step": 14534 + }, + { + "epoch": 2.987974098057354, + "grad_norm": 0.2273947149515152, + "learning_rate": 3.601370471994492e-09, + "loss": 0.3752, + "step": 14535 + }, + { + "epoch": 2.9881796690307327, + "grad_norm": 0.22759398818016052, + "learning_rate": 3.4761166634644795e-09, + "loss": 0.3688, + "step": 14536 + }, + { + "epoch": 2.9883852400041113, + "grad_norm": 0.22332710027694702, + "learning_rate": 3.353079623353295e-09, + "loss": 0.3744, + "step": 14537 + }, + { + "epoch": 2.98859081097749, + "grad_norm": 0.22932596504688263, + "learning_rate": 3.232259357726086e-09, + "loss": 0.3762, + "step": 14538 + }, + { + "epoch": 2.9887963819508685, + "grad_norm": 0.22627981007099152, + "learning_rate": 3.1136558725280986e-09, + "loss": 0.37, + "step": 14539 + }, + { + "epoch": 2.989001952924247, + "grad_norm": 0.23626869916915894, + "learning_rate": 2.9972691736046556e-09, + "loss": 0.3708, + "step": 14540 + }, + { + "epoch": 2.9892075238976257, + "grad_norm": 0.2292552888393402, + "learning_rate": 2.8830992666911696e-09, + "loss": 0.3854, + "step": 14541 + }, + { + "epoch": 2.9894130948710043, + "grad_norm": 0.13139550387859344, + "learning_rate": 2.7711461574081443e-09, + "loss": 0.4664, + "step": 14542 + }, + { + "epoch": 2.989618665844383, + "grad_norm": 0.2349810004234314, + "learning_rate": 2.6614098512811603e-09, + "loss": 0.394, + "step": 14543 + }, + { + "epoch": 2.9898242368177614, + "grad_norm": 0.23067308962345123, + "learning_rate": 2.553890353700905e-09, + "loss": 0.3803, + "step": 14544 + }, + { + "epoch": 2.99002980779114, + "grad_norm": 0.238302543759346, + "learning_rate": 2.448587669978131e-09, + "loss": 0.3568, + "step": 14545 + }, + { + "epoch": 2.9902353787645186, + "grad_norm": 0.11440926790237427, + "learning_rate": 2.345501805298689e-09, + "loss": 0.4259, + "step": 14546 + }, + { + "epoch": 2.9904409497378968, + "grad_norm": 0.23391030728816986, + "learning_rate": 2.244632764733523e-09, + "loss": 0.3822, + "step": 14547 + }, + { + "epoch": 2.990646520711276, + "grad_norm": 0.2159079611301422, + "learning_rate": 2.145980553253657e-09, + "loss": 0.3729, + "step": 14548 + }, + { + "epoch": 2.990852091684654, + "grad_norm": 0.2323864847421646, + "learning_rate": 2.0495451757251983e-09, + "loss": 0.3891, + "step": 14549 + }, + { + "epoch": 2.9910576626580325, + "grad_norm": 0.23049965500831604, + "learning_rate": 1.955326636899346e-09, + "loss": 0.3736, + "step": 14550 + }, + { + "epoch": 2.991263233631411, + "grad_norm": 0.227107435464859, + "learning_rate": 1.8633249414073963e-09, + "loss": 0.3669, + "step": 14551 + }, + { + "epoch": 2.9914688046047897, + "grad_norm": 0.22962632775306702, + "learning_rate": 1.7735400937957114e-09, + "loss": 0.3931, + "step": 14552 + }, + { + "epoch": 2.9916743755781683, + "grad_norm": 0.22868715226650238, + "learning_rate": 1.6859720984757631e-09, + "loss": 0.3688, + "step": 14553 + }, + { + "epoch": 2.991879946551547, + "grad_norm": 0.22757934033870697, + "learning_rate": 1.6006209597640986e-09, + "loss": 0.3572, + "step": 14554 + }, + { + "epoch": 2.9920855175249255, + "grad_norm": 0.22525522112846375, + "learning_rate": 1.5174866818723487e-09, + "loss": 0.3889, + "step": 14555 + }, + { + "epoch": 2.992291088498304, + "grad_norm": 0.22742381691932678, + "learning_rate": 1.4365692688922405e-09, + "loss": 0.3883, + "step": 14556 + }, + { + "epoch": 2.9924966594716826, + "grad_norm": 0.2328689843416214, + "learning_rate": 1.3578687248055888e-09, + "loss": 0.3777, + "step": 14557 + }, + { + "epoch": 2.9927022304450612, + "grad_norm": 0.2241593301296234, + "learning_rate": 1.2813850534992843e-09, + "loss": 0.373, + "step": 14558 + }, + { + "epoch": 2.99290780141844, + "grad_norm": 0.14074623584747314, + "learning_rate": 1.207118258730322e-09, + "loss": 0.4593, + "step": 14559 + }, + { + "epoch": 2.9931133723918184, + "grad_norm": 0.22643250226974487, + "learning_rate": 1.1350683441657684e-09, + "loss": 0.3729, + "step": 14560 + }, + { + "epoch": 2.993318943365197, + "grad_norm": 0.23651181161403656, + "learning_rate": 1.06523531334779e-09, + "loss": 0.3741, + "step": 14561 + }, + { + "epoch": 2.993524514338575, + "grad_norm": 0.23334497213363647, + "learning_rate": 9.976191697286253e-10, + "loss": 0.3901, + "step": 14562 + }, + { + "epoch": 2.993730085311954, + "grad_norm": 0.2361806035041809, + "learning_rate": 9.322199166256207e-10, + "loss": 0.3956, + "step": 14563 + }, + { + "epoch": 2.9939356562853323, + "grad_norm": 0.23388999700546265, + "learning_rate": 8.690375572711906e-10, + "loss": 0.3776, + "step": 14564 + }, + { + "epoch": 2.9941412272587113, + "grad_norm": 0.12321368604898453, + "learning_rate": 8.080720947678533e-10, + "loss": 0.4526, + "step": 14565 + }, + { + "epoch": 2.9943467982320895, + "grad_norm": 0.23372013866901398, + "learning_rate": 7.493235321331948e-10, + "loss": 0.3964, + "step": 14566 + }, + { + "epoch": 2.994552369205468, + "grad_norm": 0.22259008884429932, + "learning_rate": 6.927918722499093e-10, + "loss": 0.3632, + "step": 14567 + }, + { + "epoch": 2.9947579401788467, + "grad_norm": 0.22694145143032074, + "learning_rate": 6.384771179057669e-10, + "loss": 0.3826, + "step": 14568 + }, + { + "epoch": 2.9949635111522253, + "grad_norm": 0.22071406245231628, + "learning_rate": 5.863792717736293e-10, + "loss": 0.3715, + "step": 14569 + }, + { + "epoch": 2.995169082125604, + "grad_norm": 0.2295757234096527, + "learning_rate": 5.364983364314347e-10, + "loss": 0.348, + "step": 14570 + }, + { + "epoch": 2.9953746530989824, + "grad_norm": 0.11967863142490387, + "learning_rate": 4.888343143222285e-10, + "loss": 0.4525, + "step": 14571 + }, + { + "epoch": 2.995580224072361, + "grad_norm": 0.23142680525779724, + "learning_rate": 4.4338720780412456e-10, + "loss": 0.3912, + "step": 14572 + }, + { + "epoch": 2.9957857950457396, + "grad_norm": 0.12113110721111298, + "learning_rate": 4.0015701911533256e-10, + "loss": 0.4394, + "step": 14573 + }, + { + "epoch": 2.995991366019118, + "grad_norm": 0.1214088723063469, + "learning_rate": 3.591437503791539e-10, + "loss": 0.4441, + "step": 14574 + }, + { + "epoch": 2.996196936992497, + "grad_norm": 0.24138882756233215, + "learning_rate": 3.203474036239662e-10, + "loss": 0.381, + "step": 14575 + }, + { + "epoch": 2.9964025079658754, + "grad_norm": 0.22098585963249207, + "learning_rate": 2.8376798075324673e-10, + "loss": 0.379, + "step": 14576 + }, + { + "epoch": 2.9966080789392535, + "grad_norm": 0.22628125548362732, + "learning_rate": 2.4940548357554884e-10, + "loss": 0.385, + "step": 14577 + }, + { + "epoch": 2.9968136499126325, + "grad_norm": 0.23406754434108734, + "learning_rate": 2.1725991378451772e-10, + "loss": 0.3906, + "step": 14578 + }, + { + "epoch": 2.9970192208860107, + "grad_norm": 0.2328203022480011, + "learning_rate": 1.8733127295389452e-10, + "loss": 0.393, + "step": 14579 + }, + { + "epoch": 2.9972247918593897, + "grad_norm": 0.11977335065603256, + "learning_rate": 1.5961956256749233e-10, + "loss": 0.4311, + "step": 14580 + }, + { + "epoch": 2.997430362832768, + "grad_norm": 0.22989055514335632, + "learning_rate": 1.3412478398922012e-10, + "loss": 0.3797, + "step": 14581 + }, + { + "epoch": 2.9976359338061465, + "grad_norm": 0.23267367482185364, + "learning_rate": 1.1084693847307482e-10, + "loss": 0.3734, + "step": 14582 + }, + { + "epoch": 2.997841504779525, + "grad_norm": 0.22674784064292908, + "learning_rate": 8.978602716813722e-11, + "loss": 0.385, + "step": 14583 + }, + { + "epoch": 2.9980470757529036, + "grad_norm": 0.24478791654109955, + "learning_rate": 7.09420511085801e-11, + "loss": 0.3813, + "step": 14584 + }, + { + "epoch": 2.9982526467262822, + "grad_norm": 0.23882992565631866, + "learning_rate": 5.431501122366012e-11, + "loss": 0.3949, + "step": 14585 + }, + { + "epoch": 2.998458217699661, + "grad_norm": 0.2423866242170334, + "learning_rate": 3.990490833771787e-11, + "loss": 0.3893, + "step": 14586 + }, + { + "epoch": 2.9986637886730394, + "grad_norm": 0.24024717509746552, + "learning_rate": 2.771174315019387e-11, + "loss": 0.3785, + "step": 14587 + }, + { + "epoch": 2.998869359646418, + "grad_norm": 0.23211045563220978, + "learning_rate": 1.773551627060055e-11, + "loss": 0.3623, + "step": 14588 + }, + { + "epoch": 2.9990749306197966, + "grad_norm": 0.22888019680976868, + "learning_rate": 9.976228188546265e-12, + "loss": 0.3982, + "step": 14589 + }, + { + "epoch": 2.999280501593175, + "grad_norm": 0.23154133558273315, + "learning_rate": 4.433879288723298e-12, + "loss": 0.3689, + "step": 14590 + }, + { + "epoch": 2.9994860725665538, + "grad_norm": 0.22858844697475433, + "learning_rate": 1.1084698359198341e-12, + "loss": 0.3763, + "step": 14591 + }, + { + "epoch": 2.999691643539932, + "grad_norm": 0.22756846249103546, + "learning_rate": 0.0, + "loss": 0.3625, + "step": 14592 + }, + { + "epoch": 2.999691643539932, + "step": 14592, + "total_flos": 6.439866310377226e+20, + "train_loss": 0.5454843599418701, + "train_runtime": 158902.6427, + "train_samples_per_second": 188.085, + "train_steps_per_second": 0.092 + } + ], + "logging_steps": 1.0, + "max_steps": 14592, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.439866310377226e+20, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}