{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 34104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008796622097114708, "grad_norm": 262.1760559082031, "learning_rate": 2.6385224274406335e-08, "loss": 6.2682, "step": 10 }, { "epoch": 0.0017593244194229415, "grad_norm": 323.3883972167969, "learning_rate": 5.570214013485782e-08, "loss": 6.9076, "step": 20 }, { "epoch": 0.0026389866291344124, "grad_norm": 253.25384521484375, "learning_rate": 8.50190559953093e-08, "loss": 6.8341, "step": 30 }, { "epoch": 0.003518648838845883, "grad_norm": 227.13699340820312, "learning_rate": 1.1433597185576079e-07, "loss": 6.3699, "step": 40 }, { "epoch": 0.004398311048557354, "grad_norm": 205.7606658935547, "learning_rate": 1.4365288771621227e-07, "loss": 6.6831, "step": 50 }, { "epoch": 0.005277973258268825, "grad_norm": 144.705322265625, "learning_rate": 1.7296980357666373e-07, "loss": 6.113, "step": 60 }, { "epoch": 0.006157635467980296, "grad_norm": 136.02186584472656, "learning_rate": 2.0228671943711525e-07, "loss": 6.1772, "step": 70 }, { "epoch": 0.007037297677691766, "grad_norm": 137.09791564941406, "learning_rate": 2.316036352975667e-07, "loss": 5.6768, "step": 80 }, { "epoch": 0.007916959887403237, "grad_norm": 124.78844451904297, "learning_rate": 2.609205511580182e-07, "loss": 5.3898, "step": 90 }, { "epoch": 0.008796622097114707, "grad_norm": 69.67615509033203, "learning_rate": 2.9023746701846967e-07, "loss": 4.8849, "step": 100 }, { "epoch": 0.00967628430682618, "grad_norm": 60.815521240234375, "learning_rate": 3.195543828789212e-07, "loss": 4.4642, "step": 110 }, { "epoch": 0.01055594651653765, "grad_norm": 49.65742874145508, "learning_rate": 3.488712987393726e-07, "loss": 4.1646, "step": 120 }, { "epoch": 0.01143560872624912, "grad_norm": 35.808448791503906, "learning_rate": 3.781882145998241e-07, "loss": 4.004, "step": 130 }, { "epoch": 0.012315270935960592, "grad_norm": 45.43000793457031, "learning_rate": 4.0750513046027563e-07, "loss": 3.9696, "step": 140 }, { "epoch": 0.013194933145672062, "grad_norm": 29.55954360961914, "learning_rate": 4.3682204632072715e-07, "loss": 3.6616, "step": 150 }, { "epoch": 0.014074595355383532, "grad_norm": 46.206443786621094, "learning_rate": 4.6613896218117856e-07, "loss": 3.4734, "step": 160 }, { "epoch": 0.014954257565095004, "grad_norm": 32.600677490234375, "learning_rate": 4.954558780416301e-07, "loss": 3.1366, "step": 170 }, { "epoch": 0.015833919774806474, "grad_norm": 38.132198333740234, "learning_rate": 5.247727939020815e-07, "loss": 2.8418, "step": 180 }, { "epoch": 0.016713581984517945, "grad_norm": 32.470184326171875, "learning_rate": 5.54089709762533e-07, "loss": 2.3633, "step": 190 }, { "epoch": 0.017593244194229415, "grad_norm": 28.02766990661621, "learning_rate": 5.834066256229845e-07, "loss": 2.4361, "step": 200 }, { "epoch": 0.01847290640394089, "grad_norm": 55.18531036376953, "learning_rate": 6.12723541483436e-07, "loss": 2.4857, "step": 210 }, { "epoch": 0.01935256861365236, "grad_norm": 30.021434783935547, "learning_rate": 6.420404573438875e-07, "loss": 2.3343, "step": 220 }, { "epoch": 0.02023223082336383, "grad_norm": 37.81367492675781, "learning_rate": 6.71357373204339e-07, "loss": 2.3231, "step": 230 }, { "epoch": 0.0211118930330753, "grad_norm": 29.83020782470703, "learning_rate": 7.006742890647903e-07, "loss": 2.2503, "step": 240 }, { "epoch": 0.02199155524278677, "grad_norm": 28.74652671813965, "learning_rate": 7.299912049252419e-07, "loss": 2.2216, "step": 250 }, { "epoch": 0.02287121745249824, "grad_norm": 28.369401931762695, "learning_rate": 7.593081207856934e-07, "loss": 2.2488, "step": 260 }, { "epoch": 0.023750879662209713, "grad_norm": 24.429798126220703, "learning_rate": 7.886250366461449e-07, "loss": 2.0616, "step": 270 }, { "epoch": 0.024630541871921183, "grad_norm": 22.8094539642334, "learning_rate": 8.179419525065964e-07, "loss": 2.1623, "step": 280 }, { "epoch": 0.025510204081632654, "grad_norm": 48.9775276184082, "learning_rate": 8.472588683670479e-07, "loss": 1.9412, "step": 290 }, { "epoch": 0.026389866291344124, "grad_norm": 34.55386734008789, "learning_rate": 8.765757842274994e-07, "loss": 2.1784, "step": 300 }, { "epoch": 0.027269528501055594, "grad_norm": 22.984296798706055, "learning_rate": 9.058927000879508e-07, "loss": 1.9406, "step": 310 }, { "epoch": 0.028149190710767064, "grad_norm": 18.96654510498047, "learning_rate": 9.352096159484023e-07, "loss": 1.9368, "step": 320 }, { "epoch": 0.029028852920478538, "grad_norm": 26.90477180480957, "learning_rate": 9.645265318088538e-07, "loss": 1.841, "step": 330 }, { "epoch": 0.029908515130190008, "grad_norm": 22.920488357543945, "learning_rate": 9.938434476693052e-07, "loss": 1.9696, "step": 340 }, { "epoch": 0.03078817733990148, "grad_norm": 20.121156692504883, "learning_rate": 1.0231603635297568e-06, "loss": 2.123, "step": 350 }, { "epoch": 0.03166783954961295, "grad_norm": 21.188480377197266, "learning_rate": 1.0524772793902083e-06, "loss": 2.0194, "step": 360 }, { "epoch": 0.03254750175932442, "grad_norm": 28.45834732055664, "learning_rate": 1.0817941952506597e-06, "loss": 1.9965, "step": 370 }, { "epoch": 0.03342716396903589, "grad_norm": 23.34012222290039, "learning_rate": 1.111111111111111e-06, "loss": 1.93, "step": 380 }, { "epoch": 0.03430682617874736, "grad_norm": 18.770872116088867, "learning_rate": 1.1404280269715626e-06, "loss": 1.8596, "step": 390 }, { "epoch": 0.03518648838845883, "grad_norm": 18.523412704467773, "learning_rate": 1.1697449428320142e-06, "loss": 1.8164, "step": 400 }, { "epoch": 0.0360661505981703, "grad_norm": 20.44288444519043, "learning_rate": 1.1990618586924657e-06, "loss": 1.7736, "step": 410 }, { "epoch": 0.03694581280788178, "grad_norm": 23.071937561035156, "learning_rate": 1.2283787745529171e-06, "loss": 1.7809, "step": 420 }, { "epoch": 0.037825475017593244, "grad_norm": 24.457965850830078, "learning_rate": 1.2576956904133687e-06, "loss": 1.9271, "step": 430 }, { "epoch": 0.03870513722730472, "grad_norm": 24.47222137451172, "learning_rate": 1.28701260627382e-06, "loss": 1.8768, "step": 440 }, { "epoch": 0.039584799437016184, "grad_norm": 20.773462295532227, "learning_rate": 1.3163295221342714e-06, "loss": 1.8113, "step": 450 }, { "epoch": 0.04046446164672766, "grad_norm": 21.063812255859375, "learning_rate": 1.3456464379947232e-06, "loss": 1.7225, "step": 460 }, { "epoch": 0.041344123856439124, "grad_norm": 24.167491912841797, "learning_rate": 1.3749633538551745e-06, "loss": 1.9576, "step": 470 }, { "epoch": 0.0422237860661506, "grad_norm": 18.631242752075195, "learning_rate": 1.404280269715626e-06, "loss": 1.804, "step": 480 }, { "epoch": 0.04310344827586207, "grad_norm": 21.79433250427246, "learning_rate": 1.4335971855760775e-06, "loss": 1.703, "step": 490 }, { "epoch": 0.04398311048557354, "grad_norm": 21.351579666137695, "learning_rate": 1.4629141014365288e-06, "loss": 1.8406, "step": 500 }, { "epoch": 0.04486277269528501, "grad_norm": 19.412641525268555, "learning_rate": 1.4922310172969806e-06, "loss": 1.7077, "step": 510 }, { "epoch": 0.04574243490499648, "grad_norm": 24.742679595947266, "learning_rate": 1.521547933157432e-06, "loss": 1.6575, "step": 520 }, { "epoch": 0.04662209711470795, "grad_norm": 23.542964935302734, "learning_rate": 1.5508648490178835e-06, "loss": 1.7801, "step": 530 }, { "epoch": 0.047501759324419426, "grad_norm": 22.598318099975586, "learning_rate": 1.5801817648783349e-06, "loss": 1.5925, "step": 540 }, { "epoch": 0.04838142153413089, "grad_norm": 22.325820922851562, "learning_rate": 1.6094986807387862e-06, "loss": 1.7227, "step": 550 }, { "epoch": 0.04926108374384237, "grad_norm": 27.58997917175293, "learning_rate": 1.6388155965992378e-06, "loss": 1.7547, "step": 560 }, { "epoch": 0.050140745953553834, "grad_norm": 14.812804222106934, "learning_rate": 1.6681325124596894e-06, "loss": 1.6236, "step": 570 }, { "epoch": 0.05102040816326531, "grad_norm": 23.730682373046875, "learning_rate": 1.697449428320141e-06, "loss": 1.6797, "step": 580 }, { "epoch": 0.051900070372976774, "grad_norm": 18.236164093017578, "learning_rate": 1.7267663441805923e-06, "loss": 1.6295, "step": 590 }, { "epoch": 0.05277973258268825, "grad_norm": 16.426822662353516, "learning_rate": 1.7560832600410439e-06, "loss": 1.564, "step": 600 }, { "epoch": 0.05365939479239972, "grad_norm": 21.215673446655273, "learning_rate": 1.7854001759014952e-06, "loss": 1.5373, "step": 610 }, { "epoch": 0.05453905700211119, "grad_norm": 20.760395050048828, "learning_rate": 1.8147170917619466e-06, "loss": 1.7478, "step": 620 }, { "epoch": 0.05541871921182266, "grad_norm": 21.460323333740234, "learning_rate": 1.8440340076223984e-06, "loss": 1.6497, "step": 630 }, { "epoch": 0.05629838142153413, "grad_norm": 21.76240348815918, "learning_rate": 1.8733509234828497e-06, "loss": 1.6779, "step": 640 }, { "epoch": 0.0571780436312456, "grad_norm": 16.459440231323242, "learning_rate": 1.9026678393433013e-06, "loss": 1.3396, "step": 650 }, { "epoch": 0.058057705840957076, "grad_norm": 25.99327850341797, "learning_rate": 1.9319847552037527e-06, "loss": 1.5017, "step": 660 }, { "epoch": 0.05893736805066854, "grad_norm": 25.01409149169922, "learning_rate": 1.9613016710642042e-06, "loss": 1.5857, "step": 670 }, { "epoch": 0.059817030260380016, "grad_norm": 19.677595138549805, "learning_rate": 1.990618586924656e-06, "loss": 1.7021, "step": 680 }, { "epoch": 0.06069669247009148, "grad_norm": 20.639846801757812, "learning_rate": 2.0199355027851074e-06, "loss": 1.5138, "step": 690 }, { "epoch": 0.06157635467980296, "grad_norm": 22.84880256652832, "learning_rate": 2.0492524186455585e-06, "loss": 1.4933, "step": 700 }, { "epoch": 0.062456016889514424, "grad_norm": 21.447385787963867, "learning_rate": 2.07856933450601e-06, "loss": 1.4625, "step": 710 }, { "epoch": 0.0633356790992259, "grad_norm": 17.13872718811035, "learning_rate": 2.1078862503664617e-06, "loss": 1.5431, "step": 720 }, { "epoch": 0.06421534130893737, "grad_norm": 26.783641815185547, "learning_rate": 2.137203166226913e-06, "loss": 1.42, "step": 730 }, { "epoch": 0.06509500351864884, "grad_norm": 22.746036529541016, "learning_rate": 2.166520082087365e-06, "loss": 1.5796, "step": 740 }, { "epoch": 0.0659746657283603, "grad_norm": 21.493179321289062, "learning_rate": 2.195836997947816e-06, "loss": 1.4996, "step": 750 }, { "epoch": 0.06685432793807178, "grad_norm": 18.741416931152344, "learning_rate": 2.2251539138082675e-06, "loss": 1.4586, "step": 760 }, { "epoch": 0.06773399014778325, "grad_norm": 19.20665168762207, "learning_rate": 2.254470829668719e-06, "loss": 1.5389, "step": 770 }, { "epoch": 0.06861365235749473, "grad_norm": 19.739431381225586, "learning_rate": 2.2837877455291702e-06, "loss": 1.4391, "step": 780 }, { "epoch": 0.0694933145672062, "grad_norm": 19.61296272277832, "learning_rate": 2.313104661389622e-06, "loss": 1.4974, "step": 790 }, { "epoch": 0.07037297677691766, "grad_norm": 22.252182006835938, "learning_rate": 2.3424215772500734e-06, "loss": 1.5388, "step": 800 }, { "epoch": 0.07125263898662913, "grad_norm": 22.615907669067383, "learning_rate": 2.371738493110525e-06, "loss": 1.4398, "step": 810 }, { "epoch": 0.0721323011963406, "grad_norm": 24.084150314331055, "learning_rate": 2.4010554089709765e-06, "loss": 1.4164, "step": 820 }, { "epoch": 0.07301196340605208, "grad_norm": 19.723674774169922, "learning_rate": 2.4303723248314277e-06, "loss": 1.5883, "step": 830 }, { "epoch": 0.07389162561576355, "grad_norm": 17.95233154296875, "learning_rate": 2.4596892406918792e-06, "loss": 1.4096, "step": 840 }, { "epoch": 0.07477128782547501, "grad_norm": 19.566635131835938, "learning_rate": 2.489006156552331e-06, "loss": 1.3947, "step": 850 }, { "epoch": 0.07565095003518649, "grad_norm": 19.242162704467773, "learning_rate": 2.5183230724127824e-06, "loss": 1.548, "step": 860 }, { "epoch": 0.07653061224489796, "grad_norm": 20.805145263671875, "learning_rate": 2.547639988273234e-06, "loss": 1.4036, "step": 870 }, { "epoch": 0.07741027445460943, "grad_norm": 18.14611053466797, "learning_rate": 2.5769569041336855e-06, "loss": 1.366, "step": 880 }, { "epoch": 0.0782899366643209, "grad_norm": 18.771162033081055, "learning_rate": 2.6062738199941367e-06, "loss": 1.5219, "step": 890 }, { "epoch": 0.07916959887403237, "grad_norm": 19.77147674560547, "learning_rate": 2.6355907358545887e-06, "loss": 1.4985, "step": 900 }, { "epoch": 0.08004926108374384, "grad_norm": 24.383596420288086, "learning_rate": 2.66490765171504e-06, "loss": 1.4096, "step": 910 }, { "epoch": 0.08092892329345532, "grad_norm": 19.269201278686523, "learning_rate": 2.6942245675754914e-06, "loss": 1.3552, "step": 920 }, { "epoch": 0.08180858550316679, "grad_norm": 16.00967025756836, "learning_rate": 2.7235414834359425e-06, "loss": 1.3755, "step": 930 }, { "epoch": 0.08268824771287825, "grad_norm": 16.601350784301758, "learning_rate": 2.752858399296394e-06, "loss": 1.5203, "step": 940 }, { "epoch": 0.08356790992258972, "grad_norm": 19.964109420776367, "learning_rate": 2.782175315156846e-06, "loss": 1.7827, "step": 950 }, { "epoch": 0.0844475721323012, "grad_norm": 26.811420440673828, "learning_rate": 2.8114922310172972e-06, "loss": 1.4739, "step": 960 }, { "epoch": 0.08532723434201267, "grad_norm": 18.643861770629883, "learning_rate": 2.840809146877749e-06, "loss": 1.4381, "step": 970 }, { "epoch": 0.08620689655172414, "grad_norm": 23.95145034790039, "learning_rate": 2.8701260627382e-06, "loss": 1.4946, "step": 980 }, { "epoch": 0.0870865587614356, "grad_norm": 22.238962173461914, "learning_rate": 2.8994429785986515e-06, "loss": 1.5961, "step": 990 }, { "epoch": 0.08796622097114708, "grad_norm": 14.25161075592041, "learning_rate": 2.9287598944591035e-06, "loss": 1.4539, "step": 1000 }, { "epoch": 0.08884588318085855, "grad_norm": 18.404861450195312, "learning_rate": 2.9580768103195547e-06, "loss": 1.4144, "step": 1010 }, { "epoch": 0.08972554539057002, "grad_norm": 19.307069778442383, "learning_rate": 2.9873937261800062e-06, "loss": 1.4566, "step": 1020 }, { "epoch": 0.0906052076002815, "grad_norm": 16.81838607788086, "learning_rate": 3.0167106420404574e-06, "loss": 1.4981, "step": 1030 }, { "epoch": 0.09148486980999296, "grad_norm": 20.26487922668457, "learning_rate": 3.046027557900909e-06, "loss": 1.474, "step": 1040 }, { "epoch": 0.09236453201970443, "grad_norm": 19.396745681762695, "learning_rate": 3.075344473761361e-06, "loss": 1.3927, "step": 1050 }, { "epoch": 0.0932441942294159, "grad_norm": 17.18834114074707, "learning_rate": 3.104661389621812e-06, "loss": 1.376, "step": 1060 }, { "epoch": 0.09412385643912738, "grad_norm": 18.5443115234375, "learning_rate": 3.1339783054822637e-06, "loss": 1.394, "step": 1070 }, { "epoch": 0.09500351864883885, "grad_norm": 14.487037658691406, "learning_rate": 3.163295221342715e-06, "loss": 1.4654, "step": 1080 }, { "epoch": 0.09588318085855031, "grad_norm": 17.511178970336914, "learning_rate": 3.1926121372031664e-06, "loss": 1.395, "step": 1090 }, { "epoch": 0.09676284306826179, "grad_norm": 14.83525276184082, "learning_rate": 3.2219290530636175e-06, "loss": 1.4335, "step": 1100 }, { "epoch": 0.09764250527797326, "grad_norm": 15.082141876220703, "learning_rate": 3.2512459689240695e-06, "loss": 1.2765, "step": 1110 }, { "epoch": 0.09852216748768473, "grad_norm": 17.110946655273438, "learning_rate": 3.280562884784521e-06, "loss": 1.3999, "step": 1120 }, { "epoch": 0.0994018296973962, "grad_norm": 19.255508422851562, "learning_rate": 3.3098798006449722e-06, "loss": 1.2893, "step": 1130 }, { "epoch": 0.10028149190710767, "grad_norm": 17.872526168823242, "learning_rate": 3.339196716505424e-06, "loss": 1.3282, "step": 1140 }, { "epoch": 0.10116115411681914, "grad_norm": 15.967557907104492, "learning_rate": 3.368513632365875e-06, "loss": 1.3216, "step": 1150 }, { "epoch": 0.10204081632653061, "grad_norm": 16.5362548828125, "learning_rate": 3.397830548226327e-06, "loss": 1.2726, "step": 1160 }, { "epoch": 0.10292047853624209, "grad_norm": 16.26555633544922, "learning_rate": 3.4271474640867785e-06, "loss": 1.321, "step": 1170 }, { "epoch": 0.10380014074595355, "grad_norm": 22.551742553710938, "learning_rate": 3.4564643799472297e-06, "loss": 1.3278, "step": 1180 }, { "epoch": 0.10467980295566502, "grad_norm": 21.43628692626953, "learning_rate": 3.4857812958076812e-06, "loss": 1.3922, "step": 1190 }, { "epoch": 0.1055594651653765, "grad_norm": 17.226224899291992, "learning_rate": 3.5150982116681324e-06, "loss": 1.1025, "step": 1200 }, { "epoch": 0.10643912737508797, "grad_norm": 17.258224487304688, "learning_rate": 3.5444151275285844e-06, "loss": 1.4021, "step": 1210 }, { "epoch": 0.10731878958479944, "grad_norm": 16.31353759765625, "learning_rate": 3.573732043389036e-06, "loss": 1.3324, "step": 1220 }, { "epoch": 0.1081984517945109, "grad_norm": 20.6882266998291, "learning_rate": 3.603048959249487e-06, "loss": 1.3249, "step": 1230 }, { "epoch": 0.10907811400422238, "grad_norm": 15.782238960266113, "learning_rate": 3.632365875109939e-06, "loss": 1.4017, "step": 1240 }, { "epoch": 0.10995777621393385, "grad_norm": 16.771081924438477, "learning_rate": 3.6616827909703902e-06, "loss": 1.3511, "step": 1250 }, { "epoch": 0.11083743842364532, "grad_norm": 18.71357536315918, "learning_rate": 3.690999706830842e-06, "loss": 1.3135, "step": 1260 }, { "epoch": 0.1117171006333568, "grad_norm": 15.347713470458984, "learning_rate": 3.7203166226912934e-06, "loss": 1.4325, "step": 1270 }, { "epoch": 0.11259676284306826, "grad_norm": 20.1947078704834, "learning_rate": 3.7496335385517445e-06, "loss": 1.3169, "step": 1280 }, { "epoch": 0.11347642505277973, "grad_norm": 14.431214332580566, "learning_rate": 3.7789504544121965e-06, "loss": 1.4052, "step": 1290 }, { "epoch": 0.1143560872624912, "grad_norm": 14.113228797912598, "learning_rate": 3.8082673702726477e-06, "loss": 1.2481, "step": 1300 }, { "epoch": 0.11523574947220268, "grad_norm": 18.45258140563965, "learning_rate": 3.837584286133099e-06, "loss": 1.2381, "step": 1310 }, { "epoch": 0.11611541168191415, "grad_norm": 16.940608978271484, "learning_rate": 3.86690120199355e-06, "loss": 1.4093, "step": 1320 }, { "epoch": 0.11699507389162561, "grad_norm": 16.57616424560547, "learning_rate": 3.896218117854002e-06, "loss": 1.231, "step": 1330 }, { "epoch": 0.11787473610133709, "grad_norm": 15.40613842010498, "learning_rate": 3.925535033714454e-06, "loss": 1.3423, "step": 1340 }, { "epoch": 0.11875439831104856, "grad_norm": 17.347049713134766, "learning_rate": 3.954851949574905e-06, "loss": 1.2983, "step": 1350 }, { "epoch": 0.11963406052076003, "grad_norm": 19.42605972290039, "learning_rate": 3.984168865435356e-06, "loss": 1.2736, "step": 1360 }, { "epoch": 0.12051372273047149, "grad_norm": 15.976557731628418, "learning_rate": 4.013485781295807e-06, "loss": 1.3701, "step": 1370 }, { "epoch": 0.12139338494018297, "grad_norm": 14.91614818572998, "learning_rate": 4.042802697156259e-06, "loss": 1.3014, "step": 1380 }, { "epoch": 0.12227304714989444, "grad_norm": 16.48415756225586, "learning_rate": 4.072119613016711e-06, "loss": 1.389, "step": 1390 }, { "epoch": 0.12315270935960591, "grad_norm": 15.029520034790039, "learning_rate": 4.1014365288771625e-06, "loss": 1.3466, "step": 1400 }, { "epoch": 0.12403237156931739, "grad_norm": 23.690950393676758, "learning_rate": 4.130753444737614e-06, "loss": 1.4034, "step": 1410 }, { "epoch": 0.12491203377902885, "grad_norm": 15.507283210754395, "learning_rate": 4.160070360598066e-06, "loss": 1.2795, "step": 1420 }, { "epoch": 0.12579169598874032, "grad_norm": 15.00433349609375, "learning_rate": 4.189387276458517e-06, "loss": 1.3223, "step": 1430 }, { "epoch": 0.1266713581984518, "grad_norm": 18.70124053955078, "learning_rate": 4.218704192318969e-06, "loss": 1.2934, "step": 1440 }, { "epoch": 0.12755102040816327, "grad_norm": 16.58974266052246, "learning_rate": 4.24802110817942e-06, "loss": 1.2413, "step": 1450 }, { "epoch": 0.12843068261787474, "grad_norm": 20.73488998413086, "learning_rate": 4.277338024039872e-06, "loss": 1.1708, "step": 1460 }, { "epoch": 0.12931034482758622, "grad_norm": 16.752227783203125, "learning_rate": 4.306654939900323e-06, "loss": 1.266, "step": 1470 }, { "epoch": 0.1301900070372977, "grad_norm": 16.215299606323242, "learning_rate": 4.335971855760774e-06, "loss": 1.2736, "step": 1480 }, { "epoch": 0.13106966924700914, "grad_norm": 16.61784553527832, "learning_rate": 4.365288771621225e-06, "loss": 1.3166, "step": 1490 }, { "epoch": 0.1319493314567206, "grad_norm": 17.6920223236084, "learning_rate": 4.394605687481677e-06, "loss": 1.2262, "step": 1500 }, { "epoch": 0.13282899366643208, "grad_norm": 15.113412857055664, "learning_rate": 4.423922603342129e-06, "loss": 1.4344, "step": 1510 }, { "epoch": 0.13370865587614356, "grad_norm": 16.263534545898438, "learning_rate": 4.4532395192025805e-06, "loss": 1.2966, "step": 1520 }, { "epoch": 0.13458831808585503, "grad_norm": 16.857311248779297, "learning_rate": 4.482556435063032e-06, "loss": 1.1726, "step": 1530 }, { "epoch": 0.1354679802955665, "grad_norm": 16.24165916442871, "learning_rate": 4.511873350923483e-06, "loss": 1.2074, "step": 1540 }, { "epoch": 0.13634764250527798, "grad_norm": 15.621249198913574, "learning_rate": 4.541190266783935e-06, "loss": 1.3464, "step": 1550 }, { "epoch": 0.13722730471498945, "grad_norm": 11.050832748413086, "learning_rate": 4.570507182644387e-06, "loss": 1.2498, "step": 1560 }, { "epoch": 0.13810696692470092, "grad_norm": 16.142757415771484, "learning_rate": 4.599824098504838e-06, "loss": 1.2116, "step": 1570 }, { "epoch": 0.1389866291344124, "grad_norm": 14.682723045349121, "learning_rate": 4.629141014365289e-06, "loss": 1.1946, "step": 1580 }, { "epoch": 0.13986629134412384, "grad_norm": 14.219841003417969, "learning_rate": 4.65845793022574e-06, "loss": 1.25, "step": 1590 }, { "epoch": 0.14074595355383532, "grad_norm": 14.822240829467773, "learning_rate": 4.687774846086192e-06, "loss": 1.1992, "step": 1600 }, { "epoch": 0.1416256157635468, "grad_norm": 14.733646392822266, "learning_rate": 4.717091761946644e-06, "loss": 1.2514, "step": 1610 }, { "epoch": 0.14250527797325827, "grad_norm": 15.016263961791992, "learning_rate": 4.746408677807095e-06, "loss": 1.2582, "step": 1620 }, { "epoch": 0.14338494018296974, "grad_norm": 14.12942886352539, "learning_rate": 4.7757255936675465e-06, "loss": 1.3339, "step": 1630 }, { "epoch": 0.1442646023926812, "grad_norm": 14.603706359863281, "learning_rate": 4.805042509527998e-06, "loss": 1.237, "step": 1640 }, { "epoch": 0.1451442646023927, "grad_norm": 14.178276062011719, "learning_rate": 4.83435942538845e-06, "loss": 1.3766, "step": 1650 }, { "epoch": 0.14602392681210416, "grad_norm": 17.086511611938477, "learning_rate": 4.863676341248901e-06, "loss": 1.1745, "step": 1660 }, { "epoch": 0.14690358902181563, "grad_norm": 14.563751220703125, "learning_rate": 4.892993257109353e-06, "loss": 1.3524, "step": 1670 }, { "epoch": 0.1477832512315271, "grad_norm": 12.278364181518555, "learning_rate": 4.922310172969804e-06, "loss": 1.2397, "step": 1680 }, { "epoch": 0.14866291344123855, "grad_norm": 15.517844200134277, "learning_rate": 4.951627088830255e-06, "loss": 1.1976, "step": 1690 }, { "epoch": 0.14954257565095003, "grad_norm": 12.885600090026855, "learning_rate": 4.980944004690707e-06, "loss": 1.283, "step": 1700 }, { "epoch": 0.1504222378606615, "grad_norm": 13.734346389770508, "learning_rate": 5.010260920551159e-06, "loss": 1.3349, "step": 1710 }, { "epoch": 0.15130190007037297, "grad_norm": 13.589200019836426, "learning_rate": 5.03957783641161e-06, "loss": 1.2697, "step": 1720 }, { "epoch": 0.15218156228008445, "grad_norm": 16.66176414489746, "learning_rate": 5.068894752272061e-06, "loss": 1.4065, "step": 1730 }, { "epoch": 0.15306122448979592, "grad_norm": 16.76392936706543, "learning_rate": 5.098211668132513e-06, "loss": 1.3097, "step": 1740 }, { "epoch": 0.1539408866995074, "grad_norm": 16.201507568359375, "learning_rate": 5.1275285839929645e-06, "loss": 1.1635, "step": 1750 }, { "epoch": 0.15482054890921887, "grad_norm": 15.215567588806152, "learning_rate": 5.156845499853416e-06, "loss": 1.2834, "step": 1760 }, { "epoch": 0.15570021111893034, "grad_norm": 13.790794372558594, "learning_rate": 5.186162415713867e-06, "loss": 1.1772, "step": 1770 }, { "epoch": 0.1565798733286418, "grad_norm": 18.381227493286133, "learning_rate": 5.215479331574319e-06, "loss": 1.2672, "step": 1780 }, { "epoch": 0.15745953553835326, "grad_norm": 14.035795211791992, "learning_rate": 5.24479624743477e-06, "loss": 1.1744, "step": 1790 }, { "epoch": 0.15833919774806474, "grad_norm": 14.319401741027832, "learning_rate": 5.274113163295221e-06, "loss": 1.0595, "step": 1800 }, { "epoch": 0.1592188599577762, "grad_norm": 15.694960594177246, "learning_rate": 5.303430079155674e-06, "loss": 1.1347, "step": 1810 }, { "epoch": 0.16009852216748768, "grad_norm": 12.619365692138672, "learning_rate": 5.332746995016125e-06, "loss": 1.0912, "step": 1820 }, { "epoch": 0.16097818437719916, "grad_norm": 14.923580169677734, "learning_rate": 5.362063910876576e-06, "loss": 1.3536, "step": 1830 }, { "epoch": 0.16185784658691063, "grad_norm": 16.85942840576172, "learning_rate": 5.391380826737028e-06, "loss": 1.2401, "step": 1840 }, { "epoch": 0.1627375087966221, "grad_norm": 15.572540283203125, "learning_rate": 5.420697742597479e-06, "loss": 1.1673, "step": 1850 }, { "epoch": 0.16361717100633358, "grad_norm": 20.235065460205078, "learning_rate": 5.4500146584579305e-06, "loss": 1.3264, "step": 1860 }, { "epoch": 0.16449683321604505, "grad_norm": 11.374720573425293, "learning_rate": 5.479331574318382e-06, "loss": 1.1374, "step": 1870 }, { "epoch": 0.1653764954257565, "grad_norm": 16.37449836730957, "learning_rate": 5.508648490178834e-06, "loss": 1.1043, "step": 1880 }, { "epoch": 0.16625615763546797, "grad_norm": 13.774438858032227, "learning_rate": 5.537965406039285e-06, "loss": 1.1298, "step": 1890 }, { "epoch": 0.16713581984517945, "grad_norm": 14.551349639892578, "learning_rate": 5.567282321899736e-06, "loss": 1.2198, "step": 1900 }, { "epoch": 0.16801548205489092, "grad_norm": 14.524474143981934, "learning_rate": 5.596599237760189e-06, "loss": 1.2173, "step": 1910 }, { "epoch": 0.1688951442646024, "grad_norm": 11.871665954589844, "learning_rate": 5.62591615362064e-06, "loss": 1.1606, "step": 1920 }, { "epoch": 0.16977480647431387, "grad_norm": 15.624194145202637, "learning_rate": 5.655233069481091e-06, "loss": 1.2706, "step": 1930 }, { "epoch": 0.17065446868402534, "grad_norm": 14.724122047424316, "learning_rate": 5.684549985341542e-06, "loss": 1.2692, "step": 1940 }, { "epoch": 0.1715341308937368, "grad_norm": 14.390944480895996, "learning_rate": 5.713866901201994e-06, "loss": 1.2557, "step": 1950 }, { "epoch": 0.1724137931034483, "grad_norm": 16.356054306030273, "learning_rate": 5.743183817062445e-06, "loss": 1.1988, "step": 1960 }, { "epoch": 0.17329345531315973, "grad_norm": 14.825990676879883, "learning_rate": 5.7725007329228965e-06, "loss": 1.1601, "step": 1970 }, { "epoch": 0.1741731175228712, "grad_norm": 11.761281967163086, "learning_rate": 5.8018176487833485e-06, "loss": 1.3344, "step": 1980 }, { "epoch": 0.17505277973258268, "grad_norm": 20.761951446533203, "learning_rate": 5.8311345646438e-06, "loss": 1.2788, "step": 1990 }, { "epoch": 0.17593244194229415, "grad_norm": 13.08662223815918, "learning_rate": 5.860451480504251e-06, "loss": 1.201, "step": 2000 }, { "epoch": 0.17681210415200563, "grad_norm": 15.15190315246582, "learning_rate": 5.889768396364704e-06, "loss": 1.2159, "step": 2010 }, { "epoch": 0.1776917663617171, "grad_norm": 10.957923889160156, "learning_rate": 5.919085312225155e-06, "loss": 1.2474, "step": 2020 }, { "epoch": 0.17857142857142858, "grad_norm": 12.449853897094727, "learning_rate": 5.948402228085606e-06, "loss": 1.323, "step": 2030 }, { "epoch": 0.17945109078114005, "grad_norm": 17.315832138061523, "learning_rate": 5.977719143946057e-06, "loss": 1.2125, "step": 2040 }, { "epoch": 0.18033075299085152, "grad_norm": 16.270519256591797, "learning_rate": 6.007036059806509e-06, "loss": 1.2414, "step": 2050 }, { "epoch": 0.181210415200563, "grad_norm": 15.238424301147461, "learning_rate": 6.03635297566696e-06, "loss": 1.2184, "step": 2060 }, { "epoch": 0.18209007741027444, "grad_norm": 15.908957481384277, "learning_rate": 6.065669891527411e-06, "loss": 1.1877, "step": 2070 }, { "epoch": 0.18296973961998592, "grad_norm": 14.928387641906738, "learning_rate": 6.094986807387863e-06, "loss": 1.1586, "step": 2080 }, { "epoch": 0.1838494018296974, "grad_norm": 12.790017127990723, "learning_rate": 6.1243037232483145e-06, "loss": 1.1595, "step": 2090 }, { "epoch": 0.18472906403940886, "grad_norm": 12.074016571044922, "learning_rate": 6.153620639108766e-06, "loss": 1.2129, "step": 2100 }, { "epoch": 0.18560872624912034, "grad_norm": 14.084776878356934, "learning_rate": 6.182937554969217e-06, "loss": 1.2958, "step": 2110 }, { "epoch": 0.1864883884588318, "grad_norm": 15.492996215820312, "learning_rate": 6.21225447082967e-06, "loss": 1.1681, "step": 2120 }, { "epoch": 0.18736805066854328, "grad_norm": 17.43244743347168, "learning_rate": 6.241571386690121e-06, "loss": 1.194, "step": 2130 }, { "epoch": 0.18824771287825476, "grad_norm": 11.239168167114258, "learning_rate": 6.270888302550572e-06, "loss": 1.1242, "step": 2140 }, { "epoch": 0.18912737508796623, "grad_norm": 13.062403678894043, "learning_rate": 6.300205218411024e-06, "loss": 1.152, "step": 2150 }, { "epoch": 0.1900070372976777, "grad_norm": 14.466598510742188, "learning_rate": 6.329522134271475e-06, "loss": 1.2566, "step": 2160 }, { "epoch": 0.19088669950738915, "grad_norm": 13.60856819152832, "learning_rate": 6.358839050131926e-06, "loss": 1.2309, "step": 2170 }, { "epoch": 0.19176636171710063, "grad_norm": 12.29806900024414, "learning_rate": 6.388155965992379e-06, "loss": 1.1628, "step": 2180 }, { "epoch": 0.1926460239268121, "grad_norm": 14.52897834777832, "learning_rate": 6.41747288185283e-06, "loss": 1.2374, "step": 2190 }, { "epoch": 0.19352568613652357, "grad_norm": 13.283951759338379, "learning_rate": 6.446789797713281e-06, "loss": 1.1467, "step": 2200 }, { "epoch": 0.19440534834623505, "grad_norm": 15.547257423400879, "learning_rate": 6.4761067135737325e-06, "loss": 1.2536, "step": 2210 }, { "epoch": 0.19528501055594652, "grad_norm": 14.070216178894043, "learning_rate": 6.5054236294341845e-06, "loss": 1.116, "step": 2220 }, { "epoch": 0.196164672765658, "grad_norm": 12.693365097045898, "learning_rate": 6.534740545294636e-06, "loss": 1.1277, "step": 2230 }, { "epoch": 0.19704433497536947, "grad_norm": 11.60954761505127, "learning_rate": 6.564057461155087e-06, "loss": 1.1181, "step": 2240 }, { "epoch": 0.19792399718508094, "grad_norm": 13.145277976989746, "learning_rate": 6.593374377015539e-06, "loss": 1.2523, "step": 2250 }, { "epoch": 0.1988036593947924, "grad_norm": 12.094047546386719, "learning_rate": 6.62269129287599e-06, "loss": 1.2017, "step": 2260 }, { "epoch": 0.19968332160450386, "grad_norm": 17.52699089050293, "learning_rate": 6.652008208736441e-06, "loss": 1.1949, "step": 2270 }, { "epoch": 0.20056298381421533, "grad_norm": 11.833756446838379, "learning_rate": 6.681325124596892e-06, "loss": 1.0758, "step": 2280 }, { "epoch": 0.2014426460239268, "grad_norm": 12.572530746459961, "learning_rate": 6.710642040457345e-06, "loss": 1.3271, "step": 2290 }, { "epoch": 0.20232230823363828, "grad_norm": 12.044607162475586, "learning_rate": 6.739958956317796e-06, "loss": 1.2352, "step": 2300 }, { "epoch": 0.20320197044334976, "grad_norm": 16.23038673400879, "learning_rate": 6.769275872178247e-06, "loss": 1.221, "step": 2310 }, { "epoch": 0.20408163265306123, "grad_norm": 13.465204238891602, "learning_rate": 6.798592788038699e-06, "loss": 1.1752, "step": 2320 }, { "epoch": 0.2049612948627727, "grad_norm": 11.482861518859863, "learning_rate": 6.8279097038991505e-06, "loss": 1.1661, "step": 2330 }, { "epoch": 0.20584095707248418, "grad_norm": 11.33665657043457, "learning_rate": 6.857226619759602e-06, "loss": 1.2341, "step": 2340 }, { "epoch": 0.20672061928219565, "grad_norm": 12.15077018737793, "learning_rate": 6.886543535620054e-06, "loss": 1.1703, "step": 2350 }, { "epoch": 0.2076002814919071, "grad_norm": 11.533472061157227, "learning_rate": 6.915860451480505e-06, "loss": 1.1736, "step": 2360 }, { "epoch": 0.20847994370161857, "grad_norm": 13.217687606811523, "learning_rate": 6.945177367340956e-06, "loss": 1.1276, "step": 2370 }, { "epoch": 0.20935960591133004, "grad_norm": 11.44119644165039, "learning_rate": 6.974494283201407e-06, "loss": 1.0852, "step": 2380 }, { "epoch": 0.21023926812104152, "grad_norm": 12.549988746643066, "learning_rate": 7.00381119906186e-06, "loss": 1.2711, "step": 2390 }, { "epoch": 0.211118930330753, "grad_norm": 14.484488487243652, "learning_rate": 7.033128114922311e-06, "loss": 1.2166, "step": 2400 }, { "epoch": 0.21199859254046446, "grad_norm": 13.215435028076172, "learning_rate": 7.062445030782762e-06, "loss": 1.0524, "step": 2410 }, { "epoch": 0.21287825475017594, "grad_norm": 15.921116828918457, "learning_rate": 7.091761946643214e-06, "loss": 1.201, "step": 2420 }, { "epoch": 0.2137579169598874, "grad_norm": 10.600140571594238, "learning_rate": 7.121078862503665e-06, "loss": 1.1271, "step": 2430 }, { "epoch": 0.21463757916959889, "grad_norm": 12.720752716064453, "learning_rate": 7.1503957783641165e-06, "loss": 1.1159, "step": 2440 }, { "epoch": 0.21551724137931033, "grad_norm": 12.836406707763672, "learning_rate": 7.179712694224568e-06, "loss": 1.2268, "step": 2450 }, { "epoch": 0.2163969035890218, "grad_norm": 14.484479904174805, "learning_rate": 7.20902961008502e-06, "loss": 1.1669, "step": 2460 }, { "epoch": 0.21727656579873328, "grad_norm": 14.128315925598145, "learning_rate": 7.238346525945471e-06, "loss": 1.3046, "step": 2470 }, { "epoch": 0.21815622800844475, "grad_norm": 12.605624198913574, "learning_rate": 7.267663441805922e-06, "loss": 0.9977, "step": 2480 }, { "epoch": 0.21903589021815623, "grad_norm": 13.015382766723633, "learning_rate": 7.296980357666375e-06, "loss": 1.2426, "step": 2490 }, { "epoch": 0.2199155524278677, "grad_norm": 17.670608520507812, "learning_rate": 7.326297273526826e-06, "loss": 1.0869, "step": 2500 }, { "epoch": 0.22079521463757917, "grad_norm": 12.759334564208984, "learning_rate": 7.355614189387277e-06, "loss": 1.0916, "step": 2510 }, { "epoch": 0.22167487684729065, "grad_norm": 12.832584381103516, "learning_rate": 7.384931105247729e-06, "loss": 1.2396, "step": 2520 }, { "epoch": 0.22255453905700212, "grad_norm": 11.229182243347168, "learning_rate": 7.41424802110818e-06, "loss": 1.104, "step": 2530 }, { "epoch": 0.2234342012667136, "grad_norm": 12.83469295501709, "learning_rate": 7.443564936968631e-06, "loss": 1.1415, "step": 2540 }, { "epoch": 0.22431386347642504, "grad_norm": 15.180788040161133, "learning_rate": 7.4728818528290825e-06, "loss": 1.2042, "step": 2550 }, { "epoch": 0.22519352568613651, "grad_norm": 10.464702606201172, "learning_rate": 7.5021987686895345e-06, "loss": 1.1659, "step": 2560 }, { "epoch": 0.226073187895848, "grad_norm": 14.164366722106934, "learning_rate": 7.531515684549986e-06, "loss": 1.1985, "step": 2570 }, { "epoch": 0.22695285010555946, "grad_norm": 13.014163970947266, "learning_rate": 7.560832600410437e-06, "loss": 1.245, "step": 2580 }, { "epoch": 0.22783251231527094, "grad_norm": 15.117788314819336, "learning_rate": 7.59014951627089e-06, "loss": 1.2198, "step": 2590 }, { "epoch": 0.2287121745249824, "grad_norm": 15.694009780883789, "learning_rate": 7.619466432131341e-06, "loss": 1.1022, "step": 2600 }, { "epoch": 0.22959183673469388, "grad_norm": 12.361466407775879, "learning_rate": 7.648783347991791e-06, "loss": 1.102, "step": 2610 }, { "epoch": 0.23047149894440536, "grad_norm": 12.566007614135742, "learning_rate": 7.678100263852242e-06, "loss": 1.1521, "step": 2620 }, { "epoch": 0.23135116115411683, "grad_norm": 13.049711227416992, "learning_rate": 7.707417179712695e-06, "loss": 1.0875, "step": 2630 }, { "epoch": 0.2322308233638283, "grad_norm": 12.188079833984375, "learning_rate": 7.736734095573146e-06, "loss": 1.1523, "step": 2640 }, { "epoch": 0.23311048557353975, "grad_norm": 11.403328895568848, "learning_rate": 7.766051011433597e-06, "loss": 1.0564, "step": 2650 }, { "epoch": 0.23399014778325122, "grad_norm": 14.727110862731934, "learning_rate": 7.79536792729405e-06, "loss": 1.2645, "step": 2660 }, { "epoch": 0.2348698099929627, "grad_norm": 13.83519458770752, "learning_rate": 7.824684843154501e-06, "loss": 1.1435, "step": 2670 }, { "epoch": 0.23574947220267417, "grad_norm": 12.854756355285645, "learning_rate": 7.854001759014952e-06, "loss": 1.0667, "step": 2680 }, { "epoch": 0.23662913441238564, "grad_norm": 12.459912300109863, "learning_rate": 7.883318674875404e-06, "loss": 1.1102, "step": 2690 }, { "epoch": 0.23750879662209712, "grad_norm": 12.490076065063477, "learning_rate": 7.912635590735855e-06, "loss": 1.2083, "step": 2700 }, { "epoch": 0.2383884588318086, "grad_norm": 12.75175666809082, "learning_rate": 7.941952506596306e-06, "loss": 0.9822, "step": 2710 }, { "epoch": 0.23926812104152007, "grad_norm": 10.826375961303711, "learning_rate": 7.971269422456757e-06, "loss": 1.1581, "step": 2720 }, { "epoch": 0.24014778325123154, "grad_norm": 12.048887252807617, "learning_rate": 8.00058633831721e-06, "loss": 1.2013, "step": 2730 }, { "epoch": 0.24102744546094299, "grad_norm": 10.787705421447754, "learning_rate": 8.029903254177661e-06, "loss": 1.1202, "step": 2740 }, { "epoch": 0.24190710767065446, "grad_norm": 13.242773056030273, "learning_rate": 8.059220170038112e-06, "loss": 1.1705, "step": 2750 }, { "epoch": 0.24278676988036593, "grad_norm": 12.419435501098633, "learning_rate": 8.088537085898565e-06, "loss": 1.1079, "step": 2760 }, { "epoch": 0.2436664320900774, "grad_norm": 13.091764450073242, "learning_rate": 8.117854001759016e-06, "loss": 1.2423, "step": 2770 }, { "epoch": 0.24454609429978888, "grad_norm": 11.197342872619629, "learning_rate": 8.147170917619467e-06, "loss": 1.0998, "step": 2780 }, { "epoch": 0.24542575650950035, "grad_norm": 19.820266723632812, "learning_rate": 8.176487833479918e-06, "loss": 0.9263, "step": 2790 }, { "epoch": 0.24630541871921183, "grad_norm": 13.465862274169922, "learning_rate": 8.20580474934037e-06, "loss": 1.1605, "step": 2800 }, { "epoch": 0.2471850809289233, "grad_norm": 12.306135177612305, "learning_rate": 8.23512166520082e-06, "loss": 1.0059, "step": 2810 }, { "epoch": 0.24806474313863477, "grad_norm": 11.260857582092285, "learning_rate": 8.264438581061272e-06, "loss": 1.0149, "step": 2820 }, { "epoch": 0.24894440534834625, "grad_norm": 12.011026382446289, "learning_rate": 8.293755496921725e-06, "loss": 1.0423, "step": 2830 }, { "epoch": 0.2498240675580577, "grad_norm": 15.562149047851562, "learning_rate": 8.323072412782176e-06, "loss": 1.0356, "step": 2840 }, { "epoch": 0.25070372976776917, "grad_norm": 13.29050064086914, "learning_rate": 8.352389328642627e-06, "loss": 1.0861, "step": 2850 }, { "epoch": 0.25158339197748064, "grad_norm": 10.4749755859375, "learning_rate": 8.38170624450308e-06, "loss": 1.0453, "step": 2860 }, { "epoch": 0.2524630541871921, "grad_norm": 13.092935562133789, "learning_rate": 8.411023160363531e-06, "loss": 1.0006, "step": 2870 }, { "epoch": 0.2533427163969036, "grad_norm": 13.1531982421875, "learning_rate": 8.440340076223982e-06, "loss": 1.1732, "step": 2880 }, { "epoch": 0.25422237860661506, "grad_norm": 11.765358924865723, "learning_rate": 8.469656992084433e-06, "loss": 1.0683, "step": 2890 }, { "epoch": 0.25510204081632654, "grad_norm": 11.169476509094238, "learning_rate": 8.498973907944884e-06, "loss": 1.0856, "step": 2900 }, { "epoch": 0.255981703026038, "grad_norm": 16.152873992919922, "learning_rate": 8.528290823805336e-06, "loss": 1.0748, "step": 2910 }, { "epoch": 0.2568613652357495, "grad_norm": 11.662906646728516, "learning_rate": 8.557607739665787e-06, "loss": 1.2749, "step": 2920 }, { "epoch": 0.25774102744546096, "grad_norm": 10.607311248779297, "learning_rate": 8.58692465552624e-06, "loss": 1.0822, "step": 2930 }, { "epoch": 0.25862068965517243, "grad_norm": 11.558359146118164, "learning_rate": 8.61624157138669e-06, "loss": 1.0474, "step": 2940 }, { "epoch": 0.2595003518648839, "grad_norm": 12.36270523071289, "learning_rate": 8.645558487247142e-06, "loss": 1.0887, "step": 2950 }, { "epoch": 0.2603800140745954, "grad_norm": 15.062859535217285, "learning_rate": 8.674875403107593e-06, "loss": 1.1176, "step": 2960 }, { "epoch": 0.26125967628430685, "grad_norm": 11.461206436157227, "learning_rate": 8.704192318968046e-06, "loss": 1.1192, "step": 2970 }, { "epoch": 0.26213933849401827, "grad_norm": 12.725966453552246, "learning_rate": 8.733509234828497e-06, "loss": 1.0521, "step": 2980 }, { "epoch": 0.26301900070372974, "grad_norm": 9.574695587158203, "learning_rate": 8.762826150688948e-06, "loss": 1.1007, "step": 2990 }, { "epoch": 0.2638986629134412, "grad_norm": 13.303370475769043, "learning_rate": 8.7921430665494e-06, "loss": 1.0855, "step": 3000 }, { "epoch": 0.2647783251231527, "grad_norm": 10.365307807922363, "learning_rate": 8.82145998240985e-06, "loss": 1.2149, "step": 3010 }, { "epoch": 0.26565798733286416, "grad_norm": 12.124460220336914, "learning_rate": 8.850776898270302e-06, "loss": 1.1033, "step": 3020 }, { "epoch": 0.26653764954257564, "grad_norm": 13.476741790771484, "learning_rate": 8.880093814130754e-06, "loss": 1.0375, "step": 3030 }, { "epoch": 0.2674173117522871, "grad_norm": 14.757545471191406, "learning_rate": 8.909410729991206e-06, "loss": 1.1503, "step": 3040 }, { "epoch": 0.2682969739619986, "grad_norm": 13.446517944335938, "learning_rate": 8.938727645851657e-06, "loss": 1.2299, "step": 3050 }, { "epoch": 0.26917663617171006, "grad_norm": 12.106247901916504, "learning_rate": 8.968044561712108e-06, "loss": 1.101, "step": 3060 }, { "epoch": 0.27005629838142153, "grad_norm": 10.631941795349121, "learning_rate": 8.99736147757256e-06, "loss": 1.039, "step": 3070 }, { "epoch": 0.270935960591133, "grad_norm": 11.573443412780762, "learning_rate": 9.026678393433012e-06, "loss": 0.994, "step": 3080 }, { "epoch": 0.2718156228008445, "grad_norm": 10.621591567993164, "learning_rate": 9.055995309293463e-06, "loss": 1.0645, "step": 3090 }, { "epoch": 0.27269528501055595, "grad_norm": 14.310229301452637, "learning_rate": 9.085312225153914e-06, "loss": 1.1814, "step": 3100 }, { "epoch": 0.27357494722026743, "grad_norm": 8.858597755432129, "learning_rate": 9.114629141014365e-06, "loss": 1.1205, "step": 3110 }, { "epoch": 0.2744546094299789, "grad_norm": 15.615384101867676, "learning_rate": 9.143946056874816e-06, "loss": 1.1866, "step": 3120 }, { "epoch": 0.2753342716396904, "grad_norm": 16.38617515563965, "learning_rate": 9.173262972735268e-06, "loss": 1.1143, "step": 3130 }, { "epoch": 0.27621393384940185, "grad_norm": 11.487483978271484, "learning_rate": 9.20257988859572e-06, "loss": 0.9593, "step": 3140 }, { "epoch": 0.2770935960591133, "grad_norm": 13.102459907531738, "learning_rate": 9.231896804456172e-06, "loss": 1.1213, "step": 3150 }, { "epoch": 0.2779732582688248, "grad_norm": 10.634673118591309, "learning_rate": 9.261213720316623e-06, "loss": 1.1014, "step": 3160 }, { "epoch": 0.2788529204785362, "grad_norm": 9.22297477722168, "learning_rate": 9.290530636177076e-06, "loss": 1.0624, "step": 3170 }, { "epoch": 0.2797325826882477, "grad_norm": 12.681438446044922, "learning_rate": 9.319847552037527e-06, "loss": 0.9878, "step": 3180 }, { "epoch": 0.28061224489795916, "grad_norm": 12.782941818237305, "learning_rate": 9.349164467897978e-06, "loss": 1.0876, "step": 3190 }, { "epoch": 0.28149190710767064, "grad_norm": 13.165761947631836, "learning_rate": 9.378481383758429e-06, "loss": 1.1196, "step": 3200 }, { "epoch": 0.2823715693173821, "grad_norm": 11.872429847717285, "learning_rate": 9.40779829961888e-06, "loss": 1.1865, "step": 3210 }, { "epoch": 0.2832512315270936, "grad_norm": 13.013513565063477, "learning_rate": 9.437115215479331e-06, "loss": 0.9666, "step": 3220 }, { "epoch": 0.28413089373680506, "grad_norm": 10.483717918395996, "learning_rate": 9.466432131339782e-06, "loss": 1.0365, "step": 3230 }, { "epoch": 0.28501055594651653, "grad_norm": 12.517560005187988, "learning_rate": 9.495749047200235e-06, "loss": 1.0031, "step": 3240 }, { "epoch": 0.285890218156228, "grad_norm": 12.200453758239746, "learning_rate": 9.525065963060686e-06, "loss": 1.1135, "step": 3250 }, { "epoch": 0.2867698803659395, "grad_norm": 12.472599029541016, "learning_rate": 9.554382878921138e-06, "loss": 1.1071, "step": 3260 }, { "epoch": 0.28764954257565095, "grad_norm": 10.924927711486816, "learning_rate": 9.58369979478159e-06, "loss": 1.0792, "step": 3270 }, { "epoch": 0.2885292047853624, "grad_norm": 13.987113952636719, "learning_rate": 9.613016710642042e-06, "loss": 1.0664, "step": 3280 }, { "epoch": 0.2894088669950739, "grad_norm": 12.691108703613281, "learning_rate": 9.642333626502493e-06, "loss": 1.1788, "step": 3290 }, { "epoch": 0.2902885292047854, "grad_norm": 10.253110885620117, "learning_rate": 9.671650542362946e-06, "loss": 0.9673, "step": 3300 }, { "epoch": 0.29116819141449685, "grad_norm": 11.537460327148438, "learning_rate": 9.700967458223397e-06, "loss": 0.8759, "step": 3310 }, { "epoch": 0.2920478536242083, "grad_norm": 10.295685768127441, "learning_rate": 9.730284374083848e-06, "loss": 1.1286, "step": 3320 }, { "epoch": 0.2929275158339198, "grad_norm": 11.083233833312988, "learning_rate": 9.759601289944299e-06, "loss": 1.1529, "step": 3330 }, { "epoch": 0.29380717804363127, "grad_norm": 12.451109886169434, "learning_rate": 9.78891820580475e-06, "loss": 1.182, "step": 3340 }, { "epoch": 0.29468684025334274, "grad_norm": 10.300433158874512, "learning_rate": 9.818235121665201e-06, "loss": 1.0944, "step": 3350 }, { "epoch": 0.2955665024630542, "grad_norm": 9.956304550170898, "learning_rate": 9.847552037525652e-06, "loss": 1.0671, "step": 3360 }, { "epoch": 0.29644616467276563, "grad_norm": 10.990047454833984, "learning_rate": 9.876868953386105e-06, "loss": 0.9921, "step": 3370 }, { "epoch": 0.2973258268824771, "grad_norm": 9.533073425292969, "learning_rate": 9.906185869246556e-06, "loss": 0.9605, "step": 3380 }, { "epoch": 0.2982054890921886, "grad_norm": 10.649511337280273, "learning_rate": 9.935502785107008e-06, "loss": 1.135, "step": 3390 }, { "epoch": 0.29908515130190005, "grad_norm": 13.161002159118652, "learning_rate": 9.964819700967459e-06, "loss": 1.1917, "step": 3400 }, { "epoch": 0.2999648135116115, "grad_norm": 10.739681243896484, "learning_rate": 9.994136616827912e-06, "loss": 1.1955, "step": 3410 }, { "epoch": 0.300844475721323, "grad_norm": 13.053327560424805, "learning_rate": 9.999998323741378e-06, "loss": 1.0039, "step": 3420 }, { "epoch": 0.3017241379310345, "grad_norm": 11.151718139648438, "learning_rate": 9.999991513942648e-06, "loss": 1.115, "step": 3430 }, { "epoch": 0.30260380014074595, "grad_norm": 10.794220924377441, "learning_rate": 9.999979465844775e-06, "loss": 1.0616, "step": 3440 }, { "epoch": 0.3034834623504574, "grad_norm": 10.326006889343262, "learning_rate": 9.999962179460381e-06, "loss": 0.9356, "step": 3450 }, { "epoch": 0.3043631245601689, "grad_norm": 10.241646766662598, "learning_rate": 9.99993965480758e-06, "loss": 1.1083, "step": 3460 }, { "epoch": 0.30524278676988037, "grad_norm": 8.389451026916504, "learning_rate": 9.999911891909966e-06, "loss": 1.0242, "step": 3470 }, { "epoch": 0.30612244897959184, "grad_norm": 12.456686019897461, "learning_rate": 9.999878890796628e-06, "loss": 1.1305, "step": 3480 }, { "epoch": 0.3070021111893033, "grad_norm": 8.337313652038574, "learning_rate": 9.999840651502137e-06, "loss": 1.0297, "step": 3490 }, { "epoch": 0.3078817733990148, "grad_norm": 11.263201713562012, "learning_rate": 9.999797174066557e-06, "loss": 1.0444, "step": 3500 }, { "epoch": 0.30876143560872626, "grad_norm": 11.168819427490234, "learning_rate": 9.999748458535436e-06, "loss": 1.0273, "step": 3510 }, { "epoch": 0.30964109781843774, "grad_norm": 10.733668327331543, "learning_rate": 9.999694504959815e-06, "loss": 1.1723, "step": 3520 }, { "epoch": 0.3105207600281492, "grad_norm": 8.827539443969727, "learning_rate": 9.999635313396216e-06, "loss": 1.08, "step": 3530 }, { "epoch": 0.3114004222378607, "grad_norm": 8.322421073913574, "learning_rate": 9.99957088390665e-06, "loss": 0.9503, "step": 3540 }, { "epoch": 0.31228008444757216, "grad_norm": 15.681316375732422, "learning_rate": 9.999501216558622e-06, "loss": 1.0776, "step": 3550 }, { "epoch": 0.3131597466572836, "grad_norm": 11.645968437194824, "learning_rate": 9.999426311425118e-06, "loss": 1.0032, "step": 3560 }, { "epoch": 0.31403940886699505, "grad_norm": 9.008444786071777, "learning_rate": 9.999346168584612e-06, "loss": 1.1273, "step": 3570 }, { "epoch": 0.3149190710767065, "grad_norm": 12.491524696350098, "learning_rate": 9.999260788121065e-06, "loss": 1.1179, "step": 3580 }, { "epoch": 0.315798733286418, "grad_norm": 10.441204071044922, "learning_rate": 9.999170170123932e-06, "loss": 1.0278, "step": 3590 }, { "epoch": 0.3166783954961295, "grad_norm": 13.264538764953613, "learning_rate": 9.999074314688147e-06, "loss": 1.0553, "step": 3600 }, { "epoch": 0.31755805770584095, "grad_norm": 11.106444358825684, "learning_rate": 9.998973221914133e-06, "loss": 1.1241, "step": 3610 }, { "epoch": 0.3184377199155524, "grad_norm": 11.616222381591797, "learning_rate": 9.998866891907801e-06, "loss": 1.065, "step": 3620 }, { "epoch": 0.3193173821252639, "grad_norm": 10.281294822692871, "learning_rate": 9.998755324780552e-06, "loss": 1.0243, "step": 3630 }, { "epoch": 0.32019704433497537, "grad_norm": 11.477418899536133, "learning_rate": 9.998638520649269e-06, "loss": 1.0601, "step": 3640 }, { "epoch": 0.32107670654468684, "grad_norm": 9.725064277648926, "learning_rate": 9.998516479636323e-06, "loss": 1.0161, "step": 3650 }, { "epoch": 0.3219563687543983, "grad_norm": 13.830681800842285, "learning_rate": 9.99838920186957e-06, "loss": 0.9999, "step": 3660 }, { "epoch": 0.3228360309641098, "grad_norm": 9.759856224060059, "learning_rate": 9.998256687482355e-06, "loss": 0.9791, "step": 3670 }, { "epoch": 0.32371569317382126, "grad_norm": 12.975741386413574, "learning_rate": 9.99811893661351e-06, "loss": 1.1133, "step": 3680 }, { "epoch": 0.32459535538353274, "grad_norm": 10.318453788757324, "learning_rate": 9.99797594940735e-06, "loss": 1.0685, "step": 3690 }, { "epoch": 0.3254750175932442, "grad_norm": 15.920031547546387, "learning_rate": 9.997827726013677e-06, "loss": 1.0642, "step": 3700 }, { "epoch": 0.3263546798029557, "grad_norm": 13.000662803649902, "learning_rate": 9.99767426658778e-06, "loss": 1.046, "step": 3710 }, { "epoch": 0.32723434201266716, "grad_norm": 10.599923133850098, "learning_rate": 9.997515571290433e-06, "loss": 1.1599, "step": 3720 }, { "epoch": 0.32811400422237863, "grad_norm": 11.85610294342041, "learning_rate": 9.997351640287891e-06, "loss": 1.1138, "step": 3730 }, { "epoch": 0.3289936664320901, "grad_norm": 10.28419303894043, "learning_rate": 9.997182473751903e-06, "loss": 1.0783, "step": 3740 }, { "epoch": 0.3298733286418015, "grad_norm": 9.328727722167969, "learning_rate": 9.997008071859695e-06, "loss": 1.0209, "step": 3750 }, { "epoch": 0.330752990851513, "grad_norm": 10.38059139251709, "learning_rate": 9.996828434793984e-06, "loss": 1.0384, "step": 3760 }, { "epoch": 0.33163265306122447, "grad_norm": 11.14818286895752, "learning_rate": 9.996643562742965e-06, "loss": 1.0098, "step": 3770 }, { "epoch": 0.33251231527093594, "grad_norm": 9.787508010864258, "learning_rate": 9.996453455900325e-06, "loss": 1.0181, "step": 3780 }, { "epoch": 0.3333919774806474, "grad_norm": 12.561637878417969, "learning_rate": 9.99625811446523e-06, "loss": 1.0987, "step": 3790 }, { "epoch": 0.3342716396903589, "grad_norm": 12.490092277526855, "learning_rate": 9.996057538642332e-06, "loss": 1.1024, "step": 3800 }, { "epoch": 0.33515130190007036, "grad_norm": 9.78665828704834, "learning_rate": 9.995851728641768e-06, "loss": 1.1153, "step": 3810 }, { "epoch": 0.33603096410978184, "grad_norm": 14.388655662536621, "learning_rate": 9.995640684679155e-06, "loss": 1.0211, "step": 3820 }, { "epoch": 0.3369106263194933, "grad_norm": 11.715775489807129, "learning_rate": 9.995424406975595e-06, "loss": 1.0443, "step": 3830 }, { "epoch": 0.3377902885292048, "grad_norm": 9.612485885620117, "learning_rate": 9.995202895757677e-06, "loss": 1.0198, "step": 3840 }, { "epoch": 0.33866995073891626, "grad_norm": 7.9821343421936035, "learning_rate": 9.994976151257468e-06, "loss": 1.0654, "step": 3850 }, { "epoch": 0.33954961294862773, "grad_norm": 10.505115509033203, "learning_rate": 9.994744173712518e-06, "loss": 1.0687, "step": 3860 }, { "epoch": 0.3404292751583392, "grad_norm": 12.617406845092773, "learning_rate": 9.994506963365864e-06, "loss": 1.0577, "step": 3870 }, { "epoch": 0.3413089373680507, "grad_norm": 8.71716022491455, "learning_rate": 9.994264520466022e-06, "loss": 1.0437, "step": 3880 }, { "epoch": 0.34218859957776215, "grad_norm": 10.009693145751953, "learning_rate": 9.994016845266987e-06, "loss": 1.0229, "step": 3890 }, { "epoch": 0.3430682617874736, "grad_norm": 10.682689666748047, "learning_rate": 9.993763938028241e-06, "loss": 0.9702, "step": 3900 }, { "epoch": 0.3439479239971851, "grad_norm": 8.888628959655762, "learning_rate": 9.993505799014745e-06, "loss": 0.9716, "step": 3910 }, { "epoch": 0.3448275862068966, "grad_norm": 7.225987434387207, "learning_rate": 9.993242428496942e-06, "loss": 0.9291, "step": 3920 }, { "epoch": 0.34570724841660805, "grad_norm": 13.761811256408691, "learning_rate": 9.992973826750754e-06, "loss": 1.0815, "step": 3930 }, { "epoch": 0.34658691062631947, "grad_norm": 9.00239372253418, "learning_rate": 9.992699994057585e-06, "loss": 1.1518, "step": 3940 }, { "epoch": 0.34746657283603094, "grad_norm": 12.397631645202637, "learning_rate": 9.99242093070432e-06, "loss": 1.0395, "step": 3950 }, { "epoch": 0.3483462350457424, "grad_norm": 8.662168502807617, "learning_rate": 9.99213663698332e-06, "loss": 0.963, "step": 3960 }, { "epoch": 0.3492258972554539, "grad_norm": 11.596122741699219, "learning_rate": 9.991847113192431e-06, "loss": 1.0155, "step": 3970 }, { "epoch": 0.35010555946516536, "grad_norm": 14.204232215881348, "learning_rate": 9.99155235963498e-06, "loss": 1.0473, "step": 3980 }, { "epoch": 0.35098522167487683, "grad_norm": 10.088661193847656, "learning_rate": 9.99125237661976e-06, "loss": 1.019, "step": 3990 }, { "epoch": 0.3518648838845883, "grad_norm": 13.183466911315918, "learning_rate": 9.990947164461059e-06, "loss": 1.1294, "step": 4000 }, { "epoch": 0.3527445460942998, "grad_norm": 12.320446968078613, "learning_rate": 9.990636723478632e-06, "loss": 0.9713, "step": 4010 }, { "epoch": 0.35362420830401126, "grad_norm": 11.564430236816406, "learning_rate": 9.990321053997717e-06, "loss": 0.9787, "step": 4020 }, { "epoch": 0.35450387051372273, "grad_norm": 9.023523330688477, "learning_rate": 9.990000156349032e-06, "loss": 1.0821, "step": 4030 }, { "epoch": 0.3553835327234342, "grad_norm": 9.896099090576172, "learning_rate": 9.989674030868763e-06, "loss": 1.0809, "step": 4040 }, { "epoch": 0.3562631949331457, "grad_norm": 9.932551383972168, "learning_rate": 9.989342677898583e-06, "loss": 1.0559, "step": 4050 }, { "epoch": 0.35714285714285715, "grad_norm": 10.387862205505371, "learning_rate": 9.98900609778564e-06, "loss": 1.0401, "step": 4060 }, { "epoch": 0.3580225193525686, "grad_norm": 10.969353675842285, "learning_rate": 9.988664290882552e-06, "loss": 0.9847, "step": 4070 }, { "epoch": 0.3589021815622801, "grad_norm": 10.323173522949219, "learning_rate": 9.988317257547419e-06, "loss": 1.0195, "step": 4080 }, { "epoch": 0.35978184377199157, "grad_norm": 9.365883827209473, "learning_rate": 9.987964998143813e-06, "loss": 0.9584, "step": 4090 }, { "epoch": 0.36066150598170305, "grad_norm": 11.00229263305664, "learning_rate": 9.987607513040785e-06, "loss": 1.1176, "step": 4100 }, { "epoch": 0.3615411681914145, "grad_norm": 9.333796501159668, "learning_rate": 9.987244802612856e-06, "loss": 0.9281, "step": 4110 }, { "epoch": 0.362420830401126, "grad_norm": 11.6054105758667, "learning_rate": 9.986876867240025e-06, "loss": 0.9994, "step": 4120 }, { "epoch": 0.3633004926108374, "grad_norm": 11.267964363098145, "learning_rate": 9.986503707307763e-06, "loss": 0.8779, "step": 4130 }, { "epoch": 0.3641801548205489, "grad_norm": 12.111215591430664, "learning_rate": 9.986125323207016e-06, "loss": 1.0083, "step": 4140 }, { "epoch": 0.36505981703026036, "grad_norm": 11.004450798034668, "learning_rate": 9.985741715334203e-06, "loss": 1.0628, "step": 4150 }, { "epoch": 0.36593947923997183, "grad_norm": 10.322732925415039, "learning_rate": 9.985352884091214e-06, "loss": 0.9945, "step": 4160 }, { "epoch": 0.3668191414496833, "grad_norm": 10.437068939208984, "learning_rate": 9.984958829885414e-06, "loss": 1.0392, "step": 4170 }, { "epoch": 0.3676988036593948, "grad_norm": 9.923577308654785, "learning_rate": 9.984559553129635e-06, "loss": 0.97, "step": 4180 }, { "epoch": 0.36857846586910625, "grad_norm": 9.024016380310059, "learning_rate": 9.984155054242191e-06, "loss": 1.0928, "step": 4190 }, { "epoch": 0.3694581280788177, "grad_norm": 9.614293098449707, "learning_rate": 9.983745333646853e-06, "loss": 1.024, "step": 4200 }, { "epoch": 0.3703377902885292, "grad_norm": 10.24779224395752, "learning_rate": 9.983330391772872e-06, "loss": 1.0444, "step": 4210 }, { "epoch": 0.3712174524982407, "grad_norm": 8.353437423706055, "learning_rate": 9.982910229054968e-06, "loss": 0.9928, "step": 4220 }, { "epoch": 0.37209711470795215, "grad_norm": 10.955916404724121, "learning_rate": 9.982484845933327e-06, "loss": 1.0345, "step": 4230 }, { "epoch": 0.3729767769176636, "grad_norm": 8.678750991821289, "learning_rate": 9.982054242853607e-06, "loss": 0.9988, "step": 4240 }, { "epoch": 0.3738564391273751, "grad_norm": 9.16463851928711, "learning_rate": 9.981618420266937e-06, "loss": 1.0278, "step": 4250 }, { "epoch": 0.37473610133708657, "grad_norm": 9.056244850158691, "learning_rate": 9.981177378629906e-06, "loss": 0.9173, "step": 4260 }, { "epoch": 0.37561576354679804, "grad_norm": 9.28425407409668, "learning_rate": 9.980731118404583e-06, "loss": 1.0504, "step": 4270 }, { "epoch": 0.3764954257565095, "grad_norm": 10.124418258666992, "learning_rate": 9.980279640058494e-06, "loss": 0.9631, "step": 4280 }, { "epoch": 0.377375087966221, "grad_norm": 12.416176795959473, "learning_rate": 9.979822944064636e-06, "loss": 0.983, "step": 4290 }, { "epoch": 0.37825475017593246, "grad_norm": 9.329833030700684, "learning_rate": 9.979361030901473e-06, "loss": 0.9594, "step": 4300 }, { "epoch": 0.37913441238564394, "grad_norm": 8.761341094970703, "learning_rate": 9.978893901052934e-06, "loss": 0.8836, "step": 4310 }, { "epoch": 0.3800140745953554, "grad_norm": 8.562820434570312, "learning_rate": 9.978421555008408e-06, "loss": 0.9207, "step": 4320 }, { "epoch": 0.38089373680506683, "grad_norm": 12.097746849060059, "learning_rate": 9.97794399326276e-06, "loss": 1.086, "step": 4330 }, { "epoch": 0.3817733990147783, "grad_norm": 9.359067916870117, "learning_rate": 9.977461216316308e-06, "loss": 0.8639, "step": 4340 }, { "epoch": 0.3826530612244898, "grad_norm": 8.671854019165039, "learning_rate": 9.976973224674844e-06, "loss": 0.9629, "step": 4350 }, { "epoch": 0.38353272343420125, "grad_norm": 10.71906566619873, "learning_rate": 9.976480018849614e-06, "loss": 0.9302, "step": 4360 }, { "epoch": 0.3844123856439127, "grad_norm": 10.00148868560791, "learning_rate": 9.975981599357336e-06, "loss": 1.0365, "step": 4370 }, { "epoch": 0.3852920478536242, "grad_norm": 10.513381004333496, "learning_rate": 9.975477966720177e-06, "loss": 0.9118, "step": 4380 }, { "epoch": 0.38617171006333567, "grad_norm": 10.212895393371582, "learning_rate": 9.97496912146578e-06, "loss": 0.9626, "step": 4390 }, { "epoch": 0.38705137227304715, "grad_norm": 11.755858421325684, "learning_rate": 9.974455064127243e-06, "loss": 0.9496, "step": 4400 }, { "epoch": 0.3879310344827586, "grad_norm": 12.238123893737793, "learning_rate": 9.973935795243117e-06, "loss": 0.934, "step": 4410 }, { "epoch": 0.3888106966924701, "grad_norm": 7.337192535400391, "learning_rate": 9.973411315357429e-06, "loss": 0.9074, "step": 4420 }, { "epoch": 0.38969035890218157, "grad_norm": 11.591019630432129, "learning_rate": 9.972881625019649e-06, "loss": 0.9605, "step": 4430 }, { "epoch": 0.39057002111189304, "grad_norm": 8.648358345031738, "learning_rate": 9.97234672478472e-06, "loss": 1.0047, "step": 4440 }, { "epoch": 0.3914496833216045, "grad_norm": 8.357699394226074, "learning_rate": 9.97180661521303e-06, "loss": 0.9763, "step": 4450 }, { "epoch": 0.392329345531316, "grad_norm": 10.63962173461914, "learning_rate": 9.971261296870434e-06, "loss": 0.9212, "step": 4460 }, { "epoch": 0.39320900774102746, "grad_norm": 11.577670097351074, "learning_rate": 9.970710770328242e-06, "loss": 0.8091, "step": 4470 }, { "epoch": 0.39408866995073893, "grad_norm": 9.100838661193848, "learning_rate": 9.970155036163218e-06, "loss": 0.9371, "step": 4480 }, { "epoch": 0.3949683321604504, "grad_norm": 8.898874282836914, "learning_rate": 9.969594094957586e-06, "loss": 0.9604, "step": 4490 }, { "epoch": 0.3958479943701619, "grad_norm": 12.011385917663574, "learning_rate": 9.969027947299018e-06, "loss": 0.888, "step": 4500 }, { "epoch": 0.39672765657987336, "grad_norm": 10.310260772705078, "learning_rate": 9.96845659378065e-06, "loss": 0.9567, "step": 4510 }, { "epoch": 0.3976073187895848, "grad_norm": 7.497164249420166, "learning_rate": 9.967880035001063e-06, "loss": 0.995, "step": 4520 }, { "epoch": 0.39848698099929625, "grad_norm": 8.190948486328125, "learning_rate": 9.967298271564299e-06, "loss": 0.8371, "step": 4530 }, { "epoch": 0.3993666432090077, "grad_norm": 10.805998802185059, "learning_rate": 9.966711304079847e-06, "loss": 0.9042, "step": 4540 }, { "epoch": 0.4002463054187192, "grad_norm": 11.24285888671875, "learning_rate": 9.966119133162649e-06, "loss": 0.9718, "step": 4550 }, { "epoch": 0.40112596762843067, "grad_norm": 10.961018562316895, "learning_rate": 9.965521759433104e-06, "loss": 0.996, "step": 4560 }, { "epoch": 0.40200562983814214, "grad_norm": 8.875455856323242, "learning_rate": 9.964919183517053e-06, "loss": 0.9603, "step": 4570 }, { "epoch": 0.4028852920478536, "grad_norm": 13.168818473815918, "learning_rate": 9.964311406045795e-06, "loss": 0.9137, "step": 4580 }, { "epoch": 0.4037649542575651, "grad_norm": 10.952852249145508, "learning_rate": 9.963698427656073e-06, "loss": 0.9997, "step": 4590 }, { "epoch": 0.40464461646727656, "grad_norm": 8.820040702819824, "learning_rate": 9.963080248990083e-06, "loss": 0.9849, "step": 4600 }, { "epoch": 0.40552427867698804, "grad_norm": 8.575933456420898, "learning_rate": 9.962456870695464e-06, "loss": 1.0349, "step": 4610 }, { "epoch": 0.4064039408866995, "grad_norm": 8.898137092590332, "learning_rate": 9.961828293425311e-06, "loss": 0.9845, "step": 4620 }, { "epoch": 0.407283603096411, "grad_norm": 8.59233570098877, "learning_rate": 9.961194517838153e-06, "loss": 0.9536, "step": 4630 }, { "epoch": 0.40816326530612246, "grad_norm": 9.783610343933105, "learning_rate": 9.960555544597978e-06, "loss": 0.8659, "step": 4640 }, { "epoch": 0.40904292751583393, "grad_norm": 8.054505348205566, "learning_rate": 9.959911374374212e-06, "loss": 0.9274, "step": 4650 }, { "epoch": 0.4099225897255454, "grad_norm": 12.414542198181152, "learning_rate": 9.959262007841725e-06, "loss": 1.0907, "step": 4660 }, { "epoch": 0.4108022519352569, "grad_norm": 9.977113723754883, "learning_rate": 9.958607445680836e-06, "loss": 1.001, "step": 4670 }, { "epoch": 0.41168191414496835, "grad_norm": 12.538162231445312, "learning_rate": 9.957947688577306e-06, "loss": 1.0911, "step": 4680 }, { "epoch": 0.4125615763546798, "grad_norm": 9.131693840026855, "learning_rate": 9.957282737222335e-06, "loss": 0.8972, "step": 4690 }, { "epoch": 0.4134412385643913, "grad_norm": 9.522241592407227, "learning_rate": 9.956612592312567e-06, "loss": 0.9498, "step": 4700 }, { "epoch": 0.4143209007741027, "grad_norm": 9.385578155517578, "learning_rate": 9.955937254550086e-06, "loss": 0.9699, "step": 4710 }, { "epoch": 0.4152005629838142, "grad_norm": 11.004072189331055, "learning_rate": 9.955256724642421e-06, "loss": 0.9947, "step": 4720 }, { "epoch": 0.41608022519352567, "grad_norm": 11.52591323852539, "learning_rate": 9.954571003302533e-06, "loss": 1.1164, "step": 4730 }, { "epoch": 0.41695988740323714, "grad_norm": 9.63169002532959, "learning_rate": 9.95388009124883e-06, "loss": 0.972, "step": 4740 }, { "epoch": 0.4178395496129486, "grad_norm": 9.158751487731934, "learning_rate": 9.95318398920515e-06, "loss": 0.9123, "step": 4750 }, { "epoch": 0.4187192118226601, "grad_norm": 9.21484375, "learning_rate": 9.952482697900777e-06, "loss": 0.8619, "step": 4760 }, { "epoch": 0.41959887403237156, "grad_norm": 10.825579643249512, "learning_rate": 9.95177621807042e-06, "loss": 1.0093, "step": 4770 }, { "epoch": 0.42047853624208303, "grad_norm": 9.59325122833252, "learning_rate": 9.951064550454237e-06, "loss": 0.8661, "step": 4780 }, { "epoch": 0.4213581984517945, "grad_norm": 10.431618690490723, "learning_rate": 9.950347695797815e-06, "loss": 0.9122, "step": 4790 }, { "epoch": 0.422237860661506, "grad_norm": 8.886575698852539, "learning_rate": 9.949625654852172e-06, "loss": 1.0007, "step": 4800 }, { "epoch": 0.42311752287121746, "grad_norm": 9.109271049499512, "learning_rate": 9.94889842837376e-06, "loss": 0.9439, "step": 4810 }, { "epoch": 0.42399718508092893, "grad_norm": 11.527764320373535, "learning_rate": 9.948166017124475e-06, "loss": 1.0115, "step": 4820 }, { "epoch": 0.4248768472906404, "grad_norm": 8.62426471710205, "learning_rate": 9.947428421871628e-06, "loss": 0.9333, "step": 4830 }, { "epoch": 0.4257565095003519, "grad_norm": 10.130982398986816, "learning_rate": 9.946685643387973e-06, "loss": 0.9771, "step": 4840 }, { "epoch": 0.42663617171006335, "grad_norm": 10.091684341430664, "learning_rate": 9.94593768245169e-06, "loss": 0.9274, "step": 4850 }, { "epoch": 0.4275158339197748, "grad_norm": 10.478256225585938, "learning_rate": 9.945184539846388e-06, "loss": 0.952, "step": 4860 }, { "epoch": 0.4283954961294863, "grad_norm": 7.2524800300598145, "learning_rate": 9.944426216361106e-06, "loss": 0.966, "step": 4870 }, { "epoch": 0.42927515833919777, "grad_norm": 8.953201293945312, "learning_rate": 9.943662712790311e-06, "loss": 0.8657, "step": 4880 }, { "epoch": 0.43015482054890924, "grad_norm": 9.544113159179688, "learning_rate": 9.942894029933897e-06, "loss": 0.9092, "step": 4890 }, { "epoch": 0.43103448275862066, "grad_norm": 12.270858764648438, "learning_rate": 9.94212016859718e-06, "loss": 0.9553, "step": 4900 }, { "epoch": 0.43191414496833214, "grad_norm": 12.792739868164062, "learning_rate": 9.941341129590911e-06, "loss": 0.9497, "step": 4910 }, { "epoch": 0.4327938071780436, "grad_norm": 7.731476306915283, "learning_rate": 9.940556913731253e-06, "loss": 0.7901, "step": 4920 }, { "epoch": 0.4336734693877551, "grad_norm": 8.014240264892578, "learning_rate": 9.939767521839803e-06, "loss": 0.9377, "step": 4930 }, { "epoch": 0.43455313159746656, "grad_norm": 11.198125839233398, "learning_rate": 9.938972954743574e-06, "loss": 0.9506, "step": 4940 }, { "epoch": 0.43543279380717803, "grad_norm": 8.004776000976562, "learning_rate": 9.938173213275004e-06, "loss": 0.9339, "step": 4950 }, { "epoch": 0.4363124560168895, "grad_norm": 9.408461570739746, "learning_rate": 9.937368298271953e-06, "loss": 0.9133, "step": 4960 }, { "epoch": 0.437192118226601, "grad_norm": 11.402220726013184, "learning_rate": 9.936558210577697e-06, "loss": 1.0579, "step": 4970 }, { "epoch": 0.43807178043631245, "grad_norm": 10.015883445739746, "learning_rate": 9.935742951040938e-06, "loss": 0.9876, "step": 4980 }, { "epoch": 0.4389514426460239, "grad_norm": 7.819204330444336, "learning_rate": 9.934922520515787e-06, "loss": 0.7873, "step": 4990 }, { "epoch": 0.4398311048557354, "grad_norm": 10.86324691772461, "learning_rate": 9.934096919861781e-06, "loss": 0.8714, "step": 5000 }, { "epoch": 0.4407107670654469, "grad_norm": 8.572742462158203, "learning_rate": 9.933266149943867e-06, "loss": 0.8984, "step": 5010 }, { "epoch": 0.44159042927515835, "grad_norm": 7.583752155303955, "learning_rate": 9.932430211632416e-06, "loss": 0.965, "step": 5020 }, { "epoch": 0.4424700914848698, "grad_norm": 10.178061485290527, "learning_rate": 9.931589105803204e-06, "loss": 0.9515, "step": 5030 }, { "epoch": 0.4433497536945813, "grad_norm": 12.359633445739746, "learning_rate": 9.930742833337425e-06, "loss": 0.9732, "step": 5040 }, { "epoch": 0.44422941590429277, "grad_norm": 11.506929397583008, "learning_rate": 9.92989139512169e-06, "loss": 1.0227, "step": 5050 }, { "epoch": 0.44510907811400424, "grad_norm": 8.714112281799316, "learning_rate": 9.929034792048014e-06, "loss": 0.9438, "step": 5060 }, { "epoch": 0.4459887403237157, "grad_norm": 9.602289199829102, "learning_rate": 9.92817302501383e-06, "loss": 0.8739, "step": 5070 }, { "epoch": 0.4468684025334272, "grad_norm": 9.212960243225098, "learning_rate": 9.927306094921977e-06, "loss": 0.8688, "step": 5080 }, { "epoch": 0.4477480647431386, "grad_norm": 7.6222991943359375, "learning_rate": 9.926434002680705e-06, "loss": 0.8989, "step": 5090 }, { "epoch": 0.4486277269528501, "grad_norm": 11.568135261535645, "learning_rate": 9.925556749203669e-06, "loss": 0.8704, "step": 5100 }, { "epoch": 0.44950738916256155, "grad_norm": 9.766641616821289, "learning_rate": 9.924674335409938e-06, "loss": 0.8407, "step": 5110 }, { "epoch": 0.45038705137227303, "grad_norm": 8.87972354888916, "learning_rate": 9.923786762223982e-06, "loss": 0.9181, "step": 5120 }, { "epoch": 0.4512667135819845, "grad_norm": 8.690655708312988, "learning_rate": 9.922894030575673e-06, "loss": 0.9242, "step": 5130 }, { "epoch": 0.452146375791696, "grad_norm": 11.117728233337402, "learning_rate": 9.921996141400297e-06, "loss": 0.9705, "step": 5140 }, { "epoch": 0.45302603800140745, "grad_norm": 10.333867073059082, "learning_rate": 9.921093095638537e-06, "loss": 0.8959, "step": 5150 }, { "epoch": 0.4539057002111189, "grad_norm": 12.482640266418457, "learning_rate": 9.920184894236477e-06, "loss": 0.9119, "step": 5160 }, { "epoch": 0.4547853624208304, "grad_norm": 9.493209838867188, "learning_rate": 9.919271538145606e-06, "loss": 0.956, "step": 5170 }, { "epoch": 0.45566502463054187, "grad_norm": 9.672574996948242, "learning_rate": 9.918353028322813e-06, "loss": 1.0357, "step": 5180 }, { "epoch": 0.45654468684025334, "grad_norm": 8.850768089294434, "learning_rate": 9.917429365730384e-06, "loss": 0.9487, "step": 5190 }, { "epoch": 0.4574243490499648, "grad_norm": 9.222759246826172, "learning_rate": 9.916500551336006e-06, "loss": 0.9475, "step": 5200 }, { "epoch": 0.4583040112596763, "grad_norm": 8.751997947692871, "learning_rate": 9.915566586112761e-06, "loss": 0.9689, "step": 5210 }, { "epoch": 0.45918367346938777, "grad_norm": 7.887462615966797, "learning_rate": 9.91462747103913e-06, "loss": 0.901, "step": 5220 }, { "epoch": 0.46006333567909924, "grad_norm": 11.55186939239502, "learning_rate": 9.913683207098988e-06, "loss": 0.8562, "step": 5230 }, { "epoch": 0.4609429978888107, "grad_norm": 10.75290584564209, "learning_rate": 9.912733795281604e-06, "loss": 0.9911, "step": 5240 }, { "epoch": 0.4618226600985222, "grad_norm": 9.355575561523438, "learning_rate": 9.911779236581635e-06, "loss": 0.9777, "step": 5250 }, { "epoch": 0.46270232230823366, "grad_norm": 9.284628868103027, "learning_rate": 9.910819531999143e-06, "loss": 1.1211, "step": 5260 }, { "epoch": 0.46358198451794513, "grad_norm": 9.622716903686523, "learning_rate": 9.90985468253957e-06, "loss": 0.9345, "step": 5270 }, { "epoch": 0.4644616467276566, "grad_norm": 9.834339141845703, "learning_rate": 9.908884689213754e-06, "loss": 0.9341, "step": 5280 }, { "epoch": 0.465341308937368, "grad_norm": 8.11144733428955, "learning_rate": 9.907909553037915e-06, "loss": 0.9419, "step": 5290 }, { "epoch": 0.4662209711470795, "grad_norm": 10.797053337097168, "learning_rate": 9.90692927503367e-06, "loss": 0.9043, "step": 5300 }, { "epoch": 0.467100633356791, "grad_norm": 10.640527725219727, "learning_rate": 9.905943856228019e-06, "loss": 0.9212, "step": 5310 }, { "epoch": 0.46798029556650245, "grad_norm": 11.997906684875488, "learning_rate": 9.904953297653345e-06, "loss": 0.9535, "step": 5320 }, { "epoch": 0.4688599577762139, "grad_norm": 8.011302947998047, "learning_rate": 9.903957600347418e-06, "loss": 1.001, "step": 5330 }, { "epoch": 0.4697396199859254, "grad_norm": 8.905049324035645, "learning_rate": 9.902956765353394e-06, "loss": 1.0155, "step": 5340 }, { "epoch": 0.47061928219563687, "grad_norm": 7.630199909210205, "learning_rate": 9.901950793719809e-06, "loss": 0.8157, "step": 5350 }, { "epoch": 0.47149894440534834, "grad_norm": 9.227375030517578, "learning_rate": 9.900939686500578e-06, "loss": 0.8586, "step": 5360 }, { "epoch": 0.4723786066150598, "grad_norm": 9.324642181396484, "learning_rate": 9.899923444755003e-06, "loss": 0.8911, "step": 5370 }, { "epoch": 0.4732582688247713, "grad_norm": 7.593076705932617, "learning_rate": 9.89890206954776e-06, "loss": 0.9162, "step": 5380 }, { "epoch": 0.47413793103448276, "grad_norm": 10.917118072509766, "learning_rate": 9.897875561948903e-06, "loss": 0.8577, "step": 5390 }, { "epoch": 0.47501759324419424, "grad_norm": 10.649130821228027, "learning_rate": 9.896843923033867e-06, "loss": 1.0152, "step": 5400 }, { "epoch": 0.4758972554539057, "grad_norm": 8.077325820922852, "learning_rate": 9.89580715388346e-06, "loss": 0.8744, "step": 5410 }, { "epoch": 0.4767769176636172, "grad_norm": 8.86250114440918, "learning_rate": 9.894765255583865e-06, "loss": 0.8892, "step": 5420 }, { "epoch": 0.47765657987332866, "grad_norm": 9.019150733947754, "learning_rate": 9.893718229226637e-06, "loss": 0.9025, "step": 5430 }, { "epoch": 0.47853624208304013, "grad_norm": 11.940291404724121, "learning_rate": 9.892666075908707e-06, "loss": 0.9445, "step": 5440 }, { "epoch": 0.4794159042927516, "grad_norm": 10.155437469482422, "learning_rate": 9.891608796732377e-06, "loss": 0.8555, "step": 5450 }, { "epoch": 0.4802955665024631, "grad_norm": 9.400674819946289, "learning_rate": 9.890546392805316e-06, "loss": 0.8875, "step": 5460 }, { "epoch": 0.48117522871217455, "grad_norm": 9.062864303588867, "learning_rate": 9.889478865240565e-06, "loss": 0.836, "step": 5470 }, { "epoch": 0.48205489092188597, "grad_norm": 9.854737281799316, "learning_rate": 9.888406215156528e-06, "loss": 0.9879, "step": 5480 }, { "epoch": 0.48293455313159744, "grad_norm": 8.348380088806152, "learning_rate": 9.887328443676985e-06, "loss": 0.921, "step": 5490 }, { "epoch": 0.4838142153413089, "grad_norm": 9.241813659667969, "learning_rate": 9.886245551931071e-06, "loss": 0.8929, "step": 5500 }, { "epoch": 0.4846938775510204, "grad_norm": 9.166839599609375, "learning_rate": 9.885157541053293e-06, "loss": 0.8821, "step": 5510 }, { "epoch": 0.48557353976073186, "grad_norm": 9.224221229553223, "learning_rate": 9.884064412183517e-06, "loss": 0.9506, "step": 5520 }, { "epoch": 0.48645320197044334, "grad_norm": 11.064276695251465, "learning_rate": 9.882966166466972e-06, "loss": 0.8765, "step": 5530 }, { "epoch": 0.4873328641801548, "grad_norm": 6.989987850189209, "learning_rate": 9.88186280505425e-06, "loss": 0.8828, "step": 5540 }, { "epoch": 0.4882125263898663, "grad_norm": 9.981297492980957, "learning_rate": 9.880754329101297e-06, "loss": 0.9198, "step": 5550 }, { "epoch": 0.48909218859957776, "grad_norm": 8.18032455444336, "learning_rate": 9.879640739769423e-06, "loss": 0.928, "step": 5560 }, { "epoch": 0.48997185080928923, "grad_norm": 9.6151123046875, "learning_rate": 9.878522038225291e-06, "loss": 0.8979, "step": 5570 }, { "epoch": 0.4908515130190007, "grad_norm": 7.064568042755127, "learning_rate": 9.877398225640925e-06, "loss": 0.8581, "step": 5580 }, { "epoch": 0.4917311752287122, "grad_norm": 7.825985908508301, "learning_rate": 9.876269303193694e-06, "loss": 0.9277, "step": 5590 }, { "epoch": 0.49261083743842365, "grad_norm": 8.200712203979492, "learning_rate": 9.875135272066334e-06, "loss": 0.9164, "step": 5600 }, { "epoch": 0.49349049964813513, "grad_norm": 9.67455768585205, "learning_rate": 9.873996133446923e-06, "loss": 0.8901, "step": 5610 }, { "epoch": 0.4943701618578466, "grad_norm": 7.104532241821289, "learning_rate": 9.872851888528892e-06, "loss": 0.9154, "step": 5620 }, { "epoch": 0.4952498240675581, "grad_norm": 8.988975524902344, "learning_rate": 9.871702538511023e-06, "loss": 0.8775, "step": 5630 }, { "epoch": 0.49612948627726955, "grad_norm": 8.053529739379883, "learning_rate": 9.870548084597444e-06, "loss": 0.923, "step": 5640 }, { "epoch": 0.497009148486981, "grad_norm": 10.321267127990723, "learning_rate": 9.869388527997637e-06, "loss": 0.8879, "step": 5650 }, { "epoch": 0.4978888106966925, "grad_norm": 8.375530242919922, "learning_rate": 9.86822386992642e-06, "loss": 0.8516, "step": 5660 }, { "epoch": 0.4987684729064039, "grad_norm": 9.078594207763672, "learning_rate": 9.867054111603962e-06, "loss": 1.0023, "step": 5670 }, { "epoch": 0.4996481351161154, "grad_norm": 8.705409049987793, "learning_rate": 9.865879254255774e-06, "loss": 0.9063, "step": 5680 }, { "epoch": 0.5005277973258269, "grad_norm": 8.25261402130127, "learning_rate": 9.86469929911271e-06, "loss": 0.9972, "step": 5690 }, { "epoch": 0.5014074595355383, "grad_norm": 10.761588096618652, "learning_rate": 9.863514247410963e-06, "loss": 0.9451, "step": 5700 }, { "epoch": 0.5022871217452498, "grad_norm": 10.379746437072754, "learning_rate": 9.862324100392067e-06, "loss": 0.8619, "step": 5710 }, { "epoch": 0.5031667839549613, "grad_norm": 8.09485912322998, "learning_rate": 9.86112885930289e-06, "loss": 0.8554, "step": 5720 }, { "epoch": 0.5040464461646728, "grad_norm": 9.348891258239746, "learning_rate": 9.859928525395643e-06, "loss": 0.961, "step": 5730 }, { "epoch": 0.5049261083743842, "grad_norm": 7.628345012664795, "learning_rate": 9.858723099927872e-06, "loss": 0.848, "step": 5740 }, { "epoch": 0.5058057705840957, "grad_norm": 9.415760040283203, "learning_rate": 9.85751258416245e-06, "loss": 0.9213, "step": 5750 }, { "epoch": 0.5066854327938072, "grad_norm": 8.605192184448242, "learning_rate": 9.85629697936759e-06, "loss": 0.8741, "step": 5760 }, { "epoch": 0.5075650950035187, "grad_norm": 8.190037727355957, "learning_rate": 9.855076286816834e-06, "loss": 0.962, "step": 5770 }, { "epoch": 0.5084447572132301, "grad_norm": 9.804182052612305, "learning_rate": 9.853850507789058e-06, "loss": 0.9339, "step": 5780 }, { "epoch": 0.5093244194229416, "grad_norm": 9.240817070007324, "learning_rate": 9.852619643568459e-06, "loss": 0.8581, "step": 5790 }, { "epoch": 0.5102040816326531, "grad_norm": 8.73692512512207, "learning_rate": 9.85138369544457e-06, "loss": 0.8975, "step": 5800 }, { "epoch": 0.5110837438423645, "grad_norm": 7.048705577850342, "learning_rate": 9.850142664712242e-06, "loss": 0.8853, "step": 5810 }, { "epoch": 0.511963406052076, "grad_norm": 8.303788185119629, "learning_rate": 9.84889655267166e-06, "loss": 0.8825, "step": 5820 }, { "epoch": 0.5128430682617875, "grad_norm": 7.651731014251709, "learning_rate": 9.847645360628325e-06, "loss": 0.8677, "step": 5830 }, { "epoch": 0.513722730471499, "grad_norm": 8.871259689331055, "learning_rate": 9.846389089893063e-06, "loss": 0.9445, "step": 5840 }, { "epoch": 0.5146023926812104, "grad_norm": 9.450345993041992, "learning_rate": 9.845127741782021e-06, "loss": 0.9385, "step": 5850 }, { "epoch": 0.5154820548909219, "grad_norm": 10.148348808288574, "learning_rate": 9.843861317616665e-06, "loss": 0.9069, "step": 5860 }, { "epoch": 0.5163617171006334, "grad_norm": 8.99698257446289, "learning_rate": 9.842589818723781e-06, "loss": 0.8902, "step": 5870 }, { "epoch": 0.5172413793103449, "grad_norm": 8.666962623596191, "learning_rate": 9.841313246435465e-06, "loss": 0.9497, "step": 5880 }, { "epoch": 0.5181210415200563, "grad_norm": 8.387866973876953, "learning_rate": 9.840031602089137e-06, "loss": 0.8334, "step": 5890 }, { "epoch": 0.5190007037297678, "grad_norm": 9.039457321166992, "learning_rate": 9.838744887027525e-06, "loss": 0.9046, "step": 5900 }, { "epoch": 0.5198803659394793, "grad_norm": 10.287925720214844, "learning_rate": 9.83745310259867e-06, "loss": 0.8715, "step": 5910 }, { "epoch": 0.5207600281491908, "grad_norm": 11.173867225646973, "learning_rate": 9.836156250155926e-06, "loss": 0.9704, "step": 5920 }, { "epoch": 0.5216396903589022, "grad_norm": 8.375786781311035, "learning_rate": 9.834854331057956e-06, "loss": 0.8505, "step": 5930 }, { "epoch": 0.5225193525686137, "grad_norm": 8.591401100158691, "learning_rate": 9.833547346668729e-06, "loss": 0.8349, "step": 5940 }, { "epoch": 0.5233990147783252, "grad_norm": 8.323572158813477, "learning_rate": 9.832235298357525e-06, "loss": 0.8088, "step": 5950 }, { "epoch": 0.5242786769880365, "grad_norm": 7.735118865966797, "learning_rate": 9.830918187498924e-06, "loss": 0.956, "step": 5960 }, { "epoch": 0.525158339197748, "grad_norm": 7.055757522583008, "learning_rate": 9.829596015472812e-06, "loss": 0.815, "step": 5970 }, { "epoch": 0.5260380014074595, "grad_norm": 11.238337516784668, "learning_rate": 9.828268783664383e-06, "loss": 0.9222, "step": 5980 }, { "epoch": 0.526917663617171, "grad_norm": 7.617655277252197, "learning_rate": 9.826936493464122e-06, "loss": 0.8789, "step": 5990 }, { "epoch": 0.5277973258268824, "grad_norm": 8.086779594421387, "learning_rate": 9.825599146267818e-06, "loss": 0.9163, "step": 6000 }, { "epoch": 0.5286769880365939, "grad_norm": 10.217026710510254, "learning_rate": 9.82425674347656e-06, "loss": 0.8468, "step": 6010 }, { "epoch": 0.5295566502463054, "grad_norm": 9.59310245513916, "learning_rate": 9.822909286496733e-06, "loss": 0.8564, "step": 6020 }, { "epoch": 0.5304363124560169, "grad_norm": 11.02243709564209, "learning_rate": 9.821556776740013e-06, "loss": 0.8908, "step": 6030 }, { "epoch": 0.5313159746657283, "grad_norm": 7.990871906280518, "learning_rate": 9.820199215623375e-06, "loss": 0.8073, "step": 6040 }, { "epoch": 0.5321956368754398, "grad_norm": 8.923064231872559, "learning_rate": 9.818836604569083e-06, "loss": 0.8686, "step": 6050 }, { "epoch": 0.5330752990851513, "grad_norm": 10.167866706848145, "learning_rate": 9.817468945004694e-06, "loss": 0.8953, "step": 6060 }, { "epoch": 0.5339549612948628, "grad_norm": 8.682450294494629, "learning_rate": 9.816096238363047e-06, "loss": 0.8751, "step": 6070 }, { "epoch": 0.5348346235045742, "grad_norm": 7.401353359222412, "learning_rate": 9.81471848608228e-06, "loss": 0.9015, "step": 6080 }, { "epoch": 0.5357142857142857, "grad_norm": 9.571507453918457, "learning_rate": 9.813335689605808e-06, "loss": 0.9125, "step": 6090 }, { "epoch": 0.5365939479239972, "grad_norm": 10.077348709106445, "learning_rate": 9.811947850382334e-06, "loss": 0.8091, "step": 6100 }, { "epoch": 0.5374736101337086, "grad_norm": 8.748892784118652, "learning_rate": 9.810554969865845e-06, "loss": 0.8701, "step": 6110 }, { "epoch": 0.5383532723434201, "grad_norm": 9.354782104492188, "learning_rate": 9.809157049515608e-06, "loss": 0.8984, "step": 6120 }, { "epoch": 0.5392329345531316, "grad_norm": 7.895491123199463, "learning_rate": 9.807754090796169e-06, "loss": 0.9003, "step": 6130 }, { "epoch": 0.5401125967628431, "grad_norm": 10.28200912475586, "learning_rate": 9.806346095177357e-06, "loss": 0.8136, "step": 6140 }, { "epoch": 0.5409922589725545, "grad_norm": 10.314196586608887, "learning_rate": 9.804933064134272e-06, "loss": 0.9095, "step": 6150 }, { "epoch": 0.541871921182266, "grad_norm": 9.489459991455078, "learning_rate": 9.803514999147295e-06, "loss": 0.9234, "step": 6160 }, { "epoch": 0.5427515833919775, "grad_norm": 10.702459335327148, "learning_rate": 9.802091901702076e-06, "loss": 0.952, "step": 6170 }, { "epoch": 0.543631245601689, "grad_norm": 10.034884452819824, "learning_rate": 9.80066377328954e-06, "loss": 0.8464, "step": 6180 }, { "epoch": 0.5445109078114004, "grad_norm": 6.61371374130249, "learning_rate": 9.799230615405884e-06, "loss": 0.8106, "step": 6190 }, { "epoch": 0.5453905700211119, "grad_norm": 6.795487403869629, "learning_rate": 9.797792429552569e-06, "loss": 0.8233, "step": 6200 }, { "epoch": 0.5462702322308234, "grad_norm": 7.195446491241455, "learning_rate": 9.796349217236331e-06, "loss": 0.9891, "step": 6210 }, { "epoch": 0.5471498944405349, "grad_norm": 10.341032028198242, "learning_rate": 9.794900979969166e-06, "loss": 0.8532, "step": 6220 }, { "epoch": 0.5480295566502463, "grad_norm": 8.160317420959473, "learning_rate": 9.793447719268338e-06, "loss": 0.897, "step": 6230 }, { "epoch": 0.5489092188599578, "grad_norm": 10.810538291931152, "learning_rate": 9.79198943665637e-06, "loss": 0.848, "step": 6240 }, { "epoch": 0.5497888810696693, "grad_norm": 9.731918334960938, "learning_rate": 9.790526133661052e-06, "loss": 0.8237, "step": 6250 }, { "epoch": 0.5506685432793808, "grad_norm": 8.14372730255127, "learning_rate": 9.789057811815426e-06, "loss": 0.8714, "step": 6260 }, { "epoch": 0.5515482054890922, "grad_norm": 8.74637222290039, "learning_rate": 9.7875844726578e-06, "loss": 0.84, "step": 6270 }, { "epoch": 0.5524278676988037, "grad_norm": 8.433090209960938, "learning_rate": 9.786106117731736e-06, "loss": 0.8399, "step": 6280 }, { "epoch": 0.5533075299085152, "grad_norm": 7.956887245178223, "learning_rate": 9.784622748586045e-06, "loss": 0.8831, "step": 6290 }, { "epoch": 0.5541871921182266, "grad_norm": 8.694855690002441, "learning_rate": 9.7831343667748e-06, "loss": 0.8336, "step": 6300 }, { "epoch": 0.5550668543279381, "grad_norm": 12.63241195678711, "learning_rate": 9.781640973857318e-06, "loss": 1.0338, "step": 6310 }, { "epoch": 0.5559465165376496, "grad_norm": 9.15165901184082, "learning_rate": 9.780142571398172e-06, "loss": 0.939, "step": 6320 }, { "epoch": 0.5568261787473611, "grad_norm": 11.450407028198242, "learning_rate": 9.77863916096718e-06, "loss": 0.8425, "step": 6330 }, { "epoch": 0.5577058409570724, "grad_norm": 9.864909172058105, "learning_rate": 9.777130744139409e-06, "loss": 0.8486, "step": 6340 }, { "epoch": 0.5585855031667839, "grad_norm": 9.060372352600098, "learning_rate": 9.775617322495167e-06, "loss": 0.879, "step": 6350 }, { "epoch": 0.5594651653764954, "grad_norm": 9.10094165802002, "learning_rate": 9.774098897620008e-06, "loss": 0.8005, "step": 6360 }, { "epoch": 0.5603448275862069, "grad_norm": 8.315040588378906, "learning_rate": 9.772575471104728e-06, "loss": 0.7812, "step": 6370 }, { "epoch": 0.5612244897959183, "grad_norm": 8.96319580078125, "learning_rate": 9.771047044545363e-06, "loss": 0.7771, "step": 6380 }, { "epoch": 0.5621041520056298, "grad_norm": 6.997859477996826, "learning_rate": 9.769513619543187e-06, "loss": 0.816, "step": 6390 }, { "epoch": 0.5629838142153413, "grad_norm": 7.817956924438477, "learning_rate": 9.767975197704709e-06, "loss": 0.8535, "step": 6400 }, { "epoch": 0.5638634764250527, "grad_norm": 8.165514945983887, "learning_rate": 9.766431780641676e-06, "loss": 0.8303, "step": 6410 }, { "epoch": 0.5647431386347642, "grad_norm": 9.206972122192383, "learning_rate": 9.764883369971066e-06, "loss": 0.8837, "step": 6420 }, { "epoch": 0.5656228008444757, "grad_norm": 7.747913837432861, "learning_rate": 9.76332996731509e-06, "loss": 0.8167, "step": 6430 }, { "epoch": 0.5665024630541872, "grad_norm": 6.604056358337402, "learning_rate": 9.761771574301187e-06, "loss": 0.9194, "step": 6440 }, { "epoch": 0.5673821252638986, "grad_norm": 7.950380802154541, "learning_rate": 9.760208192562024e-06, "loss": 0.9754, "step": 6450 }, { "epoch": 0.5682617874736101, "grad_norm": 8.020041465759277, "learning_rate": 9.758639823735503e-06, "loss": 0.7761, "step": 6460 }, { "epoch": 0.5691414496833216, "grad_norm": 8.705816268920898, "learning_rate": 9.757066469464737e-06, "loss": 0.8317, "step": 6470 }, { "epoch": 0.5700211118930331, "grad_norm": 8.763314247131348, "learning_rate": 9.75548813139807e-06, "loss": 0.8266, "step": 6480 }, { "epoch": 0.5709007741027445, "grad_norm": 7.762413501739502, "learning_rate": 9.75390481118907e-06, "loss": 0.9142, "step": 6490 }, { "epoch": 0.571780436312456, "grad_norm": 8.363484382629395, "learning_rate": 9.752316510496515e-06, "loss": 0.8855, "step": 6500 }, { "epoch": 0.5726600985221675, "grad_norm": 10.269872665405273, "learning_rate": 9.750723230984412e-06, "loss": 0.8588, "step": 6510 }, { "epoch": 0.573539760731879, "grad_norm": 9.479391098022461, "learning_rate": 9.749124974321976e-06, "loss": 0.7432, "step": 6520 }, { "epoch": 0.5744194229415904, "grad_norm": 11.011748313903809, "learning_rate": 9.74752174218364e-06, "loss": 0.8173, "step": 6530 }, { "epoch": 0.5752990851513019, "grad_norm": 8.476012229919434, "learning_rate": 9.745913536249047e-06, "loss": 0.9301, "step": 6540 }, { "epoch": 0.5761787473610134, "grad_norm": 7.840052127838135, "learning_rate": 9.744300358203057e-06, "loss": 0.8488, "step": 6550 }, { "epoch": 0.5770584095707249, "grad_norm": 9.826251983642578, "learning_rate": 9.742682209735727e-06, "loss": 0.9519, "step": 6560 }, { "epoch": 0.5779380717804363, "grad_norm": 7.668666362762451, "learning_rate": 9.741059092542337e-06, "loss": 0.8141, "step": 6570 }, { "epoch": 0.5788177339901478, "grad_norm": 8.202250480651855, "learning_rate": 9.739431008323361e-06, "loss": 0.8588, "step": 6580 }, { "epoch": 0.5796973961998593, "grad_norm": 11.660720825195312, "learning_rate": 9.737797958784481e-06, "loss": 0.8915, "step": 6590 }, { "epoch": 0.5805770584095707, "grad_norm": 8.381614685058594, "learning_rate": 9.736159945636579e-06, "loss": 0.8976, "step": 6600 }, { "epoch": 0.5814567206192822, "grad_norm": 7.899594306945801, "learning_rate": 9.73451697059574e-06, "loss": 0.8303, "step": 6610 }, { "epoch": 0.5823363828289937, "grad_norm": 9.074773788452148, "learning_rate": 9.732869035383246e-06, "loss": 0.9444, "step": 6620 }, { "epoch": 0.5832160450387052, "grad_norm": 7.67896842956543, "learning_rate": 9.731216141725573e-06, "loss": 0.923, "step": 6630 }, { "epoch": 0.5840957072484166, "grad_norm": 7.329408645629883, "learning_rate": 9.729558291354398e-06, "loss": 0.8239, "step": 6640 }, { "epoch": 0.5849753694581281, "grad_norm": 8.678596496582031, "learning_rate": 9.727895486006584e-06, "loss": 0.7532, "step": 6650 }, { "epoch": 0.5858550316678396, "grad_norm": 8.41541862487793, "learning_rate": 9.72622772742419e-06, "loss": 0.8772, "step": 6660 }, { "epoch": 0.5867346938775511, "grad_norm": 10.35283088684082, "learning_rate": 9.72455501735446e-06, "loss": 0.7326, "step": 6670 }, { "epoch": 0.5876143560872625, "grad_norm": 9.190284729003906, "learning_rate": 9.722877357549832e-06, "loss": 0.8505, "step": 6680 }, { "epoch": 0.588494018296974, "grad_norm": 8.029476165771484, "learning_rate": 9.721194749767923e-06, "loss": 0.8352, "step": 6690 }, { "epoch": 0.5893736805066855, "grad_norm": 8.175151824951172, "learning_rate": 9.71950719577154e-06, "loss": 0.9284, "step": 6700 }, { "epoch": 0.590253342716397, "grad_norm": 8.200882911682129, "learning_rate": 9.717814697328665e-06, "loss": 0.792, "step": 6710 }, { "epoch": 0.5911330049261084, "grad_norm": 7.801442623138428, "learning_rate": 9.716117256212464e-06, "loss": 0.9092, "step": 6720 }, { "epoch": 0.5920126671358198, "grad_norm": 7.0838541984558105, "learning_rate": 9.714414874201281e-06, "loss": 0.8431, "step": 6730 }, { "epoch": 0.5928923293455313, "grad_norm": 8.283254623413086, "learning_rate": 9.712707553078639e-06, "loss": 0.778, "step": 6740 }, { "epoch": 0.5937719915552427, "grad_norm": 6.856544494628906, "learning_rate": 9.71099529463323e-06, "loss": 0.8816, "step": 6750 }, { "epoch": 0.5946516537649542, "grad_norm": 8.955995559692383, "learning_rate": 9.70927810065892e-06, "loss": 0.9204, "step": 6760 }, { "epoch": 0.5955313159746657, "grad_norm": 7.296191692352295, "learning_rate": 9.707555972954753e-06, "loss": 0.7788, "step": 6770 }, { "epoch": 0.5964109781843772, "grad_norm": 7.9858198165893555, "learning_rate": 9.70582891332493e-06, "loss": 0.8069, "step": 6780 }, { "epoch": 0.5972906403940886, "grad_norm": 7.6095075607299805, "learning_rate": 9.704096923578827e-06, "loss": 0.7612, "step": 6790 }, { "epoch": 0.5981703026038001, "grad_norm": 10.005577087402344, "learning_rate": 9.702360005530986e-06, "loss": 0.7562, "step": 6800 }, { "epoch": 0.5990499648135116, "grad_norm": 9.731851577758789, "learning_rate": 9.700618161001105e-06, "loss": 0.8888, "step": 6810 }, { "epoch": 0.599929627023223, "grad_norm": 8.549162864685059, "learning_rate": 9.69887139181405e-06, "loss": 0.8365, "step": 6820 }, { "epoch": 0.6008092892329345, "grad_norm": 12.076580047607422, "learning_rate": 9.697119699799843e-06, "loss": 0.8823, "step": 6830 }, { "epoch": 0.601688951442646, "grad_norm": 9.179803848266602, "learning_rate": 9.695363086793666e-06, "loss": 0.795, "step": 6840 }, { "epoch": 0.6025686136523575, "grad_norm": 10.121969223022461, "learning_rate": 9.693601554635853e-06, "loss": 0.8377, "step": 6850 }, { "epoch": 0.603448275862069, "grad_norm": 6.614415645599365, "learning_rate": 9.691835105171898e-06, "loss": 0.8412, "step": 6860 }, { "epoch": 0.6043279380717804, "grad_norm": 11.840957641601562, "learning_rate": 9.690063740252436e-06, "loss": 0.8925, "step": 6870 }, { "epoch": 0.6052076002814919, "grad_norm": 8.668222427368164, "learning_rate": 9.688287461733263e-06, "loss": 0.8432, "step": 6880 }, { "epoch": 0.6060872624912034, "grad_norm": 7.619955062866211, "learning_rate": 9.686506271475314e-06, "loss": 0.8689, "step": 6890 }, { "epoch": 0.6069669247009148, "grad_norm": 6.729368686676025, "learning_rate": 9.684720171344676e-06, "loss": 0.8371, "step": 6900 }, { "epoch": 0.6078465869106263, "grad_norm": 7.7951273918151855, "learning_rate": 9.682929163212577e-06, "loss": 0.9214, "step": 6910 }, { "epoch": 0.6087262491203378, "grad_norm": 10.345414161682129, "learning_rate": 9.681133248955387e-06, "loss": 0.8864, "step": 6920 }, { "epoch": 0.6096059113300493, "grad_norm": 7.489892959594727, "learning_rate": 9.679332430454617e-06, "loss": 0.7666, "step": 6930 }, { "epoch": 0.6104855735397607, "grad_norm": 7.49118709564209, "learning_rate": 9.677526709596918e-06, "loss": 0.7249, "step": 6940 }, { "epoch": 0.6113652357494722, "grad_norm": 10.09367561340332, "learning_rate": 9.675716088274069e-06, "loss": 0.8842, "step": 6950 }, { "epoch": 0.6122448979591837, "grad_norm": 9.200807571411133, "learning_rate": 9.673900568382993e-06, "loss": 0.9375, "step": 6960 }, { "epoch": 0.6131245601688952, "grad_norm": 9.60831069946289, "learning_rate": 9.672080151825735e-06, "loss": 0.7734, "step": 6970 }, { "epoch": 0.6140042223786066, "grad_norm": 7.435789108276367, "learning_rate": 9.670254840509483e-06, "loss": 0.7844, "step": 6980 }, { "epoch": 0.6148838845883181, "grad_norm": 8.604950904846191, "learning_rate": 9.66842463634654e-06, "loss": 0.8755, "step": 6990 }, { "epoch": 0.6157635467980296, "grad_norm": 10.382131576538086, "learning_rate": 9.666589541254343e-06, "loss": 0.8727, "step": 7000 }, { "epoch": 0.616643209007741, "grad_norm": 8.562626838684082, "learning_rate": 9.664749557155448e-06, "loss": 0.8758, "step": 7010 }, { "epoch": 0.6175228712174525, "grad_norm": 9.81380844116211, "learning_rate": 9.66290468597754e-06, "loss": 0.9356, "step": 7020 }, { "epoch": 0.618402533427164, "grad_norm": 7.5896711349487305, "learning_rate": 9.661054929653417e-06, "loss": 0.812, "step": 7030 }, { "epoch": 0.6192821956368755, "grad_norm": 6.505224704742432, "learning_rate": 9.659200290120998e-06, "loss": 0.8296, "step": 7040 }, { "epoch": 0.620161857846587, "grad_norm": 8.78367805480957, "learning_rate": 9.657340769323318e-06, "loss": 0.7464, "step": 7050 }, { "epoch": 0.6210415200562984, "grad_norm": 7.540207862854004, "learning_rate": 9.655476369208525e-06, "loss": 0.8415, "step": 7060 }, { "epoch": 0.6219211822660099, "grad_norm": 6.474719524383545, "learning_rate": 9.65360709172988e-06, "loss": 0.8156, "step": 7070 }, { "epoch": 0.6228008444757214, "grad_norm": 9.622965812683105, "learning_rate": 9.651732938845752e-06, "loss": 0.8143, "step": 7080 }, { "epoch": 0.6236805066854328, "grad_norm": 6.9456939697265625, "learning_rate": 9.649853912519623e-06, "loss": 0.8725, "step": 7090 }, { "epoch": 0.6245601688951443, "grad_norm": 7.3917646408081055, "learning_rate": 9.64797001472007e-06, "loss": 0.7313, "step": 7100 }, { "epoch": 0.6254398311048557, "grad_norm": 9.239513397216797, "learning_rate": 9.646081247420786e-06, "loss": 0.8793, "step": 7110 }, { "epoch": 0.6263194933145672, "grad_norm": 8.829631805419922, "learning_rate": 9.644187612600558e-06, "loss": 0.885, "step": 7120 }, { "epoch": 0.6271991555242786, "grad_norm": 7.849456310272217, "learning_rate": 9.642289112243274e-06, "loss": 0.7979, "step": 7130 }, { "epoch": 0.6280788177339901, "grad_norm": 9.476936340332031, "learning_rate": 9.640385748337921e-06, "loss": 0.9006, "step": 7140 }, { "epoch": 0.6289584799437016, "grad_norm": 7.645656585693359, "learning_rate": 9.63847752287858e-06, "loss": 0.7716, "step": 7150 }, { "epoch": 0.629838142153413, "grad_norm": 9.10317325592041, "learning_rate": 9.636564437864426e-06, "loss": 0.8832, "step": 7160 }, { "epoch": 0.6307178043631245, "grad_norm": 9.49273681640625, "learning_rate": 9.634646495299722e-06, "loss": 0.8279, "step": 7170 }, { "epoch": 0.631597466572836, "grad_norm": 12.500299453735352, "learning_rate": 9.632723697193827e-06, "loss": 0.8685, "step": 7180 }, { "epoch": 0.6324771287825475, "grad_norm": 11.512235641479492, "learning_rate": 9.63079604556118e-06, "loss": 0.9801, "step": 7190 }, { "epoch": 0.633356790992259, "grad_norm": 6.832698822021484, "learning_rate": 9.628863542421308e-06, "loss": 0.7926, "step": 7200 }, { "epoch": 0.6342364532019704, "grad_norm": 7.662810325622559, "learning_rate": 9.626926189798822e-06, "loss": 0.8031, "step": 7210 }, { "epoch": 0.6351161154116819, "grad_norm": 11.209434509277344, "learning_rate": 9.624983989723408e-06, "loss": 0.9028, "step": 7220 }, { "epoch": 0.6359957776213934, "grad_norm": 8.559063911437988, "learning_rate": 9.62303694422984e-06, "loss": 0.8928, "step": 7230 }, { "epoch": 0.6368754398311048, "grad_norm": 7.452171325683594, "learning_rate": 9.621085055357958e-06, "loss": 0.8438, "step": 7240 }, { "epoch": 0.6377551020408163, "grad_norm": 8.112951278686523, "learning_rate": 9.619128325152683e-06, "loss": 0.7481, "step": 7250 }, { "epoch": 0.6386347642505278, "grad_norm": 8.643758773803711, "learning_rate": 9.617166755664005e-06, "loss": 0.7169, "step": 7260 }, { "epoch": 0.6395144264602393, "grad_norm": 8.620161056518555, "learning_rate": 9.615200348946987e-06, "loss": 0.9023, "step": 7270 }, { "epoch": 0.6403940886699507, "grad_norm": 8.80849552154541, "learning_rate": 9.613229107061758e-06, "loss": 0.8177, "step": 7280 }, { "epoch": 0.6412737508796622, "grad_norm": 8.693267822265625, "learning_rate": 9.61125303207351e-06, "loss": 0.8308, "step": 7290 }, { "epoch": 0.6421534130893737, "grad_norm": 7.6343207359313965, "learning_rate": 9.609272126052502e-06, "loss": 0.771, "step": 7300 }, { "epoch": 0.6430330752990852, "grad_norm": 7.107234477996826, "learning_rate": 9.607286391074053e-06, "loss": 0.829, "step": 7310 }, { "epoch": 0.6439127375087966, "grad_norm": 8.754098892211914, "learning_rate": 9.605295829218544e-06, "loss": 0.8824, "step": 7320 }, { "epoch": 0.6447923997185081, "grad_norm": 7.867997169494629, "learning_rate": 9.603300442571406e-06, "loss": 0.9062, "step": 7330 }, { "epoch": 0.6456720619282196, "grad_norm": 9.911584854125977, "learning_rate": 9.601300233223132e-06, "loss": 0.8352, "step": 7340 }, { "epoch": 0.646551724137931, "grad_norm": 11.04414176940918, "learning_rate": 9.599295203269262e-06, "loss": 0.7546, "step": 7350 }, { "epoch": 0.6474313863476425, "grad_norm": 6.120754241943359, "learning_rate": 9.59728535481039e-06, "loss": 0.9023, "step": 7360 }, { "epoch": 0.648311048557354, "grad_norm": 7.1041951179504395, "learning_rate": 9.595270689952158e-06, "loss": 0.8699, "step": 7370 }, { "epoch": 0.6491907107670655, "grad_norm": 5.963545799255371, "learning_rate": 9.593251210805253e-06, "loss": 0.7495, "step": 7380 }, { "epoch": 0.650070372976777, "grad_norm": 7.9249958992004395, "learning_rate": 9.591226919485405e-06, "loss": 0.8865, "step": 7390 }, { "epoch": 0.6509500351864884, "grad_norm": 9.75959587097168, "learning_rate": 9.589197818113385e-06, "loss": 0.9168, "step": 7400 }, { "epoch": 0.6518296973961999, "grad_norm": 7.746891021728516, "learning_rate": 9.587163908815008e-06, "loss": 0.7464, "step": 7410 }, { "epoch": 0.6527093596059114, "grad_norm": 7.9630937576293945, "learning_rate": 9.585125193721122e-06, "loss": 0.8401, "step": 7420 }, { "epoch": 0.6535890218156228, "grad_norm": 9.030978202819824, "learning_rate": 9.583081674967606e-06, "loss": 0.8668, "step": 7430 }, { "epoch": 0.6544686840253343, "grad_norm": 7.150374412536621, "learning_rate": 9.58103335469538e-06, "loss": 0.8146, "step": 7440 }, { "epoch": 0.6553483462350458, "grad_norm": 6.40443754196167, "learning_rate": 9.57898023505039e-06, "loss": 0.754, "step": 7450 }, { "epoch": 0.6562280084447573, "grad_norm": 8.477450370788574, "learning_rate": 9.576922318183612e-06, "loss": 0.8449, "step": 7460 }, { "epoch": 0.6571076706544687, "grad_norm": 8.087319374084473, "learning_rate": 9.574859606251043e-06, "loss": 0.8175, "step": 7470 }, { "epoch": 0.6579873328641802, "grad_norm": 7.874214172363281, "learning_rate": 9.572792101413712e-06, "loss": 0.8129, "step": 7480 }, { "epoch": 0.6588669950738916, "grad_norm": 8.9036865234375, "learning_rate": 9.570719805837659e-06, "loss": 0.8342, "step": 7490 }, { "epoch": 0.659746657283603, "grad_norm": 8.470335006713867, "learning_rate": 9.568642721693953e-06, "loss": 0.8244, "step": 7500 }, { "epoch": 0.6606263194933145, "grad_norm": 7.9585113525390625, "learning_rate": 9.566560851158673e-06, "loss": 0.8152, "step": 7510 }, { "epoch": 0.661505981703026, "grad_norm": 7.093564510345459, "learning_rate": 9.564474196412914e-06, "loss": 0.8059, "step": 7520 }, { "epoch": 0.6623856439127375, "grad_norm": 9.737990379333496, "learning_rate": 9.562382759642785e-06, "loss": 0.8814, "step": 7530 }, { "epoch": 0.6632653061224489, "grad_norm": 9.444591522216797, "learning_rate": 9.560286543039405e-06, "loss": 0.8959, "step": 7540 }, { "epoch": 0.6641449683321604, "grad_norm": 9.10556411743164, "learning_rate": 9.5581855487989e-06, "loss": 0.7762, "step": 7550 }, { "epoch": 0.6650246305418719, "grad_norm": 10.021120071411133, "learning_rate": 9.5560797791224e-06, "loss": 0.7864, "step": 7560 }, { "epoch": 0.6659042927515834, "grad_norm": 9.083039283752441, "learning_rate": 9.553969236216038e-06, "loss": 0.7863, "step": 7570 }, { "epoch": 0.6667839549612948, "grad_norm": 7.715065002441406, "learning_rate": 9.55185392229095e-06, "loss": 0.7447, "step": 7580 }, { "epoch": 0.6676636171710063, "grad_norm": 12.059564590454102, "learning_rate": 9.549733839563268e-06, "loss": 0.8146, "step": 7590 }, { "epoch": 0.6685432793807178, "grad_norm": 8.38671875, "learning_rate": 9.547608990254123e-06, "loss": 0.7995, "step": 7600 }, { "epoch": 0.6694229415904293, "grad_norm": 7.5856499671936035, "learning_rate": 9.545479376589637e-06, "loss": 0.817, "step": 7610 }, { "epoch": 0.6703026038001407, "grad_norm": 7.718897819519043, "learning_rate": 9.543345000800926e-06, "loss": 0.9146, "step": 7620 }, { "epoch": 0.6711822660098522, "grad_norm": 11.376896858215332, "learning_rate": 9.54120586512409e-06, "loss": 0.9217, "step": 7630 }, { "epoch": 0.6720619282195637, "grad_norm": 7.99937105178833, "learning_rate": 9.539061971800224e-06, "loss": 0.781, "step": 7640 }, { "epoch": 0.6729415904292751, "grad_norm": 8.043183326721191, "learning_rate": 9.536913323075399e-06, "loss": 0.7407, "step": 7650 }, { "epoch": 0.6738212526389866, "grad_norm": 7.811465740203857, "learning_rate": 9.534759921200673e-06, "loss": 0.8649, "step": 7660 }, { "epoch": 0.6747009148486981, "grad_norm": 10.074837684631348, "learning_rate": 9.532601768432083e-06, "loss": 0.7237, "step": 7670 }, { "epoch": 0.6755805770584096, "grad_norm": 8.032349586486816, "learning_rate": 9.530438867030643e-06, "loss": 0.8189, "step": 7680 }, { "epoch": 0.676460239268121, "grad_norm": 8.724042892456055, "learning_rate": 9.528271219262341e-06, "loss": 0.783, "step": 7690 }, { "epoch": 0.6773399014778325, "grad_norm": 7.887787342071533, "learning_rate": 9.526098827398139e-06, "loss": 0.8114, "step": 7700 }, { "epoch": 0.678219563687544, "grad_norm": 8.21603775024414, "learning_rate": 9.523921693713966e-06, "loss": 0.8636, "step": 7710 }, { "epoch": 0.6790992258972555, "grad_norm": 8.545001983642578, "learning_rate": 9.521739820490725e-06, "loss": 0.8978, "step": 7720 }, { "epoch": 0.6799788881069669, "grad_norm": 6.848452091217041, "learning_rate": 9.51955321001428e-06, "loss": 0.6845, "step": 7730 }, { "epoch": 0.6808585503166784, "grad_norm": 6.654182434082031, "learning_rate": 9.517361864575457e-06, "loss": 0.8713, "step": 7740 }, { "epoch": 0.6817382125263899, "grad_norm": 8.70547103881836, "learning_rate": 9.515165786470045e-06, "loss": 0.7682, "step": 7750 }, { "epoch": 0.6826178747361014, "grad_norm": 7.17990779876709, "learning_rate": 9.512964977998791e-06, "loss": 0.8055, "step": 7760 }, { "epoch": 0.6834975369458128, "grad_norm": 10.440848350524902, "learning_rate": 9.510759441467399e-06, "loss": 0.8459, "step": 7770 }, { "epoch": 0.6843771991555243, "grad_norm": 10.012944221496582, "learning_rate": 9.508549179186523e-06, "loss": 0.816, "step": 7780 }, { "epoch": 0.6852568613652358, "grad_norm": 6.773433208465576, "learning_rate": 9.506334193471772e-06, "loss": 0.7717, "step": 7790 }, { "epoch": 0.6861365235749473, "grad_norm": 8.382229804992676, "learning_rate": 9.504114486643699e-06, "loss": 0.7676, "step": 7800 }, { "epoch": 0.6870161857846587, "grad_norm": 9.567729949951172, "learning_rate": 9.501890061027806e-06, "loss": 0.8659, "step": 7810 }, { "epoch": 0.6878958479943702, "grad_norm": 8.186829566955566, "learning_rate": 9.49966091895454e-06, "loss": 0.9244, "step": 7820 }, { "epoch": 0.6887755102040817, "grad_norm": 6.6288228034973145, "learning_rate": 9.497427062759287e-06, "loss": 0.7456, "step": 7830 }, { "epoch": 0.6896551724137931, "grad_norm": 7.061506271362305, "learning_rate": 9.495188494782372e-06, "loss": 0.8204, "step": 7840 }, { "epoch": 0.6905348346235046, "grad_norm": 8.610881805419922, "learning_rate": 9.492945217369056e-06, "loss": 0.7966, "step": 7850 }, { "epoch": 0.6914144968332161, "grad_norm": 7.624958515167236, "learning_rate": 9.490697232869538e-06, "loss": 0.8695, "step": 7860 }, { "epoch": 0.6922941590429276, "grad_norm": 9.236664772033691, "learning_rate": 9.48844454363894e-06, "loss": 0.7768, "step": 7870 }, { "epoch": 0.6931738212526389, "grad_norm": 8.62597370147705, "learning_rate": 9.48618715203732e-06, "loss": 0.8265, "step": 7880 }, { "epoch": 0.6940534834623504, "grad_norm": 7.9439496994018555, "learning_rate": 9.483925060429662e-06, "loss": 0.7533, "step": 7890 }, { "epoch": 0.6949331456720619, "grad_norm": 9.846586227416992, "learning_rate": 9.481658271185871e-06, "loss": 0.8971, "step": 7900 }, { "epoch": 0.6958128078817734, "grad_norm": 9.006646156311035, "learning_rate": 9.479386786680777e-06, "loss": 0.8148, "step": 7910 }, { "epoch": 0.6966924700914848, "grad_norm": 8.610021591186523, "learning_rate": 9.477110609294125e-06, "loss": 0.8064, "step": 7920 }, { "epoch": 0.6975721323011963, "grad_norm": 9.597954750061035, "learning_rate": 9.474829741410579e-06, "loss": 0.8419, "step": 7930 }, { "epoch": 0.6984517945109078, "grad_norm": 8.44272232055664, "learning_rate": 9.472544185419717e-06, "loss": 0.8279, "step": 7940 }, { "epoch": 0.6993314567206192, "grad_norm": 7.719616413116455, "learning_rate": 9.470253943716027e-06, "loss": 0.8429, "step": 7950 }, { "epoch": 0.7002111189303307, "grad_norm": 6.653249263763428, "learning_rate": 9.46795901869891e-06, "loss": 0.7885, "step": 7960 }, { "epoch": 0.7010907811400422, "grad_norm": 7.007891654968262, "learning_rate": 9.46565941277267e-06, "loss": 0.8265, "step": 7970 }, { "epoch": 0.7019704433497537, "grad_norm": 8.830856323242188, "learning_rate": 9.463355128346515e-06, "loss": 0.7717, "step": 7980 }, { "epoch": 0.7028501055594651, "grad_norm": 8.637545585632324, "learning_rate": 9.461046167834553e-06, "loss": 0.7849, "step": 7990 }, { "epoch": 0.7037297677691766, "grad_norm": 8.184865951538086, "learning_rate": 9.458732533655799e-06, "loss": 0.7094, "step": 8000 }, { "epoch": 0.7046094299788881, "grad_norm": 8.39356803894043, "learning_rate": 9.456414228234154e-06, "loss": 0.8463, "step": 8010 }, { "epoch": 0.7054890921885996, "grad_norm": 6.039584159851074, "learning_rate": 9.454091253998422e-06, "loss": 0.8252, "step": 8020 }, { "epoch": 0.706368754398311, "grad_norm": 8.922965049743652, "learning_rate": 9.451763613382287e-06, "loss": 0.852, "step": 8030 }, { "epoch": 0.7072484166080225, "grad_norm": 8.706902503967285, "learning_rate": 9.449431308824334e-06, "loss": 0.7387, "step": 8040 }, { "epoch": 0.708128078817734, "grad_norm": 7.026686191558838, "learning_rate": 9.447094342768028e-06, "loss": 0.7921, "step": 8050 }, { "epoch": 0.7090077410274455, "grad_norm": 9.48546314239502, "learning_rate": 9.444752717661721e-06, "loss": 0.8221, "step": 8060 }, { "epoch": 0.7098874032371569, "grad_norm": 9.276078224182129, "learning_rate": 9.442406435958638e-06, "loss": 0.7466, "step": 8070 }, { "epoch": 0.7107670654468684, "grad_norm": 9.209023475646973, "learning_rate": 9.440055500116892e-06, "loss": 0.8153, "step": 8080 }, { "epoch": 0.7116467276565799, "grad_norm": 8.606512069702148, "learning_rate": 9.437699912599467e-06, "loss": 0.8973, "step": 8090 }, { "epoch": 0.7125263898662914, "grad_norm": 9.037758827209473, "learning_rate": 9.435339675874224e-06, "loss": 0.8713, "step": 8100 }, { "epoch": 0.7134060520760028, "grad_norm": 6.894113063812256, "learning_rate": 9.43297479241389e-06, "loss": 0.7138, "step": 8110 }, { "epoch": 0.7142857142857143, "grad_norm": 10.912918090820312, "learning_rate": 9.43060526469606e-06, "loss": 0.8228, "step": 8120 }, { "epoch": 0.7151653764954258, "grad_norm": 7.983969688415527, "learning_rate": 9.428231095203204e-06, "loss": 0.8182, "step": 8130 }, { "epoch": 0.7160450387051372, "grad_norm": 9.229507446289062, "learning_rate": 9.425852286422644e-06, "loss": 0.855, "step": 8140 }, { "epoch": 0.7169247009148487, "grad_norm": 7.9691925048828125, "learning_rate": 9.423468840846566e-06, "loss": 0.8196, "step": 8150 }, { "epoch": 0.7178043631245602, "grad_norm": 8.147778511047363, "learning_rate": 9.421080760972017e-06, "loss": 0.7582, "step": 8160 }, { "epoch": 0.7186840253342717, "grad_norm": 7.8743181228637695, "learning_rate": 9.418688049300894e-06, "loss": 0.8042, "step": 8170 }, { "epoch": 0.7195636875439831, "grad_norm": 8.109112739562988, "learning_rate": 9.41629070833995e-06, "loss": 0.7323, "step": 8180 }, { "epoch": 0.7204433497536946, "grad_norm": 10.20501708984375, "learning_rate": 9.41388874060079e-06, "loss": 0.9114, "step": 8190 }, { "epoch": 0.7213230119634061, "grad_norm": 10.174378395080566, "learning_rate": 9.411482148599857e-06, "loss": 0.8971, "step": 8200 }, { "epoch": 0.7222026741731176, "grad_norm": 8.876524925231934, "learning_rate": 9.409070934858452e-06, "loss": 0.7041, "step": 8210 }, { "epoch": 0.723082336382829, "grad_norm": 9.58246898651123, "learning_rate": 9.406655101902706e-06, "loss": 0.8552, "step": 8220 }, { "epoch": 0.7239619985925405, "grad_norm": 8.027246475219727, "learning_rate": 9.404234652263598e-06, "loss": 0.7372, "step": 8230 }, { "epoch": 0.724841660802252, "grad_norm": 7.485148906707764, "learning_rate": 9.40180958847694e-06, "loss": 0.8339, "step": 8240 }, { "epoch": 0.7257213230119635, "grad_norm": 7.074976921081543, "learning_rate": 9.399379913083373e-06, "loss": 0.8107, "step": 8250 }, { "epoch": 0.7266009852216748, "grad_norm": 8.13673210144043, "learning_rate": 9.39694562862838e-06, "loss": 0.7893, "step": 8260 }, { "epoch": 0.7274806474313863, "grad_norm": 8.402352333068848, "learning_rate": 9.394506737662267e-06, "loss": 0.8581, "step": 8270 }, { "epoch": 0.7283603096410978, "grad_norm": 8.268866539001465, "learning_rate": 9.392063242740164e-06, "loss": 0.7842, "step": 8280 }, { "epoch": 0.7292399718508092, "grad_norm": 11.204652786254883, "learning_rate": 9.389615146422027e-06, "loss": 0.8043, "step": 8290 }, { "epoch": 0.7301196340605207, "grad_norm": 10.28194522857666, "learning_rate": 9.387162451272635e-06, "loss": 0.753, "step": 8300 }, { "epoch": 0.7309992962702322, "grad_norm": 9.557822227478027, "learning_rate": 9.38470515986158e-06, "loss": 0.685, "step": 8310 }, { "epoch": 0.7318789584799437, "grad_norm": 7.463655471801758, "learning_rate": 9.382243274763274e-06, "loss": 0.7779, "step": 8320 }, { "epoch": 0.7327586206896551, "grad_norm": 7.287647724151611, "learning_rate": 9.379776798556939e-06, "loss": 0.7764, "step": 8330 }, { "epoch": 0.7336382828993666, "grad_norm": 6.4871368408203125, "learning_rate": 9.377305733826605e-06, "loss": 0.6988, "step": 8340 }, { "epoch": 0.7345179451090781, "grad_norm": 7.432406902313232, "learning_rate": 9.374830083161115e-06, "loss": 0.7582, "step": 8350 }, { "epoch": 0.7353976073187896, "grad_norm": 7.673295021057129, "learning_rate": 9.372349849154112e-06, "loss": 0.8054, "step": 8360 }, { "epoch": 0.736277269528501, "grad_norm": 7.746471405029297, "learning_rate": 9.36986503440404e-06, "loss": 0.7927, "step": 8370 }, { "epoch": 0.7371569317382125, "grad_norm": 9.473104476928711, "learning_rate": 9.367375641514146e-06, "loss": 0.7578, "step": 8380 }, { "epoch": 0.738036593947924, "grad_norm": 8.808333396911621, "learning_rate": 9.364881673092472e-06, "loss": 0.7831, "step": 8390 }, { "epoch": 0.7389162561576355, "grad_norm": 8.259716033935547, "learning_rate": 9.36238313175185e-06, "loss": 0.7607, "step": 8400 }, { "epoch": 0.7397959183673469, "grad_norm": 7.279810905456543, "learning_rate": 9.35988002010991e-06, "loss": 0.8107, "step": 8410 }, { "epoch": 0.7406755805770584, "grad_norm": 5.456779956817627, "learning_rate": 9.35737234078906e-06, "loss": 0.6634, "step": 8420 }, { "epoch": 0.7415552427867699, "grad_norm": 8.289407730102539, "learning_rate": 9.354860096416508e-06, "loss": 0.7099, "step": 8430 }, { "epoch": 0.7424349049964813, "grad_norm": 6.928791046142578, "learning_rate": 9.352343289624229e-06, "loss": 0.7464, "step": 8440 }, { "epoch": 0.7433145672061928, "grad_norm": 8.174041748046875, "learning_rate": 9.349821923048988e-06, "loss": 0.8491, "step": 8450 }, { "epoch": 0.7441942294159043, "grad_norm": 7.487947463989258, "learning_rate": 9.34729599933232e-06, "loss": 0.7366, "step": 8460 }, { "epoch": 0.7450738916256158, "grad_norm": 6.59562873840332, "learning_rate": 9.344765521120543e-06, "loss": 0.7463, "step": 8470 }, { "epoch": 0.7459535538353272, "grad_norm": 8.68041706085205, "learning_rate": 9.34223049106474e-06, "loss": 0.7614, "step": 8480 }, { "epoch": 0.7468332160450387, "grad_norm": 8.748992919921875, "learning_rate": 9.339690911820765e-06, "loss": 0.797, "step": 8490 }, { "epoch": 0.7477128782547502, "grad_norm": 6.120482921600342, "learning_rate": 9.337146786049236e-06, "loss": 0.7939, "step": 8500 }, { "epoch": 0.7485925404644617, "grad_norm": 9.666031837463379, "learning_rate": 9.334598116415539e-06, "loss": 0.725, "step": 8510 }, { "epoch": 0.7494722026741731, "grad_norm": 8.329330444335938, "learning_rate": 9.332044905589814e-06, "loss": 0.8021, "step": 8520 }, { "epoch": 0.7503518648838846, "grad_norm": 6.977663040161133, "learning_rate": 9.329487156246964e-06, "loss": 0.8229, "step": 8530 }, { "epoch": 0.7512315270935961, "grad_norm": 8.702689170837402, "learning_rate": 9.326924871066644e-06, "loss": 0.7865, "step": 8540 }, { "epoch": 0.7521111893033076, "grad_norm": 5.336197853088379, "learning_rate": 9.324358052733263e-06, "loss": 0.7817, "step": 8550 }, { "epoch": 0.752990851513019, "grad_norm": 4.893182277679443, "learning_rate": 9.321786703935975e-06, "loss": 0.7314, "step": 8560 }, { "epoch": 0.7538705137227305, "grad_norm": 9.089276313781738, "learning_rate": 9.319210827368687e-06, "loss": 0.8245, "step": 8570 }, { "epoch": 0.754750175932442, "grad_norm": 8.750150680541992, "learning_rate": 9.316630425730047e-06, "loss": 0.8367, "step": 8580 }, { "epoch": 0.7556298381421535, "grad_norm": 8.3826322555542, "learning_rate": 9.314045501723438e-06, "loss": 0.7081, "step": 8590 }, { "epoch": 0.7565095003518649, "grad_norm": 10.014968872070312, "learning_rate": 9.311456058056987e-06, "loss": 0.8809, "step": 8600 }, { "epoch": 0.7573891625615764, "grad_norm": 6.504092693328857, "learning_rate": 9.308862097443556e-06, "loss": 0.7557, "step": 8610 }, { "epoch": 0.7582688247712879, "grad_norm": 8.150071144104004, "learning_rate": 9.30626362260074e-06, "loss": 0.7952, "step": 8620 }, { "epoch": 0.7591484869809993, "grad_norm": 9.587328910827637, "learning_rate": 9.303660636250856e-06, "loss": 0.7827, "step": 8630 }, { "epoch": 0.7600281491907108, "grad_norm": 7.53125, "learning_rate": 9.301053141120957e-06, "loss": 0.7944, "step": 8640 }, { "epoch": 0.7609078114004222, "grad_norm": 9.448511123657227, "learning_rate": 9.298441139942814e-06, "loss": 0.689, "step": 8650 }, { "epoch": 0.7617874736101337, "grad_norm": 9.0913724899292, "learning_rate": 9.29582463545292e-06, "loss": 0.7853, "step": 8660 }, { "epoch": 0.7626671358198451, "grad_norm": 10.192832946777344, "learning_rate": 9.293203630392488e-06, "loss": 0.8427, "step": 8670 }, { "epoch": 0.7635467980295566, "grad_norm": 6.865830898284912, "learning_rate": 9.290578127507444e-06, "loss": 0.7097, "step": 8680 }, { "epoch": 0.7644264602392681, "grad_norm": 6.482701778411865, "learning_rate": 9.287948129548423e-06, "loss": 0.775, "step": 8690 }, { "epoch": 0.7653061224489796, "grad_norm": 10.071352005004883, "learning_rate": 9.28531363927078e-06, "loss": 0.7983, "step": 8700 }, { "epoch": 0.766185784658691, "grad_norm": 7.81582498550415, "learning_rate": 9.28267465943456e-06, "loss": 0.8618, "step": 8710 }, { "epoch": 0.7670654468684025, "grad_norm": 7.497014045715332, "learning_rate": 9.280031192804529e-06, "loss": 0.804, "step": 8720 }, { "epoch": 0.767945109078114, "grad_norm": 8.15945816040039, "learning_rate": 9.277383242150142e-06, "loss": 0.7927, "step": 8730 }, { "epoch": 0.7688247712878254, "grad_norm": 8.403365135192871, "learning_rate": 9.274730810245556e-06, "loss": 0.7279, "step": 8740 }, { "epoch": 0.7697044334975369, "grad_norm": 9.012005805969238, "learning_rate": 9.27207389986962e-06, "loss": 0.6788, "step": 8750 }, { "epoch": 0.7705840957072484, "grad_norm": 9.01651382446289, "learning_rate": 9.26941251380588e-06, "loss": 0.7736, "step": 8760 }, { "epoch": 0.7714637579169599, "grad_norm": 6.3782196044921875, "learning_rate": 9.266746654842567e-06, "loss": 0.7347, "step": 8770 }, { "epoch": 0.7723434201266713, "grad_norm": 11.832535743713379, "learning_rate": 9.2640763257726e-06, "loss": 0.7189, "step": 8780 }, { "epoch": 0.7732230823363828, "grad_norm": 8.17848014831543, "learning_rate": 9.261401529393576e-06, "loss": 0.7586, "step": 8790 }, { "epoch": 0.7741027445460943, "grad_norm": 9.217689514160156, "learning_rate": 9.258722268507783e-06, "loss": 0.7506, "step": 8800 }, { "epoch": 0.7749824067558058, "grad_norm": 9.250201225280762, "learning_rate": 9.256038545922174e-06, "loss": 0.7252, "step": 8810 }, { "epoch": 0.7758620689655172, "grad_norm": 8.302064895629883, "learning_rate": 9.253350364448384e-06, "loss": 0.7661, "step": 8820 }, { "epoch": 0.7767417311752287, "grad_norm": 7.476352691650391, "learning_rate": 9.250657726902718e-06, "loss": 0.7591, "step": 8830 }, { "epoch": 0.7776213933849402, "grad_norm": 8.846946716308594, "learning_rate": 9.24796063610615e-06, "loss": 0.8086, "step": 8840 }, { "epoch": 0.7785010555946517, "grad_norm": 7.876168251037598, "learning_rate": 9.245259094884318e-06, "loss": 0.8188, "step": 8850 }, { "epoch": 0.7793807178043631, "grad_norm": 11.111193656921387, "learning_rate": 9.242553106067522e-06, "loss": 0.8176, "step": 8860 }, { "epoch": 0.7802603800140746, "grad_norm": 5.99713134765625, "learning_rate": 9.239842672490722e-06, "loss": 0.7392, "step": 8870 }, { "epoch": 0.7811400422237861, "grad_norm": 8.269925117492676, "learning_rate": 9.237127796993536e-06, "loss": 0.833, "step": 8880 }, { "epoch": 0.7820197044334976, "grad_norm": 7.15533447265625, "learning_rate": 9.234408482420237e-06, "loss": 0.7854, "step": 8890 }, { "epoch": 0.782899366643209, "grad_norm": 6.329888343811035, "learning_rate": 9.231684731619743e-06, "loss": 0.7128, "step": 8900 }, { "epoch": 0.7837790288529205, "grad_norm": 11.493359565734863, "learning_rate": 9.228956547445624e-06, "loss": 0.804, "step": 8910 }, { "epoch": 0.784658691062632, "grad_norm": 7.547720432281494, "learning_rate": 9.226223932756096e-06, "loss": 0.8624, "step": 8920 }, { "epoch": 0.7855383532723434, "grad_norm": 10.298832893371582, "learning_rate": 9.223486890414011e-06, "loss": 0.7306, "step": 8930 }, { "epoch": 0.7864180154820549, "grad_norm": 8.054140090942383, "learning_rate": 9.220745423286866e-06, "loss": 0.7832, "step": 8940 }, { "epoch": 0.7872976776917664, "grad_norm": 8.354339599609375, "learning_rate": 9.217999534246791e-06, "loss": 0.7966, "step": 8950 }, { "epoch": 0.7881773399014779, "grad_norm": 8.832682609558105, "learning_rate": 9.215249226170547e-06, "loss": 0.8607, "step": 8960 }, { "epoch": 0.7890570021111893, "grad_norm": 7.608147621154785, "learning_rate": 9.212494501939526e-06, "loss": 0.8073, "step": 8970 }, { "epoch": 0.7899366643209008, "grad_norm": 6.915885925292969, "learning_rate": 9.209735364439747e-06, "loss": 0.8535, "step": 8980 }, { "epoch": 0.7908163265306123, "grad_norm": 9.60535717010498, "learning_rate": 9.206971816561854e-06, "loss": 0.7968, "step": 8990 }, { "epoch": 0.7916959887403238, "grad_norm": 7.743350505828857, "learning_rate": 9.204203861201109e-06, "loss": 0.8255, "step": 9000 }, { "epoch": 0.7925756509500352, "grad_norm": 10.528352737426758, "learning_rate": 9.201431501257392e-06, "loss": 0.7204, "step": 9010 }, { "epoch": 0.7934553131597467, "grad_norm": 8.415822982788086, "learning_rate": 9.198654739635198e-06, "loss": 0.8931, "step": 9020 }, { "epoch": 0.7943349753694581, "grad_norm": 8.116988182067871, "learning_rate": 9.195873579243634e-06, "loss": 0.7341, "step": 9030 }, { "epoch": 0.7952146375791695, "grad_norm": 8.536517143249512, "learning_rate": 9.193088022996417e-06, "loss": 0.7757, "step": 9040 }, { "epoch": 0.796094299788881, "grad_norm": 8.559906959533691, "learning_rate": 9.190298073811866e-06, "loss": 0.7718, "step": 9050 }, { "epoch": 0.7969739619985925, "grad_norm": 9.921978950500488, "learning_rate": 9.1875037346129e-06, "loss": 0.6428, "step": 9060 }, { "epoch": 0.797853624208304, "grad_norm": 10.68593978881836, "learning_rate": 9.184705008327044e-06, "loss": 0.7395, "step": 9070 }, { "epoch": 0.7987332864180154, "grad_norm": 9.009956359863281, "learning_rate": 9.181901897886419e-06, "loss": 0.8694, "step": 9080 }, { "epoch": 0.7996129486277269, "grad_norm": 7.755526542663574, "learning_rate": 9.179094406227732e-06, "loss": 0.7189, "step": 9090 }, { "epoch": 0.8004926108374384, "grad_norm": 8.564069747924805, "learning_rate": 9.176282536292283e-06, "loss": 0.7235, "step": 9100 }, { "epoch": 0.8013722730471499, "grad_norm": 6.200893878936768, "learning_rate": 9.173466291025965e-06, "loss": 0.732, "step": 9110 }, { "epoch": 0.8022519352568613, "grad_norm": 7.958987712860107, "learning_rate": 9.170645673379246e-06, "loss": 0.7627, "step": 9120 }, { "epoch": 0.8031315974665728, "grad_norm": 6.860930442810059, "learning_rate": 9.167820686307182e-06, "loss": 0.6328, "step": 9130 }, { "epoch": 0.8040112596762843, "grad_norm": 8.58984088897705, "learning_rate": 9.164991332769402e-06, "loss": 0.7454, "step": 9140 }, { "epoch": 0.8048909218859958, "grad_norm": 6.154870986938477, "learning_rate": 9.162157615730109e-06, "loss": 0.7299, "step": 9150 }, { "epoch": 0.8057705840957072, "grad_norm": 9.521571159362793, "learning_rate": 9.159319538158083e-06, "loss": 0.7782, "step": 9160 }, { "epoch": 0.8066502463054187, "grad_norm": 7.995107650756836, "learning_rate": 9.156477103026669e-06, "loss": 0.8827, "step": 9170 }, { "epoch": 0.8075299085151302, "grad_norm": 7.557469367980957, "learning_rate": 9.153630313313772e-06, "loss": 0.7425, "step": 9180 }, { "epoch": 0.8084095707248417, "grad_norm": 9.642196655273438, "learning_rate": 9.150779172001871e-06, "loss": 0.824, "step": 9190 }, { "epoch": 0.8092892329345531, "grad_norm": 9.468195915222168, "learning_rate": 9.147923682077993e-06, "loss": 0.7111, "step": 9200 }, { "epoch": 0.8101688951442646, "grad_norm": 8.43470287322998, "learning_rate": 9.145063846533727e-06, "loss": 0.6951, "step": 9210 }, { "epoch": 0.8110485573539761, "grad_norm": 8.960837364196777, "learning_rate": 9.14219966836521e-06, "loss": 0.85, "step": 9220 }, { "epoch": 0.8119282195636875, "grad_norm": 8.39845085144043, "learning_rate": 9.139331150573136e-06, "loss": 0.7862, "step": 9230 }, { "epoch": 0.812807881773399, "grad_norm": 6.740390777587891, "learning_rate": 9.136458296162739e-06, "loss": 0.8078, "step": 9240 }, { "epoch": 0.8136875439831105, "grad_norm": 10.019824981689453, "learning_rate": 9.133581108143796e-06, "loss": 0.7248, "step": 9250 }, { "epoch": 0.814567206192822, "grad_norm": 8.650683403015137, "learning_rate": 9.130699589530631e-06, "loss": 0.7335, "step": 9260 }, { "epoch": 0.8154468684025334, "grad_norm": 6.107051372528076, "learning_rate": 9.127813743342096e-06, "loss": 0.7728, "step": 9270 }, { "epoch": 0.8163265306122449, "grad_norm": 8.076821327209473, "learning_rate": 9.124923572601583e-06, "loss": 0.7022, "step": 9280 }, { "epoch": 0.8172061928219564, "grad_norm": 7.888184547424316, "learning_rate": 9.122029080337014e-06, "loss": 0.8476, "step": 9290 }, { "epoch": 0.8180858550316679, "grad_norm": 7.596648216247559, "learning_rate": 9.119130269580835e-06, "loss": 0.7129, "step": 9300 }, { "epoch": 0.8189655172413793, "grad_norm": 6.49446439743042, "learning_rate": 9.116227143370023e-06, "loss": 0.7806, "step": 9310 }, { "epoch": 0.8198451794510908, "grad_norm": 8.388823509216309, "learning_rate": 9.113319704746066e-06, "loss": 0.6904, "step": 9320 }, { "epoch": 0.8207248416608023, "grad_norm": 8.161612510681152, "learning_rate": 9.11040795675498e-06, "loss": 0.7672, "step": 9330 }, { "epoch": 0.8216045038705138, "grad_norm": 9.374138832092285, "learning_rate": 9.10749190244729e-06, "loss": 0.7318, "step": 9340 }, { "epoch": 0.8224841660802252, "grad_norm": 7.946638584136963, "learning_rate": 9.104571544878035e-06, "loss": 0.8448, "step": 9350 }, { "epoch": 0.8233638282899367, "grad_norm": 11.471941947937012, "learning_rate": 9.10164688710676e-06, "loss": 0.806, "step": 9360 }, { "epoch": 0.8242434904996482, "grad_norm": 6.580000877380371, "learning_rate": 9.098717932197519e-06, "loss": 0.6831, "step": 9370 }, { "epoch": 0.8251231527093597, "grad_norm": 7.027739524841309, "learning_rate": 9.095784683218864e-06, "loss": 0.7345, "step": 9380 }, { "epoch": 0.8260028149190711, "grad_norm": 7.663942813873291, "learning_rate": 9.092847143243848e-06, "loss": 0.737, "step": 9390 }, { "epoch": 0.8268824771287826, "grad_norm": 6.098499298095703, "learning_rate": 9.089905315350018e-06, "loss": 0.7342, "step": 9400 }, { "epoch": 0.827762139338494, "grad_norm": 7.582835674285889, "learning_rate": 9.086959202619414e-06, "loss": 0.7421, "step": 9410 }, { "epoch": 0.8286418015482054, "grad_norm": 9.823530197143555, "learning_rate": 9.084008808138566e-06, "loss": 0.9282, "step": 9420 }, { "epoch": 0.8295214637579169, "grad_norm": 6.794120788574219, "learning_rate": 9.081054134998491e-06, "loss": 0.662, "step": 9430 }, { "epoch": 0.8304011259676284, "grad_norm": 7.99787712097168, "learning_rate": 9.078095186294682e-06, "loss": 0.6913, "step": 9440 }, { "epoch": 0.8312807881773399, "grad_norm": 7.146722793579102, "learning_rate": 9.07513196512712e-06, "loss": 0.7615, "step": 9450 }, { "epoch": 0.8321604503870513, "grad_norm": 6.964779853820801, "learning_rate": 9.072164474600256e-06, "loss": 0.6893, "step": 9460 }, { "epoch": 0.8330401125967628, "grad_norm": 9.680728912353516, "learning_rate": 9.069192717823017e-06, "loss": 0.7561, "step": 9470 }, { "epoch": 0.8339197748064743, "grad_norm": 8.709808349609375, "learning_rate": 9.066216697908796e-06, "loss": 0.7322, "step": 9480 }, { "epoch": 0.8347994370161858, "grad_norm": 8.266434669494629, "learning_rate": 9.063236417975459e-06, "loss": 0.8218, "step": 9490 }, { "epoch": 0.8356790992258972, "grad_norm": 7.716861248016357, "learning_rate": 9.060251881145325e-06, "loss": 0.8326, "step": 9500 }, { "epoch": 0.8365587614356087, "grad_norm": 9.795207977294922, "learning_rate": 9.057263090545185e-06, "loss": 0.6207, "step": 9510 }, { "epoch": 0.8374384236453202, "grad_norm": 7.342216968536377, "learning_rate": 9.054270049306278e-06, "loss": 0.7673, "step": 9520 }, { "epoch": 0.8383180858550316, "grad_norm": 7.0540337562561035, "learning_rate": 9.051272760564296e-06, "loss": 0.696, "step": 9530 }, { "epoch": 0.8391977480647431, "grad_norm": 6.300555229187012, "learning_rate": 9.048271227459383e-06, "loss": 0.6729, "step": 9540 }, { "epoch": 0.8400774102744546, "grad_norm": 7.150109767913818, "learning_rate": 9.045265453136136e-06, "loss": 0.6921, "step": 9550 }, { "epoch": 0.8409570724841661, "grad_norm": 8.599748611450195, "learning_rate": 9.042255440743584e-06, "loss": 0.7684, "step": 9560 }, { "epoch": 0.8418367346938775, "grad_norm": 5.929365158081055, "learning_rate": 9.039241193435203e-06, "loss": 0.6846, "step": 9570 }, { "epoch": 0.842716396903589, "grad_norm": 12.04484748840332, "learning_rate": 9.036222714368904e-06, "loss": 0.785, "step": 9580 }, { "epoch": 0.8435960591133005, "grad_norm": 6.564620494842529, "learning_rate": 9.03320000670703e-06, "loss": 0.8618, "step": 9590 }, { "epoch": 0.844475721323012, "grad_norm": 7.741371154785156, "learning_rate": 9.03017307361636e-06, "loss": 0.8507, "step": 9600 }, { "epoch": 0.8453553835327234, "grad_norm": 7.401696681976318, "learning_rate": 9.027141918268092e-06, "loss": 0.872, "step": 9610 }, { "epoch": 0.8462350457424349, "grad_norm": 9.099974632263184, "learning_rate": 9.024106543837851e-06, "loss": 0.6011, "step": 9620 }, { "epoch": 0.8471147079521464, "grad_norm": 9.46039867401123, "learning_rate": 9.021066953505686e-06, "loss": 0.8136, "step": 9630 }, { "epoch": 0.8479943701618579, "grad_norm": 8.888017654418945, "learning_rate": 9.018023150456054e-06, "loss": 0.7202, "step": 9640 }, { "epoch": 0.8488740323715693, "grad_norm": 8.756048202514648, "learning_rate": 9.014975137877834e-06, "loss": 0.6908, "step": 9650 }, { "epoch": 0.8497536945812808, "grad_norm": 8.997354507446289, "learning_rate": 9.01192291896431e-06, "loss": 0.7611, "step": 9660 }, { "epoch": 0.8506333567909923, "grad_norm": 6.13082218170166, "learning_rate": 9.008866496913176e-06, "loss": 0.695, "step": 9670 }, { "epoch": 0.8515130190007038, "grad_norm": 10.839118003845215, "learning_rate": 9.005805874926528e-06, "loss": 0.7981, "step": 9680 }, { "epoch": 0.8523926812104152, "grad_norm": 7.716192722320557, "learning_rate": 9.00274105621086e-06, "loss": 0.737, "step": 9690 }, { "epoch": 0.8532723434201267, "grad_norm": 9.150269508361816, "learning_rate": 8.999672043977068e-06, "loss": 0.7857, "step": 9700 }, { "epoch": 0.8541520056298382, "grad_norm": 7.613675594329834, "learning_rate": 8.996598841440433e-06, "loss": 0.7347, "step": 9710 }, { "epoch": 0.8550316678395496, "grad_norm": 6.028810501098633, "learning_rate": 8.993521451820639e-06, "loss": 0.7073, "step": 9720 }, { "epoch": 0.8559113300492611, "grad_norm": 13.024445533752441, "learning_rate": 8.990439878341744e-06, "loss": 0.7872, "step": 9730 }, { "epoch": 0.8567909922589726, "grad_norm": 7.548077583312988, "learning_rate": 8.987354124232195e-06, "loss": 0.7567, "step": 9740 }, { "epoch": 0.8576706544686841, "grad_norm": 8.584639549255371, "learning_rate": 8.984264192724818e-06, "loss": 0.739, "step": 9750 }, { "epoch": 0.8585503166783955, "grad_norm": 10.334716796875, "learning_rate": 8.981170087056816e-06, "loss": 0.7454, "step": 9760 }, { "epoch": 0.859429978888107, "grad_norm": 6.247286796569824, "learning_rate": 8.978071810469764e-06, "loss": 0.7252, "step": 9770 }, { "epoch": 0.8603096410978185, "grad_norm": 8.511188507080078, "learning_rate": 8.974969366209609e-06, "loss": 0.7146, "step": 9780 }, { "epoch": 0.86118930330753, "grad_norm": 7.914990425109863, "learning_rate": 8.971862757526661e-06, "loss": 0.7937, "step": 9790 }, { "epoch": 0.8620689655172413, "grad_norm": 6.710818290710449, "learning_rate": 8.968751987675596e-06, "loss": 0.7638, "step": 9800 }, { "epoch": 0.8629486277269528, "grad_norm": 6.160813331604004, "learning_rate": 8.965637059915449e-06, "loss": 0.6664, "step": 9810 }, { "epoch": 0.8638282899366643, "grad_norm": 7.085774898529053, "learning_rate": 8.962517977509607e-06, "loss": 0.7359, "step": 9820 }, { "epoch": 0.8647079521463757, "grad_norm": 9.522568702697754, "learning_rate": 8.959394743725813e-06, "loss": 0.7142, "step": 9830 }, { "epoch": 0.8655876143560872, "grad_norm": 8.389387130737305, "learning_rate": 8.956267361836164e-06, "loss": 0.7795, "step": 9840 }, { "epoch": 0.8664672765657987, "grad_norm": 8.361845970153809, "learning_rate": 8.953135835117093e-06, "loss": 0.6928, "step": 9850 }, { "epoch": 0.8673469387755102, "grad_norm": 12.468467712402344, "learning_rate": 8.950000166849382e-06, "loss": 0.8521, "step": 9860 }, { "epoch": 0.8682266009852216, "grad_norm": 6.4986252784729, "learning_rate": 8.94686036031815e-06, "loss": 0.7235, "step": 9870 }, { "epoch": 0.8691062631949331, "grad_norm": 8.024114608764648, "learning_rate": 8.943716418812855e-06, "loss": 0.6955, "step": 9880 }, { "epoch": 0.8699859254046446, "grad_norm": 8.791252136230469, "learning_rate": 8.940568345627279e-06, "loss": 0.7846, "step": 9890 }, { "epoch": 0.8708655876143561, "grad_norm": 9.020142555236816, "learning_rate": 8.937416144059539e-06, "loss": 0.8361, "step": 9900 }, { "epoch": 0.8717452498240675, "grad_norm": 8.047307014465332, "learning_rate": 8.934259817412074e-06, "loss": 0.7759, "step": 9910 }, { "epoch": 0.872624912033779, "grad_norm": 12.083381652832031, "learning_rate": 8.93109936899165e-06, "loss": 0.7619, "step": 9920 }, { "epoch": 0.8735045742434905, "grad_norm": 7.030168533325195, "learning_rate": 8.927934802109346e-06, "loss": 0.7087, "step": 9930 }, { "epoch": 0.874384236453202, "grad_norm": 7.393320083618164, "learning_rate": 8.924766120080557e-06, "loss": 0.7452, "step": 9940 }, { "epoch": 0.8752638986629134, "grad_norm": 7.278824806213379, "learning_rate": 8.921593326224987e-06, "loss": 0.7452, "step": 9950 }, { "epoch": 0.8761435608726249, "grad_norm": 10.035475730895996, "learning_rate": 8.918416423866654e-06, "loss": 0.7524, "step": 9960 }, { "epoch": 0.8770232230823364, "grad_norm": 12.176562309265137, "learning_rate": 8.915235416333876e-06, "loss": 0.8861, "step": 9970 }, { "epoch": 0.8779028852920479, "grad_norm": 7.649777889251709, "learning_rate": 8.91205030695927e-06, "loss": 0.7478, "step": 9980 }, { "epoch": 0.8787825475017593, "grad_norm": 7.653356075286865, "learning_rate": 8.908861099079755e-06, "loss": 0.7606, "step": 9990 }, { "epoch": 0.8796622097114708, "grad_norm": 7.386388301849365, "learning_rate": 8.905667796036542e-06, "loss": 0.7097, "step": 10000 }, { "epoch": 0.8805418719211823, "grad_norm": 7.162853240966797, "learning_rate": 8.90247040117513e-06, "loss": 0.747, "step": 10010 }, { "epoch": 0.8814215341308937, "grad_norm": 8.986380577087402, "learning_rate": 8.89926891784531e-06, "loss": 0.7139, "step": 10020 }, { "epoch": 0.8823011963406052, "grad_norm": 9.102167129516602, "learning_rate": 8.89606334940115e-06, "loss": 0.7173, "step": 10030 }, { "epoch": 0.8831808585503167, "grad_norm": 7.2762532234191895, "learning_rate": 8.892853699201e-06, "loss": 0.7484, "step": 10040 }, { "epoch": 0.8840605207600282, "grad_norm": 8.314363479614258, "learning_rate": 8.88963997060749e-06, "loss": 0.7338, "step": 10050 }, { "epoch": 0.8849401829697396, "grad_norm": 5.87384557723999, "learning_rate": 8.886422166987523e-06, "loss": 0.7719, "step": 10060 }, { "epoch": 0.8858198451794511, "grad_norm": 9.233144760131836, "learning_rate": 8.883200291712262e-06, "loss": 0.8207, "step": 10070 }, { "epoch": 0.8866995073891626, "grad_norm": 9.142370223999023, "learning_rate": 8.879974348157145e-06, "loss": 0.7962, "step": 10080 }, { "epoch": 0.8875791695988741, "grad_norm": 8.25483512878418, "learning_rate": 8.876744339701869e-06, "loss": 0.6996, "step": 10090 }, { "epoch": 0.8884588318085855, "grad_norm": 11.007930755615234, "learning_rate": 8.87351026973039e-06, "loss": 0.6429, "step": 10100 }, { "epoch": 0.889338494018297, "grad_norm": 6.018333911895752, "learning_rate": 8.87027214163092e-06, "loss": 0.6753, "step": 10110 }, { "epoch": 0.8902181562280085, "grad_norm": 8.66531753540039, "learning_rate": 8.867029958795918e-06, "loss": 0.7439, "step": 10120 }, { "epoch": 0.89109781843772, "grad_norm": 8.623414039611816, "learning_rate": 8.863783724622099e-06, "loss": 0.6784, "step": 10130 }, { "epoch": 0.8919774806474314, "grad_norm": 7.1045308113098145, "learning_rate": 8.860533442510415e-06, "loss": 0.7212, "step": 10140 }, { "epoch": 0.8928571428571429, "grad_norm": 8.625452995300293, "learning_rate": 8.85727911586606e-06, "loss": 0.7201, "step": 10150 }, { "epoch": 0.8937368050668544, "grad_norm": 9.561033248901367, "learning_rate": 8.854020748098472e-06, "loss": 0.7584, "step": 10160 }, { "epoch": 0.8946164672765659, "grad_norm": 8.765721321105957, "learning_rate": 8.850758342621316e-06, "loss": 0.7954, "step": 10170 }, { "epoch": 0.8954961294862772, "grad_norm": 7.071775913238525, "learning_rate": 8.847491902852485e-06, "loss": 0.7367, "step": 10180 }, { "epoch": 0.8963757916959887, "grad_norm": 5.22135066986084, "learning_rate": 8.844221432214108e-06, "loss": 0.7515, "step": 10190 }, { "epoch": 0.8972554539057002, "grad_norm": 6.593540668487549, "learning_rate": 8.84094693413253e-06, "loss": 0.784, "step": 10200 }, { "epoch": 0.8981351161154116, "grad_norm": 9.937409400939941, "learning_rate": 8.837668412038316e-06, "loss": 0.7996, "step": 10210 }, { "epoch": 0.8990147783251231, "grad_norm": 7.780577659606934, "learning_rate": 8.834385869366247e-06, "loss": 0.7414, "step": 10220 }, { "epoch": 0.8998944405348346, "grad_norm": 6.657134532928467, "learning_rate": 8.83109930955532e-06, "loss": 0.7351, "step": 10230 }, { "epoch": 0.9007741027445461, "grad_norm": 9.44941520690918, "learning_rate": 8.827808736048733e-06, "loss": 0.6348, "step": 10240 }, { "epoch": 0.9016537649542575, "grad_norm": 8.691306114196777, "learning_rate": 8.824514152293896e-06, "loss": 0.7076, "step": 10250 }, { "epoch": 0.902533427163969, "grad_norm": 8.27644157409668, "learning_rate": 8.82121556174242e-06, "loss": 0.717, "step": 10260 }, { "epoch": 0.9034130893736805, "grad_norm": 6.8982110023498535, "learning_rate": 8.817912967850109e-06, "loss": 0.7198, "step": 10270 }, { "epoch": 0.904292751583392, "grad_norm": 9.621940612792969, "learning_rate": 8.814606374076963e-06, "loss": 0.7937, "step": 10280 }, { "epoch": 0.9051724137931034, "grad_norm": 9.83552360534668, "learning_rate": 8.811295783887178e-06, "loss": 0.6955, "step": 10290 }, { "epoch": 0.9060520760028149, "grad_norm": 9.032546997070312, "learning_rate": 8.807981200749125e-06, "loss": 0.7225, "step": 10300 }, { "epoch": 0.9069317382125264, "grad_norm": 10.130562782287598, "learning_rate": 8.804662628135371e-06, "loss": 0.7918, "step": 10310 }, { "epoch": 0.9078114004222378, "grad_norm": 7.586170673370361, "learning_rate": 8.801340069522656e-06, "loss": 0.6736, "step": 10320 }, { "epoch": 0.9086910626319493, "grad_norm": 11.851020812988281, "learning_rate": 8.798013528391895e-06, "loss": 0.7744, "step": 10330 }, { "epoch": 0.9095707248416608, "grad_norm": 7.826239585876465, "learning_rate": 8.794683008228182e-06, "loss": 0.6979, "step": 10340 }, { "epoch": 0.9104503870513723, "grad_norm": 6.738382339477539, "learning_rate": 8.79134851252077e-06, "loss": 0.7042, "step": 10350 }, { "epoch": 0.9113300492610837, "grad_norm": 8.692420959472656, "learning_rate": 8.788010044763084e-06, "loss": 0.7894, "step": 10360 }, { "epoch": 0.9122097114707952, "grad_norm": 9.076848983764648, "learning_rate": 8.784667608452711e-06, "loss": 0.82, "step": 10370 }, { "epoch": 0.9130893736805067, "grad_norm": 8.452138900756836, "learning_rate": 8.781321207091391e-06, "loss": 0.6725, "step": 10380 }, { "epoch": 0.9139690358902182, "grad_norm": 6.465821266174316, "learning_rate": 8.777970844185018e-06, "loss": 0.6503, "step": 10390 }, { "epoch": 0.9148486980999296, "grad_norm": 8.333083152770996, "learning_rate": 8.774616523243642e-06, "loss": 0.6471, "step": 10400 }, { "epoch": 0.9157283603096411, "grad_norm": 7.966302871704102, "learning_rate": 8.771258247781456e-06, "loss": 0.7432, "step": 10410 }, { "epoch": 0.9166080225193526, "grad_norm": 8.33949089050293, "learning_rate": 8.767896021316794e-06, "loss": 0.8728, "step": 10420 }, { "epoch": 0.9174876847290641, "grad_norm": 5.788053512573242, "learning_rate": 8.764529847372135e-06, "loss": 0.6957, "step": 10430 }, { "epoch": 0.9183673469387755, "grad_norm": 8.360697746276855, "learning_rate": 8.761159729474086e-06, "loss": 0.6835, "step": 10440 }, { "epoch": 0.919247009148487, "grad_norm": 8.22715950012207, "learning_rate": 8.757785671153393e-06, "loss": 0.6398, "step": 10450 }, { "epoch": 0.9201266713581985, "grad_norm": 9.2582368850708, "learning_rate": 8.754407675944927e-06, "loss": 0.7132, "step": 10460 }, { "epoch": 0.92100633356791, "grad_norm": 12.438268661499023, "learning_rate": 8.751025747387683e-06, "loss": 0.7723, "step": 10470 }, { "epoch": 0.9218859957776214, "grad_norm": 6.32668399810791, "learning_rate": 8.747639889024777e-06, "loss": 0.7751, "step": 10480 }, { "epoch": 0.9227656579873329, "grad_norm": 6.322526454925537, "learning_rate": 8.744250104403447e-06, "loss": 0.7451, "step": 10490 }, { "epoch": 0.9236453201970444, "grad_norm": 7.263008117675781, "learning_rate": 8.740856397075035e-06, "loss": 0.7049, "step": 10500 }, { "epoch": 0.9245249824067558, "grad_norm": 6.6787190437316895, "learning_rate": 8.737458770595002e-06, "loss": 0.7071, "step": 10510 }, { "epoch": 0.9254046446164673, "grad_norm": 6.972539901733398, "learning_rate": 8.734057228522907e-06, "loss": 0.6764, "step": 10520 }, { "epoch": 0.9262843068261788, "grad_norm": 7.731930732727051, "learning_rate": 8.730651774422419e-06, "loss": 0.7338, "step": 10530 }, { "epoch": 0.9271639690358903, "grad_norm": 7.598321914672852, "learning_rate": 8.727242411861297e-06, "loss": 0.7395, "step": 10540 }, { "epoch": 0.9280436312456017, "grad_norm": 7.145493507385254, "learning_rate": 8.723829144411405e-06, "loss": 0.7811, "step": 10550 }, { "epoch": 0.9289232934553132, "grad_norm": 8.996188163757324, "learning_rate": 8.720411975648685e-06, "loss": 0.7278, "step": 10560 }, { "epoch": 0.9298029556650246, "grad_norm": 6.978206634521484, "learning_rate": 8.71699090915318e-06, "loss": 0.7035, "step": 10570 }, { "epoch": 0.930682617874736, "grad_norm": 8.745279312133789, "learning_rate": 8.713565948509007e-06, "loss": 0.8125, "step": 10580 }, { "epoch": 0.9315622800844475, "grad_norm": 8.094949722290039, "learning_rate": 8.710137097304368e-06, "loss": 0.7907, "step": 10590 }, { "epoch": 0.932441942294159, "grad_norm": 7.811434268951416, "learning_rate": 8.706704359131534e-06, "loss": 0.8114, "step": 10600 }, { "epoch": 0.9333216045038705, "grad_norm": 8.119482040405273, "learning_rate": 8.703267737586858e-06, "loss": 0.7189, "step": 10610 }, { "epoch": 0.934201266713582, "grad_norm": 8.612527847290039, "learning_rate": 8.699827236270755e-06, "loss": 0.7346, "step": 10620 }, { "epoch": 0.9350809289232934, "grad_norm": 6.899227619171143, "learning_rate": 8.696382858787704e-06, "loss": 0.7851, "step": 10630 }, { "epoch": 0.9359605911330049, "grad_norm": 8.343822479248047, "learning_rate": 8.69293460874625e-06, "loss": 0.6502, "step": 10640 }, { "epoch": 0.9368402533427164, "grad_norm": 6.429286003112793, "learning_rate": 8.689482489758991e-06, "loss": 0.8024, "step": 10650 }, { "epoch": 0.9377199155524278, "grad_norm": 8.054033279418945, "learning_rate": 8.686026505442578e-06, "loss": 0.7156, "step": 10660 }, { "epoch": 0.9385995777621393, "grad_norm": 9.162810325622559, "learning_rate": 8.682566659417717e-06, "loss": 0.6797, "step": 10670 }, { "epoch": 0.9394792399718508, "grad_norm": 8.006098747253418, "learning_rate": 8.67910295530915e-06, "loss": 0.6806, "step": 10680 }, { "epoch": 0.9403589021815623, "grad_norm": 9.223005294799805, "learning_rate": 8.675635396745672e-06, "loss": 0.6287, "step": 10690 }, { "epoch": 0.9412385643912737, "grad_norm": 9.053372383117676, "learning_rate": 8.67216398736011e-06, "loss": 0.6899, "step": 10700 }, { "epoch": 0.9421182266009852, "grad_norm": 7.232842922210693, "learning_rate": 8.668688730789323e-06, "loss": 0.6515, "step": 10710 }, { "epoch": 0.9429978888106967, "grad_norm": 7.271060466766357, "learning_rate": 8.665209630674208e-06, "loss": 0.7473, "step": 10720 }, { "epoch": 0.9438775510204082, "grad_norm": 8.02649974822998, "learning_rate": 8.661726690659684e-06, "loss": 0.7896, "step": 10730 }, { "epoch": 0.9447572132301196, "grad_norm": 6.75419282913208, "learning_rate": 8.65823991439469e-06, "loss": 0.7863, "step": 10740 }, { "epoch": 0.9456368754398311, "grad_norm": 6.9798583984375, "learning_rate": 8.654749305532192e-06, "loss": 0.6771, "step": 10750 }, { "epoch": 0.9465165376495426, "grad_norm": 9.252596855163574, "learning_rate": 8.651254867729164e-06, "loss": 0.7299, "step": 10760 }, { "epoch": 0.947396199859254, "grad_norm": 7.67595911026001, "learning_rate": 8.647756604646595e-06, "loss": 0.698, "step": 10770 }, { "epoch": 0.9482758620689655, "grad_norm": 10.133106231689453, "learning_rate": 8.644254519949483e-06, "loss": 0.7188, "step": 10780 }, { "epoch": 0.949155524278677, "grad_norm": 5.079830169677734, "learning_rate": 8.640748617306824e-06, "loss": 0.7027, "step": 10790 }, { "epoch": 0.9500351864883885, "grad_norm": 7.989641189575195, "learning_rate": 8.637238900391622e-06, "loss": 0.802, "step": 10800 }, { "epoch": 0.9509148486981, "grad_norm": 7.518815994262695, "learning_rate": 8.633725372880868e-06, "loss": 0.7613, "step": 10810 }, { "epoch": 0.9517945109078114, "grad_norm": 6.369951248168945, "learning_rate": 8.630208038455553e-06, "loss": 0.7842, "step": 10820 }, { "epoch": 0.9526741731175229, "grad_norm": 8.687597274780273, "learning_rate": 8.626686900800652e-06, "loss": 0.6909, "step": 10830 }, { "epoch": 0.9535538353272344, "grad_norm": 8.654245376586914, "learning_rate": 8.623161963605124e-06, "loss": 0.6502, "step": 10840 }, { "epoch": 0.9544334975369458, "grad_norm": 10.234760284423828, "learning_rate": 8.619633230561914e-06, "loss": 0.7122, "step": 10850 }, { "epoch": 0.9553131597466573, "grad_norm": 10.36835765838623, "learning_rate": 8.61610070536794e-06, "loss": 0.7216, "step": 10860 }, { "epoch": 0.9561928219563688, "grad_norm": 7.112572193145752, "learning_rate": 8.612564391724089e-06, "loss": 0.7615, "step": 10870 }, { "epoch": 0.9570724841660803, "grad_norm": 7.011812210083008, "learning_rate": 8.609024293335227e-06, "loss": 0.6868, "step": 10880 }, { "epoch": 0.9579521463757917, "grad_norm": 7.943332195281982, "learning_rate": 8.605480413910172e-06, "loss": 0.6793, "step": 10890 }, { "epoch": 0.9588318085855032, "grad_norm": 7.330551624298096, "learning_rate": 8.601932757161715e-06, "loss": 0.752, "step": 10900 }, { "epoch": 0.9597114707952147, "grad_norm": 6.544751167297363, "learning_rate": 8.5983813268066e-06, "loss": 0.7478, "step": 10910 }, { "epoch": 0.9605911330049262, "grad_norm": 8.834061622619629, "learning_rate": 8.594826126565524e-06, "loss": 0.6962, "step": 10920 }, { "epoch": 0.9614707952146376, "grad_norm": 4.996640682220459, "learning_rate": 8.59126716016313e-06, "loss": 0.6644, "step": 10930 }, { "epoch": 0.9623504574243491, "grad_norm": 7.231828212738037, "learning_rate": 8.587704431328017e-06, "loss": 0.6785, "step": 10940 }, { "epoch": 0.9632301196340605, "grad_norm": 7.557430267333984, "learning_rate": 8.584137943792713e-06, "loss": 0.7687, "step": 10950 }, { "epoch": 0.9641097818437719, "grad_norm": 9.059426307678223, "learning_rate": 8.580567701293694e-06, "loss": 0.7031, "step": 10960 }, { "epoch": 0.9649894440534834, "grad_norm": 9.240041732788086, "learning_rate": 8.576993707571363e-06, "loss": 0.7531, "step": 10970 }, { "epoch": 0.9658691062631949, "grad_norm": 6.775343418121338, "learning_rate": 8.57341596637006e-06, "loss": 0.7068, "step": 10980 }, { "epoch": 0.9667487684729064, "grad_norm": 7.629881381988525, "learning_rate": 8.569834481438044e-06, "loss": 0.762, "step": 10990 }, { "epoch": 0.9676284306826178, "grad_norm": 8.589273452758789, "learning_rate": 8.5662492565275e-06, "loss": 0.7053, "step": 11000 }, { "epoch": 0.9685080928923293, "grad_norm": 6.629649639129639, "learning_rate": 8.56266029539453e-06, "loss": 0.8166, "step": 11010 }, { "epoch": 0.9693877551020408, "grad_norm": 7.592010974884033, "learning_rate": 8.55906760179915e-06, "loss": 0.6967, "step": 11020 }, { "epoch": 0.9702674173117523, "grad_norm": 7.8819990158081055, "learning_rate": 8.555471179505291e-06, "loss": 0.8582, "step": 11030 }, { "epoch": 0.9711470795214637, "grad_norm": 7.372311592102051, "learning_rate": 8.551871032280782e-06, "loss": 0.7682, "step": 11040 }, { "epoch": 0.9720267417311752, "grad_norm": 7.092881202697754, "learning_rate": 8.548267163897362e-06, "loss": 0.7261, "step": 11050 }, { "epoch": 0.9729064039408867, "grad_norm": 9.39966106414795, "learning_rate": 8.544659578130665e-06, "loss": 0.7271, "step": 11060 }, { "epoch": 0.9737860661505982, "grad_norm": 7.689900875091553, "learning_rate": 8.54104827876022e-06, "loss": 0.7835, "step": 11070 }, { "epoch": 0.9746657283603096, "grad_norm": 6.972972869873047, "learning_rate": 8.537433269569444e-06, "loss": 0.7624, "step": 11080 }, { "epoch": 0.9755453905700211, "grad_norm": 7.2631096839904785, "learning_rate": 8.533814554345647e-06, "loss": 0.6916, "step": 11090 }, { "epoch": 0.9764250527797326, "grad_norm": 7.043187141418457, "learning_rate": 8.530192136880018e-06, "loss": 0.6882, "step": 11100 }, { "epoch": 0.977304714989444, "grad_norm": 8.489876747131348, "learning_rate": 8.526566020967622e-06, "loss": 0.7554, "step": 11110 }, { "epoch": 0.9781843771991555, "grad_norm": 9.777521133422852, "learning_rate": 8.522936210407405e-06, "loss": 0.706, "step": 11120 }, { "epoch": 0.979064039408867, "grad_norm": 7.380918979644775, "learning_rate": 8.519302709002175e-06, "loss": 0.6616, "step": 11130 }, { "epoch": 0.9799437016185785, "grad_norm": 7.215766429901123, "learning_rate": 8.515665520558616e-06, "loss": 0.5657, "step": 11140 }, { "epoch": 0.9808233638282899, "grad_norm": 8.048717498779297, "learning_rate": 8.51202464888727e-06, "loss": 0.6622, "step": 11150 }, { "epoch": 0.9817030260380014, "grad_norm": 7.910508155822754, "learning_rate": 8.508380097802537e-06, "loss": 0.7277, "step": 11160 }, { "epoch": 0.9825826882477129, "grad_norm": 6.619856834411621, "learning_rate": 8.504731871122676e-06, "loss": 0.7284, "step": 11170 }, { "epoch": 0.9834623504574244, "grad_norm": 9.114152908325195, "learning_rate": 8.501079972669792e-06, "loss": 0.7737, "step": 11180 }, { "epoch": 0.9843420126671358, "grad_norm": 5.7373948097229, "learning_rate": 8.497424406269838e-06, "loss": 0.6783, "step": 11190 }, { "epoch": 0.9852216748768473, "grad_norm": 8.340964317321777, "learning_rate": 8.493765175752614e-06, "loss": 0.6953, "step": 11200 }, { "epoch": 0.9861013370865588, "grad_norm": 10.152695655822754, "learning_rate": 8.490102284951749e-06, "loss": 0.7342, "step": 11210 }, { "epoch": 0.9869809992962703, "grad_norm": 7.171577453613281, "learning_rate": 8.48643573770472e-06, "loss": 0.7416, "step": 11220 }, { "epoch": 0.9878606615059817, "grad_norm": 7.594709396362305, "learning_rate": 8.482765537852823e-06, "loss": 0.6867, "step": 11230 }, { "epoch": 0.9887403237156932, "grad_norm": 7.022674560546875, "learning_rate": 8.47909168924119e-06, "loss": 0.6354, "step": 11240 }, { "epoch": 0.9896199859254047, "grad_norm": 7.873165607452393, "learning_rate": 8.475414195718767e-06, "loss": 0.7307, "step": 11250 }, { "epoch": 0.9904996481351162, "grad_norm": 7.258177757263184, "learning_rate": 8.471733061138326e-06, "loss": 0.6531, "step": 11260 }, { "epoch": 0.9913793103448276, "grad_norm": 7.031017780303955, "learning_rate": 8.468048289356445e-06, "loss": 0.6346, "step": 11270 }, { "epoch": 0.9922589725545391, "grad_norm": 8.345564842224121, "learning_rate": 8.464359884233524e-06, "loss": 0.6212, "step": 11280 }, { "epoch": 0.9931386347642506, "grad_norm": 8.445239067077637, "learning_rate": 8.460667849633762e-06, "loss": 0.6968, "step": 11290 }, { "epoch": 0.994018296973962, "grad_norm": 7.1240668296813965, "learning_rate": 8.456972189425158e-06, "loss": 0.712, "step": 11300 }, { "epoch": 0.9948979591836735, "grad_norm": 7.91703987121582, "learning_rate": 8.453272907479516e-06, "loss": 0.7828, "step": 11310 }, { "epoch": 0.995777621393385, "grad_norm": 8.545646667480469, "learning_rate": 8.449570007672437e-06, "loss": 0.7759, "step": 11320 }, { "epoch": 0.9966572836030964, "grad_norm": 7.506185054779053, "learning_rate": 8.445863493883297e-06, "loss": 0.7521, "step": 11330 }, { "epoch": 0.9975369458128078, "grad_norm": 7.2790303230285645, "learning_rate": 8.442153369995274e-06, "loss": 0.6509, "step": 11340 }, { "epoch": 0.9984166080225193, "grad_norm": 8.780603408813477, "learning_rate": 8.438439639895323e-06, "loss": 0.7984, "step": 11350 }, { "epoch": 0.9992962702322308, "grad_norm": 8.218255996704102, "learning_rate": 8.434722307474172e-06, "loss": 0.695, "step": 11360 }, { "epoch": 1.0001759324419424, "grad_norm": 7.299861907958984, "learning_rate": 8.431001376626333e-06, "loss": 0.7434, "step": 11370 }, { "epoch": 1.0010555946516537, "grad_norm": 7.662354469299316, "learning_rate": 8.42727685125008e-06, "loss": 0.627, "step": 11380 }, { "epoch": 1.0019352568613653, "grad_norm": 7.784764289855957, "learning_rate": 8.42354873524745e-06, "loss": 0.633, "step": 11390 }, { "epoch": 1.0028149190710767, "grad_norm": 9.307229042053223, "learning_rate": 8.419817032524256e-06, "loss": 0.6104, "step": 11400 }, { "epoch": 1.0036945812807883, "grad_norm": 7.236617565155029, "learning_rate": 8.416081746990057e-06, "loss": 0.5761, "step": 11410 }, { "epoch": 1.0045742434904996, "grad_norm": 9.721209526062012, "learning_rate": 8.412342882558164e-06, "loss": 0.6077, "step": 11420 }, { "epoch": 1.0054539057002112, "grad_norm": 7.9085164070129395, "learning_rate": 8.408600443145645e-06, "loss": 0.6288, "step": 11430 }, { "epoch": 1.0063335679099226, "grad_norm": 6.918266296386719, "learning_rate": 8.40485443267331e-06, "loss": 0.5624, "step": 11440 }, { "epoch": 1.0072132301196342, "grad_norm": 8.29351806640625, "learning_rate": 8.401104855065709e-06, "loss": 0.6042, "step": 11450 }, { "epoch": 1.0080928923293455, "grad_norm": 9.199606895446777, "learning_rate": 8.397351714251132e-06, "loss": 0.5893, "step": 11460 }, { "epoch": 1.008972554539057, "grad_norm": 13.07867431640625, "learning_rate": 8.393595014161602e-06, "loss": 0.6139, "step": 11470 }, { "epoch": 1.0098522167487685, "grad_norm": 7.085753917694092, "learning_rate": 8.389834758732865e-06, "loss": 0.5429, "step": 11480 }, { "epoch": 1.01073187895848, "grad_norm": 10.914121627807617, "learning_rate": 8.386070951904398e-06, "loss": 0.5695, "step": 11490 }, { "epoch": 1.0116115411681914, "grad_norm": 9.367453575134277, "learning_rate": 8.382303597619401e-06, "loss": 0.6127, "step": 11500 }, { "epoch": 1.012491203377903, "grad_norm": 14.202057838439941, "learning_rate": 8.37853269982478e-06, "loss": 0.6074, "step": 11510 }, { "epoch": 1.0133708655876144, "grad_norm": 6.286341667175293, "learning_rate": 8.374758262471163e-06, "loss": 0.6441, "step": 11520 }, { "epoch": 1.0142505277973257, "grad_norm": 7.156232833862305, "learning_rate": 8.370980289512885e-06, "loss": 0.6019, "step": 11530 }, { "epoch": 1.0151301900070373, "grad_norm": 8.409750938415527, "learning_rate": 8.367198784907981e-06, "loss": 0.6395, "step": 11540 }, { "epoch": 1.0160098522167487, "grad_norm": 8.260435104370117, "learning_rate": 8.363413752618187e-06, "loss": 0.5761, "step": 11550 }, { "epoch": 1.0168895144264603, "grad_norm": 8.668198585510254, "learning_rate": 8.359625196608938e-06, "loss": 0.6378, "step": 11560 }, { "epoch": 1.0177691766361716, "grad_norm": 9.917679786682129, "learning_rate": 8.355833120849358e-06, "loss": 0.6335, "step": 11570 }, { "epoch": 1.0186488388458832, "grad_norm": 6.2154717445373535, "learning_rate": 8.35203752931226e-06, "loss": 0.5384, "step": 11580 }, { "epoch": 1.0195285010555946, "grad_norm": 6.8232951164245605, "learning_rate": 8.348238425974138e-06, "loss": 0.5734, "step": 11590 }, { "epoch": 1.0204081632653061, "grad_norm": 11.689498901367188, "learning_rate": 8.344435814815167e-06, "loss": 0.626, "step": 11600 }, { "epoch": 1.0212878254750175, "grad_norm": 8.21432113647461, "learning_rate": 8.340629699819197e-06, "loss": 0.5231, "step": 11610 }, { "epoch": 1.022167487684729, "grad_norm": 9.142352104187012, "learning_rate": 8.336820084973752e-06, "loss": 0.5708, "step": 11620 }, { "epoch": 1.0230471498944405, "grad_norm": 9.584098815917969, "learning_rate": 8.333006974270013e-06, "loss": 0.6387, "step": 11630 }, { "epoch": 1.023926812104152, "grad_norm": 7.535806179046631, "learning_rate": 8.329190371702832e-06, "loss": 0.5226, "step": 11640 }, { "epoch": 1.0248064743138634, "grad_norm": 7.852949142456055, "learning_rate": 8.325370281270719e-06, "loss": 0.607, "step": 11650 }, { "epoch": 1.025686136523575, "grad_norm": 7.776589393615723, "learning_rate": 8.321546706975837e-06, "loss": 0.5703, "step": 11660 }, { "epoch": 1.0265657987332863, "grad_norm": 8.064594268798828, "learning_rate": 8.317719652823992e-06, "loss": 0.567, "step": 11670 }, { "epoch": 1.027445460942998, "grad_norm": 7.828836917877197, "learning_rate": 8.313889122824646e-06, "loss": 0.4814, "step": 11680 }, { "epoch": 1.0283251231527093, "grad_norm": 8.234220504760742, "learning_rate": 8.310055120990898e-06, "loss": 0.6079, "step": 11690 }, { "epoch": 1.0292047853624209, "grad_norm": 7.304616928100586, "learning_rate": 8.306217651339485e-06, "loss": 0.6696, "step": 11700 }, { "epoch": 1.0300844475721322, "grad_norm": 8.828968048095703, "learning_rate": 8.302376717890775e-06, "loss": 0.6264, "step": 11710 }, { "epoch": 1.0309641097818438, "grad_norm": 7.40019416809082, "learning_rate": 8.298532324668768e-06, "loss": 0.5975, "step": 11720 }, { "epoch": 1.0318437719915552, "grad_norm": 7.610756874084473, "learning_rate": 8.294684475701088e-06, "loss": 0.6914, "step": 11730 }, { "epoch": 1.0327234342012668, "grad_norm": 7.077138900756836, "learning_rate": 8.290833175018973e-06, "loss": 0.5989, "step": 11740 }, { "epoch": 1.0336030964109781, "grad_norm": 6.756237506866455, "learning_rate": 8.286978426657289e-06, "loss": 0.5957, "step": 11750 }, { "epoch": 1.0344827586206897, "grad_norm": 7.757540225982666, "learning_rate": 8.283120234654506e-06, "loss": 0.5136, "step": 11760 }, { "epoch": 1.035362420830401, "grad_norm": 8.682188987731934, "learning_rate": 8.279258603052705e-06, "loss": 0.5851, "step": 11770 }, { "epoch": 1.0362420830401127, "grad_norm": 8.324477195739746, "learning_rate": 8.275393535897564e-06, "loss": 0.5916, "step": 11780 }, { "epoch": 1.037121745249824, "grad_norm": 8.735583305358887, "learning_rate": 8.27152503723837e-06, "loss": 0.6071, "step": 11790 }, { "epoch": 1.0380014074595356, "grad_norm": 6.919816493988037, "learning_rate": 8.267653111128002e-06, "loss": 0.5697, "step": 11800 }, { "epoch": 1.038881069669247, "grad_norm": 8.725064277648926, "learning_rate": 8.263777761622925e-06, "loss": 0.5579, "step": 11810 }, { "epoch": 1.0397607318789586, "grad_norm": 8.754441261291504, "learning_rate": 8.259898992783192e-06, "loss": 0.5737, "step": 11820 }, { "epoch": 1.04064039408867, "grad_norm": 8.889179229736328, "learning_rate": 8.256016808672447e-06, "loss": 0.6098, "step": 11830 }, { "epoch": 1.0415200562983815, "grad_norm": 6.862189292907715, "learning_rate": 8.252131213357899e-06, "loss": 0.5268, "step": 11840 }, { "epoch": 1.0423997185080929, "grad_norm": 6.9862775802612305, "learning_rate": 8.24824221091034e-06, "loss": 0.6252, "step": 11850 }, { "epoch": 1.0432793807178045, "grad_norm": 8.504034042358398, "learning_rate": 8.244349805404128e-06, "loss": 0.6579, "step": 11860 }, { "epoch": 1.0441590429275158, "grad_norm": 7.545679092407227, "learning_rate": 8.240454000917187e-06, "loss": 0.5987, "step": 11870 }, { "epoch": 1.0450387051372274, "grad_norm": 8.153464317321777, "learning_rate": 8.236554801531e-06, "loss": 0.5593, "step": 11880 }, { "epoch": 1.0459183673469388, "grad_norm": 9.322982788085938, "learning_rate": 8.232652211330611e-06, "loss": 0.5709, "step": 11890 }, { "epoch": 1.0467980295566504, "grad_norm": 5.574365139007568, "learning_rate": 8.228746234404612e-06, "loss": 0.5577, "step": 11900 }, { "epoch": 1.0476776917663617, "grad_norm": 8.885111808776855, "learning_rate": 8.224836874845145e-06, "loss": 0.5764, "step": 11910 }, { "epoch": 1.048557353976073, "grad_norm": 7.374355316162109, "learning_rate": 8.2209241367479e-06, "loss": 0.5101, "step": 11920 }, { "epoch": 1.0494370161857847, "grad_norm": 6.532994270324707, "learning_rate": 8.217008024212096e-06, "loss": 0.5798, "step": 11930 }, { "epoch": 1.050316678395496, "grad_norm": 9.485751152038574, "learning_rate": 8.2130885413405e-06, "loss": 0.5889, "step": 11940 }, { "epoch": 1.0511963406052076, "grad_norm": 8.639615058898926, "learning_rate": 8.2091656922394e-06, "loss": 0.6308, "step": 11950 }, { "epoch": 1.052076002814919, "grad_norm": 8.031490325927734, "learning_rate": 8.205239481018614e-06, "loss": 0.5923, "step": 11960 }, { "epoch": 1.0529556650246306, "grad_norm": 5.582894802093506, "learning_rate": 8.201309911791483e-06, "loss": 0.5693, "step": 11970 }, { "epoch": 1.053835327234342, "grad_norm": 8.799117088317871, "learning_rate": 8.197376988674869e-06, "loss": 0.5984, "step": 11980 }, { "epoch": 1.0547149894440535, "grad_norm": 7.974264621734619, "learning_rate": 8.19344071578914e-06, "loss": 0.6447, "step": 11990 }, { "epoch": 1.0555946516537649, "grad_norm": 8.130180358886719, "learning_rate": 8.189501097258183e-06, "loss": 0.6096, "step": 12000 }, { "epoch": 1.0564743138634765, "grad_norm": 7.474098205566406, "learning_rate": 8.18555813720938e-06, "loss": 0.6145, "step": 12010 }, { "epoch": 1.0573539760731878, "grad_norm": 6.392549991607666, "learning_rate": 8.181611839773622e-06, "loss": 0.5189, "step": 12020 }, { "epoch": 1.0582336382828994, "grad_norm": 7.145114421844482, "learning_rate": 8.177662209085293e-06, "loss": 0.5139, "step": 12030 }, { "epoch": 1.0591133004926108, "grad_norm": 11.989012718200684, "learning_rate": 8.17370924928227e-06, "loss": 0.587, "step": 12040 }, { "epoch": 1.0599929627023223, "grad_norm": 7.2929534912109375, "learning_rate": 8.169752964505912e-06, "loss": 0.5993, "step": 12050 }, { "epoch": 1.0608726249120337, "grad_norm": 6.527358531951904, "learning_rate": 8.165793358901075e-06, "loss": 0.5863, "step": 12060 }, { "epoch": 1.0617522871217453, "grad_norm": 8.895418167114258, "learning_rate": 8.161830436616081e-06, "loss": 0.6293, "step": 12070 }, { "epoch": 1.0626319493314567, "grad_norm": 7.628481864929199, "learning_rate": 8.15786420180273e-06, "loss": 0.6312, "step": 12080 }, { "epoch": 1.0635116115411682, "grad_norm": 7.998660087585449, "learning_rate": 8.153894658616298e-06, "loss": 0.5658, "step": 12090 }, { "epoch": 1.0643912737508796, "grad_norm": 9.488481521606445, "learning_rate": 8.14992181121552e-06, "loss": 0.6084, "step": 12100 }, { "epoch": 1.0652709359605912, "grad_norm": 6.971285820007324, "learning_rate": 8.145945663762596e-06, "loss": 0.6251, "step": 12110 }, { "epoch": 1.0661505981703026, "grad_norm": 9.219334602355957, "learning_rate": 8.141966220423185e-06, "loss": 0.5314, "step": 12120 }, { "epoch": 1.0670302603800141, "grad_norm": 8.80787181854248, "learning_rate": 8.137983485366397e-06, "loss": 0.5832, "step": 12130 }, { "epoch": 1.0679099225897255, "grad_norm": 7.758588790893555, "learning_rate": 8.133997462764786e-06, "loss": 0.5706, "step": 12140 }, { "epoch": 1.068789584799437, "grad_norm": 7.051955699920654, "learning_rate": 8.13000815679436e-06, "loss": 0.4779, "step": 12150 }, { "epoch": 1.0696692470091484, "grad_norm": 5.3071136474609375, "learning_rate": 8.12601557163456e-06, "loss": 0.5841, "step": 12160 }, { "epoch": 1.07054890921886, "grad_norm": 9.553003311157227, "learning_rate": 8.122019711468265e-06, "loss": 0.5782, "step": 12170 }, { "epoch": 1.0714285714285714, "grad_norm": 7.60118293762207, "learning_rate": 8.118020580481781e-06, "loss": 0.5491, "step": 12180 }, { "epoch": 1.072308233638283, "grad_norm": 6.619450092315674, "learning_rate": 8.114018182864848e-06, "loss": 0.5403, "step": 12190 }, { "epoch": 1.0731878958479943, "grad_norm": 8.343246459960938, "learning_rate": 8.110012522810624e-06, "loss": 0.5307, "step": 12200 }, { "epoch": 1.074067558057706, "grad_norm": 7.139920711517334, "learning_rate": 8.106003604515681e-06, "loss": 0.6341, "step": 12210 }, { "epoch": 1.0749472202674173, "grad_norm": 6.857558250427246, "learning_rate": 8.101991432180015e-06, "loss": 0.5552, "step": 12220 }, { "epoch": 1.0758268824771289, "grad_norm": 9.11231517791748, "learning_rate": 8.097976010007019e-06, "loss": 0.5449, "step": 12230 }, { "epoch": 1.0767065446868402, "grad_norm": 7.857626914978027, "learning_rate": 8.0939573422035e-06, "loss": 0.5247, "step": 12240 }, { "epoch": 1.0775862068965518, "grad_norm": 7.160214424133301, "learning_rate": 8.08993543297966e-06, "loss": 0.5767, "step": 12250 }, { "epoch": 1.0784658691062632, "grad_norm": 7.842446804046631, "learning_rate": 8.085910286549102e-06, "loss": 0.6195, "step": 12260 }, { "epoch": 1.0793455313159748, "grad_norm": 10.639360427856445, "learning_rate": 8.081881907128814e-06, "loss": 0.5986, "step": 12270 }, { "epoch": 1.0802251935256861, "grad_norm": 7.386841297149658, "learning_rate": 8.077850298939178e-06, "loss": 0.5685, "step": 12280 }, { "epoch": 1.0811048557353975, "grad_norm": 8.508967399597168, "learning_rate": 8.07381546620395e-06, "loss": 0.5815, "step": 12290 }, { "epoch": 1.081984517945109, "grad_norm": 7.58867883682251, "learning_rate": 8.069777413150277e-06, "loss": 0.5495, "step": 12300 }, { "epoch": 1.0828641801548207, "grad_norm": 9.593918800354004, "learning_rate": 8.065736144008664e-06, "loss": 0.5886, "step": 12310 }, { "epoch": 1.083743842364532, "grad_norm": 9.208586692810059, "learning_rate": 8.061691663012998e-06, "loss": 0.5843, "step": 12320 }, { "epoch": 1.0846235045742434, "grad_norm": 6.734734058380127, "learning_rate": 8.057643974400526e-06, "loss": 0.5526, "step": 12330 }, { "epoch": 1.085503166783955, "grad_norm": 6.331991195678711, "learning_rate": 8.053593082411855e-06, "loss": 0.6122, "step": 12340 }, { "epoch": 1.0863828289936663, "grad_norm": 7.558201789855957, "learning_rate": 8.04953899129095e-06, "loss": 0.5305, "step": 12350 }, { "epoch": 1.087262491203378, "grad_norm": 7.840267658233643, "learning_rate": 8.045481705285125e-06, "loss": 0.585, "step": 12360 }, { "epoch": 1.0881421534130893, "grad_norm": 11.667179107666016, "learning_rate": 8.041421228645046e-06, "loss": 0.5981, "step": 12370 }, { "epoch": 1.0890218156228009, "grad_norm": 7.44216775894165, "learning_rate": 8.037357565624714e-06, "loss": 0.5926, "step": 12380 }, { "epoch": 1.0899014778325122, "grad_norm": 7.676124572753906, "learning_rate": 8.033290720481476e-06, "loss": 0.6029, "step": 12390 }, { "epoch": 1.0907811400422238, "grad_norm": 7.487218856811523, "learning_rate": 8.029220697476008e-06, "loss": 0.6144, "step": 12400 }, { "epoch": 1.0916608022519352, "grad_norm": 6.962316036224365, "learning_rate": 8.025147500872319e-06, "loss": 0.5631, "step": 12410 }, { "epoch": 1.0925404644616468, "grad_norm": 7.8070783615112305, "learning_rate": 8.021071134937736e-06, "loss": 0.5465, "step": 12420 }, { "epoch": 1.0934201266713581, "grad_norm": 8.603740692138672, "learning_rate": 8.016991603942918e-06, "loss": 0.6571, "step": 12430 }, { "epoch": 1.0942997888810697, "grad_norm": 11.006976127624512, "learning_rate": 8.012908912161828e-06, "loss": 0.6228, "step": 12440 }, { "epoch": 1.095179451090781, "grad_norm": 7.108590126037598, "learning_rate": 8.008823063871746e-06, "loss": 0.6146, "step": 12450 }, { "epoch": 1.0960591133004927, "grad_norm": 6.553457260131836, "learning_rate": 8.00473406335326e-06, "loss": 0.5278, "step": 12460 }, { "epoch": 1.096938775510204, "grad_norm": 6.721995830535889, "learning_rate": 8.000641914890257e-06, "loss": 0.4821, "step": 12470 }, { "epoch": 1.0978184377199156, "grad_norm": 7.7367844581604, "learning_rate": 7.996546622769925e-06, "loss": 0.6186, "step": 12480 }, { "epoch": 1.098698099929627, "grad_norm": 7.608863830566406, "learning_rate": 7.992448191282745e-06, "loss": 0.5306, "step": 12490 }, { "epoch": 1.0995777621393386, "grad_norm": 9.00650691986084, "learning_rate": 7.988346624722484e-06, "loss": 0.5853, "step": 12500 }, { "epoch": 1.10045742434905, "grad_norm": 8.452534675598145, "learning_rate": 7.984241927386199e-06, "loss": 0.5575, "step": 12510 }, { "epoch": 1.1013370865587615, "grad_norm": 7.744530200958252, "learning_rate": 7.98013410357422e-06, "loss": 0.6019, "step": 12520 }, { "epoch": 1.1022167487684729, "grad_norm": 5.935032844543457, "learning_rate": 7.976023157590159e-06, "loss": 0.5471, "step": 12530 }, { "epoch": 1.1030964109781844, "grad_norm": 9.631288528442383, "learning_rate": 7.971909093740894e-06, "loss": 0.6476, "step": 12540 }, { "epoch": 1.1039760731878958, "grad_norm": 7.360828876495361, "learning_rate": 7.967791916336575e-06, "loss": 0.5209, "step": 12550 }, { "epoch": 1.1048557353976074, "grad_norm": 7.481033802032471, "learning_rate": 7.96367162969061e-06, "loss": 0.6603, "step": 12560 }, { "epoch": 1.1057353976073188, "grad_norm": 7.018675327301025, "learning_rate": 7.959548238119663e-06, "loss": 0.5143, "step": 12570 }, { "epoch": 1.1066150598170303, "grad_norm": 7.054087162017822, "learning_rate": 7.955421745943654e-06, "loss": 0.5922, "step": 12580 }, { "epoch": 1.1074947220267417, "grad_norm": 10.1622953414917, "learning_rate": 7.95129215748575e-06, "loss": 0.5627, "step": 12590 }, { "epoch": 1.1083743842364533, "grad_norm": 6.5921430587768555, "learning_rate": 7.947159477072363e-06, "loss": 0.561, "step": 12600 }, { "epoch": 1.1092540464461647, "grad_norm": 7.595539569854736, "learning_rate": 7.943023709033146e-06, "loss": 0.6111, "step": 12610 }, { "epoch": 1.1101337086558762, "grad_norm": 6.488556861877441, "learning_rate": 7.938884857700979e-06, "loss": 0.5683, "step": 12620 }, { "epoch": 1.1110133708655876, "grad_norm": 7.177140712738037, "learning_rate": 7.934742927411983e-06, "loss": 0.5708, "step": 12630 }, { "epoch": 1.1118930330752992, "grad_norm": 8.59157657623291, "learning_rate": 7.930597922505494e-06, "loss": 0.6142, "step": 12640 }, { "epoch": 1.1127726952850105, "grad_norm": 10.4617338180542, "learning_rate": 7.92644984732408e-06, "loss": 0.5552, "step": 12650 }, { "epoch": 1.113652357494722, "grad_norm": 6.717123031616211, "learning_rate": 7.922298706213515e-06, "loss": 0.5886, "step": 12660 }, { "epoch": 1.1145320197044335, "grad_norm": 5.260331153869629, "learning_rate": 7.918144503522793e-06, "loss": 0.4847, "step": 12670 }, { "epoch": 1.115411681914145, "grad_norm": 10.307119369506836, "learning_rate": 7.913987243604113e-06, "loss": 0.5736, "step": 12680 }, { "epoch": 1.1162913441238564, "grad_norm": 7.090002536773682, "learning_rate": 7.909826930812876e-06, "loss": 0.6336, "step": 12690 }, { "epoch": 1.1171710063335678, "grad_norm": 11.009613037109375, "learning_rate": 7.905663569507683e-06, "loss": 0.6331, "step": 12700 }, { "epoch": 1.1180506685432794, "grad_norm": 9.685070037841797, "learning_rate": 7.901497164050325e-06, "loss": 0.602, "step": 12710 }, { "epoch": 1.1189303307529908, "grad_norm": 8.219822883605957, "learning_rate": 7.897327718805786e-06, "loss": 0.5365, "step": 12720 }, { "epoch": 1.1198099929627023, "grad_norm": 10.204384803771973, "learning_rate": 7.893155238142237e-06, "loss": 0.659, "step": 12730 }, { "epoch": 1.1206896551724137, "grad_norm": 7.885805130004883, "learning_rate": 7.88897972643102e-06, "loss": 0.4733, "step": 12740 }, { "epoch": 1.1215693173821253, "grad_norm": 8.030442237854004, "learning_rate": 7.884801188046663e-06, "loss": 0.546, "step": 12750 }, { "epoch": 1.1224489795918366, "grad_norm": 9.007575988769531, "learning_rate": 7.88061962736686e-06, "loss": 0.6353, "step": 12760 }, { "epoch": 1.1233286418015482, "grad_norm": 8.007272720336914, "learning_rate": 7.876435048772468e-06, "loss": 0.5255, "step": 12770 }, { "epoch": 1.1242083040112596, "grad_norm": 7.643534183502197, "learning_rate": 7.872247456647513e-06, "loss": 0.574, "step": 12780 }, { "epoch": 1.1250879662209712, "grad_norm": 13.174545288085938, "learning_rate": 7.868056855379174e-06, "loss": 0.5865, "step": 12790 }, { "epoch": 1.1259676284306825, "grad_norm": 8.439684867858887, "learning_rate": 7.86386324935778e-06, "loss": 0.5648, "step": 12800 }, { "epoch": 1.1268472906403941, "grad_norm": 7.971238136291504, "learning_rate": 7.859666642976816e-06, "loss": 0.5409, "step": 12810 }, { "epoch": 1.1277269528501055, "grad_norm": 7.609548568725586, "learning_rate": 7.8554670406329e-06, "loss": 0.5177, "step": 12820 }, { "epoch": 1.128606615059817, "grad_norm": 7.3401336669921875, "learning_rate": 7.851264446725797e-06, "loss": 0.6537, "step": 12830 }, { "epoch": 1.1294862772695284, "grad_norm": 7.238247871398926, "learning_rate": 7.847058865658404e-06, "loss": 0.5086, "step": 12840 }, { "epoch": 1.13036593947924, "grad_norm": 7.037383079528809, "learning_rate": 7.842850301836746e-06, "loss": 0.5665, "step": 12850 }, { "epoch": 1.1312456016889514, "grad_norm": 7.42608118057251, "learning_rate": 7.838638759669971e-06, "loss": 0.5069, "step": 12860 }, { "epoch": 1.132125263898663, "grad_norm": 7.664800643920898, "learning_rate": 7.834424243570356e-06, "loss": 0.5323, "step": 12870 }, { "epoch": 1.1330049261083743, "grad_norm": 8.97972297668457, "learning_rate": 7.830206757953284e-06, "loss": 0.5803, "step": 12880 }, { "epoch": 1.133884588318086, "grad_norm": 10.487667083740234, "learning_rate": 7.82598630723725e-06, "loss": 0.6117, "step": 12890 }, { "epoch": 1.1347642505277973, "grad_norm": 7.155311584472656, "learning_rate": 7.821762895843864e-06, "loss": 0.5883, "step": 12900 }, { "epoch": 1.1356439127375089, "grad_norm": 6.260738372802734, "learning_rate": 7.81753652819783e-06, "loss": 0.5673, "step": 12910 }, { "epoch": 1.1365235749472202, "grad_norm": 9.291387557983398, "learning_rate": 7.813307208726948e-06, "loss": 0.6405, "step": 12920 }, { "epoch": 1.1374032371569318, "grad_norm": 8.622729301452637, "learning_rate": 7.809074941862116e-06, "loss": 0.5111, "step": 12930 }, { "epoch": 1.1382828993666432, "grad_norm": 7.473614692687988, "learning_rate": 7.804839732037319e-06, "loss": 0.5049, "step": 12940 }, { "epoch": 1.1391625615763548, "grad_norm": 7.946528434753418, "learning_rate": 7.800601583689622e-06, "loss": 0.557, "step": 12950 }, { "epoch": 1.1400422237860661, "grad_norm": 6.729836463928223, "learning_rate": 7.79636050125917e-06, "loss": 0.6601, "step": 12960 }, { "epoch": 1.1409218859957777, "grad_norm": 6.728960990905762, "learning_rate": 7.792116489189183e-06, "loss": 0.5384, "step": 12970 }, { "epoch": 1.141801548205489, "grad_norm": 9.298859596252441, "learning_rate": 7.787869551925949e-06, "loss": 0.5254, "step": 12980 }, { "epoch": 1.1426812104152007, "grad_norm": 8.573859214782715, "learning_rate": 7.78361969391882e-06, "loss": 0.5768, "step": 12990 }, { "epoch": 1.143560872624912, "grad_norm": 9.560981750488281, "learning_rate": 7.779366919620212e-06, "loss": 0.5626, "step": 13000 }, { "epoch": 1.1444405348346236, "grad_norm": 7.249106407165527, "learning_rate": 7.775111233485592e-06, "loss": 0.5071, "step": 13010 }, { "epoch": 1.145320197044335, "grad_norm": 8.450703620910645, "learning_rate": 7.770852639973477e-06, "loss": 0.6001, "step": 13020 }, { "epoch": 1.1461998592540463, "grad_norm": 7.013330936431885, "learning_rate": 7.766591143545435e-06, "loss": 0.6528, "step": 13030 }, { "epoch": 1.147079521463758, "grad_norm": 8.423206329345703, "learning_rate": 7.762326748666072e-06, "loss": 0.5328, "step": 13040 }, { "epoch": 1.1479591836734695, "grad_norm": 8.476350784301758, "learning_rate": 7.758059459803027e-06, "loss": 0.5754, "step": 13050 }, { "epoch": 1.1488388458831809, "grad_norm": 9.691259384155273, "learning_rate": 7.75378928142698e-06, "loss": 0.6044, "step": 13060 }, { "epoch": 1.1497185080928922, "grad_norm": 8.391997337341309, "learning_rate": 7.749516218011628e-06, "loss": 0.5349, "step": 13070 }, { "epoch": 1.1505981703026038, "grad_norm": 7.1731390953063965, "learning_rate": 7.7452402740337e-06, "loss": 0.5121, "step": 13080 }, { "epoch": 1.1514778325123154, "grad_norm": 9.786884307861328, "learning_rate": 7.740961453972933e-06, "loss": 0.5895, "step": 13090 }, { "epoch": 1.1523574947220268, "grad_norm": 7.553300857543945, "learning_rate": 7.73667976231209e-06, "loss": 0.5268, "step": 13100 }, { "epoch": 1.1532371569317381, "grad_norm": 7.397988796234131, "learning_rate": 7.732395203536927e-06, "loss": 0.529, "step": 13110 }, { "epoch": 1.1541168191414497, "grad_norm": 11.817912101745605, "learning_rate": 7.728107782136216e-06, "loss": 0.634, "step": 13120 }, { "epoch": 1.154996481351161, "grad_norm": 10.07180118560791, "learning_rate": 7.723817502601724e-06, "loss": 0.5845, "step": 13130 }, { "epoch": 1.1558761435608726, "grad_norm": 6.0853047370910645, "learning_rate": 7.719524369428211e-06, "loss": 0.493, "step": 13140 }, { "epoch": 1.156755805770584, "grad_norm": 8.719619750976562, "learning_rate": 7.715228387113427e-06, "loss": 0.5451, "step": 13150 }, { "epoch": 1.1576354679802956, "grad_norm": 7.722339630126953, "learning_rate": 7.710929560158111e-06, "loss": 0.5851, "step": 13160 }, { "epoch": 1.158515130190007, "grad_norm": 6.445685386657715, "learning_rate": 7.706627893065975e-06, "loss": 0.4925, "step": 13170 }, { "epoch": 1.1593947923997185, "grad_norm": 7.203913688659668, "learning_rate": 7.702323390343713e-06, "loss": 0.5924, "step": 13180 }, { "epoch": 1.16027445460943, "grad_norm": 6.914217472076416, "learning_rate": 7.698016056500986e-06, "loss": 0.6313, "step": 13190 }, { "epoch": 1.1611541168191415, "grad_norm": 7.762572765350342, "learning_rate": 7.693705896050423e-06, "loss": 0.5338, "step": 13200 }, { "epoch": 1.1620337790288529, "grad_norm": 9.136201858520508, "learning_rate": 7.689392913507611e-06, "loss": 0.5982, "step": 13210 }, { "epoch": 1.1629134412385644, "grad_norm": 7.919384002685547, "learning_rate": 7.685077113391102e-06, "loss": 0.5711, "step": 13220 }, { "epoch": 1.1637931034482758, "grad_norm": 7.14493989944458, "learning_rate": 7.68075850022239e-06, "loss": 0.5815, "step": 13230 }, { "epoch": 1.1646727656579874, "grad_norm": 9.953307151794434, "learning_rate": 7.67643707852592e-06, "loss": 0.5266, "step": 13240 }, { "epoch": 1.1655524278676987, "grad_norm": 7.297393798828125, "learning_rate": 7.67211285282908e-06, "loss": 0.5356, "step": 13250 }, { "epoch": 1.1664320900774103, "grad_norm": 10.925657272338867, "learning_rate": 7.667785827662197e-06, "loss": 0.5708, "step": 13260 }, { "epoch": 1.1673117522871217, "grad_norm": 9.23708438873291, "learning_rate": 7.663456007558527e-06, "loss": 0.6129, "step": 13270 }, { "epoch": 1.1681914144968333, "grad_norm": 7.488320827484131, "learning_rate": 7.659123397054259e-06, "loss": 0.5516, "step": 13280 }, { "epoch": 1.1690710767065446, "grad_norm": 6.500202655792236, "learning_rate": 7.654788000688498e-06, "loss": 0.5716, "step": 13290 }, { "epoch": 1.1699507389162562, "grad_norm": 10.763008117675781, "learning_rate": 7.650449823003274e-06, "loss": 0.5808, "step": 13300 }, { "epoch": 1.1708304011259676, "grad_norm": 5.932322025299072, "learning_rate": 7.646108868543533e-06, "loss": 0.6027, "step": 13310 }, { "epoch": 1.1717100633356792, "grad_norm": 9.5794677734375, "learning_rate": 7.641765141857125e-06, "loss": 0.6596, "step": 13320 }, { "epoch": 1.1725897255453905, "grad_norm": 7.075094223022461, "learning_rate": 7.6374186474948e-06, "loss": 0.497, "step": 13330 }, { "epoch": 1.1734693877551021, "grad_norm": 7.587216854095459, "learning_rate": 7.633069390010222e-06, "loss": 0.6251, "step": 13340 }, { "epoch": 1.1743490499648135, "grad_norm": 8.407641410827637, "learning_rate": 7.628717373959935e-06, "loss": 0.5375, "step": 13350 }, { "epoch": 1.175228712174525, "grad_norm": 7.672147274017334, "learning_rate": 7.62436260390338e-06, "loss": 0.6586, "step": 13360 }, { "epoch": 1.1761083743842364, "grad_norm": 6.667666435241699, "learning_rate": 7.620005084402885e-06, "loss": 0.4677, "step": 13370 }, { "epoch": 1.176988036593948, "grad_norm": 9.85606861114502, "learning_rate": 7.6156448200236525e-06, "loss": 0.5932, "step": 13380 }, { "epoch": 1.1778676988036594, "grad_norm": 6.786891460418701, "learning_rate": 7.611281815333767e-06, "loss": 0.6476, "step": 13390 }, { "epoch": 1.1787473610133707, "grad_norm": 10.463642120361328, "learning_rate": 7.6069160749041785e-06, "loss": 0.6459, "step": 13400 }, { "epoch": 1.1796270232230823, "grad_norm": 6.578385829925537, "learning_rate": 7.602547603308707e-06, "loss": 0.5859, "step": 13410 }, { "epoch": 1.180506685432794, "grad_norm": 7.430306911468506, "learning_rate": 7.598176405124031e-06, "loss": 0.4914, "step": 13420 }, { "epoch": 1.1813863476425053, "grad_norm": 8.293041229248047, "learning_rate": 7.59380248492969e-06, "loss": 0.4977, "step": 13430 }, { "epoch": 1.1822660098522166, "grad_norm": 8.912906646728516, "learning_rate": 7.589425847308071e-06, "loss": 0.6837, "step": 13440 }, { "epoch": 1.1831456720619282, "grad_norm": 8.757073402404785, "learning_rate": 7.5850464968444075e-06, "loss": 0.543, "step": 13450 }, { "epoch": 1.1840253342716398, "grad_norm": 11.60096263885498, "learning_rate": 7.580664438126779e-06, "loss": 0.6809, "step": 13460 }, { "epoch": 1.1849049964813512, "grad_norm": 8.979601860046387, "learning_rate": 7.576279675746098e-06, "loss": 0.5879, "step": 13470 }, { "epoch": 1.1857846586910625, "grad_norm": 9.390915870666504, "learning_rate": 7.571892214296114e-06, "loss": 0.5895, "step": 13480 }, { "epoch": 1.1866643209007741, "grad_norm": 10.91619873046875, "learning_rate": 7.567502058373401e-06, "loss": 0.5718, "step": 13490 }, { "epoch": 1.1875439831104855, "grad_norm": 8.704005241394043, "learning_rate": 7.563109212577355e-06, "loss": 0.615, "step": 13500 }, { "epoch": 1.188423645320197, "grad_norm": 7.740281105041504, "learning_rate": 7.558713681510196e-06, "loss": 0.5194, "step": 13510 }, { "epoch": 1.1893033075299084, "grad_norm": 9.596470832824707, "learning_rate": 7.554315469776949e-06, "loss": 0.5329, "step": 13520 }, { "epoch": 1.19018296973962, "grad_norm": 8.664963722229004, "learning_rate": 7.549914581985454e-06, "loss": 0.5284, "step": 13530 }, { "epoch": 1.1910626319493314, "grad_norm": 7.481260299682617, "learning_rate": 7.54551102274635e-06, "loss": 0.6122, "step": 13540 }, { "epoch": 1.191942294159043, "grad_norm": 5.42224645614624, "learning_rate": 7.541104796673081e-06, "loss": 0.5872, "step": 13550 }, { "epoch": 1.1928219563687543, "grad_norm": 8.704337120056152, "learning_rate": 7.5366959083818765e-06, "loss": 0.5439, "step": 13560 }, { "epoch": 1.193701618578466, "grad_norm": 8.045351028442383, "learning_rate": 7.532284362491762e-06, "loss": 0.5889, "step": 13570 }, { "epoch": 1.1945812807881773, "grad_norm": 9.120620727539062, "learning_rate": 7.527870163624544e-06, "loss": 0.5312, "step": 13580 }, { "epoch": 1.1954609429978889, "grad_norm": 10.681914329528809, "learning_rate": 7.523453316404809e-06, "loss": 0.5903, "step": 13590 }, { "epoch": 1.1963406052076002, "grad_norm": 7.801657199859619, "learning_rate": 7.519033825459918e-06, "loss": 0.5843, "step": 13600 }, { "epoch": 1.1972202674173118, "grad_norm": 6.025331974029541, "learning_rate": 7.5146116954200044e-06, "loss": 0.4867, "step": 13610 }, { "epoch": 1.1980999296270232, "grad_norm": 7.54993200302124, "learning_rate": 7.51018693091796e-06, "loss": 0.5933, "step": 13620 }, { "epoch": 1.1989795918367347, "grad_norm": 6.8038129806518555, "learning_rate": 7.505759536589442e-06, "loss": 0.511, "step": 13630 }, { "epoch": 1.199859254046446, "grad_norm": 5.675104141235352, "learning_rate": 7.501329517072865e-06, "loss": 0.5355, "step": 13640 }, { "epoch": 1.2007389162561577, "grad_norm": 10.02097225189209, "learning_rate": 7.496896877009385e-06, "loss": 0.6155, "step": 13650 }, { "epoch": 1.201618578465869, "grad_norm": 9.393360137939453, "learning_rate": 7.492461621042913e-06, "loss": 0.4977, "step": 13660 }, { "epoch": 1.2024982406755806, "grad_norm": 6.623519420623779, "learning_rate": 7.488023753820095e-06, "loss": 0.5443, "step": 13670 }, { "epoch": 1.203377902885292, "grad_norm": 9.665759086608887, "learning_rate": 7.483583279990314e-06, "loss": 0.5608, "step": 13680 }, { "epoch": 1.2042575650950036, "grad_norm": 9.989501953125, "learning_rate": 7.479140204205682e-06, "loss": 0.5231, "step": 13690 }, { "epoch": 1.205137227304715, "grad_norm": 8.187620162963867, "learning_rate": 7.474694531121043e-06, "loss": 0.5041, "step": 13700 }, { "epoch": 1.2060168895144265, "grad_norm": 6.390125274658203, "learning_rate": 7.470246265393955e-06, "loss": 0.611, "step": 13710 }, { "epoch": 1.206896551724138, "grad_norm": 8.835799217224121, "learning_rate": 7.465795411684695e-06, "loss": 0.5408, "step": 13720 }, { "epoch": 1.2077762139338495, "grad_norm": 5.5417985916137695, "learning_rate": 7.461341974656254e-06, "loss": 0.542, "step": 13730 }, { "epoch": 1.2086558761435608, "grad_norm": 7.760707855224609, "learning_rate": 7.456885958974325e-06, "loss": 0.5048, "step": 13740 }, { "epoch": 1.2095355383532724, "grad_norm": 10.017494201660156, "learning_rate": 7.452427369307303e-06, "loss": 0.5496, "step": 13750 }, { "epoch": 1.2104152005629838, "grad_norm": 6.603514194488525, "learning_rate": 7.447966210326287e-06, "loss": 0.5063, "step": 13760 }, { "epoch": 1.2112948627726954, "grad_norm": 7.037947177886963, "learning_rate": 7.4435024867050566e-06, "loss": 0.6553, "step": 13770 }, { "epoch": 1.2121745249824067, "grad_norm": 7.217723369598389, "learning_rate": 7.439036203120084e-06, "loss": 0.5322, "step": 13780 }, { "epoch": 1.2130541871921183, "grad_norm": 6.5982866287231445, "learning_rate": 7.434567364250527e-06, "loss": 0.5723, "step": 13790 }, { "epoch": 1.2139338494018297, "grad_norm": 10.606630325317383, "learning_rate": 7.4300959747782135e-06, "loss": 0.6335, "step": 13800 }, { "epoch": 1.214813511611541, "grad_norm": 7.576580047607422, "learning_rate": 7.425622039387645e-06, "loss": 0.5184, "step": 13810 }, { "epoch": 1.2156931738212526, "grad_norm": 7.089601039886475, "learning_rate": 7.4211455627659955e-06, "loss": 0.5333, "step": 13820 }, { "epoch": 1.2165728360309642, "grad_norm": 11.016584396362305, "learning_rate": 7.416666549603094e-06, "loss": 0.5541, "step": 13830 }, { "epoch": 1.2174524982406756, "grad_norm": 10.275322914123535, "learning_rate": 7.412185004591434e-06, "loss": 0.5585, "step": 13840 }, { "epoch": 1.218332160450387, "grad_norm": 9.521339416503906, "learning_rate": 7.407700932426158e-06, "loss": 0.4978, "step": 13850 }, { "epoch": 1.2192118226600985, "grad_norm": 11.003255844116211, "learning_rate": 7.4032143378050535e-06, "loss": 0.5305, "step": 13860 }, { "epoch": 1.2200914848698101, "grad_norm": 10.373151779174805, "learning_rate": 7.398725225428555e-06, "loss": 0.5589, "step": 13870 }, { "epoch": 1.2209711470795215, "grad_norm": 7.177501678466797, "learning_rate": 7.394233599999734e-06, "loss": 0.5243, "step": 13880 }, { "epoch": 1.2218508092892328, "grad_norm": 9.694533348083496, "learning_rate": 7.389739466224293e-06, "loss": 0.599, "step": 13890 }, { "epoch": 1.2227304714989444, "grad_norm": 9.523486137390137, "learning_rate": 7.385242828810561e-06, "loss": 0.4966, "step": 13900 }, { "epoch": 1.2236101337086558, "grad_norm": 7.345489978790283, "learning_rate": 7.3807436924694985e-06, "loss": 0.5615, "step": 13910 }, { "epoch": 1.2244897959183674, "grad_norm": 8.574580192565918, "learning_rate": 7.376242061914673e-06, "loss": 0.5285, "step": 13920 }, { "epoch": 1.2253694581280787, "grad_norm": 9.671597480773926, "learning_rate": 7.3717379418622715e-06, "loss": 0.6096, "step": 13930 }, { "epoch": 1.2262491203377903, "grad_norm": 7.396388053894043, "learning_rate": 7.367231337031087e-06, "loss": 0.5025, "step": 13940 }, { "epoch": 1.2271287825475017, "grad_norm": 9.664523124694824, "learning_rate": 7.362722252142517e-06, "loss": 0.5488, "step": 13950 }, { "epoch": 1.2280084447572133, "grad_norm": 9.227258682250977, "learning_rate": 7.358210691920556e-06, "loss": 0.6528, "step": 13960 }, { "epoch": 1.2288881069669246, "grad_norm": 7.006105422973633, "learning_rate": 7.353696661091795e-06, "loss": 0.5488, "step": 13970 }, { "epoch": 1.2297677691766362, "grad_norm": 7.40347957611084, "learning_rate": 7.349180164385405e-06, "loss": 0.4632, "step": 13980 }, { "epoch": 1.2306474313863476, "grad_norm": 9.056800842285156, "learning_rate": 7.3446612065331524e-06, "loss": 0.5722, "step": 13990 }, { "epoch": 1.2315270935960592, "grad_norm": 8.078840255737305, "learning_rate": 7.340139792269374e-06, "loss": 0.6522, "step": 14000 }, { "epoch": 1.2324067558057705, "grad_norm": 8.293356895446777, "learning_rate": 7.33561592633098e-06, "loss": 0.5734, "step": 14010 }, { "epoch": 1.233286418015482, "grad_norm": 6.099889278411865, "learning_rate": 7.3310896134574515e-06, "loss": 0.526, "step": 14020 }, { "epoch": 1.2341660802251935, "grad_norm": 13.082859992980957, "learning_rate": 7.326560858390836e-06, "loss": 0.5421, "step": 14030 }, { "epoch": 1.235045742434905, "grad_norm": 8.52652645111084, "learning_rate": 7.322029665875732e-06, "loss": 0.552, "step": 14040 }, { "epoch": 1.2359254046446164, "grad_norm": 7.029118537902832, "learning_rate": 7.317496040659297e-06, "loss": 0.5574, "step": 14050 }, { "epoch": 1.236805066854328, "grad_norm": 7.787009239196777, "learning_rate": 7.312959987491239e-06, "loss": 0.5494, "step": 14060 }, { "epoch": 1.2376847290640394, "grad_norm": 6.9556379318237305, "learning_rate": 7.308421511123803e-06, "loss": 0.5104, "step": 14070 }, { "epoch": 1.238564391273751, "grad_norm": 7.409199237823486, "learning_rate": 7.303880616311781e-06, "loss": 0.5121, "step": 14080 }, { "epoch": 1.2394440534834623, "grad_norm": 8.319873809814453, "learning_rate": 7.299337307812489e-06, "loss": 0.4897, "step": 14090 }, { "epoch": 1.240323715693174, "grad_norm": 9.54175853729248, "learning_rate": 7.294791590385777e-06, "loss": 0.5331, "step": 14100 }, { "epoch": 1.2412033779028853, "grad_norm": 6.889533519744873, "learning_rate": 7.290243468794023e-06, "loss": 0.5544, "step": 14110 }, { "epoch": 1.2420830401125968, "grad_norm": 5.9160966873168945, "learning_rate": 7.285692947802117e-06, "loss": 0.5554, "step": 14120 }, { "epoch": 1.2429627023223082, "grad_norm": 8.187971115112305, "learning_rate": 7.281140032177465e-06, "loss": 0.4593, "step": 14130 }, { "epoch": 1.2438423645320198, "grad_norm": 9.446537971496582, "learning_rate": 7.276584726689982e-06, "loss": 0.5031, "step": 14140 }, { "epoch": 1.2447220267417312, "grad_norm": 7.918715476989746, "learning_rate": 7.2720270361120856e-06, "loss": 0.5518, "step": 14150 }, { "epoch": 1.2456016889514427, "grad_norm": 8.579546928405762, "learning_rate": 7.267466965218696e-06, "loss": 0.5681, "step": 14160 }, { "epoch": 1.246481351161154, "grad_norm": 11.405834197998047, "learning_rate": 7.262904518787222e-06, "loss": 0.5278, "step": 14170 }, { "epoch": 1.2473610133708655, "grad_norm": 6.060388088226318, "learning_rate": 7.258339701597565e-06, "loss": 0.501, "step": 14180 }, { "epoch": 1.248240675580577, "grad_norm": 6.146483421325684, "learning_rate": 7.253772518432108e-06, "loss": 0.5905, "step": 14190 }, { "epoch": 1.2491203377902886, "grad_norm": 6.23621129989624, "learning_rate": 7.249202974075713e-06, "loss": 0.5848, "step": 14200 }, { "epoch": 1.25, "grad_norm": 7.694369316101074, "learning_rate": 7.244631073315718e-06, "loss": 0.4971, "step": 14210 }, { "epoch": 1.2508796622097114, "grad_norm": 7.256680965423584, "learning_rate": 7.240056820941925e-06, "loss": 0.5098, "step": 14220 }, { "epoch": 1.251759324419423, "grad_norm": 9.356908798217773, "learning_rate": 7.235480221746605e-06, "loss": 0.5646, "step": 14230 }, { "epoch": 1.2526389866291345, "grad_norm": 9.248125076293945, "learning_rate": 7.2309012805244846e-06, "loss": 0.5893, "step": 14240 }, { "epoch": 1.253518648838846, "grad_norm": 14.058982849121094, "learning_rate": 7.226320002072744e-06, "loss": 0.5554, "step": 14250 }, { "epoch": 1.2543983110485573, "grad_norm": 12.598773956298828, "learning_rate": 7.221736391191015e-06, "loss": 0.536, "step": 14260 }, { "epoch": 1.2552779732582688, "grad_norm": 8.334681510925293, "learning_rate": 7.217150452681367e-06, "loss": 0.5628, "step": 14270 }, { "epoch": 1.2561576354679804, "grad_norm": 5.621058464050293, "learning_rate": 7.2125621913483134e-06, "loss": 0.5731, "step": 14280 }, { "epoch": 1.2570372976776918, "grad_norm": 5.561543941497803, "learning_rate": 7.207971611998801e-06, "loss": 0.5752, "step": 14290 }, { "epoch": 1.2579169598874032, "grad_norm": 7.596503257751465, "learning_rate": 7.203378719442204e-06, "loss": 0.6809, "step": 14300 }, { "epoch": 1.2587966220971147, "grad_norm": 6.681259632110596, "learning_rate": 7.1987835184903145e-06, "loss": 0.5136, "step": 14310 }, { "epoch": 1.259676284306826, "grad_norm": 9.608114242553711, "learning_rate": 7.194186013957353e-06, "loss": 0.5113, "step": 14320 }, { "epoch": 1.2605559465165377, "grad_norm": 8.33874225616455, "learning_rate": 7.1895862106599466e-06, "loss": 0.6041, "step": 14330 }, { "epoch": 1.261435608726249, "grad_norm": 6.769319534301758, "learning_rate": 7.184984113417135e-06, "loss": 0.5517, "step": 14340 }, { "epoch": 1.2623152709359606, "grad_norm": 8.972335815429688, "learning_rate": 7.180379727050355e-06, "loss": 0.6017, "step": 14350 }, { "epoch": 1.263194933145672, "grad_norm": 7.16944694519043, "learning_rate": 7.175773056383447e-06, "loss": 0.5768, "step": 14360 }, { "epoch": 1.2640745953553836, "grad_norm": 9.27467155456543, "learning_rate": 7.171164106242645e-06, "loss": 0.5658, "step": 14370 }, { "epoch": 1.264954257565095, "grad_norm": 7.159074783325195, "learning_rate": 7.166552881456566e-06, "loss": 0.4748, "step": 14380 }, { "epoch": 1.2658339197748065, "grad_norm": 8.539411544799805, "learning_rate": 7.161939386856215e-06, "loss": 0.5034, "step": 14390 }, { "epoch": 1.266713581984518, "grad_norm": 11.004217147827148, "learning_rate": 7.157323627274971e-06, "loss": 0.5615, "step": 14400 }, { "epoch": 1.2675932441942295, "grad_norm": 8.612276077270508, "learning_rate": 7.152705607548593e-06, "loss": 0.6203, "step": 14410 }, { "epoch": 1.2684729064039408, "grad_norm": 8.808526039123535, "learning_rate": 7.148085332515198e-06, "loss": 0.5827, "step": 14420 }, { "epoch": 1.2693525686136524, "grad_norm": 7.379049777984619, "learning_rate": 7.143462807015271e-06, "loss": 0.4869, "step": 14430 }, { "epoch": 1.2702322308233638, "grad_norm": 8.35226058959961, "learning_rate": 7.138838035891657e-06, "loss": 0.538, "step": 14440 }, { "epoch": 1.2711118930330754, "grad_norm": 8.90818977355957, "learning_rate": 7.134211023989552e-06, "loss": 0.625, "step": 14450 }, { "epoch": 1.2719915552427867, "grad_norm": 9.669974327087402, "learning_rate": 7.1295817761564955e-06, "loss": 0.5994, "step": 14460 }, { "epoch": 1.2728712174524983, "grad_norm": 8.275327682495117, "learning_rate": 7.124950297242373e-06, "loss": 0.5222, "step": 14470 }, { "epoch": 1.2737508796622097, "grad_norm": 9.682085037231445, "learning_rate": 7.12031659209941e-06, "loss": 0.538, "step": 14480 }, { "epoch": 1.2746305418719213, "grad_norm": 6.858287334442139, "learning_rate": 7.115680665582159e-06, "loss": 0.5198, "step": 14490 }, { "epoch": 1.2755102040816326, "grad_norm": 9.664053916931152, "learning_rate": 7.1110425225475035e-06, "loss": 0.544, "step": 14500 }, { "epoch": 1.276389866291344, "grad_norm": 6.8382792472839355, "learning_rate": 7.106402167854647e-06, "loss": 0.5439, "step": 14510 }, { "epoch": 1.2772695285010556, "grad_norm": 9.257858276367188, "learning_rate": 7.101759606365111e-06, "loss": 0.5271, "step": 14520 }, { "epoch": 1.2781491907107672, "grad_norm": 6.733691215515137, "learning_rate": 7.0971148429427295e-06, "loss": 0.6445, "step": 14530 }, { "epoch": 1.2790288529204785, "grad_norm": 7.269659996032715, "learning_rate": 7.092467882453643e-06, "loss": 0.5537, "step": 14540 }, { "epoch": 1.2799085151301899, "grad_norm": 8.498445510864258, "learning_rate": 7.087818729766292e-06, "loss": 0.4857, "step": 14550 }, { "epoch": 1.2807881773399015, "grad_norm": 9.398307800292969, "learning_rate": 7.083167389751415e-06, "loss": 0.5163, "step": 14560 }, { "epoch": 1.281667839549613, "grad_norm": 7.600684642791748, "learning_rate": 7.078513867282046e-06, "loss": 0.533, "step": 14570 }, { "epoch": 1.2825475017593244, "grad_norm": 7.804446697235107, "learning_rate": 7.0738581672335004e-06, "loss": 0.5335, "step": 14580 }, { "epoch": 1.2834271639690358, "grad_norm": 10.307950019836426, "learning_rate": 7.069200294483373e-06, "loss": 0.584, "step": 14590 }, { "epoch": 1.2843068261787474, "grad_norm": 9.784930229187012, "learning_rate": 7.064540253911542e-06, "loss": 0.5448, "step": 14600 }, { "epoch": 1.285186488388459, "grad_norm": 8.289177894592285, "learning_rate": 7.059878050400152e-06, "loss": 0.5126, "step": 14610 }, { "epoch": 1.2860661505981703, "grad_norm": 9.050064086914062, "learning_rate": 7.055213688833615e-06, "loss": 0.5567, "step": 14620 }, { "epoch": 1.2869458128078817, "grad_norm": 7.646144390106201, "learning_rate": 7.050547174098602e-06, "loss": 0.6066, "step": 14630 }, { "epoch": 1.2878254750175933, "grad_norm": 6.3253068923950195, "learning_rate": 7.045878511084041e-06, "loss": 0.6073, "step": 14640 }, { "epoch": 1.2887051372273048, "grad_norm": 8.745948791503906, "learning_rate": 7.041207704681113e-06, "loss": 0.5857, "step": 14650 }, { "epoch": 1.2895847994370162, "grad_norm": 9.107210159301758, "learning_rate": 7.0365347597832445e-06, "loss": 0.5208, "step": 14660 }, { "epoch": 1.2904644616467276, "grad_norm": 9.712482452392578, "learning_rate": 7.031859681286095e-06, "loss": 0.6517, "step": 14670 }, { "epoch": 1.2913441238564392, "grad_norm": 8.603989601135254, "learning_rate": 7.027182474087568e-06, "loss": 0.5312, "step": 14680 }, { "epoch": 1.2922237860661507, "grad_norm": 8.763587951660156, "learning_rate": 7.0225031430877934e-06, "loss": 0.5175, "step": 14690 }, { "epoch": 1.293103448275862, "grad_norm": 7.060293674468994, "learning_rate": 7.017821693189128e-06, "loss": 0.4756, "step": 14700 }, { "epoch": 1.2939831104855735, "grad_norm": 7.492372512817383, "learning_rate": 7.0131381292961434e-06, "loss": 0.5915, "step": 14710 }, { "epoch": 1.294862772695285, "grad_norm": 6.9808855056762695, "learning_rate": 7.0084524563156335e-06, "loss": 0.5481, "step": 14720 }, { "epoch": 1.2957424349049964, "grad_norm": 6.862091064453125, "learning_rate": 7.003764679156596e-06, "loss": 0.5307, "step": 14730 }, { "epoch": 1.296622097114708, "grad_norm": 7.117900848388672, "learning_rate": 6.999074802730236e-06, "loss": 0.6312, "step": 14740 }, { "epoch": 1.2975017593244194, "grad_norm": 10.261190414428711, "learning_rate": 6.994382831949957e-06, "loss": 0.5123, "step": 14750 }, { "epoch": 1.298381421534131, "grad_norm": 14.309590339660645, "learning_rate": 6.989688771731355e-06, "loss": 0.524, "step": 14760 }, { "epoch": 1.2992610837438423, "grad_norm": 10.256165504455566, "learning_rate": 6.984992626992217e-06, "loss": 0.5084, "step": 14770 }, { "epoch": 1.300140745953554, "grad_norm": 7.309422969818115, "learning_rate": 6.9802944026525175e-06, "loss": 0.6247, "step": 14780 }, { "epoch": 1.3010204081632653, "grad_norm": 7.722208499908447, "learning_rate": 6.9755941036344e-06, "loss": 0.6064, "step": 14790 }, { "epoch": 1.3019000703729768, "grad_norm": 7.995462894439697, "learning_rate": 6.970891734862191e-06, "loss": 0.513, "step": 14800 }, { "epoch": 1.3027797325826882, "grad_norm": 8.496228218078613, "learning_rate": 6.966187301262382e-06, "loss": 0.5503, "step": 14810 }, { "epoch": 1.3036593947923998, "grad_norm": 8.70747184753418, "learning_rate": 6.9614808077636245e-06, "loss": 0.6265, "step": 14820 }, { "epoch": 1.3045390570021111, "grad_norm": 9.090606689453125, "learning_rate": 6.956772259296732e-06, "loss": 0.6422, "step": 14830 }, { "epoch": 1.3054187192118227, "grad_norm": 9.655423164367676, "learning_rate": 6.952061660794672e-06, "loss": 0.531, "step": 14840 }, { "epoch": 1.306298381421534, "grad_norm": 6.987698554992676, "learning_rate": 6.947349017192555e-06, "loss": 0.5835, "step": 14850 }, { "epoch": 1.3071780436312457, "grad_norm": 15.035801887512207, "learning_rate": 6.94263433342764e-06, "loss": 0.5921, "step": 14860 }, { "epoch": 1.308057705840957, "grad_norm": 7.486433506011963, "learning_rate": 6.937917614439318e-06, "loss": 0.542, "step": 14870 }, { "epoch": 1.3089373680506686, "grad_norm": 8.21220588684082, "learning_rate": 6.933198865169113e-06, "loss": 0.519, "step": 14880 }, { "epoch": 1.30981703026038, "grad_norm": 10.023938179016113, "learning_rate": 6.92847809056068e-06, "loss": 0.5649, "step": 14890 }, { "epoch": 1.3106966924700916, "grad_norm": 6.025881290435791, "learning_rate": 6.923755295559793e-06, "loss": 0.5193, "step": 14900 }, { "epoch": 1.311576354679803, "grad_norm": 7.6829118728637695, "learning_rate": 6.919030485114342e-06, "loss": 0.5407, "step": 14910 }, { "epoch": 1.3124560168895143, "grad_norm": 8.238287925720215, "learning_rate": 6.91430366417433e-06, "loss": 0.5257, "step": 14920 }, { "epoch": 1.3133356790992259, "grad_norm": 9.790637969970703, "learning_rate": 6.909574837691866e-06, "loss": 0.4932, "step": 14930 }, { "epoch": 1.3142153413089375, "grad_norm": 7.681717395782471, "learning_rate": 6.90484401062116e-06, "loss": 0.5126, "step": 14940 }, { "epoch": 1.3150950035186488, "grad_norm": 8.97265625, "learning_rate": 6.900111187918517e-06, "loss": 0.5485, "step": 14950 }, { "epoch": 1.3159746657283602, "grad_norm": 8.764293670654297, "learning_rate": 6.895376374542334e-06, "loss": 0.5249, "step": 14960 }, { "epoch": 1.3168543279380718, "grad_norm": 7.354798793792725, "learning_rate": 6.890639575453093e-06, "loss": 0.5318, "step": 14970 }, { "epoch": 1.3177339901477834, "grad_norm": 11.188117027282715, "learning_rate": 6.885900795613359e-06, "loss": 0.5956, "step": 14980 }, { "epoch": 1.3186136523574947, "grad_norm": 6.459826946258545, "learning_rate": 6.8811600399877665e-06, "loss": 0.6102, "step": 14990 }, { "epoch": 1.319493314567206, "grad_norm": 6.988093852996826, "learning_rate": 6.876417313543022e-06, "loss": 0.5408, "step": 15000 }, { "epoch": 1.3203729767769177, "grad_norm": 10.815760612487793, "learning_rate": 6.871672621247902e-06, "loss": 0.5887, "step": 15010 }, { "epoch": 1.3212526389866293, "grad_norm": 10.898590087890625, "learning_rate": 6.866925968073238e-06, "loss": 0.5325, "step": 15020 }, { "epoch": 1.3221323011963406, "grad_norm": 6.513483047485352, "learning_rate": 6.862177358991915e-06, "loss": 0.6317, "step": 15030 }, { "epoch": 1.323011963406052, "grad_norm": 8.982474327087402, "learning_rate": 6.857426798978866e-06, "loss": 0.5548, "step": 15040 }, { "epoch": 1.3238916256157636, "grad_norm": 9.006377220153809, "learning_rate": 6.852674293011074e-06, "loss": 0.6166, "step": 15050 }, { "epoch": 1.3247712878254752, "grad_norm": 8.068791389465332, "learning_rate": 6.8479198460675554e-06, "loss": 0.5655, "step": 15060 }, { "epoch": 1.3256509500351865, "grad_norm": 6.957041263580322, "learning_rate": 6.843163463129363e-06, "loss": 0.5345, "step": 15070 }, { "epoch": 1.3265306122448979, "grad_norm": 8.510924339294434, "learning_rate": 6.838405149179576e-06, "loss": 0.5239, "step": 15080 }, { "epoch": 1.3274102744546095, "grad_norm": 9.153879165649414, "learning_rate": 6.833644909203295e-06, "loss": 0.4789, "step": 15090 }, { "epoch": 1.3282899366643208, "grad_norm": 7.760932445526123, "learning_rate": 6.828882748187643e-06, "loss": 0.5925, "step": 15100 }, { "epoch": 1.3291695988740324, "grad_norm": 6.752501487731934, "learning_rate": 6.824118671121755e-06, "loss": 0.5011, "step": 15110 }, { "epoch": 1.3300492610837438, "grad_norm": 8.323711395263672, "learning_rate": 6.819352682996767e-06, "loss": 0.5845, "step": 15120 }, { "epoch": 1.3309289232934554, "grad_norm": 9.068470001220703, "learning_rate": 6.814584788805825e-06, "loss": 0.5085, "step": 15130 }, { "epoch": 1.3318085855031667, "grad_norm": 8.132193565368652, "learning_rate": 6.809814993544068e-06, "loss": 0.5165, "step": 15140 }, { "epoch": 1.3326882477128783, "grad_norm": 8.18858814239502, "learning_rate": 6.805043302208628e-06, "loss": 0.5939, "step": 15150 }, { "epoch": 1.3335679099225897, "grad_norm": 7.564757823944092, "learning_rate": 6.8002697197986215e-06, "loss": 0.5709, "step": 15160 }, { "epoch": 1.3344475721323013, "grad_norm": 6.14793062210083, "learning_rate": 6.795494251315149e-06, "loss": 0.5259, "step": 15170 }, { "epoch": 1.3353272343420126, "grad_norm": 7.86462926864624, "learning_rate": 6.790716901761283e-06, "loss": 0.4928, "step": 15180 }, { "epoch": 1.3362068965517242, "grad_norm": 9.957202911376953, "learning_rate": 6.7859376761420716e-06, "loss": 0.5749, "step": 15190 }, { "epoch": 1.3370865587614356, "grad_norm": 9.700116157531738, "learning_rate": 6.781156579464525e-06, "loss": 0.5514, "step": 15200 }, { "epoch": 1.3379662209711471, "grad_norm": 7.27429723739624, "learning_rate": 6.776373616737615e-06, "loss": 0.5982, "step": 15210 }, { "epoch": 1.3388458831808585, "grad_norm": 8.676167488098145, "learning_rate": 6.7715887929722664e-06, "loss": 0.4785, "step": 15220 }, { "epoch": 1.33972554539057, "grad_norm": 9.936002731323242, "learning_rate": 6.76680211318136e-06, "loss": 0.5325, "step": 15230 }, { "epoch": 1.3406052076002815, "grad_norm": 8.34602165222168, "learning_rate": 6.762013582379711e-06, "loss": 0.611, "step": 15240 }, { "epoch": 1.341484869809993, "grad_norm": 6.763606548309326, "learning_rate": 6.7572232055840805e-06, "loss": 0.5424, "step": 15250 }, { "epoch": 1.3423645320197044, "grad_norm": 10.236185073852539, "learning_rate": 6.752430987813166e-06, "loss": 0.5477, "step": 15260 }, { "epoch": 1.343244194229416, "grad_norm": 8.191959381103516, "learning_rate": 6.747636934087586e-06, "loss": 0.491, "step": 15270 }, { "epoch": 1.3441238564391274, "grad_norm": 7.856494903564453, "learning_rate": 6.742841049429888e-06, "loss": 0.5592, "step": 15280 }, { "epoch": 1.3450035186488387, "grad_norm": 7.523510932922363, "learning_rate": 6.738043338864536e-06, "loss": 0.5192, "step": 15290 }, { "epoch": 1.3458831808585503, "grad_norm": 9.194759368896484, "learning_rate": 6.733243807417908e-06, "loss": 0.5172, "step": 15300 }, { "epoch": 1.3467628430682619, "grad_norm": 7.090651988983154, "learning_rate": 6.728442460118287e-06, "loss": 0.553, "step": 15310 }, { "epoch": 1.3476425052779732, "grad_norm": 6.06172513961792, "learning_rate": 6.723639301995864e-06, "loss": 0.5866, "step": 15320 }, { "epoch": 1.3485221674876846, "grad_norm": 7.4258856773376465, "learning_rate": 6.7188343380827185e-06, "loss": 0.5823, "step": 15330 }, { "epoch": 1.3494018296973962, "grad_norm": 7.389446258544922, "learning_rate": 6.714027573412828e-06, "loss": 0.5735, "step": 15340 }, { "epoch": 1.3502814919071078, "grad_norm": 8.294984817504883, "learning_rate": 6.709219013022058e-06, "loss": 0.5083, "step": 15350 }, { "epoch": 1.3511611541168191, "grad_norm": 6.6845173835754395, "learning_rate": 6.7044086619481506e-06, "loss": 0.5356, "step": 15360 }, { "epoch": 1.3520408163265305, "grad_norm": 8.416315078735352, "learning_rate": 6.699596525230726e-06, "loss": 0.5319, "step": 15370 }, { "epoch": 1.352920478536242, "grad_norm": 6.855007171630859, "learning_rate": 6.694782607911275e-06, "loss": 0.493, "step": 15380 }, { "epoch": 1.3538001407459537, "grad_norm": 8.31822681427002, "learning_rate": 6.689966915033156e-06, "loss": 0.6029, "step": 15390 }, { "epoch": 1.354679802955665, "grad_norm": 10.87519645690918, "learning_rate": 6.685149451641581e-06, "loss": 0.4876, "step": 15400 }, { "epoch": 1.3555594651653764, "grad_norm": 8.100724220275879, "learning_rate": 6.6803302227836266e-06, "loss": 0.5616, "step": 15410 }, { "epoch": 1.356439127375088, "grad_norm": 8.104679107666016, "learning_rate": 6.675509233508209e-06, "loss": 0.5756, "step": 15420 }, { "epoch": 1.3573187895847996, "grad_norm": 12.977224349975586, "learning_rate": 6.6706864888660984e-06, "loss": 0.584, "step": 15430 }, { "epoch": 1.358198451794511, "grad_norm": 7.73675012588501, "learning_rate": 6.665861993909897e-06, "loss": 0.6113, "step": 15440 }, { "epoch": 1.3590781140042223, "grad_norm": 6.843143463134766, "learning_rate": 6.661035753694041e-06, "loss": 0.6038, "step": 15450 }, { "epoch": 1.3599577762139339, "grad_norm": 6.666586399078369, "learning_rate": 6.656207773274798e-06, "loss": 0.4764, "step": 15460 }, { "epoch": 1.3608374384236452, "grad_norm": 7.576254367828369, "learning_rate": 6.651378057710261e-06, "loss": 0.5472, "step": 15470 }, { "epoch": 1.3617171006333568, "grad_norm": 7.739799976348877, "learning_rate": 6.646546612060334e-06, "loss": 0.4677, "step": 15480 }, { "epoch": 1.3625967628430682, "grad_norm": 8.166440963745117, "learning_rate": 6.641713441386737e-06, "loss": 0.5567, "step": 15490 }, { "epoch": 1.3634764250527798, "grad_norm": 8.065286636352539, "learning_rate": 6.636878550753e-06, "loss": 0.534, "step": 15500 }, { "epoch": 1.3643560872624911, "grad_norm": 8.124488830566406, "learning_rate": 6.632041945224449e-06, "loss": 0.5204, "step": 15510 }, { "epoch": 1.3652357494722027, "grad_norm": 7.290230751037598, "learning_rate": 6.627203629868213e-06, "loss": 0.5327, "step": 15520 }, { "epoch": 1.366115411681914, "grad_norm": 7.024503231048584, "learning_rate": 6.622363609753209e-06, "loss": 0.5485, "step": 15530 }, { "epoch": 1.3669950738916257, "grad_norm": 10.082755088806152, "learning_rate": 6.617521889950137e-06, "loss": 0.5049, "step": 15540 }, { "epoch": 1.367874736101337, "grad_norm": 8.222545623779297, "learning_rate": 6.6126784755314846e-06, "loss": 0.5297, "step": 15550 }, { "epoch": 1.3687543983110486, "grad_norm": 10.354256629943848, "learning_rate": 6.607833371571511e-06, "loss": 0.5555, "step": 15560 }, { "epoch": 1.36963406052076, "grad_norm": 5.983184814453125, "learning_rate": 6.6029865831462405e-06, "loss": 0.569, "step": 15570 }, { "epoch": 1.3705137227304716, "grad_norm": 8.4411039352417, "learning_rate": 6.5981381153334725e-06, "loss": 0.5805, "step": 15580 }, { "epoch": 1.371393384940183, "grad_norm": 7.541089057922363, "learning_rate": 6.593287973212761e-06, "loss": 0.5387, "step": 15590 }, { "epoch": 1.3722730471498945, "grad_norm": 7.3517351150512695, "learning_rate": 6.58843616186541e-06, "loss": 0.5303, "step": 15600 }, { "epoch": 1.3731527093596059, "grad_norm": 8.850347518920898, "learning_rate": 6.5835826863744775e-06, "loss": 0.4936, "step": 15610 }, { "epoch": 1.3740323715693175, "grad_norm": 9.849382400512695, "learning_rate": 6.578727551824765e-06, "loss": 0.5532, "step": 15620 }, { "epoch": 1.3749120337790288, "grad_norm": 7.849625587463379, "learning_rate": 6.573870763302807e-06, "loss": 0.4551, "step": 15630 }, { "epoch": 1.3757916959887404, "grad_norm": 8.285499572753906, "learning_rate": 6.56901232589688e-06, "loss": 0.5763, "step": 15640 }, { "epoch": 1.3766713581984518, "grad_norm": 7.437036514282227, "learning_rate": 6.5641522446969766e-06, "loss": 0.556, "step": 15650 }, { "epoch": 1.3775510204081631, "grad_norm": 7.590633392333984, "learning_rate": 6.559290524794819e-06, "loss": 0.5678, "step": 15660 }, { "epoch": 1.3784306826178747, "grad_norm": 9.29285717010498, "learning_rate": 6.554427171283846e-06, "loss": 0.5275, "step": 15670 }, { "epoch": 1.3793103448275863, "grad_norm": 6.6517839431762695, "learning_rate": 6.549562189259206e-06, "loss": 0.5553, "step": 15680 }, { "epoch": 1.3801900070372977, "grad_norm": 8.90400505065918, "learning_rate": 6.5446955838177535e-06, "loss": 0.5209, "step": 15690 }, { "epoch": 1.381069669247009, "grad_norm": 6.5812225341796875, "learning_rate": 6.539827360058043e-06, "loss": 0.6113, "step": 15700 }, { "epoch": 1.3819493314567206, "grad_norm": 9.1018705368042, "learning_rate": 6.534957523080328e-06, "loss": 0.5565, "step": 15710 }, { "epoch": 1.3828289936664322, "grad_norm": 7.554117679595947, "learning_rate": 6.5300860779865475e-06, "loss": 0.5262, "step": 15720 }, { "epoch": 1.3837086558761436, "grad_norm": 6.70955228805542, "learning_rate": 6.525213029880328e-06, "loss": 0.4853, "step": 15730 }, { "epoch": 1.384588318085855, "grad_norm": 8.787446975708008, "learning_rate": 6.5203383838669756e-06, "loss": 0.5095, "step": 15740 }, { "epoch": 1.3854679802955665, "grad_norm": 7.2636399269104, "learning_rate": 6.515462145053468e-06, "loss": 0.5167, "step": 15750 }, { "epoch": 1.386347642505278, "grad_norm": 10.572460174560547, "learning_rate": 6.510584318548457e-06, "loss": 0.5562, "step": 15760 }, { "epoch": 1.3872273047149895, "grad_norm": 7.00785493850708, "learning_rate": 6.505704909462252e-06, "loss": 0.5378, "step": 15770 }, { "epoch": 1.3881069669247008, "grad_norm": 8.01778793334961, "learning_rate": 6.5008239229068214e-06, "loss": 0.4969, "step": 15780 }, { "epoch": 1.3889866291344124, "grad_norm": 11.430807113647461, "learning_rate": 6.495941363995788e-06, "loss": 0.5457, "step": 15790 }, { "epoch": 1.389866291344124, "grad_norm": 7.387159824371338, "learning_rate": 6.491057237844425e-06, "loss": 0.5096, "step": 15800 }, { "epoch": 1.3907459535538353, "grad_norm": 6.412287712097168, "learning_rate": 6.486171549569638e-06, "loss": 0.491, "step": 15810 }, { "epoch": 1.3916256157635467, "grad_norm": 8.055830001831055, "learning_rate": 6.481284304289977e-06, "loss": 0.5564, "step": 15820 }, { "epoch": 1.3925052779732583, "grad_norm": 8.161625862121582, "learning_rate": 6.4763955071256246e-06, "loss": 0.4849, "step": 15830 }, { "epoch": 1.3933849401829699, "grad_norm": 8.477890968322754, "learning_rate": 6.471505163198383e-06, "loss": 0.4744, "step": 15840 }, { "epoch": 1.3942646023926812, "grad_norm": 8.334616661071777, "learning_rate": 6.4666132776316755e-06, "loss": 0.4608, "step": 15850 }, { "epoch": 1.3951442646023926, "grad_norm": 7.861494064331055, "learning_rate": 6.461719855550548e-06, "loss": 0.5795, "step": 15860 }, { "epoch": 1.3960239268121042, "grad_norm": 6.93330192565918, "learning_rate": 6.456824902081647e-06, "loss": 0.5015, "step": 15870 }, { "epoch": 1.3969035890218156, "grad_norm": 8.304457664489746, "learning_rate": 6.451928422353229e-06, "loss": 0.5385, "step": 15880 }, { "epoch": 1.3977832512315271, "grad_norm": 9.053669929504395, "learning_rate": 6.447030421495147e-06, "loss": 0.6474, "step": 15890 }, { "epoch": 1.3986629134412385, "grad_norm": 7.950388431549072, "learning_rate": 6.442130904638849e-06, "loss": 0.4906, "step": 15900 }, { "epoch": 1.39954257565095, "grad_norm": 10.526117324829102, "learning_rate": 6.43722987691737e-06, "loss": 0.5795, "step": 15910 }, { "epoch": 1.4004222378606614, "grad_norm": 7.879498481750488, "learning_rate": 6.432327343465331e-06, "loss": 0.5561, "step": 15920 }, { "epoch": 1.401301900070373, "grad_norm": 9.403043746948242, "learning_rate": 6.427423309418925e-06, "loss": 0.4838, "step": 15930 }, { "epoch": 1.4021815622800844, "grad_norm": 7.325310230255127, "learning_rate": 6.4225177799159214e-06, "loss": 0.5183, "step": 15940 }, { "epoch": 1.403061224489796, "grad_norm": 9.076020240783691, "learning_rate": 6.417610760095656e-06, "loss": 0.4859, "step": 15950 }, { "epoch": 1.4039408866995073, "grad_norm": 8.338379859924316, "learning_rate": 6.412702255099022e-06, "loss": 0.5601, "step": 15960 }, { "epoch": 1.404820548909219, "grad_norm": 9.232609748840332, "learning_rate": 6.407792270068476e-06, "loss": 0.525, "step": 15970 }, { "epoch": 1.4057002111189303, "grad_norm": 7.710334300994873, "learning_rate": 6.402880810148019e-06, "loss": 0.503, "step": 15980 }, { "epoch": 1.4065798733286419, "grad_norm": 8.950572967529297, "learning_rate": 6.397967880483197e-06, "loss": 0.5519, "step": 15990 }, { "epoch": 1.4074595355383532, "grad_norm": 7.176523685455322, "learning_rate": 6.3930534862211e-06, "loss": 0.4406, "step": 16000 }, { "epoch": 1.4083391977480648, "grad_norm": 6.65091609954834, "learning_rate": 6.388137632510351e-06, "loss": 0.5227, "step": 16010 }, { "epoch": 1.4092188599577762, "grad_norm": 9.892199516296387, "learning_rate": 6.383220324501097e-06, "loss": 0.5836, "step": 16020 }, { "epoch": 1.4100985221674878, "grad_norm": 7.304071426391602, "learning_rate": 6.378301567345017e-06, "loss": 0.5403, "step": 16030 }, { "epoch": 1.4109781843771991, "grad_norm": 6.678507328033447, "learning_rate": 6.373381366195302e-06, "loss": 0.5512, "step": 16040 }, { "epoch": 1.4118578465869107, "grad_norm": 11.523675918579102, "learning_rate": 6.368459726206661e-06, "loss": 0.6082, "step": 16050 }, { "epoch": 1.412737508796622, "grad_norm": 6.570919036865234, "learning_rate": 6.363536652535302e-06, "loss": 0.556, "step": 16060 }, { "epoch": 1.4136171710063334, "grad_norm": 7.839238166809082, "learning_rate": 6.358612150338944e-06, "loss": 0.4809, "step": 16070 }, { "epoch": 1.414496833216045, "grad_norm": 6.346567153930664, "learning_rate": 6.353686224776797e-06, "loss": 0.552, "step": 16080 }, { "epoch": 1.4153764954257566, "grad_norm": 8.516129493713379, "learning_rate": 6.348758881009567e-06, "loss": 0.6013, "step": 16090 }, { "epoch": 1.416256157635468, "grad_norm": 6.466887950897217, "learning_rate": 6.3438301241994396e-06, "loss": 0.4927, "step": 16100 }, { "epoch": 1.4171358198451793, "grad_norm": 11.589012145996094, "learning_rate": 6.338899959510084e-06, "loss": 0.5458, "step": 16110 }, { "epoch": 1.418015482054891, "grad_norm": 8.05162239074707, "learning_rate": 6.333968392106646e-06, "loss": 0.5364, "step": 16120 }, { "epoch": 1.4188951442646025, "grad_norm": 6.909152984619141, "learning_rate": 6.329035427155743e-06, "loss": 0.5624, "step": 16130 }, { "epoch": 1.4197748064743139, "grad_norm": 8.278901100158691, "learning_rate": 6.324101069825446e-06, "loss": 0.4854, "step": 16140 }, { "epoch": 1.4206544686840252, "grad_norm": 6.438477516174316, "learning_rate": 6.319165325285295e-06, "loss": 0.4804, "step": 16150 }, { "epoch": 1.4215341308937368, "grad_norm": 7.442785263061523, "learning_rate": 6.31422819870628e-06, "loss": 0.6029, "step": 16160 }, { "epoch": 1.4224137931034484, "grad_norm": 7.484609127044678, "learning_rate": 6.309289695260841e-06, "loss": 0.4473, "step": 16170 }, { "epoch": 1.4232934553131598, "grad_norm": 7.26006555557251, "learning_rate": 6.3043498201228535e-06, "loss": 0.5605, "step": 16180 }, { "epoch": 1.4241731175228711, "grad_norm": 8.723344802856445, "learning_rate": 6.299408578467641e-06, "loss": 0.557, "step": 16190 }, { "epoch": 1.4250527797325827, "grad_norm": 7.258244037628174, "learning_rate": 6.294465975471946e-06, "loss": 0.5361, "step": 16200 }, { "epoch": 1.4259324419422943, "grad_norm": 8.91775131225586, "learning_rate": 6.28952201631395e-06, "loss": 0.4663, "step": 16210 }, { "epoch": 1.4268121041520057, "grad_norm": 7.096338748931885, "learning_rate": 6.284576706173247e-06, "loss": 0.5114, "step": 16220 }, { "epoch": 1.427691766361717, "grad_norm": 7.107122898101807, "learning_rate": 6.279630050230847e-06, "loss": 0.4741, "step": 16230 }, { "epoch": 1.4285714285714286, "grad_norm": 7.663174152374268, "learning_rate": 6.274682053669172e-06, "loss": 0.5998, "step": 16240 }, { "epoch": 1.42945109078114, "grad_norm": 12.329484939575195, "learning_rate": 6.269732721672052e-06, "loss": 0.5377, "step": 16250 }, { "epoch": 1.4303307529908516, "grad_norm": 7.1576151847839355, "learning_rate": 6.264782059424706e-06, "loss": 0.545, "step": 16260 }, { "epoch": 1.431210415200563, "grad_norm": 9.202332496643066, "learning_rate": 6.259830072113756e-06, "loss": 0.5458, "step": 16270 }, { "epoch": 1.4320900774102745, "grad_norm": 7.371771335601807, "learning_rate": 6.254876764927209e-06, "loss": 0.5119, "step": 16280 }, { "epoch": 1.4329697396199859, "grad_norm": 10.127457618713379, "learning_rate": 6.249922143054455e-06, "loss": 0.5709, "step": 16290 }, { "epoch": 1.4338494018296974, "grad_norm": 7.886415004730225, "learning_rate": 6.24496621168626e-06, "loss": 0.5891, "step": 16300 }, { "epoch": 1.4347290640394088, "grad_norm": 8.214916229248047, "learning_rate": 6.2400089760147646e-06, "loss": 0.5294, "step": 16310 }, { "epoch": 1.4356087262491204, "grad_norm": 9.256831169128418, "learning_rate": 6.235050441233471e-06, "loss": 0.5459, "step": 16320 }, { "epoch": 1.4364883884588318, "grad_norm": 7.2463507652282715, "learning_rate": 6.230090612537251e-06, "loss": 0.5026, "step": 16330 }, { "epoch": 1.4373680506685433, "grad_norm": 8.153393745422363, "learning_rate": 6.225129495122322e-06, "loss": 0.4911, "step": 16340 }, { "epoch": 1.4382477128782547, "grad_norm": 10.821930885314941, "learning_rate": 6.220167094186257e-06, "loss": 0.5414, "step": 16350 }, { "epoch": 1.4391273750879663, "grad_norm": 10.569250106811523, "learning_rate": 6.215203414927974e-06, "loss": 0.5657, "step": 16360 }, { "epoch": 1.4400070372976777, "grad_norm": 8.27813720703125, "learning_rate": 6.210238462547731e-06, "loss": 0.5206, "step": 16370 }, { "epoch": 1.4408866995073892, "grad_norm": 8.300979614257812, "learning_rate": 6.205272242247115e-06, "loss": 0.4573, "step": 16380 }, { "epoch": 1.4417663617171006, "grad_norm": 10.009787559509277, "learning_rate": 6.200304759229044e-06, "loss": 0.6076, "step": 16390 }, { "epoch": 1.4426460239268122, "grad_norm": 11.321508407592773, "learning_rate": 6.195336018697762e-06, "loss": 0.5073, "step": 16400 }, { "epoch": 1.4435256861365235, "grad_norm": 8.638775825500488, "learning_rate": 6.190366025858826e-06, "loss": 0.5682, "step": 16410 }, { "epoch": 1.4444053483462351, "grad_norm": 7.936859130859375, "learning_rate": 6.185394785919106e-06, "loss": 0.4668, "step": 16420 }, { "epoch": 1.4452850105559465, "grad_norm": 6.794249057769775, "learning_rate": 6.1804223040867825e-06, "loss": 0.5844, "step": 16430 }, { "epoch": 1.4461646727656579, "grad_norm": 7.342381954193115, "learning_rate": 6.1754485855713285e-06, "loss": 0.4751, "step": 16440 }, { "epoch": 1.4470443349753694, "grad_norm": 8.420326232910156, "learning_rate": 6.170473635583523e-06, "loss": 0.5089, "step": 16450 }, { "epoch": 1.447923997185081, "grad_norm": 7.771886348724365, "learning_rate": 6.165497459335429e-06, "loss": 0.5383, "step": 16460 }, { "epoch": 1.4488036593947924, "grad_norm": 9.369412422180176, "learning_rate": 6.1605200620403915e-06, "loss": 0.5372, "step": 16470 }, { "epoch": 1.4496833216045037, "grad_norm": 8.452661514282227, "learning_rate": 6.155541448913042e-06, "loss": 0.6137, "step": 16480 }, { "epoch": 1.4505629838142153, "grad_norm": 8.601435661315918, "learning_rate": 6.150561625169284e-06, "loss": 0.5221, "step": 16490 }, { "epoch": 1.451442646023927, "grad_norm": 6.774396896362305, "learning_rate": 6.145580596026284e-06, "loss": 0.5793, "step": 16500 }, { "epoch": 1.4523223082336383, "grad_norm": 9.425986289978027, "learning_rate": 6.140598366702475e-06, "loss": 0.6264, "step": 16510 }, { "epoch": 1.4532019704433496, "grad_norm": 6.9223856925964355, "learning_rate": 6.1356149424175495e-06, "loss": 0.5275, "step": 16520 }, { "epoch": 1.4540816326530612, "grad_norm": 8.641104698181152, "learning_rate": 6.130630328392448e-06, "loss": 0.5691, "step": 16530 }, { "epoch": 1.4549612948627728, "grad_norm": 6.45000696182251, "learning_rate": 6.125644529849363e-06, "loss": 0.5267, "step": 16540 }, { "epoch": 1.4558409570724842, "grad_norm": 6.4961090087890625, "learning_rate": 6.120657552011719e-06, "loss": 0.4554, "step": 16550 }, { "epoch": 1.4567206192821955, "grad_norm": 10.273581504821777, "learning_rate": 6.115669400104185e-06, "loss": 0.5073, "step": 16560 }, { "epoch": 1.4576002814919071, "grad_norm": 6.772701740264893, "learning_rate": 6.1106800793526554e-06, "loss": 0.5088, "step": 16570 }, { "epoch": 1.4584799437016187, "grad_norm": 9.4951171875, "learning_rate": 6.105689594984248e-06, "loss": 0.54, "step": 16580 }, { "epoch": 1.45935960591133, "grad_norm": 8.259317398071289, "learning_rate": 6.100697952227304e-06, "loss": 0.4957, "step": 16590 }, { "epoch": 1.4602392681210414, "grad_norm": 6.732875347137451, "learning_rate": 6.095705156311373e-06, "loss": 0.5051, "step": 16600 }, { "epoch": 1.461118930330753, "grad_norm": 8.474063873291016, "learning_rate": 6.09071121246722e-06, "loss": 0.5386, "step": 16610 }, { "epoch": 1.4619985925404644, "grad_norm": 8.094334602355957, "learning_rate": 6.085716125926806e-06, "loss": 0.5653, "step": 16620 }, { "epoch": 1.462878254750176, "grad_norm": 9.418039321899414, "learning_rate": 6.08071990192329e-06, "loss": 0.5306, "step": 16630 }, { "epoch": 1.4637579169598873, "grad_norm": 8.702248573303223, "learning_rate": 6.075722545691025e-06, "loss": 0.5154, "step": 16640 }, { "epoch": 1.464637579169599, "grad_norm": 8.206555366516113, "learning_rate": 6.070724062465551e-06, "loss": 0.5729, "step": 16650 }, { "epoch": 1.4655172413793103, "grad_norm": 9.759793281555176, "learning_rate": 6.065724457483585e-06, "loss": 0.6126, "step": 16660 }, { "epoch": 1.4663969035890219, "grad_norm": 7.664906978607178, "learning_rate": 6.06072373598302e-06, "loss": 0.5707, "step": 16670 }, { "epoch": 1.4672765657987332, "grad_norm": 6.687844753265381, "learning_rate": 6.055721903202924e-06, "loss": 0.4957, "step": 16680 }, { "epoch": 1.4681562280084448, "grad_norm": 8.276668548583984, "learning_rate": 6.050718964383521e-06, "loss": 0.5303, "step": 16690 }, { "epoch": 1.4690358902181562, "grad_norm": 6.277676582336426, "learning_rate": 6.045714924766203e-06, "loss": 0.4425, "step": 16700 }, { "epoch": 1.4699155524278678, "grad_norm": 11.086554527282715, "learning_rate": 6.040709789593505e-06, "loss": 0.5609, "step": 16710 }, { "epoch": 1.4707952146375791, "grad_norm": 6.570236682891846, "learning_rate": 6.035703564109115e-06, "loss": 0.5271, "step": 16720 }, { "epoch": 1.4716748768472907, "grad_norm": 8.543072700500488, "learning_rate": 6.030696253557868e-06, "loss": 0.4938, "step": 16730 }, { "epoch": 1.472554539057002, "grad_norm": 6.121361255645752, "learning_rate": 6.0256878631857275e-06, "loss": 0.5063, "step": 16740 }, { "epoch": 1.4734342012667137, "grad_norm": 7.034813404083252, "learning_rate": 6.020678398239791e-06, "loss": 0.5421, "step": 16750 }, { "epoch": 1.474313863476425, "grad_norm": 11.026559829711914, "learning_rate": 6.015667863968284e-06, "loss": 0.4825, "step": 16760 }, { "epoch": 1.4751935256861366, "grad_norm": 8.89335823059082, "learning_rate": 6.0106562656205515e-06, "loss": 0.511, "step": 16770 }, { "epoch": 1.476073187895848, "grad_norm": 10.023509979248047, "learning_rate": 6.005643608447053e-06, "loss": 0.5488, "step": 16780 }, { "epoch": 1.4769528501055595, "grad_norm": 10.164527893066406, "learning_rate": 6.0006298976993546e-06, "loss": 0.5389, "step": 16790 }, { "epoch": 1.477832512315271, "grad_norm": 7.425182342529297, "learning_rate": 5.995615138630131e-06, "loss": 0.4925, "step": 16800 }, { "epoch": 1.4787121745249823, "grad_norm": 6.27846622467041, "learning_rate": 5.990599336493151e-06, "loss": 0.5588, "step": 16810 }, { "epoch": 1.4795918367346939, "grad_norm": 11.408931732177734, "learning_rate": 5.985582496543279e-06, "loss": 0.5241, "step": 16820 }, { "epoch": 1.4804714989444054, "grad_norm": 7.41193151473999, "learning_rate": 5.980564624036467e-06, "loss": 0.5546, "step": 16830 }, { "epoch": 1.4813511611541168, "grad_norm": 7.827046871185303, "learning_rate": 5.975545724229743e-06, "loss": 0.4703, "step": 16840 }, { "epoch": 1.4822308233638282, "grad_norm": 8.132577896118164, "learning_rate": 5.970525802381218e-06, "loss": 0.5027, "step": 16850 }, { "epoch": 1.4831104855735397, "grad_norm": 8.35282039642334, "learning_rate": 5.965504863750076e-06, "loss": 0.5282, "step": 16860 }, { "epoch": 1.4839901477832513, "grad_norm": 8.020844459533691, "learning_rate": 5.960482913596553e-06, "loss": 0.5688, "step": 16870 }, { "epoch": 1.4848698099929627, "grad_norm": 8.429619789123535, "learning_rate": 5.955459957181958e-06, "loss": 0.5207, "step": 16880 }, { "epoch": 1.485749472202674, "grad_norm": 7.648517608642578, "learning_rate": 5.9504359997686515e-06, "loss": 0.5539, "step": 16890 }, { "epoch": 1.4866291344123856, "grad_norm": 6.6185150146484375, "learning_rate": 5.945411046620038e-06, "loss": 0.5526, "step": 16900 }, { "epoch": 1.4875087966220972, "grad_norm": 7.729889392852783, "learning_rate": 5.9403851030005664e-06, "loss": 0.487, "step": 16910 }, { "epoch": 1.4883884588318086, "grad_norm": 7.109167575836182, "learning_rate": 5.93535817417573e-06, "loss": 0.5436, "step": 16920 }, { "epoch": 1.48926812104152, "grad_norm": 8.837197303771973, "learning_rate": 5.930330265412043e-06, "loss": 0.548, "step": 16930 }, { "epoch": 1.4901477832512315, "grad_norm": 9.498022079467773, "learning_rate": 5.925301381977058e-06, "loss": 0.5507, "step": 16940 }, { "epoch": 1.4910274454609431, "grad_norm": 7.753468036651611, "learning_rate": 5.92027152913934e-06, "loss": 0.6168, "step": 16950 }, { "epoch": 1.4919071076706545, "grad_norm": 7.6260786056518555, "learning_rate": 5.9152407121684715e-06, "loss": 0.6069, "step": 16960 }, { "epoch": 1.4927867698803658, "grad_norm": 7.961201190948486, "learning_rate": 5.910208936335049e-06, "loss": 0.5374, "step": 16970 }, { "epoch": 1.4936664320900774, "grad_norm": 7.737741947174072, "learning_rate": 5.9051762069106715e-06, "loss": 0.5502, "step": 16980 }, { "epoch": 1.494546094299789, "grad_norm": 6.7194719314575195, "learning_rate": 5.900142529167934e-06, "loss": 0.4766, "step": 16990 }, { "epoch": 1.4954257565095004, "grad_norm": 7.220416069030762, "learning_rate": 5.895107908380427e-06, "loss": 0.4994, "step": 17000 }, { "epoch": 1.4963054187192117, "grad_norm": 9.206267356872559, "learning_rate": 5.890072349822733e-06, "loss": 0.5362, "step": 17010 }, { "epoch": 1.4971850809289233, "grad_norm": 7.861724853515625, "learning_rate": 5.88503585877041e-06, "loss": 0.6014, "step": 17020 }, { "epoch": 1.4980647431386347, "grad_norm": 9.43554973602295, "learning_rate": 5.879998440499997e-06, "loss": 0.5493, "step": 17030 }, { "epoch": 1.4989444053483463, "grad_norm": 10.385664939880371, "learning_rate": 5.8749601002890044e-06, "loss": 0.5057, "step": 17040 }, { "epoch": 1.4998240675580576, "grad_norm": 7.033921241760254, "learning_rate": 5.869920843415907e-06, "loss": 0.5963, "step": 17050 }, { "epoch": 1.5007037297677692, "grad_norm": 12.949324607849121, "learning_rate": 5.864880675160142e-06, "loss": 0.5205, "step": 17060 }, { "epoch": 1.5015833919774808, "grad_norm": 9.582341194152832, "learning_rate": 5.859839600802099e-06, "loss": 0.51, "step": 17070 }, { "epoch": 1.5024630541871922, "grad_norm": 8.265140533447266, "learning_rate": 5.854797625623118e-06, "loss": 0.6026, "step": 17080 }, { "epoch": 1.5033427163969035, "grad_norm": 8.600849151611328, "learning_rate": 5.849754754905482e-06, "loss": 0.5417, "step": 17090 }, { "epoch": 1.504222378606615, "grad_norm": 10.344887733459473, "learning_rate": 5.844710993932416e-06, "loss": 0.5758, "step": 17100 }, { "epoch": 1.5051020408163265, "grad_norm": 7.949313640594482, "learning_rate": 5.839666347988074e-06, "loss": 0.5224, "step": 17110 }, { "epoch": 1.505981703026038, "grad_norm": 10.699772834777832, "learning_rate": 5.8346208223575355e-06, "loss": 0.5139, "step": 17120 }, { "epoch": 1.5068613652357494, "grad_norm": 6.69357442855835, "learning_rate": 5.8295744223268054e-06, "loss": 0.4469, "step": 17130 }, { "epoch": 1.5077410274454608, "grad_norm": 9.69383716583252, "learning_rate": 5.824527153182804e-06, "loss": 0.5406, "step": 17140 }, { "epoch": 1.5086206896551724, "grad_norm": 10.260815620422363, "learning_rate": 5.81947902021336e-06, "loss": 0.5515, "step": 17150 }, { "epoch": 1.509500351864884, "grad_norm": 9.67973518371582, "learning_rate": 5.81443002870721e-06, "loss": 0.5202, "step": 17160 }, { "epoch": 1.5103800140745953, "grad_norm": 9.135684967041016, "learning_rate": 5.8093801839539866e-06, "loss": 0.4824, "step": 17170 }, { "epoch": 1.5112596762843067, "grad_norm": 7.52739143371582, "learning_rate": 5.804329491244221e-06, "loss": 0.5324, "step": 17180 }, { "epoch": 1.5121393384940183, "grad_norm": 8.41889476776123, "learning_rate": 5.799277955869327e-06, "loss": 0.5216, "step": 17190 }, { "epoch": 1.5130190007037299, "grad_norm": 13.002578735351562, "learning_rate": 5.794225583121604e-06, "loss": 0.5259, "step": 17200 }, { "epoch": 1.5138986629134412, "grad_norm": 11.302602767944336, "learning_rate": 5.789172378294232e-06, "loss": 0.4948, "step": 17210 }, { "epoch": 1.5147783251231526, "grad_norm": 5.729623317718506, "learning_rate": 5.78411834668126e-06, "loss": 0.488, "step": 17220 }, { "epoch": 1.5156579873328642, "grad_norm": 8.409319877624512, "learning_rate": 5.7790634935776e-06, "loss": 0.4768, "step": 17230 }, { "epoch": 1.5165376495425757, "grad_norm": 7.729800701141357, "learning_rate": 5.7740078242790284e-06, "loss": 0.5602, "step": 17240 }, { "epoch": 1.5174173117522871, "grad_norm": 8.908509254455566, "learning_rate": 5.7689513440821764e-06, "loss": 0.5675, "step": 17250 }, { "epoch": 1.5182969739619985, "grad_norm": 8.228300094604492, "learning_rate": 5.763894058284524e-06, "loss": 0.5495, "step": 17260 }, { "epoch": 1.51917663617171, "grad_norm": 10.703595161437988, "learning_rate": 5.758835972184396e-06, "loss": 0.5065, "step": 17270 }, { "epoch": 1.5200562983814216, "grad_norm": 8.195737838745117, "learning_rate": 5.753777091080953e-06, "loss": 0.5213, "step": 17280 }, { "epoch": 1.520935960591133, "grad_norm": 7.679516315460205, "learning_rate": 5.748717420274191e-06, "loss": 0.4979, "step": 17290 }, { "epoch": 1.5218156228008444, "grad_norm": 8.229510307312012, "learning_rate": 5.743656965064935e-06, "loss": 0.4937, "step": 17300 }, { "epoch": 1.522695285010556, "grad_norm": 7.581177234649658, "learning_rate": 5.738595730754829e-06, "loss": 0.5551, "step": 17310 }, { "epoch": 1.5235749472202675, "grad_norm": 9.098518371582031, "learning_rate": 5.733533722646331e-06, "loss": 0.6665, "step": 17320 }, { "epoch": 1.524454609429979, "grad_norm": 7.550580978393555, "learning_rate": 5.728470946042716e-06, "loss": 0.5553, "step": 17330 }, { "epoch": 1.5253342716396903, "grad_norm": 7.441530227661133, "learning_rate": 5.72340740624806e-06, "loss": 0.4886, "step": 17340 }, { "epoch": 1.5262139338494018, "grad_norm": 6.29999303817749, "learning_rate": 5.718343108567238e-06, "loss": 0.5261, "step": 17350 }, { "epoch": 1.5270935960591134, "grad_norm": 7.255375385284424, "learning_rate": 5.71327805830592e-06, "loss": 0.4985, "step": 17360 }, { "epoch": 1.5279732582688248, "grad_norm": 9.309633255004883, "learning_rate": 5.708212260770566e-06, "loss": 0.51, "step": 17370 }, { "epoch": 1.5288529204785362, "grad_norm": 9.599700927734375, "learning_rate": 5.703145721268417e-06, "loss": 0.4906, "step": 17380 }, { "epoch": 1.5297325826882477, "grad_norm": 7.709219932556152, "learning_rate": 5.698078445107492e-06, "loss": 0.481, "step": 17390 }, { "epoch": 1.5306122448979593, "grad_norm": 8.308862686157227, "learning_rate": 5.693010437596582e-06, "loss": 0.5377, "step": 17400 }, { "epoch": 1.5314919071076707, "grad_norm": 8.838823318481445, "learning_rate": 5.687941704045243e-06, "loss": 0.444, "step": 17410 }, { "epoch": 1.532371569317382, "grad_norm": 7.265298843383789, "learning_rate": 5.682872249763793e-06, "loss": 0.4584, "step": 17420 }, { "epoch": 1.5332512315270936, "grad_norm": 8.132843971252441, "learning_rate": 5.677802080063307e-06, "loss": 0.563, "step": 17430 }, { "epoch": 1.5341308937368052, "grad_norm": 9.502723693847656, "learning_rate": 5.672731200255604e-06, "loss": 0.5935, "step": 17440 }, { "epoch": 1.5350105559465166, "grad_norm": 7.5639262199401855, "learning_rate": 5.6676596156532484e-06, "loss": 0.4773, "step": 17450 }, { "epoch": 1.535890218156228, "grad_norm": 7.663568496704102, "learning_rate": 5.662587331569549e-06, "loss": 0.4798, "step": 17460 }, { "epoch": 1.5367698803659395, "grad_norm": 6.361850261688232, "learning_rate": 5.657514353318543e-06, "loss": 0.5243, "step": 17470 }, { "epoch": 1.537649542575651, "grad_norm": 9.186630249023438, "learning_rate": 5.65244068621499e-06, "loss": 0.5703, "step": 17480 }, { "epoch": 1.5385292047853625, "grad_norm": 9.12357234954834, "learning_rate": 5.647366335574381e-06, "loss": 0.4997, "step": 17490 }, { "epoch": 1.5394088669950738, "grad_norm": 7.706592559814453, "learning_rate": 5.642291306712917e-06, "loss": 0.4613, "step": 17500 }, { "epoch": 1.5402885292047852, "grad_norm": 6.841887950897217, "learning_rate": 5.637215604947512e-06, "loss": 0.4665, "step": 17510 }, { "epoch": 1.5411681914144968, "grad_norm": 7.635484218597412, "learning_rate": 5.63213923559578e-06, "loss": 0.4615, "step": 17520 }, { "epoch": 1.5420478536242084, "grad_norm": 9.658819198608398, "learning_rate": 5.627062203976043e-06, "loss": 0.4819, "step": 17530 }, { "epoch": 1.5429275158339197, "grad_norm": 13.485023498535156, "learning_rate": 5.621984515407307e-06, "loss": 0.5416, "step": 17540 }, { "epoch": 1.543807178043631, "grad_norm": 7.112985610961914, "learning_rate": 5.616906175209278e-06, "loss": 0.4566, "step": 17550 }, { "epoch": 1.5446868402533427, "grad_norm": 10.220747947692871, "learning_rate": 5.611827188702332e-06, "loss": 0.5902, "step": 17560 }, { "epoch": 1.5455665024630543, "grad_norm": 8.058282852172852, "learning_rate": 5.606747561207532e-06, "loss": 0.5621, "step": 17570 }, { "epoch": 1.5464461646727656, "grad_norm": 10.555813789367676, "learning_rate": 5.601667298046607e-06, "loss": 0.5862, "step": 17580 }, { "epoch": 1.547325826882477, "grad_norm": 9.125187873840332, "learning_rate": 5.596586404541954e-06, "loss": 0.5048, "step": 17590 }, { "epoch": 1.5482054890921886, "grad_norm": 7.079853534698486, "learning_rate": 5.591504886016628e-06, "loss": 0.4664, "step": 17600 }, { "epoch": 1.5490851513019002, "grad_norm": 6.755866050720215, "learning_rate": 5.586422747794344e-06, "loss": 0.4625, "step": 17610 }, { "epoch": 1.5499648135116115, "grad_norm": 8.080731391906738, "learning_rate": 5.581339995199463e-06, "loss": 0.482, "step": 17620 }, { "epoch": 1.550844475721323, "grad_norm": 8.97127628326416, "learning_rate": 5.576256633556988e-06, "loss": 0.4902, "step": 17630 }, { "epoch": 1.5517241379310345, "grad_norm": 7.59882926940918, "learning_rate": 5.5711726681925624e-06, "loss": 0.4831, "step": 17640 }, { "epoch": 1.552603800140746, "grad_norm": 8.38381576538086, "learning_rate": 5.566088104432461e-06, "loss": 0.4881, "step": 17650 }, { "epoch": 1.5534834623504574, "grad_norm": 6.882980823516846, "learning_rate": 5.561002947603587e-06, "loss": 0.5862, "step": 17660 }, { "epoch": 1.5543631245601688, "grad_norm": 10.668312072753906, "learning_rate": 5.555917203033464e-06, "loss": 0.4966, "step": 17670 }, { "epoch": 1.5552427867698804, "grad_norm": 10.220399856567383, "learning_rate": 5.550830876050232e-06, "loss": 0.4994, "step": 17680 }, { "epoch": 1.556122448979592, "grad_norm": 9.246426582336426, "learning_rate": 5.5457439719826375e-06, "loss": 0.4456, "step": 17690 }, { "epoch": 1.5570021111893033, "grad_norm": 9.388684272766113, "learning_rate": 5.5406564961600375e-06, "loss": 0.5417, "step": 17700 }, { "epoch": 1.5578817733990147, "grad_norm": 8.086506843566895, "learning_rate": 5.535568453912383e-06, "loss": 0.5352, "step": 17710 }, { "epoch": 1.5587614356087263, "grad_norm": 8.13151741027832, "learning_rate": 5.530479850570224e-06, "loss": 0.5469, "step": 17720 }, { "epoch": 1.5596410978184378, "grad_norm": 7.633078098297119, "learning_rate": 5.5253906914646915e-06, "loss": 0.4969, "step": 17730 }, { "epoch": 1.5605207600281492, "grad_norm": 7.248235702514648, "learning_rate": 5.520300981927502e-06, "loss": 0.4402, "step": 17740 }, { "epoch": 1.5614004222378606, "grad_norm": 7.560342788696289, "learning_rate": 5.51521072729095e-06, "loss": 0.5566, "step": 17750 }, { "epoch": 1.5622800844475722, "grad_norm": 7.4975714683532715, "learning_rate": 5.510119932887902e-06, "loss": 0.5741, "step": 17760 }, { "epoch": 1.5631597466572837, "grad_norm": 7.645232200622559, "learning_rate": 5.505028604051783e-06, "loss": 0.4713, "step": 17770 }, { "epoch": 1.564039408866995, "grad_norm": 7.199230194091797, "learning_rate": 5.499936746116585e-06, "loss": 0.4524, "step": 17780 }, { "epoch": 1.5649190710767065, "grad_norm": 7.615925312042236, "learning_rate": 5.4948443644168544e-06, "loss": 0.5621, "step": 17790 }, { "epoch": 1.565798733286418, "grad_norm": 8.116165161132812, "learning_rate": 5.489751464287682e-06, "loss": 0.5333, "step": 17800 }, { "epoch": 1.5666783954961296, "grad_norm": 8.509732246398926, "learning_rate": 5.484658051064702e-06, "loss": 0.4558, "step": 17810 }, { "epoch": 1.567558057705841, "grad_norm": 6.813404083251953, "learning_rate": 5.479564130084091e-06, "loss": 0.5178, "step": 17820 }, { "epoch": 1.5684377199155524, "grad_norm": 8.47960376739502, "learning_rate": 5.4744697066825526e-06, "loss": 0.59, "step": 17830 }, { "epoch": 1.569317382125264, "grad_norm": 10.621184349060059, "learning_rate": 5.469374786197319e-06, "loss": 0.4815, "step": 17840 }, { "epoch": 1.5701970443349755, "grad_norm": 8.178923606872559, "learning_rate": 5.4642793739661435e-06, "loss": 0.5836, "step": 17850 }, { "epoch": 1.571076706544687, "grad_norm": 5.925271511077881, "learning_rate": 5.459183475327291e-06, "loss": 0.5792, "step": 17860 }, { "epoch": 1.5719563687543983, "grad_norm": 8.55307674407959, "learning_rate": 5.454087095619542e-06, "loss": 0.549, "step": 17870 }, { "epoch": 1.5728360309641096, "grad_norm": 5.913169860839844, "learning_rate": 5.448990240182179e-06, "loss": 0.4831, "step": 17880 }, { "epoch": 1.5737156931738212, "grad_norm": 6.7664475440979, "learning_rate": 5.443892914354978e-06, "loss": 0.4963, "step": 17890 }, { "epoch": 1.5745953553835328, "grad_norm": 7.793246746063232, "learning_rate": 5.4387951234782125e-06, "loss": 0.5831, "step": 17900 }, { "epoch": 1.5754750175932442, "grad_norm": 9.771756172180176, "learning_rate": 5.433696872892645e-06, "loss": 0.561, "step": 17910 }, { "epoch": 1.5763546798029555, "grad_norm": 9.113490104675293, "learning_rate": 5.428598167939513e-06, "loss": 0.4851, "step": 17920 }, { "epoch": 1.577234342012667, "grad_norm": 10.064866065979004, "learning_rate": 5.423499013960536e-06, "loss": 0.4985, "step": 17930 }, { "epoch": 1.5781140042223787, "grad_norm": 8.016467094421387, "learning_rate": 5.418399416297902e-06, "loss": 0.4947, "step": 17940 }, { "epoch": 1.57899366643209, "grad_norm": 9.686164855957031, "learning_rate": 5.413299380294264e-06, "loss": 0.5257, "step": 17950 }, { "epoch": 1.5798733286418014, "grad_norm": 7.7048234939575195, "learning_rate": 5.408198911292732e-06, "loss": 0.4529, "step": 17960 }, { "epoch": 1.580752990851513, "grad_norm": 8.216329574584961, "learning_rate": 5.403098014636874e-06, "loss": 0.5192, "step": 17970 }, { "epoch": 1.5816326530612246, "grad_norm": 8.13851547241211, "learning_rate": 5.397996695670702e-06, "loss": 0.5033, "step": 17980 }, { "epoch": 1.582512315270936, "grad_norm": 8.225299835205078, "learning_rate": 5.392894959738672e-06, "loss": 0.46, "step": 17990 }, { "epoch": 1.5833919774806473, "grad_norm": 12.155467987060547, "learning_rate": 5.387792812185677e-06, "loss": 0.4584, "step": 18000 }, { "epoch": 1.584271639690359, "grad_norm": 7.115071773529053, "learning_rate": 5.382690258357043e-06, "loss": 0.6293, "step": 18010 }, { "epoch": 1.5851513019000705, "grad_norm": 6.701413631439209, "learning_rate": 5.377587303598516e-06, "loss": 0.495, "step": 18020 }, { "epoch": 1.5860309641097818, "grad_norm": 6.874399662017822, "learning_rate": 5.372483953256269e-06, "loss": 0.4463, "step": 18030 }, { "epoch": 1.5869106263194932, "grad_norm": 7.8881754875183105, "learning_rate": 5.367380212676886e-06, "loss": 0.5297, "step": 18040 }, { "epoch": 1.5877902885292048, "grad_norm": 7.8489909172058105, "learning_rate": 5.3622760872073575e-06, "loss": 0.4876, "step": 18050 }, { "epoch": 1.5886699507389164, "grad_norm": 9.326056480407715, "learning_rate": 5.3571715821950825e-06, "loss": 0.5244, "step": 18060 }, { "epoch": 1.5895496129486277, "grad_norm": 7.389490127563477, "learning_rate": 5.352066702987853e-06, "loss": 0.5046, "step": 18070 }, { "epoch": 1.590429275158339, "grad_norm": 8.911169052124023, "learning_rate": 5.346961454933856e-06, "loss": 0.5447, "step": 18080 }, { "epoch": 1.5913089373680507, "grad_norm": 8.337925910949707, "learning_rate": 5.341855843381665e-06, "loss": 0.5671, "step": 18090 }, { "epoch": 1.5921885995777623, "grad_norm": 7.003993034362793, "learning_rate": 5.33674987368023e-06, "loss": 0.4421, "step": 18100 }, { "epoch": 1.5930682617874736, "grad_norm": 8.677210807800293, "learning_rate": 5.331643551178882e-06, "loss": 0.4575, "step": 18110 }, { "epoch": 1.593947923997185, "grad_norm": 8.936156272888184, "learning_rate": 5.326536881227319e-06, "loss": 0.4964, "step": 18120 }, { "epoch": 1.5948275862068966, "grad_norm": 11.29076862335205, "learning_rate": 5.321429869175603e-06, "loss": 0.5182, "step": 18130 }, { "epoch": 1.5957072484166082, "grad_norm": 13.741730690002441, "learning_rate": 5.316322520374152e-06, "loss": 0.5402, "step": 18140 }, { "epoch": 1.5965869106263195, "grad_norm": 10.774431228637695, "learning_rate": 5.311214840173743e-06, "loss": 0.561, "step": 18150 }, { "epoch": 1.5974665728360309, "grad_norm": 8.095342636108398, "learning_rate": 5.306106833925493e-06, "loss": 0.4872, "step": 18160 }, { "epoch": 1.5983462350457425, "grad_norm": 8.26319694519043, "learning_rate": 5.300998506980867e-06, "loss": 0.5305, "step": 18170 }, { "epoch": 1.599225897255454, "grad_norm": 9.888181686401367, "learning_rate": 5.295889864691661e-06, "loss": 0.5083, "step": 18180 }, { "epoch": 1.6001055594651654, "grad_norm": 7.415805816650391, "learning_rate": 5.2907809124100025e-06, "loss": 0.479, "step": 18190 }, { "epoch": 1.6009852216748768, "grad_norm": 6.630844593048096, "learning_rate": 5.285671655488348e-06, "loss": 0.4664, "step": 18200 }, { "epoch": 1.6018648838845884, "grad_norm": 5.828280448913574, "learning_rate": 5.280562099279468e-06, "loss": 0.576, "step": 18210 }, { "epoch": 1.6027445460943, "grad_norm": 7.743124485015869, "learning_rate": 5.275452249136448e-06, "loss": 0.4998, "step": 18220 }, { "epoch": 1.6036242083040113, "grad_norm": 9.903414726257324, "learning_rate": 5.27034211041268e-06, "loss": 0.5152, "step": 18230 }, { "epoch": 1.6045038705137227, "grad_norm": 7.996463775634766, "learning_rate": 5.265231688461865e-06, "loss": 0.5416, "step": 18240 }, { "epoch": 1.605383532723434, "grad_norm": 8.224754333496094, "learning_rate": 5.260120988637993e-06, "loss": 0.5501, "step": 18250 }, { "epoch": 1.6062631949331456, "grad_norm": 10.544371604919434, "learning_rate": 5.255010016295346e-06, "loss": 0.5143, "step": 18260 }, { "epoch": 1.6071428571428572, "grad_norm": 8.417435646057129, "learning_rate": 5.249898776788499e-06, "loss": 0.4432, "step": 18270 }, { "epoch": 1.6080225193525686, "grad_norm": 7.793869495391846, "learning_rate": 5.244787275472295e-06, "loss": 0.4792, "step": 18280 }, { "epoch": 1.60890218156228, "grad_norm": 8.55099105834961, "learning_rate": 5.2396755177018625e-06, "loss": 0.4526, "step": 18290 }, { "epoch": 1.6097818437719915, "grad_norm": 8.23194408416748, "learning_rate": 5.234563508832592e-06, "loss": 0.5418, "step": 18300 }, { "epoch": 1.610661505981703, "grad_norm": 6.657679080963135, "learning_rate": 5.229451254220138e-06, "loss": 0.5491, "step": 18310 }, { "epoch": 1.6115411681914145, "grad_norm": 8.25204086303711, "learning_rate": 5.224338759220414e-06, "loss": 0.5199, "step": 18320 }, { "epoch": 1.6124208304011258, "grad_norm": 8.041470527648926, "learning_rate": 5.2192260291895865e-06, "loss": 0.4514, "step": 18330 }, { "epoch": 1.6133004926108374, "grad_norm": 7.328383922576904, "learning_rate": 5.2141130694840646e-06, "loss": 0.5225, "step": 18340 }, { "epoch": 1.614180154820549, "grad_norm": 9.406354904174805, "learning_rate": 5.2089998854604995e-06, "loss": 0.4608, "step": 18350 }, { "epoch": 1.6150598170302604, "grad_norm": 8.976187705993652, "learning_rate": 5.203886482475779e-06, "loss": 0.5398, "step": 18360 }, { "epoch": 1.6159394792399717, "grad_norm": 8.099407196044922, "learning_rate": 5.198772865887021e-06, "loss": 0.4407, "step": 18370 }, { "epoch": 1.6168191414496833, "grad_norm": 7.891857147216797, "learning_rate": 5.19365904105156e-06, "loss": 0.5331, "step": 18380 }, { "epoch": 1.617698803659395, "grad_norm": 7.603702545166016, "learning_rate": 5.188545013326959e-06, "loss": 0.4997, "step": 18390 }, { "epoch": 1.6185784658691063, "grad_norm": 8.090417861938477, "learning_rate": 5.183430788070984e-06, "loss": 0.459, "step": 18400 }, { "epoch": 1.6194581280788176, "grad_norm": 8.449166297912598, "learning_rate": 5.178316370641617e-06, "loss": 0.4849, "step": 18410 }, { "epoch": 1.6203377902885292, "grad_norm": 7.51845645904541, "learning_rate": 5.1732017663970345e-06, "loss": 0.5601, "step": 18420 }, { "epoch": 1.6212174524982408, "grad_norm": 6.144296646118164, "learning_rate": 5.16808698069561e-06, "loss": 0.5075, "step": 18430 }, { "epoch": 1.6220971147079521, "grad_norm": 8.62205696105957, "learning_rate": 5.162972018895909e-06, "loss": 0.448, "step": 18440 }, { "epoch": 1.6229767769176635, "grad_norm": 8.085602760314941, "learning_rate": 5.157856886356684e-06, "loss": 0.4667, "step": 18450 }, { "epoch": 1.623856439127375, "grad_norm": 8.808740615844727, "learning_rate": 5.152741588436856e-06, "loss": 0.4763, "step": 18460 }, { "epoch": 1.6247361013370867, "grad_norm": 9.380142211914062, "learning_rate": 5.147626130495531e-06, "loss": 0.5092, "step": 18470 }, { "epoch": 1.625615763546798, "grad_norm": 7.201907157897949, "learning_rate": 5.142510517891978e-06, "loss": 0.4952, "step": 18480 }, { "epoch": 1.6264954257565094, "grad_norm": 11.083559036254883, "learning_rate": 5.137394755985628e-06, "loss": 0.5037, "step": 18490 }, { "epoch": 1.627375087966221, "grad_norm": 8.317154884338379, "learning_rate": 5.132278850136064e-06, "loss": 0.5253, "step": 18500 }, { "epoch": 1.6282547501759326, "grad_norm": 9.41623592376709, "learning_rate": 5.12716280570303e-06, "loss": 0.5749, "step": 18510 }, { "epoch": 1.629134412385644, "grad_norm": 6.704380035400391, "learning_rate": 5.122046628046404e-06, "loss": 0.4892, "step": 18520 }, { "epoch": 1.6300140745953553, "grad_norm": 9.552490234375, "learning_rate": 5.116930322526215e-06, "loss": 0.5233, "step": 18530 }, { "epoch": 1.6308937368050669, "grad_norm": 8.047355651855469, "learning_rate": 5.111813894502614e-06, "loss": 0.4658, "step": 18540 }, { "epoch": 1.6317733990147785, "grad_norm": 7.568130970001221, "learning_rate": 5.106697349335889e-06, "loss": 0.491, "step": 18550 }, { "epoch": 1.6326530612244898, "grad_norm": 9.576286315917969, "learning_rate": 5.101580692386447e-06, "loss": 0.502, "step": 18560 }, { "epoch": 1.6335327234342012, "grad_norm": 7.872195720672607, "learning_rate": 5.096463929014816e-06, "loss": 0.5265, "step": 18570 }, { "epoch": 1.6344123856439128, "grad_norm": 8.970274925231934, "learning_rate": 5.091347064581629e-06, "loss": 0.5141, "step": 18580 }, { "epoch": 1.6352920478536244, "grad_norm": 9.810746192932129, "learning_rate": 5.086230104447628e-06, "loss": 0.5837, "step": 18590 }, { "epoch": 1.6361717100633357, "grad_norm": 7.929157257080078, "learning_rate": 5.0811130539736595e-06, "loss": 0.5464, "step": 18600 }, { "epoch": 1.637051372273047, "grad_norm": 6.38942289352417, "learning_rate": 5.0759959185206584e-06, "loss": 0.5284, "step": 18610 }, { "epoch": 1.6379310344827587, "grad_norm": 8.097877502441406, "learning_rate": 5.0708787034496495e-06, "loss": 0.5204, "step": 18620 }, { "epoch": 1.63881069669247, "grad_norm": 9.297806739807129, "learning_rate": 5.065761414121746e-06, "loss": 0.51, "step": 18630 }, { "epoch": 1.6396903589021816, "grad_norm": 6.942878246307373, "learning_rate": 5.060644055898132e-06, "loss": 0.5201, "step": 18640 }, { "epoch": 1.640570021111893, "grad_norm": 8.472870826721191, "learning_rate": 5.0555266341400696e-06, "loss": 0.5012, "step": 18650 }, { "epoch": 1.6414496833216043, "grad_norm": 10.331345558166504, "learning_rate": 5.050409154208886e-06, "loss": 0.5741, "step": 18660 }, { "epoch": 1.642329345531316, "grad_norm": 7.54359245300293, "learning_rate": 5.045291621465965e-06, "loss": 0.4997, "step": 18670 }, { "epoch": 1.6432090077410275, "grad_norm": 6.438284397125244, "learning_rate": 5.04017404127275e-06, "loss": 0.4181, "step": 18680 }, { "epoch": 1.6440886699507389, "grad_norm": 8.356420516967773, "learning_rate": 5.035056418990738e-06, "loss": 0.4279, "step": 18690 }, { "epoch": 1.6449683321604502, "grad_norm": 10.587328910827637, "learning_rate": 5.02993875998146e-06, "loss": 0.5596, "step": 18700 }, { "epoch": 1.6458479943701618, "grad_norm": 9.875277519226074, "learning_rate": 5.0248210696064915e-06, "loss": 0.6249, "step": 18710 }, { "epoch": 1.6467276565798734, "grad_norm": 9.212162971496582, "learning_rate": 5.019703353227443e-06, "loss": 0.4976, "step": 18720 }, { "epoch": 1.6476073187895848, "grad_norm": 7.16459321975708, "learning_rate": 5.014585616205947e-06, "loss": 0.5379, "step": 18730 }, { "epoch": 1.6484869809992961, "grad_norm": 8.792413711547852, "learning_rate": 5.0094678639036605e-06, "loss": 0.5052, "step": 18740 }, { "epoch": 1.6493666432090077, "grad_norm": 9.222097396850586, "learning_rate": 5.004350101682258e-06, "loss": 0.47, "step": 18750 }, { "epoch": 1.6502463054187193, "grad_norm": 9.470173835754395, "learning_rate": 4.999232334903419e-06, "loss": 0.4872, "step": 18760 }, { "epoch": 1.6511259676284307, "grad_norm": 7.915321350097656, "learning_rate": 4.994114568928836e-06, "loss": 0.453, "step": 18770 }, { "epoch": 1.652005629838142, "grad_norm": 6.046530723571777, "learning_rate": 4.988996809120191e-06, "loss": 0.4986, "step": 18780 }, { "epoch": 1.6528852920478536, "grad_norm": 7.445747375488281, "learning_rate": 4.983879060839167e-06, "loss": 0.4688, "step": 18790 }, { "epoch": 1.6537649542575652, "grad_norm": 6.9931864738464355, "learning_rate": 4.978761329447431e-06, "loss": 0.4691, "step": 18800 }, { "epoch": 1.6546446164672766, "grad_norm": 8.201911926269531, "learning_rate": 4.9736436203066365e-06, "loss": 0.4772, "step": 18810 }, { "epoch": 1.655524278676988, "grad_norm": 7.16725492477417, "learning_rate": 4.968525938778408e-06, "loss": 0.4716, "step": 18820 }, { "epoch": 1.6564039408866995, "grad_norm": 8.6710786819458, "learning_rate": 4.963408290224347e-06, "loss": 0.5141, "step": 18830 }, { "epoch": 1.657283603096411, "grad_norm": 7.003940105438232, "learning_rate": 4.958290680006014e-06, "loss": 0.479, "step": 18840 }, { "epoch": 1.6581632653061225, "grad_norm": 8.134620666503906, "learning_rate": 4.953173113484933e-06, "loss": 0.4841, "step": 18850 }, { "epoch": 1.6590429275158338, "grad_norm": 7.255366802215576, "learning_rate": 4.948055596022585e-06, "loss": 0.4896, "step": 18860 }, { "epoch": 1.6599225897255454, "grad_norm": 10.75046443939209, "learning_rate": 4.942938132980393e-06, "loss": 0.502, "step": 18870 }, { "epoch": 1.660802251935257, "grad_norm": 6.463998317718506, "learning_rate": 4.937820729719729e-06, "loss": 0.5219, "step": 18880 }, { "epoch": 1.6616819141449684, "grad_norm": 6.43196964263916, "learning_rate": 4.9327033916019e-06, "loss": 0.5324, "step": 18890 }, { "epoch": 1.6625615763546797, "grad_norm": 5.982107639312744, "learning_rate": 4.927586123988144e-06, "loss": 0.4809, "step": 18900 }, { "epoch": 1.6634412385643913, "grad_norm": 10.212921142578125, "learning_rate": 4.922468932239625e-06, "loss": 0.499, "step": 18910 }, { "epoch": 1.6643209007741029, "grad_norm": 8.896707534790039, "learning_rate": 4.917351821717428e-06, "loss": 0.5509, "step": 18920 }, { "epoch": 1.6652005629838142, "grad_norm": 9.806184768676758, "learning_rate": 4.912234797782554e-06, "loss": 0.5635, "step": 18930 }, { "epoch": 1.6660802251935256, "grad_norm": 7.429471492767334, "learning_rate": 4.907117865795915e-06, "loss": 0.5032, "step": 18940 }, { "epoch": 1.6669598874032372, "grad_norm": 10.686758041381836, "learning_rate": 4.902001031118323e-06, "loss": 0.5952, "step": 18950 }, { "epoch": 1.6678395496129488, "grad_norm": 7.804841041564941, "learning_rate": 4.896884299110485e-06, "loss": 0.4575, "step": 18960 }, { "epoch": 1.6687192118226601, "grad_norm": 7.40811824798584, "learning_rate": 4.891767675133009e-06, "loss": 0.4761, "step": 18970 }, { "epoch": 1.6695988740323715, "grad_norm": 8.52443790435791, "learning_rate": 4.886651164546384e-06, "loss": 0.4946, "step": 18980 }, { "epoch": 1.670478536242083, "grad_norm": 6.987000942230225, "learning_rate": 4.881534772710982e-06, "loss": 0.4814, "step": 18990 }, { "epoch": 1.6713581984517947, "grad_norm": 7.895609378814697, "learning_rate": 4.87641850498705e-06, "loss": 0.5, "step": 19000 }, { "epoch": 1.672237860661506, "grad_norm": 6.746887683868408, "learning_rate": 4.871302366734708e-06, "loss": 0.5291, "step": 19010 }, { "epoch": 1.6731175228712174, "grad_norm": 6.901947975158691, "learning_rate": 4.866186363313933e-06, "loss": 0.5604, "step": 19020 }, { "epoch": 1.6739971850809288, "grad_norm": 8.729240417480469, "learning_rate": 4.861070500084568e-06, "loss": 0.5156, "step": 19030 }, { "epoch": 1.6748768472906403, "grad_norm": 7.747799873352051, "learning_rate": 4.855954782406306e-06, "loss": 0.5451, "step": 19040 }, { "epoch": 1.675756509500352, "grad_norm": 9.823748588562012, "learning_rate": 4.850839215638687e-06, "loss": 0.5858, "step": 19050 }, { "epoch": 1.6766361717100633, "grad_norm": 6.713179111480713, "learning_rate": 4.845723805141097e-06, "loss": 0.4578, "step": 19060 }, { "epoch": 1.6775158339197747, "grad_norm": 7.625060558319092, "learning_rate": 4.840608556272755e-06, "loss": 0.4206, "step": 19070 }, { "epoch": 1.6783954961294862, "grad_norm": 8.353745460510254, "learning_rate": 4.835493474392705e-06, "loss": 0.5092, "step": 19080 }, { "epoch": 1.6792751583391978, "grad_norm": 5.34041690826416, "learning_rate": 4.830378564859827e-06, "loss": 0.4488, "step": 19090 }, { "epoch": 1.6801548205489092, "grad_norm": 7.969494819641113, "learning_rate": 4.825263833032815e-06, "loss": 0.5283, "step": 19100 }, { "epoch": 1.6810344827586206, "grad_norm": 7.038054943084717, "learning_rate": 4.820149284270175e-06, "loss": 0.4309, "step": 19110 }, { "epoch": 1.6819141449683321, "grad_norm": 10.184945106506348, "learning_rate": 4.815034923930225e-06, "loss": 0.616, "step": 19120 }, { "epoch": 1.6827938071780437, "grad_norm": 8.030872344970703, "learning_rate": 4.809920757371087e-06, "loss": 0.4823, "step": 19130 }, { "epoch": 1.683673469387755, "grad_norm": 7.690305233001709, "learning_rate": 4.80480678995067e-06, "loss": 0.4832, "step": 19140 }, { "epoch": 1.6845531315974664, "grad_norm": 7.110231399536133, "learning_rate": 4.799693027026688e-06, "loss": 0.4743, "step": 19150 }, { "epoch": 1.685432793807178, "grad_norm": 7.585033893585205, "learning_rate": 4.794579473956627e-06, "loss": 0.4867, "step": 19160 }, { "epoch": 1.6863124560168896, "grad_norm": 7.0182085037231445, "learning_rate": 4.789466136097767e-06, "loss": 0.4867, "step": 19170 }, { "epoch": 1.687192118226601, "grad_norm": 10.421906471252441, "learning_rate": 4.784353018807153e-06, "loss": 0.5706, "step": 19180 }, { "epoch": 1.6880717804363123, "grad_norm": 7.458887100219727, "learning_rate": 4.7792401274416035e-06, "loss": 0.525, "step": 19190 }, { "epoch": 1.688951442646024, "grad_norm": 10.1382417678833, "learning_rate": 4.774127467357694e-06, "loss": 0.4677, "step": 19200 }, { "epoch": 1.6898311048557355, "grad_norm": 7.3929667472839355, "learning_rate": 4.769015043911765e-06, "loss": 0.4678, "step": 19210 }, { "epoch": 1.6907107670654469, "grad_norm": 8.535226821899414, "learning_rate": 4.763902862459907e-06, "loss": 0.4925, "step": 19220 }, { "epoch": 1.6915904292751582, "grad_norm": 7.917834758758545, "learning_rate": 4.7587909283579555e-06, "loss": 0.4851, "step": 19230 }, { "epoch": 1.6924700914848698, "grad_norm": 10.267107963562012, "learning_rate": 4.753679246961487e-06, "loss": 0.548, "step": 19240 }, { "epoch": 1.6933497536945814, "grad_norm": 6.623278617858887, "learning_rate": 4.748567823625816e-06, "loss": 0.459, "step": 19250 }, { "epoch": 1.6942294159042928, "grad_norm": 5.702402114868164, "learning_rate": 4.743456663705981e-06, "loss": 0.5265, "step": 19260 }, { "epoch": 1.6951090781140041, "grad_norm": 8.48918628692627, "learning_rate": 4.73834577255675e-06, "loss": 0.4263, "step": 19270 }, { "epoch": 1.6959887403237157, "grad_norm": 7.109251499176025, "learning_rate": 4.733235155532608e-06, "loss": 0.5624, "step": 19280 }, { "epoch": 1.6968684025334273, "grad_norm": 6.972310543060303, "learning_rate": 4.728124817987751e-06, "loss": 0.5439, "step": 19290 }, { "epoch": 1.6977480647431387, "grad_norm": 11.689125061035156, "learning_rate": 4.723014765276084e-06, "loss": 0.5361, "step": 19300 }, { "epoch": 1.69862772695285, "grad_norm": 8.233918190002441, "learning_rate": 4.717905002751214e-06, "loss": 0.4933, "step": 19310 }, { "epoch": 1.6995073891625616, "grad_norm": 7.6206536293029785, "learning_rate": 4.712795535766442e-06, "loss": 0.481, "step": 19320 }, { "epoch": 1.7003870513722732, "grad_norm": 8.69529914855957, "learning_rate": 4.707686369674758e-06, "loss": 0.4418, "step": 19330 }, { "epoch": 1.7012667135819846, "grad_norm": 8.616668701171875, "learning_rate": 4.702577509828843e-06, "loss": 0.5218, "step": 19340 }, { "epoch": 1.702146375791696, "grad_norm": 6.761985778808594, "learning_rate": 4.697468961581053e-06, "loss": 0.4134, "step": 19350 }, { "epoch": 1.7030260380014075, "grad_norm": 7.77575159072876, "learning_rate": 4.692360730283415e-06, "loss": 0.4621, "step": 19360 }, { "epoch": 1.703905700211119, "grad_norm": 7.558733940124512, "learning_rate": 4.68725282128763e-06, "loss": 0.4848, "step": 19370 }, { "epoch": 1.7047853624208305, "grad_norm": 10.275272369384766, "learning_rate": 4.682145239945057e-06, "loss": 0.5094, "step": 19380 }, { "epoch": 1.7056650246305418, "grad_norm": 9.148167610168457, "learning_rate": 4.677037991606712e-06, "loss": 0.5615, "step": 19390 }, { "epoch": 1.7065446868402532, "grad_norm": 9.176121711730957, "learning_rate": 4.671931081623262e-06, "loss": 0.5595, "step": 19400 }, { "epoch": 1.7074243490499648, "grad_norm": 7.8880934715271, "learning_rate": 4.666824515345023e-06, "loss": 0.5472, "step": 19410 }, { "epoch": 1.7083040112596763, "grad_norm": 6.652556419372559, "learning_rate": 4.661718298121948e-06, "loss": 0.4692, "step": 19420 }, { "epoch": 1.7091836734693877, "grad_norm": 8.722001075744629, "learning_rate": 4.656612435303625e-06, "loss": 0.5437, "step": 19430 }, { "epoch": 1.710063335679099, "grad_norm": 7.561027526855469, "learning_rate": 4.651506932239269e-06, "loss": 0.4466, "step": 19440 }, { "epoch": 1.7109429978888107, "grad_norm": 7.944461822509766, "learning_rate": 4.646401794277719e-06, "loss": 0.4169, "step": 19450 }, { "epoch": 1.7118226600985222, "grad_norm": 7.313119888305664, "learning_rate": 4.641297026767434e-06, "loss": 0.4482, "step": 19460 }, { "epoch": 1.7127023223082336, "grad_norm": 8.1514310836792, "learning_rate": 4.636192635056483e-06, "loss": 0.4895, "step": 19470 }, { "epoch": 1.713581984517945, "grad_norm": 6.347132682800293, "learning_rate": 4.631088624492539e-06, "loss": 0.4358, "step": 19480 }, { "epoch": 1.7144616467276566, "grad_norm": 7.031516075134277, "learning_rate": 4.625985000422882e-06, "loss": 0.4968, "step": 19490 }, { "epoch": 1.7153413089373681, "grad_norm": 8.228424072265625, "learning_rate": 4.620881768194378e-06, "loss": 0.5898, "step": 19500 }, { "epoch": 1.7162209711470795, "grad_norm": 7.594395637512207, "learning_rate": 4.6157789331534905e-06, "loss": 0.5564, "step": 19510 }, { "epoch": 1.7171006333567909, "grad_norm": 7.284217357635498, "learning_rate": 4.610676500646263e-06, "loss": 0.4751, "step": 19520 }, { "epoch": 1.7179802955665024, "grad_norm": 7.971425533294678, "learning_rate": 4.605574476018318e-06, "loss": 0.5538, "step": 19530 }, { "epoch": 1.718859957776214, "grad_norm": 7.310335159301758, "learning_rate": 4.600472864614852e-06, "loss": 0.5694, "step": 19540 }, { "epoch": 1.7197396199859254, "grad_norm": 9.49163818359375, "learning_rate": 4.595371671780626e-06, "loss": 0.4874, "step": 19550 }, { "epoch": 1.7206192821956368, "grad_norm": 6.094480037689209, "learning_rate": 4.590270902859965e-06, "loss": 0.4633, "step": 19560 }, { "epoch": 1.7214989444053483, "grad_norm": 7.395593643188477, "learning_rate": 4.585170563196748e-06, "loss": 0.5066, "step": 19570 }, { "epoch": 1.72237860661506, "grad_norm": 6.025745391845703, "learning_rate": 4.5800706581344034e-06, "loss": 0.4935, "step": 19580 }, { "epoch": 1.7232582688247713, "grad_norm": 7.134429931640625, "learning_rate": 4.574971193015909e-06, "loss": 0.411, "step": 19590 }, { "epoch": 1.7241379310344827, "grad_norm": 7.688818454742432, "learning_rate": 4.569872173183777e-06, "loss": 0.4514, "step": 19600 }, { "epoch": 1.7250175932441942, "grad_norm": 7.880807399749756, "learning_rate": 4.564773603980058e-06, "loss": 0.4853, "step": 19610 }, { "epoch": 1.7258972554539058, "grad_norm": 11.366028785705566, "learning_rate": 4.559675490746321e-06, "loss": 0.5501, "step": 19620 }, { "epoch": 1.7267769176636172, "grad_norm": 9.006085395812988, "learning_rate": 4.554577838823667e-06, "loss": 0.4506, "step": 19630 }, { "epoch": 1.7276565798733285, "grad_norm": 10.0860595703125, "learning_rate": 4.549480653552711e-06, "loss": 0.5357, "step": 19640 }, { "epoch": 1.7285362420830401, "grad_norm": 6.807350158691406, "learning_rate": 4.544383940273577e-06, "loss": 0.5094, "step": 19650 }, { "epoch": 1.7294159042927517, "grad_norm": 8.120041847229004, "learning_rate": 4.539287704325894e-06, "loss": 0.4566, "step": 19660 }, { "epoch": 1.730295566502463, "grad_norm": 9.167254447937012, "learning_rate": 4.534191951048799e-06, "loss": 0.5618, "step": 19670 }, { "epoch": 1.7311752287121744, "grad_norm": 8.353686332702637, "learning_rate": 4.529096685780911e-06, "loss": 0.4784, "step": 19680 }, { "epoch": 1.732054890921886, "grad_norm": 9.319416999816895, "learning_rate": 4.524001913860346e-06, "loss": 0.4984, "step": 19690 }, { "epoch": 1.7329345531315976, "grad_norm": 8.060443878173828, "learning_rate": 4.518907640624699e-06, "loss": 0.571, "step": 19700 }, { "epoch": 1.733814215341309, "grad_norm": 10.539557456970215, "learning_rate": 4.513813871411047e-06, "loss": 0.4161, "step": 19710 }, { "epoch": 1.7346938775510203, "grad_norm": 11.245636940002441, "learning_rate": 4.508720611555936e-06, "loss": 0.5246, "step": 19720 }, { "epoch": 1.735573539760732, "grad_norm": 7.924689769744873, "learning_rate": 4.503627866395379e-06, "loss": 0.4812, "step": 19730 }, { "epoch": 1.7364532019704435, "grad_norm": 11.435842514038086, "learning_rate": 4.498535641264848e-06, "loss": 0.5242, "step": 19740 }, { "epoch": 1.7373328641801549, "grad_norm": 6.495911121368408, "learning_rate": 4.493443941499273e-06, "loss": 0.4995, "step": 19750 }, { "epoch": 1.7382125263898662, "grad_norm": 7.940772533416748, "learning_rate": 4.4883527724330315e-06, "loss": 0.4612, "step": 19760 }, { "epoch": 1.7390921885995778, "grad_norm": 9.52241325378418, "learning_rate": 4.483262139399947e-06, "loss": 0.5037, "step": 19770 }, { "epoch": 1.7399718508092892, "grad_norm": 5.566858291625977, "learning_rate": 4.478172047733278e-06, "loss": 0.4788, "step": 19780 }, { "epoch": 1.7408515130190008, "grad_norm": 7.550381660461426, "learning_rate": 4.4730825027657236e-06, "loss": 0.5191, "step": 19790 }, { "epoch": 1.7417311752287121, "grad_norm": 5.1750078201293945, "learning_rate": 4.467993509829399e-06, "loss": 0.3598, "step": 19800 }, { "epoch": 1.7426108374384235, "grad_norm": 7.092970848083496, "learning_rate": 4.462905074255847e-06, "loss": 0.4479, "step": 19810 }, { "epoch": 1.743490499648135, "grad_norm": 7.600279808044434, "learning_rate": 4.457817201376028e-06, "loss": 0.4817, "step": 19820 }, { "epoch": 1.7443701618578467, "grad_norm": 7.333839416503906, "learning_rate": 4.4527298965203135e-06, "loss": 0.452, "step": 19830 }, { "epoch": 1.745249824067558, "grad_norm": 10.363202095031738, "learning_rate": 4.447643165018472e-06, "loss": 0.4865, "step": 19840 }, { "epoch": 1.7461294862772694, "grad_norm": 9.491141319274902, "learning_rate": 4.442557012199683e-06, "loss": 0.4804, "step": 19850 }, { "epoch": 1.747009148486981, "grad_norm": 8.282683372497559, "learning_rate": 4.437471443392507e-06, "loss": 0.5057, "step": 19860 }, { "epoch": 1.7478888106966926, "grad_norm": 6.724813938140869, "learning_rate": 4.432386463924902e-06, "loss": 0.5242, "step": 19870 }, { "epoch": 1.748768472906404, "grad_norm": 9.298612594604492, "learning_rate": 4.4273020791242085e-06, "loss": 0.4979, "step": 19880 }, { "epoch": 1.7496481351161153, "grad_norm": 8.195066452026367, "learning_rate": 4.422218294317136e-06, "loss": 0.5524, "step": 19890 }, { "epoch": 1.7505277973258269, "grad_norm": 6.91118860244751, "learning_rate": 4.417135114829775e-06, "loss": 0.4865, "step": 19900 }, { "epoch": 1.7514074595355384, "grad_norm": 9.835691452026367, "learning_rate": 4.412052545987578e-06, "loss": 0.4881, "step": 19910 }, { "epoch": 1.7522871217452498, "grad_norm": 6.434234619140625, "learning_rate": 4.406970593115355e-06, "loss": 0.4696, "step": 19920 }, { "epoch": 1.7531667839549612, "grad_norm": 5.69167423248291, "learning_rate": 4.4018892615372705e-06, "loss": 0.4991, "step": 19930 }, { "epoch": 1.7540464461646728, "grad_norm": 6.337332725524902, "learning_rate": 4.396808556576844e-06, "loss": 0.4676, "step": 19940 }, { "epoch": 1.7549261083743843, "grad_norm": 7.536810398101807, "learning_rate": 4.391728483556933e-06, "loss": 0.4678, "step": 19950 }, { "epoch": 1.7558057705840957, "grad_norm": 8.849397659301758, "learning_rate": 4.386649047799739e-06, "loss": 0.4097, "step": 19960 }, { "epoch": 1.756685432793807, "grad_norm": 8.359367370605469, "learning_rate": 4.381570254626789e-06, "loss": 0.4672, "step": 19970 }, { "epoch": 1.7575650950035187, "grad_norm": 11.458508491516113, "learning_rate": 4.376492109358937e-06, "loss": 0.5676, "step": 19980 }, { "epoch": 1.7584447572132302, "grad_norm": 9.524081230163574, "learning_rate": 4.3714146173163644e-06, "loss": 0.4786, "step": 19990 }, { "epoch": 1.7593244194229416, "grad_norm": 7.643036842346191, "learning_rate": 4.3663377838185646e-06, "loss": 0.4964, "step": 20000 }, { "epoch": 1.760204081632653, "grad_norm": 6.020983695983887, "learning_rate": 4.361261614184339e-06, "loss": 0.4235, "step": 20010 }, { "epoch": 1.7610837438423645, "grad_norm": 6.450713634490967, "learning_rate": 4.3561861137317965e-06, "loss": 0.4479, "step": 20020 }, { "epoch": 1.7619634060520761, "grad_norm": 9.19697380065918, "learning_rate": 4.351111287778347e-06, "loss": 0.4523, "step": 20030 }, { "epoch": 1.7628430682617875, "grad_norm": 9.350509643554688, "learning_rate": 4.346037141640686e-06, "loss": 0.4524, "step": 20040 }, { "epoch": 1.7637227304714989, "grad_norm": 5.983992099761963, "learning_rate": 4.340963680634805e-06, "loss": 0.4814, "step": 20050 }, { "epoch": 1.7646023926812104, "grad_norm": 7.474555969238281, "learning_rate": 4.335890910075972e-06, "loss": 0.461, "step": 20060 }, { "epoch": 1.765482054890922, "grad_norm": 6.530849933624268, "learning_rate": 4.330818835278735e-06, "loss": 0.4814, "step": 20070 }, { "epoch": 1.7663617171006334, "grad_norm": 6.77523946762085, "learning_rate": 4.325747461556913e-06, "loss": 0.4783, "step": 20080 }, { "epoch": 1.7672413793103448, "grad_norm": 6.520811080932617, "learning_rate": 4.320676794223589e-06, "loss": 0.4373, "step": 20090 }, { "epoch": 1.7681210415200563, "grad_norm": 10.562407493591309, "learning_rate": 4.315606838591106e-06, "loss": 0.5649, "step": 20100 }, { "epoch": 1.769000703729768, "grad_norm": 10.599225044250488, "learning_rate": 4.31053759997106e-06, "loss": 0.496, "step": 20110 }, { "epoch": 1.7698803659394793, "grad_norm": 9.208768844604492, "learning_rate": 4.305469083674302e-06, "loss": 0.4866, "step": 20120 }, { "epoch": 1.7707600281491906, "grad_norm": 9.551877975463867, "learning_rate": 4.300401295010918e-06, "loss": 0.4789, "step": 20130 }, { "epoch": 1.7716396903589022, "grad_norm": 7.35713005065918, "learning_rate": 4.295334239290237e-06, "loss": 0.4896, "step": 20140 }, { "epoch": 1.7725193525686138, "grad_norm": 7.814120292663574, "learning_rate": 4.290267921820822e-06, "loss": 0.4539, "step": 20150 }, { "epoch": 1.7733990147783252, "grad_norm": 7.448631286621094, "learning_rate": 4.285202347910453e-06, "loss": 0.4881, "step": 20160 }, { "epoch": 1.7742786769880365, "grad_norm": 10.749679565429688, "learning_rate": 4.280137522866142e-06, "loss": 0.5768, "step": 20170 }, { "epoch": 1.775158339197748, "grad_norm": 10.469120979309082, "learning_rate": 4.2750734519941095e-06, "loss": 0.5339, "step": 20180 }, { "epoch": 1.7760380014074595, "grad_norm": 7.920567035675049, "learning_rate": 4.270010140599789e-06, "loss": 0.4652, "step": 20190 }, { "epoch": 1.776917663617171, "grad_norm": 7.047778129577637, "learning_rate": 4.264947593987818e-06, "loss": 0.4509, "step": 20200 }, { "epoch": 1.7777973258268824, "grad_norm": 8.916488647460938, "learning_rate": 4.259885817462033e-06, "loss": 0.521, "step": 20210 }, { "epoch": 1.7786769880365938, "grad_norm": 7.504305839538574, "learning_rate": 4.254824816325459e-06, "loss": 0.5636, "step": 20220 }, { "epoch": 1.7795566502463054, "grad_norm": 6.23654842376709, "learning_rate": 4.249764595880315e-06, "loss": 0.4635, "step": 20230 }, { "epoch": 1.780436312456017, "grad_norm": 8.079662322998047, "learning_rate": 4.244705161428002e-06, "loss": 0.4762, "step": 20240 }, { "epoch": 1.7813159746657283, "grad_norm": 8.146095275878906, "learning_rate": 4.239646518269091e-06, "loss": 0.4913, "step": 20250 }, { "epoch": 1.7821956368754397, "grad_norm": 7.490260601043701, "learning_rate": 4.234588671703331e-06, "loss": 0.4489, "step": 20260 }, { "epoch": 1.7830752990851513, "grad_norm": 7.344233512878418, "learning_rate": 4.229531627029634e-06, "loss": 0.4545, "step": 20270 }, { "epoch": 1.7839549612948629, "grad_norm": 8.956254005432129, "learning_rate": 4.224475389546069e-06, "loss": 0.4384, "step": 20280 }, { "epoch": 1.7848346235045742, "grad_norm": 9.236567497253418, "learning_rate": 4.219419964549863e-06, "loss": 0.476, "step": 20290 }, { "epoch": 1.7857142857142856, "grad_norm": 11.641026496887207, "learning_rate": 4.214365357337391e-06, "loss": 0.4904, "step": 20300 }, { "epoch": 1.7865939479239972, "grad_norm": 9.328904151916504, "learning_rate": 4.209311573204171e-06, "loss": 0.4343, "step": 20310 }, { "epoch": 1.7874736101337088, "grad_norm": 7.2645368576049805, "learning_rate": 4.20425861744486e-06, "loss": 0.4695, "step": 20320 }, { "epoch": 1.7883532723434201, "grad_norm": 6.68594217300415, "learning_rate": 4.199206495353245e-06, "loss": 0.5064, "step": 20330 }, { "epoch": 1.7892329345531315, "grad_norm": 6.861854553222656, "learning_rate": 4.194155212222237e-06, "loss": 0.4728, "step": 20340 }, { "epoch": 1.790112596762843, "grad_norm": 7.195733547210693, "learning_rate": 4.189104773343875e-06, "loss": 0.4569, "step": 20350 }, { "epoch": 1.7909922589725547, "grad_norm": 7.491269588470459, "learning_rate": 4.1840551840093076e-06, "loss": 0.5571, "step": 20360 }, { "epoch": 1.791871921182266, "grad_norm": 6.98841667175293, "learning_rate": 4.1790064495087984e-06, "loss": 0.4674, "step": 20370 }, { "epoch": 1.7927515833919774, "grad_norm": 9.678778648376465, "learning_rate": 4.173958575131711e-06, "loss": 0.5656, "step": 20380 }, { "epoch": 1.793631245601689, "grad_norm": 5.996384620666504, "learning_rate": 4.16891156616651e-06, "loss": 0.4835, "step": 20390 }, { "epoch": 1.7945109078114005, "grad_norm": 7.109838485717773, "learning_rate": 4.163865427900753e-06, "loss": 0.5251, "step": 20400 }, { "epoch": 1.795390570021112, "grad_norm": 7.697781085968018, "learning_rate": 4.158820165621085e-06, "loss": 0.492, "step": 20410 }, { "epoch": 1.7962702322308233, "grad_norm": 7.461838245391846, "learning_rate": 4.153775784613233e-06, "loss": 0.5239, "step": 20420 }, { "epoch": 1.7971498944405349, "grad_norm": 8.046464920043945, "learning_rate": 4.148732290162002e-06, "loss": 0.4621, "step": 20430 }, { "epoch": 1.7980295566502464, "grad_norm": 7.613962173461914, "learning_rate": 4.143689687551269e-06, "loss": 0.4971, "step": 20440 }, { "epoch": 1.7989092188599578, "grad_norm": 8.291379928588867, "learning_rate": 4.1386479820639724e-06, "loss": 0.4737, "step": 20450 }, { "epoch": 1.7997888810696692, "grad_norm": 8.369001388549805, "learning_rate": 4.133607178982115e-06, "loss": 0.5027, "step": 20460 }, { "epoch": 1.8006685432793808, "grad_norm": 7.539218425750732, "learning_rate": 4.128567283586752e-06, "loss": 0.4881, "step": 20470 }, { "epoch": 1.8015482054890923, "grad_norm": 8.185629844665527, "learning_rate": 4.123528301157988e-06, "loss": 0.5174, "step": 20480 }, { "epoch": 1.8024278676988037, "grad_norm": 8.919824600219727, "learning_rate": 4.118490236974974e-06, "loss": 0.5169, "step": 20490 }, { "epoch": 1.803307529908515, "grad_norm": 9.852577209472656, "learning_rate": 4.113453096315893e-06, "loss": 0.4129, "step": 20500 }, { "epoch": 1.8041871921182266, "grad_norm": 6.425495624542236, "learning_rate": 4.108416884457967e-06, "loss": 0.4236, "step": 20510 }, { "epoch": 1.8050668543279382, "grad_norm": 6.3195481300354, "learning_rate": 4.10338160667744e-06, "loss": 0.4387, "step": 20520 }, { "epoch": 1.8059465165376496, "grad_norm": 6.652390003204346, "learning_rate": 4.098347268249578e-06, "loss": 0.4592, "step": 20530 }, { "epoch": 1.806826178747361, "grad_norm": 9.438950538635254, "learning_rate": 4.093313874448666e-06, "loss": 0.3992, "step": 20540 }, { "epoch": 1.8077058409570723, "grad_norm": 8.946706771850586, "learning_rate": 4.0882814305479965e-06, "loss": 0.5049, "step": 20550 }, { "epoch": 1.808585503166784, "grad_norm": 9.852784156799316, "learning_rate": 4.0832499418198685e-06, "loss": 0.4306, "step": 20560 }, { "epoch": 1.8094651653764955, "grad_norm": 9.67101001739502, "learning_rate": 4.078219413535582e-06, "loss": 0.5976, "step": 20570 }, { "epoch": 1.8103448275862069, "grad_norm": 10.924999237060547, "learning_rate": 4.073189850965422e-06, "loss": 0.4328, "step": 20580 }, { "epoch": 1.8112244897959182, "grad_norm": 10.172835350036621, "learning_rate": 4.068161259378673e-06, "loss": 0.4919, "step": 20590 }, { "epoch": 1.8121041520056298, "grad_norm": 8.585433959960938, "learning_rate": 4.063133644043596e-06, "loss": 0.4636, "step": 20600 }, { "epoch": 1.8129838142153414, "grad_norm": 7.822171688079834, "learning_rate": 4.05810701022743e-06, "loss": 0.4481, "step": 20610 }, { "epoch": 1.8138634764250527, "grad_norm": 9.234576225280762, "learning_rate": 4.053081363196388e-06, "loss": 0.4822, "step": 20620 }, { "epoch": 1.814743138634764, "grad_norm": 11.584358215332031, "learning_rate": 4.0480567082156465e-06, "loss": 0.4614, "step": 20630 }, { "epoch": 1.8156228008444757, "grad_norm": 7.305027484893799, "learning_rate": 4.043033050549342e-06, "loss": 0.4518, "step": 20640 }, { "epoch": 1.8165024630541873, "grad_norm": 9.79561710357666, "learning_rate": 4.038010395460569e-06, "loss": 0.5148, "step": 20650 }, { "epoch": 1.8173821252638986, "grad_norm": 6.685495376586914, "learning_rate": 4.0329887482113724e-06, "loss": 0.4181, "step": 20660 }, { "epoch": 1.81826178747361, "grad_norm": 8.892301559448242, "learning_rate": 4.027968114062736e-06, "loss": 0.5229, "step": 20670 }, { "epoch": 1.8191414496833216, "grad_norm": 6.283373832702637, "learning_rate": 4.0229484982745865e-06, "loss": 0.5436, "step": 20680 }, { "epoch": 1.8200211118930332, "grad_norm": 10.707183837890625, "learning_rate": 4.017929906105785e-06, "loss": 0.4756, "step": 20690 }, { "epoch": 1.8209007741027445, "grad_norm": 9.68073844909668, "learning_rate": 4.012912342814115e-06, "loss": 0.4937, "step": 20700 }, { "epoch": 1.821780436312456, "grad_norm": 10.067551612854004, "learning_rate": 4.0078958136562855e-06, "loss": 0.4352, "step": 20710 }, { "epoch": 1.8226600985221675, "grad_norm": 7.391679286956787, "learning_rate": 4.00288032388792e-06, "loss": 0.4542, "step": 20720 }, { "epoch": 1.823539760731879, "grad_norm": 7.3979973793029785, "learning_rate": 3.997865878763559e-06, "loss": 0.4546, "step": 20730 }, { "epoch": 1.8244194229415904, "grad_norm": 9.343127250671387, "learning_rate": 3.9928524835366394e-06, "loss": 0.4393, "step": 20740 }, { "epoch": 1.8252990851513018, "grad_norm": 10.168795585632324, "learning_rate": 3.9878401434595075e-06, "loss": 0.5157, "step": 20750 }, { "epoch": 1.8261787473610134, "grad_norm": 8.134392738342285, "learning_rate": 3.982828863783395e-06, "loss": 0.4605, "step": 20760 }, { "epoch": 1.827058409570725, "grad_norm": 8.217068672180176, "learning_rate": 3.977818649758429e-06, "loss": 0.5724, "step": 20770 }, { "epoch": 1.8279380717804363, "grad_norm": 9.004444122314453, "learning_rate": 3.9728095066336195e-06, "loss": 0.4783, "step": 20780 }, { "epoch": 1.8288177339901477, "grad_norm": 7.699681758880615, "learning_rate": 3.967801439656851e-06, "loss": 0.4509, "step": 20790 }, { "epoch": 1.8296973961998593, "grad_norm": 7.873967170715332, "learning_rate": 3.962794454074886e-06, "loss": 0.5063, "step": 20800 }, { "epoch": 1.8305770584095709, "grad_norm": 7.7679362297058105, "learning_rate": 3.957788555133351e-06, "loss": 0.492, "step": 20810 }, { "epoch": 1.8314567206192822, "grad_norm": 12.936326026916504, "learning_rate": 3.9527837480767314e-06, "loss": 0.5865, "step": 20820 }, { "epoch": 1.8323363828289936, "grad_norm": 8.51690673828125, "learning_rate": 3.947780038148371e-06, "loss": 0.4733, "step": 20830 }, { "epoch": 1.8332160450387052, "grad_norm": 8.76525592803955, "learning_rate": 3.942777430590468e-06, "loss": 0.4529, "step": 20840 }, { "epoch": 1.8340957072484168, "grad_norm": 8.98110294342041, "learning_rate": 3.93777593064406e-06, "loss": 0.5061, "step": 20850 }, { "epoch": 1.8349753694581281, "grad_norm": 9.845206260681152, "learning_rate": 3.932775543549027e-06, "loss": 0.5038, "step": 20860 }, { "epoch": 1.8358550316678395, "grad_norm": 10.569867134094238, "learning_rate": 3.927776274544084e-06, "loss": 0.4638, "step": 20870 }, { "epoch": 1.836734693877551, "grad_norm": 7.695403099060059, "learning_rate": 3.92277812886677e-06, "loss": 0.526, "step": 20880 }, { "epoch": 1.8376143560872626, "grad_norm": 5.368588924407959, "learning_rate": 3.917781111753452e-06, "loss": 0.5162, "step": 20890 }, { "epoch": 1.838494018296974, "grad_norm": 7.572543144226074, "learning_rate": 3.912785228439314e-06, "loss": 0.4973, "step": 20900 }, { "epoch": 1.8393736805066854, "grad_norm": 7.881800174713135, "learning_rate": 3.907790484158349e-06, "loss": 0.5241, "step": 20910 }, { "epoch": 1.840253342716397, "grad_norm": 8.921191215515137, "learning_rate": 3.9027968841433614e-06, "loss": 0.4748, "step": 20920 }, { "epoch": 1.8411330049261085, "grad_norm": 10.672125816345215, "learning_rate": 3.897804433625954e-06, "loss": 0.5125, "step": 20930 }, { "epoch": 1.84201266713582, "grad_norm": 7.901500701904297, "learning_rate": 3.892813137836523e-06, "loss": 0.4824, "step": 20940 }, { "epoch": 1.8428923293455313, "grad_norm": 8.483839988708496, "learning_rate": 3.8878230020042585e-06, "loss": 0.4635, "step": 20950 }, { "epoch": 1.8437719915552426, "grad_norm": 9.899295806884766, "learning_rate": 3.882834031357134e-06, "loss": 0.3897, "step": 20960 }, { "epoch": 1.8446516537649542, "grad_norm": 7.893838882446289, "learning_rate": 3.877846231121903e-06, "loss": 0.4917, "step": 20970 }, { "epoch": 1.8455313159746658, "grad_norm": 8.351110458374023, "learning_rate": 3.872859606524094e-06, "loss": 0.5256, "step": 20980 }, { "epoch": 1.8464109781843772, "grad_norm": 7.848738670349121, "learning_rate": 3.8678741627880015e-06, "loss": 0.4552, "step": 20990 }, { "epoch": 1.8472906403940885, "grad_norm": 9.787022590637207, "learning_rate": 3.862889905136679e-06, "loss": 0.5575, "step": 21000 }, { "epoch": 1.8481703026038, "grad_norm": 10.828354835510254, "learning_rate": 3.8579068387919466e-06, "loss": 0.5093, "step": 21010 }, { "epoch": 1.8490499648135117, "grad_norm": 7.400455474853516, "learning_rate": 3.852924968974371e-06, "loss": 0.5272, "step": 21020 }, { "epoch": 1.849929627023223, "grad_norm": 7.514430046081543, "learning_rate": 3.847944300903264e-06, "loss": 0.5305, "step": 21030 }, { "epoch": 1.8508092892329344, "grad_norm": 6.127838611602783, "learning_rate": 3.842964839796681e-06, "loss": 0.4545, "step": 21040 }, { "epoch": 1.851688951442646, "grad_norm": 5.286289215087891, "learning_rate": 3.837986590871417e-06, "loss": 0.4681, "step": 21050 }, { "epoch": 1.8525686136523576, "grad_norm": 9.11295223236084, "learning_rate": 3.833009559342984e-06, "loss": 0.4757, "step": 21060 }, { "epoch": 1.853448275862069, "grad_norm": 9.35288143157959, "learning_rate": 3.828033750425633e-06, "loss": 0.4791, "step": 21070 }, { "epoch": 1.8543279380717803, "grad_norm": 5.951265811920166, "learning_rate": 3.8230591693323235e-06, "loss": 0.5289, "step": 21080 }, { "epoch": 1.855207600281492, "grad_norm": 8.272468566894531, "learning_rate": 3.818085821274736e-06, "loss": 0.5026, "step": 21090 }, { "epoch": 1.8560872624912035, "grad_norm": 9.891656875610352, "learning_rate": 3.813113711463258e-06, "loss": 0.4728, "step": 21100 }, { "epoch": 1.8569669247009148, "grad_norm": 6.5142927169799805, "learning_rate": 3.808142845106976e-06, "loss": 0.5835, "step": 21110 }, { "epoch": 1.8578465869106262, "grad_norm": 7.589720249176025, "learning_rate": 3.8031732274136734e-06, "loss": 0.4716, "step": 21120 }, { "epoch": 1.8587262491203378, "grad_norm": 8.615384101867676, "learning_rate": 3.7982048635898304e-06, "loss": 0.5418, "step": 21130 }, { "epoch": 1.8596059113300494, "grad_norm": 7.841357707977295, "learning_rate": 3.7932377588406115e-06, "loss": 0.486, "step": 21140 }, { "epoch": 1.8604855735397607, "grad_norm": 10.201964378356934, "learning_rate": 3.78827191836986e-06, "loss": 0.4744, "step": 21150 }, { "epoch": 1.861365235749472, "grad_norm": 11.043363571166992, "learning_rate": 3.783307347380098e-06, "loss": 0.5039, "step": 21160 }, { "epoch": 1.8622448979591837, "grad_norm": 8.670607566833496, "learning_rate": 3.7783440510725167e-06, "loss": 0.4891, "step": 21170 }, { "epoch": 1.8631245601688953, "grad_norm": 6.90752649307251, "learning_rate": 3.7733820346469667e-06, "loss": 0.4869, "step": 21180 }, { "epoch": 1.8640042223786066, "grad_norm": 7.694879055023193, "learning_rate": 3.768421303301967e-06, "loss": 0.4775, "step": 21190 }, { "epoch": 1.864883884588318, "grad_norm": 11.58683967590332, "learning_rate": 3.763461862234682e-06, "loss": 0.4556, "step": 21200 }, { "epoch": 1.8657635467980296, "grad_norm": 6.484595775604248, "learning_rate": 3.7585037166409283e-06, "loss": 0.4494, "step": 21210 }, { "epoch": 1.8666432090077412, "grad_norm": 8.819748878479004, "learning_rate": 3.753546871715167e-06, "loss": 0.4829, "step": 21220 }, { "epoch": 1.8675228712174525, "grad_norm": 10.05046272277832, "learning_rate": 3.7485913326504937e-06, "loss": 0.4537, "step": 21230 }, { "epoch": 1.868402533427164, "grad_norm": 8.786294937133789, "learning_rate": 3.743637104638633e-06, "loss": 0.5023, "step": 21240 }, { "epoch": 1.8692821956368755, "grad_norm": 9.576695442199707, "learning_rate": 3.7386841928699415e-06, "loss": 0.4725, "step": 21250 }, { "epoch": 1.870161857846587, "grad_norm": 7.024372577667236, "learning_rate": 3.7337326025333964e-06, "loss": 0.4418, "step": 21260 }, { "epoch": 1.8710415200562984, "grad_norm": 9.17593002319336, "learning_rate": 3.7287823388165854e-06, "loss": 0.4465, "step": 21270 }, { "epoch": 1.8719211822660098, "grad_norm": 10.97526741027832, "learning_rate": 3.7238334069057125e-06, "loss": 0.4876, "step": 21280 }, { "epoch": 1.8728008444757214, "grad_norm": 9.861023902893066, "learning_rate": 3.7188858119855842e-06, "loss": 0.5027, "step": 21290 }, { "epoch": 1.873680506685433, "grad_norm": 10.280965805053711, "learning_rate": 3.7139395592396025e-06, "loss": 0.466, "step": 21300 }, { "epoch": 1.8745601688951443, "grad_norm": 7.673752784729004, "learning_rate": 3.7089946538497697e-06, "loss": 0.4799, "step": 21310 }, { "epoch": 1.8754398311048557, "grad_norm": 9.165332794189453, "learning_rate": 3.7040511009966705e-06, "loss": 0.4723, "step": 21320 }, { "epoch": 1.876319493314567, "grad_norm": 7.819222450256348, "learning_rate": 3.6991089058594766e-06, "loss": 0.4668, "step": 21330 }, { "epoch": 1.8771991555242786, "grad_norm": 6.349427223205566, "learning_rate": 3.694168073615938e-06, "loss": 0.4279, "step": 21340 }, { "epoch": 1.8780788177339902, "grad_norm": 10.481040000915527, "learning_rate": 3.6892286094423724e-06, "loss": 0.4738, "step": 21350 }, { "epoch": 1.8789584799437016, "grad_norm": 8.747355461120605, "learning_rate": 3.6842905185136685e-06, "loss": 0.4532, "step": 21360 }, { "epoch": 1.879838142153413, "grad_norm": 6.542567729949951, "learning_rate": 3.6793538060032723e-06, "loss": 0.4828, "step": 21370 }, { "epoch": 1.8807178043631245, "grad_norm": 7.069324970245361, "learning_rate": 3.6744184770831896e-06, "loss": 0.4547, "step": 21380 }, { "epoch": 1.881597466572836, "grad_norm": 9.281723022460938, "learning_rate": 3.669484536923976e-06, "loss": 0.4711, "step": 21390 }, { "epoch": 1.8824771287825475, "grad_norm": 10.240946769714355, "learning_rate": 3.6645519906947285e-06, "loss": 0.459, "step": 21400 }, { "epoch": 1.8833567909922588, "grad_norm": 8.489568710327148, "learning_rate": 3.65962084356309e-06, "loss": 0.475, "step": 21410 }, { "epoch": 1.8842364532019704, "grad_norm": 10.931282997131348, "learning_rate": 3.65469110069523e-06, "loss": 0.4535, "step": 21420 }, { "epoch": 1.885116115411682, "grad_norm": 7.2224345207214355, "learning_rate": 3.6497627672558543e-06, "loss": 0.5084, "step": 21430 }, { "epoch": 1.8859957776213934, "grad_norm": 8.454198837280273, "learning_rate": 3.644835848408186e-06, "loss": 0.4782, "step": 21440 }, { "epoch": 1.8868754398311047, "grad_norm": 11.641197204589844, "learning_rate": 3.6399103493139687e-06, "loss": 0.5202, "step": 21450 }, { "epoch": 1.8877551020408163, "grad_norm": 9.077286720275879, "learning_rate": 3.634986275133462e-06, "loss": 0.5132, "step": 21460 }, { "epoch": 1.888634764250528, "grad_norm": 6.652059555053711, "learning_rate": 3.630063631025427e-06, "loss": 0.4695, "step": 21470 }, { "epoch": 1.8895144264602393, "grad_norm": 7.608617782592773, "learning_rate": 3.6251424221471294e-06, "loss": 0.5291, "step": 21480 }, { "epoch": 1.8903940886699506, "grad_norm": 10.39698600769043, "learning_rate": 3.6202226536543296e-06, "loss": 0.4257, "step": 21490 }, { "epoch": 1.8912737508796622, "grad_norm": 10.032390594482422, "learning_rate": 3.6153043307012808e-06, "loss": 0.4311, "step": 21500 }, { "epoch": 1.8921534130893738, "grad_norm": 8.064603805541992, "learning_rate": 3.610387458440723e-06, "loss": 0.5067, "step": 21510 }, { "epoch": 1.8930330752990852, "grad_norm": 8.20429515838623, "learning_rate": 3.6054720420238707e-06, "loss": 0.4774, "step": 21520 }, { "epoch": 1.8939127375087965, "grad_norm": 5.270030498504639, "learning_rate": 3.600558086600423e-06, "loss": 0.4904, "step": 21530 }, { "epoch": 1.894792399718508, "grad_norm": 8.82154655456543, "learning_rate": 3.5956455973185357e-06, "loss": 0.4792, "step": 21540 }, { "epoch": 1.8956720619282197, "grad_norm": 9.220709800720215, "learning_rate": 3.590734579324839e-06, "loss": 0.5043, "step": 21550 }, { "epoch": 1.896551724137931, "grad_norm": 8.77991771697998, "learning_rate": 3.5858250377644164e-06, "loss": 0.4733, "step": 21560 }, { "epoch": 1.8974313863476424, "grad_norm": 8.7297945022583, "learning_rate": 3.5809169777808064e-06, "loss": 0.5531, "step": 21570 }, { "epoch": 1.898311048557354, "grad_norm": 6.807669162750244, "learning_rate": 3.576010404515995e-06, "loss": 0.5056, "step": 21580 }, { "epoch": 1.8991907107670656, "grad_norm": 7.288235187530518, "learning_rate": 3.5711053231104132e-06, "loss": 0.525, "step": 21590 }, { "epoch": 1.900070372976777, "grad_norm": 6.880301475524902, "learning_rate": 3.5662017387029234e-06, "loss": 0.4795, "step": 21600 }, { "epoch": 1.9009500351864883, "grad_norm": 10.302690505981445, "learning_rate": 3.5612996564308223e-06, "loss": 0.4555, "step": 21610 }, { "epoch": 1.9018296973962, "grad_norm": 6.622171401977539, "learning_rate": 3.5563990814298344e-06, "loss": 0.4359, "step": 21620 }, { "epoch": 1.9027093596059115, "grad_norm": 7.226187705993652, "learning_rate": 3.551500018834105e-06, "loss": 0.515, "step": 21630 }, { "epoch": 1.9035890218156228, "grad_norm": 8.227275848388672, "learning_rate": 3.5466024737761926e-06, "loss": 0.4434, "step": 21640 }, { "epoch": 1.9044686840253342, "grad_norm": 7.546737194061279, "learning_rate": 3.541706451387069e-06, "loss": 0.395, "step": 21650 }, { "epoch": 1.9053483462350458, "grad_norm": 9.697392463684082, "learning_rate": 3.536811956796106e-06, "loss": 0.4958, "step": 21660 }, { "epoch": 1.9062280084447574, "grad_norm": 7.659425735473633, "learning_rate": 3.5319189951310783e-06, "loss": 0.4327, "step": 21670 }, { "epoch": 1.9071076706544687, "grad_norm": 8.2478666305542, "learning_rate": 3.5270275715181563e-06, "loss": 0.5211, "step": 21680 }, { "epoch": 1.90798733286418, "grad_norm": 10.28062629699707, "learning_rate": 3.522137691081894e-06, "loss": 0.4975, "step": 21690 }, { "epoch": 1.9088669950738915, "grad_norm": 8.133328437805176, "learning_rate": 3.5172493589452317e-06, "loss": 0.4868, "step": 21700 }, { "epoch": 1.909746657283603, "grad_norm": 9.810466766357422, "learning_rate": 3.5123625802294915e-06, "loss": 0.4616, "step": 21710 }, { "epoch": 1.9106263194933146, "grad_norm": 8.316728591918945, "learning_rate": 3.5074773600543583e-06, "loss": 0.4936, "step": 21720 }, { "epoch": 1.911505981703026, "grad_norm": 8.728649139404297, "learning_rate": 3.5025937035378908e-06, "loss": 0.4382, "step": 21730 }, { "epoch": 1.9123856439127374, "grad_norm": 7.329566955566406, "learning_rate": 3.4977116157965107e-06, "loss": 0.4534, "step": 21740 }, { "epoch": 1.913265306122449, "grad_norm": 7.418203353881836, "learning_rate": 3.492831101944993e-06, "loss": 0.4097, "step": 21750 }, { "epoch": 1.9141449683321605, "grad_norm": 8.449092864990234, "learning_rate": 3.487952167096463e-06, "loss": 0.4809, "step": 21760 }, { "epoch": 1.9150246305418719, "grad_norm": 8.673717498779297, "learning_rate": 3.4830748163623953e-06, "loss": 0.434, "step": 21770 }, { "epoch": 1.9159042927515832, "grad_norm": 7.050623416900635, "learning_rate": 3.4781990548526045e-06, "loss": 0.4653, "step": 21780 }, { "epoch": 1.9167839549612948, "grad_norm": 9.005496978759766, "learning_rate": 3.4733248876752344e-06, "loss": 0.4942, "step": 21790 }, { "epoch": 1.9176636171710064, "grad_norm": 8.318561553955078, "learning_rate": 3.468452319936767e-06, "loss": 0.6069, "step": 21800 }, { "epoch": 1.9185432793807178, "grad_norm": 8.299747467041016, "learning_rate": 3.463581356742001e-06, "loss": 0.4859, "step": 21810 }, { "epoch": 1.9194229415904291, "grad_norm": 5.997599124908447, "learning_rate": 3.458712003194059e-06, "loss": 0.4111, "step": 21820 }, { "epoch": 1.9203026038001407, "grad_norm": 9.711665153503418, "learning_rate": 3.4538442643943776e-06, "loss": 0.4227, "step": 21830 }, { "epoch": 1.9211822660098523, "grad_norm": 5.481921195983887, "learning_rate": 3.4489781454426985e-06, "loss": 0.5006, "step": 21840 }, { "epoch": 1.9220619282195637, "grad_norm": 7.140232086181641, "learning_rate": 3.4441136514370655e-06, "loss": 0.5724, "step": 21850 }, { "epoch": 1.922941590429275, "grad_norm": 7.826817512512207, "learning_rate": 3.4392507874738244e-06, "loss": 0.4348, "step": 21860 }, { "epoch": 1.9238212526389866, "grad_norm": 6.025455474853516, "learning_rate": 3.434389558647612e-06, "loss": 0.3655, "step": 21870 }, { "epoch": 1.9247009148486982, "grad_norm": 13.001155853271484, "learning_rate": 3.429529970051347e-06, "loss": 0.4683, "step": 21880 }, { "epoch": 1.9255805770584096, "grad_norm": 7.162818431854248, "learning_rate": 3.424672026776239e-06, "loss": 0.4777, "step": 21890 }, { "epoch": 1.926460239268121, "grad_norm": 7.404416561126709, "learning_rate": 3.4198157339117667e-06, "loss": 0.4652, "step": 21900 }, { "epoch": 1.9273399014778325, "grad_norm": 8.364561080932617, "learning_rate": 3.4149610965456805e-06, "loss": 0.4258, "step": 21910 }, { "epoch": 1.928219563687544, "grad_norm": 12.098787307739258, "learning_rate": 3.4101081197639984e-06, "loss": 0.4872, "step": 21920 }, { "epoch": 1.9290992258972555, "grad_norm": 9.043652534484863, "learning_rate": 3.4052568086509963e-06, "loss": 0.5316, "step": 21930 }, { "epoch": 1.9299788881069668, "grad_norm": 8.214943885803223, "learning_rate": 3.400407168289209e-06, "loss": 0.4439, "step": 21940 }, { "epoch": 1.9308585503166784, "grad_norm": 9.60054874420166, "learning_rate": 3.395559203759419e-06, "loss": 0.4788, "step": 21950 }, { "epoch": 1.93173821252639, "grad_norm": 6.401882171630859, "learning_rate": 3.3907129201406514e-06, "loss": 0.415, "step": 21960 }, { "epoch": 1.9326178747361014, "grad_norm": 6.345152378082275, "learning_rate": 3.3858683225101698e-06, "loss": 0.3915, "step": 21970 }, { "epoch": 1.9334975369458127, "grad_norm": 8.819084167480469, "learning_rate": 3.3810254159434743e-06, "loss": 0.5279, "step": 21980 }, { "epoch": 1.9343771991555243, "grad_norm": 9.415375709533691, "learning_rate": 3.3761842055142923e-06, "loss": 0.4658, "step": 21990 }, { "epoch": 1.935256861365236, "grad_norm": 10.034165382385254, "learning_rate": 3.3713446962945758e-06, "loss": 0.4315, "step": 22000 }, { "epoch": 1.9361365235749473, "grad_norm": 7.856036186218262, "learning_rate": 3.36650689335449e-06, "loss": 0.421, "step": 22010 }, { "epoch": 1.9370161857846586, "grad_norm": 6.806214809417725, "learning_rate": 3.361670801762419e-06, "loss": 0.4782, "step": 22020 }, { "epoch": 1.9378958479943702, "grad_norm": 9.04146957397461, "learning_rate": 3.3568364265849453e-06, "loss": 0.4482, "step": 22030 }, { "epoch": 1.9387755102040818, "grad_norm": 7.177223205566406, "learning_rate": 3.352003772886862e-06, "loss": 0.4347, "step": 22040 }, { "epoch": 1.9396551724137931, "grad_norm": 12.53480052947998, "learning_rate": 3.3471728457311525e-06, "loss": 0.456, "step": 22050 }, { "epoch": 1.9405348346235045, "grad_norm": 6.51157283782959, "learning_rate": 3.3423436501789953e-06, "loss": 0.4118, "step": 22060 }, { "epoch": 1.941414496833216, "grad_norm": 7.011214733123779, "learning_rate": 3.3375161912897537e-06, "loss": 0.4665, "step": 22070 }, { "epoch": 1.9422941590429277, "grad_norm": 10.869952201843262, "learning_rate": 3.3326904741209693e-06, "loss": 0.5413, "step": 22080 }, { "epoch": 1.943173821252639, "grad_norm": 6.263561248779297, "learning_rate": 3.327866503728362e-06, "loss": 0.4821, "step": 22090 }, { "epoch": 1.9440534834623504, "grad_norm": 9.718780517578125, "learning_rate": 3.3230442851658184e-06, "loss": 0.4429, "step": 22100 }, { "epoch": 1.9449331456720618, "grad_norm": 7.5414886474609375, "learning_rate": 3.318223823485393e-06, "loss": 0.4921, "step": 22110 }, { "epoch": 1.9458128078817734, "grad_norm": 6.549152374267578, "learning_rate": 3.3134051237372998e-06, "loss": 0.4566, "step": 22120 }, { "epoch": 1.946692470091485, "grad_norm": 8.928961753845215, "learning_rate": 3.3085881909699037e-06, "loss": 0.4636, "step": 22130 }, { "epoch": 1.9475721323011963, "grad_norm": 6.396554470062256, "learning_rate": 3.3037730302297224e-06, "loss": 0.4094, "step": 22140 }, { "epoch": 1.9484517945109077, "grad_norm": 7.553526878356934, "learning_rate": 3.2989596465614114e-06, "loss": 0.514, "step": 22150 }, { "epoch": 1.9493314567206192, "grad_norm": 10.013252258300781, "learning_rate": 3.294148045007771e-06, "loss": 0.4618, "step": 22160 }, { "epoch": 1.9502111189303308, "grad_norm": 4.834720611572266, "learning_rate": 3.2893382306097286e-06, "loss": 0.4235, "step": 22170 }, { "epoch": 1.9510907811400422, "grad_norm": 13.145724296569824, "learning_rate": 3.2845302084063445e-06, "loss": 0.4594, "step": 22180 }, { "epoch": 1.9519704433497536, "grad_norm": 7.119621753692627, "learning_rate": 3.2797239834347994e-06, "loss": 0.4604, "step": 22190 }, { "epoch": 1.9528501055594651, "grad_norm": 8.814141273498535, "learning_rate": 3.2749195607303897e-06, "loss": 0.5083, "step": 22200 }, { "epoch": 1.9537297677691767, "grad_norm": 7.168071269989014, "learning_rate": 3.2701169453265238e-06, "loss": 0.4271, "step": 22210 }, { "epoch": 1.954609429978888, "grad_norm": 6.59645938873291, "learning_rate": 3.2653161422547174e-06, "loss": 0.4408, "step": 22220 }, { "epoch": 1.9554890921885995, "grad_norm": 5.599145889282227, "learning_rate": 3.260517156544588e-06, "loss": 0.3885, "step": 22230 }, { "epoch": 1.956368754398311, "grad_norm": 9.329167366027832, "learning_rate": 3.2557199932238505e-06, "loss": 0.4817, "step": 22240 }, { "epoch": 1.9572484166080226, "grad_norm": 5.4341864585876465, "learning_rate": 3.2509246573183057e-06, "loss": 0.4635, "step": 22250 }, { "epoch": 1.958128078817734, "grad_norm": 9.123102188110352, "learning_rate": 3.246131153851847e-06, "loss": 0.4675, "step": 22260 }, { "epoch": 1.9590077410274453, "grad_norm": 10.623340606689453, "learning_rate": 3.2413394878464387e-06, "loss": 0.4594, "step": 22270 }, { "epoch": 1.959887403237157, "grad_norm": 7.806822776794434, "learning_rate": 3.236549664322128e-06, "loss": 0.4849, "step": 22280 }, { "epoch": 1.9607670654468685, "grad_norm": 8.784773826599121, "learning_rate": 3.2317616882970304e-06, "loss": 0.404, "step": 22290 }, { "epoch": 1.9616467276565799, "grad_norm": 6.5112786293029785, "learning_rate": 3.226975564787322e-06, "loss": 0.447, "step": 22300 }, { "epoch": 1.9625263898662912, "grad_norm": 10.035785675048828, "learning_rate": 3.2221912988072417e-06, "loss": 0.4109, "step": 22310 }, { "epoch": 1.9634060520760028, "grad_norm": 12.368217468261719, "learning_rate": 3.217408895369084e-06, "loss": 0.4706, "step": 22320 }, { "epoch": 1.9642857142857144, "grad_norm": 6.79185152053833, "learning_rate": 3.2126283594831853e-06, "loss": 0.4441, "step": 22330 }, { "epoch": 1.9651653764954258, "grad_norm": 7.761543273925781, "learning_rate": 3.2078496961579304e-06, "loss": 0.4952, "step": 22340 }, { "epoch": 1.9660450387051371, "grad_norm": 5.859261512756348, "learning_rate": 3.2030729103997416e-06, "loss": 0.4804, "step": 22350 }, { "epoch": 1.9669247009148487, "grad_norm": 9.857121467590332, "learning_rate": 3.1982980072130754e-06, "loss": 0.4434, "step": 22360 }, { "epoch": 1.9678043631245603, "grad_norm": 8.834452629089355, "learning_rate": 3.193524991600413e-06, "loss": 0.4726, "step": 22370 }, { "epoch": 1.9686840253342717, "grad_norm": 9.031597137451172, "learning_rate": 3.188753868562262e-06, "loss": 0.5307, "step": 22380 }, { "epoch": 1.969563687543983, "grad_norm": 7.209918975830078, "learning_rate": 3.1839846430971405e-06, "loss": 0.4444, "step": 22390 }, { "epoch": 1.9704433497536946, "grad_norm": 7.359462261199951, "learning_rate": 3.179217320201585e-06, "loss": 0.4636, "step": 22400 }, { "epoch": 1.9713230119634062, "grad_norm": 9.828230857849121, "learning_rate": 3.1744519048701394e-06, "loss": 0.4311, "step": 22410 }, { "epoch": 1.9722026741731176, "grad_norm": 6.607624053955078, "learning_rate": 3.1696884020953424e-06, "loss": 0.4282, "step": 22420 }, { "epoch": 1.973082336382829, "grad_norm": 9.9160737991333, "learning_rate": 3.164926816867734e-06, "loss": 0.4971, "step": 22430 }, { "epoch": 1.9739619985925405, "grad_norm": 6.689765453338623, "learning_rate": 3.1601671541758473e-06, "loss": 0.4701, "step": 22440 }, { "epoch": 1.974841660802252, "grad_norm": 8.928503036499023, "learning_rate": 3.1554094190061946e-06, "loss": 0.5236, "step": 22450 }, { "epoch": 1.9757213230119635, "grad_norm": 8.029068946838379, "learning_rate": 3.150653616343272e-06, "loss": 0.5496, "step": 22460 }, { "epoch": 1.9766009852216748, "grad_norm": 9.19001579284668, "learning_rate": 3.1458997511695534e-06, "loss": 0.5123, "step": 22470 }, { "epoch": 1.9774806474313862, "grad_norm": 10.199000358581543, "learning_rate": 3.1411478284654808e-06, "loss": 0.4204, "step": 22480 }, { "epoch": 1.9783603096410978, "grad_norm": 7.525503158569336, "learning_rate": 3.1363978532094597e-06, "loss": 0.4727, "step": 22490 }, { "epoch": 1.9792399718508094, "grad_norm": 7.905032157897949, "learning_rate": 3.1316498303778596e-06, "loss": 0.4498, "step": 22500 }, { "epoch": 1.9801196340605207, "grad_norm": 8.442888259887695, "learning_rate": 3.1269037649449976e-06, "loss": 0.4753, "step": 22510 }, { "epoch": 1.980999296270232, "grad_norm": 7.907448768615723, "learning_rate": 3.1221596618831467e-06, "loss": 0.4542, "step": 22520 }, { "epoch": 1.9818789584799437, "grad_norm": 8.959737777709961, "learning_rate": 3.1174175261625226e-06, "loss": 0.447, "step": 22530 }, { "epoch": 1.9827586206896552, "grad_norm": 7.421688556671143, "learning_rate": 3.1126773627512764e-06, "loss": 0.4879, "step": 22540 }, { "epoch": 1.9836382828993666, "grad_norm": 7.318508148193359, "learning_rate": 3.1079391766154966e-06, "loss": 0.5089, "step": 22550 }, { "epoch": 1.984517945109078, "grad_norm": 7.359505653381348, "learning_rate": 3.1032029727192004e-06, "loss": 0.4636, "step": 22560 }, { "epoch": 1.9853976073187896, "grad_norm": 10.435942649841309, "learning_rate": 3.0984687560243236e-06, "loss": 0.4625, "step": 22570 }, { "epoch": 1.9862772695285011, "grad_norm": 6.894886493682861, "learning_rate": 3.093736531490724e-06, "loss": 0.4109, "step": 22580 }, { "epoch": 1.9871569317382125, "grad_norm": 6.819297790527344, "learning_rate": 3.0890063040761732e-06, "loss": 0.4975, "step": 22590 }, { "epoch": 1.9880365939479239, "grad_norm": 8.497161865234375, "learning_rate": 3.0842780787363475e-06, "loss": 0.4884, "step": 22600 }, { "epoch": 1.9889162561576355, "grad_norm": 8.0750732421875, "learning_rate": 3.0795518604248285e-06, "loss": 0.4761, "step": 22610 }, { "epoch": 1.989795918367347, "grad_norm": 10.0103759765625, "learning_rate": 3.074827654093094e-06, "loss": 0.4965, "step": 22620 }, { "epoch": 1.9906755805770584, "grad_norm": 7.0565643310546875, "learning_rate": 3.0701054646905097e-06, "loss": 0.429, "step": 22630 }, { "epoch": 1.9915552427867698, "grad_norm": 7.27754545211792, "learning_rate": 3.065385297164335e-06, "loss": 0.445, "step": 22640 }, { "epoch": 1.9924349049964813, "grad_norm": 7.982407093048096, "learning_rate": 3.060667156459709e-06, "loss": 0.4617, "step": 22650 }, { "epoch": 1.993314567206193, "grad_norm": 9.092256546020508, "learning_rate": 3.0559510475196454e-06, "loss": 0.5045, "step": 22660 }, { "epoch": 1.9941942294159043, "grad_norm": 10.842965126037598, "learning_rate": 3.0512369752850292e-06, "loss": 0.4282, "step": 22670 }, { "epoch": 1.9950738916256157, "grad_norm": 7.927082061767578, "learning_rate": 3.0465249446946164e-06, "loss": 0.4758, "step": 22680 }, { "epoch": 1.9959535538353272, "grad_norm": 9.614212036132812, "learning_rate": 3.0418149606850175e-06, "loss": 0.5078, "step": 22690 }, { "epoch": 1.9968332160450388, "grad_norm": 8.667665481567383, "learning_rate": 3.0371070281907033e-06, "loss": 0.4872, "step": 22700 }, { "epoch": 1.9977128782547502, "grad_norm": 8.9832124710083, "learning_rate": 3.032401152143992e-06, "loss": 0.4479, "step": 22710 }, { "epoch": 1.9985925404644616, "grad_norm": 8.365753173828125, "learning_rate": 3.027697337475051e-06, "loss": 0.4545, "step": 22720 }, { "epoch": 1.9994722026741731, "grad_norm": 7.365147590637207, "learning_rate": 3.0229955891118872e-06, "loss": 0.4606, "step": 22730 }, { "epoch": 2.0003518648838847, "grad_norm": 7.106514930725098, "learning_rate": 3.018295911980341e-06, "loss": 0.3175, "step": 22740 }, { "epoch": 2.001231527093596, "grad_norm": 4.516010284423828, "learning_rate": 3.0135983110040824e-06, "loss": 0.2605, "step": 22750 }, { "epoch": 2.0021111893033074, "grad_norm": 6.479176044464111, "learning_rate": 3.0089027911046083e-06, "loss": 0.2463, "step": 22760 }, { "epoch": 2.002990851513019, "grad_norm": 8.218488693237305, "learning_rate": 3.0042093572012355e-06, "loss": 0.2919, "step": 22770 }, { "epoch": 2.0038705137227306, "grad_norm": 7.479281902313232, "learning_rate": 2.9995180142110935e-06, "loss": 0.3222, "step": 22780 }, { "epoch": 2.004750175932442, "grad_norm": 7.556055068969727, "learning_rate": 2.9948287670491215e-06, "loss": 0.2651, "step": 22790 }, { "epoch": 2.0056298381421533, "grad_norm": 10.39393138885498, "learning_rate": 2.9901416206280675e-06, "loss": 0.2673, "step": 22800 }, { "epoch": 2.0065095003518647, "grad_norm": 8.206273078918457, "learning_rate": 2.9854565798584706e-06, "loss": 0.3015, "step": 22810 }, { "epoch": 2.0073891625615765, "grad_norm": 6.799476623535156, "learning_rate": 2.9807736496486705e-06, "loss": 0.2484, "step": 22820 }, { "epoch": 2.008268824771288, "grad_norm": 8.810019493103027, "learning_rate": 2.976092834904791e-06, "loss": 0.2374, "step": 22830 }, { "epoch": 2.0091484869809992, "grad_norm": 11.148653984069824, "learning_rate": 2.9714141405307446e-06, "loss": 0.2777, "step": 22840 }, { "epoch": 2.0100281491907106, "grad_norm": 7.4814252853393555, "learning_rate": 2.9667375714282187e-06, "loss": 0.285, "step": 22850 }, { "epoch": 2.0109078114004224, "grad_norm": 7.2498555183410645, "learning_rate": 2.9620631324966777e-06, "loss": 0.2761, "step": 22860 }, { "epoch": 2.0117874736101338, "grad_norm": 8.662611961364746, "learning_rate": 2.9573908286333465e-06, "loss": 0.2773, "step": 22870 }, { "epoch": 2.012667135819845, "grad_norm": 7.2998223304748535, "learning_rate": 2.9527206647332206e-06, "loss": 0.2555, "step": 22880 }, { "epoch": 2.0135467980295565, "grad_norm": 13.686690330505371, "learning_rate": 2.948052645689055e-06, "loss": 0.3084, "step": 22890 }, { "epoch": 2.0144264602392683, "grad_norm": 9.149946212768555, "learning_rate": 2.9433867763913482e-06, "loss": 0.2891, "step": 22900 }, { "epoch": 2.0153061224489797, "grad_norm": 12.652191162109375, "learning_rate": 2.9387230617283565e-06, "loss": 0.3005, "step": 22910 }, { "epoch": 2.016185784658691, "grad_norm": 8.328264236450195, "learning_rate": 2.934061506586075e-06, "loss": 0.3149, "step": 22920 }, { "epoch": 2.0170654468684024, "grad_norm": 9.62153148651123, "learning_rate": 2.9294021158482328e-06, "loss": 0.2892, "step": 22930 }, { "epoch": 2.017945109078114, "grad_norm": 9.19791316986084, "learning_rate": 2.9247448943962976e-06, "loss": 0.2969, "step": 22940 }, { "epoch": 2.0188247712878256, "grad_norm": 10.743212699890137, "learning_rate": 2.920089847109459e-06, "loss": 0.2522, "step": 22950 }, { "epoch": 2.019704433497537, "grad_norm": 10.358012199401855, "learning_rate": 2.915436978864633e-06, "loss": 0.2674, "step": 22960 }, { "epoch": 2.0205840957072483, "grad_norm": 8.332043647766113, "learning_rate": 2.910786294536452e-06, "loss": 0.2656, "step": 22970 }, { "epoch": 2.02146375791696, "grad_norm": 9.20153522491455, "learning_rate": 2.9061377989972573e-06, "loss": 0.2548, "step": 22980 }, { "epoch": 2.0223434201266715, "grad_norm": 10.976323127746582, "learning_rate": 2.9014914971171002e-06, "loss": 0.3081, "step": 22990 }, { "epoch": 2.023223082336383, "grad_norm": 10.06468391418457, "learning_rate": 2.896847393763733e-06, "loss": 0.3202, "step": 23000 }, { "epoch": 2.024102744546094, "grad_norm": 11.850008010864258, "learning_rate": 2.8922054938026013e-06, "loss": 0.2965, "step": 23010 }, { "epoch": 2.024982406755806, "grad_norm": 11.361839294433594, "learning_rate": 2.887565802096851e-06, "loss": 0.2868, "step": 23020 }, { "epoch": 2.0258620689655173, "grad_norm": 9.624772071838379, "learning_rate": 2.8829283235073057e-06, "loss": 0.314, "step": 23030 }, { "epoch": 2.0267417311752287, "grad_norm": 9.01999282836914, "learning_rate": 2.878293062892475e-06, "loss": 0.2893, "step": 23040 }, { "epoch": 2.02762139338494, "grad_norm": 9.939691543579102, "learning_rate": 2.8736600251085425e-06, "loss": 0.2516, "step": 23050 }, { "epoch": 2.0285010555946514, "grad_norm": 10.584128379821777, "learning_rate": 2.869029215009361e-06, "loss": 0.2325, "step": 23060 }, { "epoch": 2.0293807178043632, "grad_norm": 8.346166610717773, "learning_rate": 2.8644006374464593e-06, "loss": 0.2855, "step": 23070 }, { "epoch": 2.0302603800140746, "grad_norm": 9.745835304260254, "learning_rate": 2.8597742972690155e-06, "loss": 0.2922, "step": 23080 }, { "epoch": 2.031140042223786, "grad_norm": 8.26791000366211, "learning_rate": 2.855150199323868e-06, "loss": 0.2367, "step": 23090 }, { "epoch": 2.0320197044334973, "grad_norm": 9.242250442504883, "learning_rate": 2.8505283484555136e-06, "loss": 0.2208, "step": 23100 }, { "epoch": 2.032899366643209, "grad_norm": 8.537992477416992, "learning_rate": 2.8459087495060776e-06, "loss": 0.2981, "step": 23110 }, { "epoch": 2.0337790288529205, "grad_norm": 11.087981224060059, "learning_rate": 2.8412914073153445e-06, "loss": 0.2777, "step": 23120 }, { "epoch": 2.034658691062632, "grad_norm": 12.111225128173828, "learning_rate": 2.8366763267207236e-06, "loss": 0.2844, "step": 23130 }, { "epoch": 2.0355383532723432, "grad_norm": 7.723420143127441, "learning_rate": 2.832063512557256e-06, "loss": 0.2933, "step": 23140 }, { "epoch": 2.036418015482055, "grad_norm": 10.198753356933594, "learning_rate": 2.827452969657613e-06, "loss": 0.2812, "step": 23150 }, { "epoch": 2.0372976776917664, "grad_norm": 8.783875465393066, "learning_rate": 2.822844702852082e-06, "loss": 0.3312, "step": 23160 }, { "epoch": 2.0381773399014778, "grad_norm": 9.435942649841309, "learning_rate": 2.818238716968569e-06, "loss": 0.2664, "step": 23170 }, { "epoch": 2.039057002111189, "grad_norm": 7.183585166931152, "learning_rate": 2.8136350168325855e-06, "loss": 0.2776, "step": 23180 }, { "epoch": 2.039936664320901, "grad_norm": 9.266807556152344, "learning_rate": 2.809033607267251e-06, "loss": 0.2797, "step": 23190 }, { "epoch": 2.0408163265306123, "grad_norm": 9.85545825958252, "learning_rate": 2.804434493093289e-06, "loss": 0.2624, "step": 23200 }, { "epoch": 2.0416959887403237, "grad_norm": 6.317489147186279, "learning_rate": 2.799837679129014e-06, "loss": 0.2864, "step": 23210 }, { "epoch": 2.042575650950035, "grad_norm": 8.597164154052734, "learning_rate": 2.7952431701903305e-06, "loss": 0.3105, "step": 23220 }, { "epoch": 2.043455313159747, "grad_norm": 9.518728256225586, "learning_rate": 2.7906509710907295e-06, "loss": 0.3146, "step": 23230 }, { "epoch": 2.044334975369458, "grad_norm": 7.326346397399902, "learning_rate": 2.7860610866412814e-06, "loss": 0.285, "step": 23240 }, { "epoch": 2.0452146375791695, "grad_norm": 7.007955551147461, "learning_rate": 2.7814735216506305e-06, "loss": 0.3229, "step": 23250 }, { "epoch": 2.046094299788881, "grad_norm": 9.701662063598633, "learning_rate": 2.7768882809249954e-06, "loss": 0.2917, "step": 23260 }, { "epoch": 2.0469739619985927, "grad_norm": 6.329052448272705, "learning_rate": 2.772305369268154e-06, "loss": 0.2669, "step": 23270 }, { "epoch": 2.047853624208304, "grad_norm": 11.239156723022461, "learning_rate": 2.767724791481452e-06, "loss": 0.2516, "step": 23280 }, { "epoch": 2.0487332864180154, "grad_norm": 8.81770133972168, "learning_rate": 2.7631465523637796e-06, "loss": 0.2539, "step": 23290 }, { "epoch": 2.049612948627727, "grad_norm": 8.03406810760498, "learning_rate": 2.758570656711582e-06, "loss": 0.2786, "step": 23300 }, { "epoch": 2.0504926108374386, "grad_norm": 7.396091461181641, "learning_rate": 2.7539971093188522e-06, "loss": 0.2931, "step": 23310 }, { "epoch": 2.05137227304715, "grad_norm": 6.503420352935791, "learning_rate": 2.7494259149771195e-06, "loss": 0.2764, "step": 23320 }, { "epoch": 2.0522519352568613, "grad_norm": 5.45847225189209, "learning_rate": 2.744857078475447e-06, "loss": 0.2856, "step": 23330 }, { "epoch": 2.0531315974665727, "grad_norm": 7.953460693359375, "learning_rate": 2.740290604600434e-06, "loss": 0.2784, "step": 23340 }, { "epoch": 2.0540112596762845, "grad_norm": 7.949288845062256, "learning_rate": 2.7357264981361933e-06, "loss": 0.2674, "step": 23350 }, { "epoch": 2.054890921885996, "grad_norm": 7.756248950958252, "learning_rate": 2.731164763864369e-06, "loss": 0.279, "step": 23360 }, { "epoch": 2.0557705840957072, "grad_norm": 8.029561042785645, "learning_rate": 2.7266054065641127e-06, "loss": 0.3035, "step": 23370 }, { "epoch": 2.0566502463054186, "grad_norm": 11.594472885131836, "learning_rate": 2.722048431012087e-06, "loss": 0.283, "step": 23380 }, { "epoch": 2.0575299085151304, "grad_norm": 6.9647369384765625, "learning_rate": 2.7174938419824636e-06, "loss": 0.2582, "step": 23390 }, { "epoch": 2.0584095707248418, "grad_norm": 12.0010347366333, "learning_rate": 2.7129416442469086e-06, "loss": 0.3105, "step": 23400 }, { "epoch": 2.059289232934553, "grad_norm": 6.821567058563232, "learning_rate": 2.708391842574585e-06, "loss": 0.3061, "step": 23410 }, { "epoch": 2.0601688951442645, "grad_norm": 10.807978630065918, "learning_rate": 2.7038444417321452e-06, "loss": 0.2991, "step": 23420 }, { "epoch": 2.061048557353976, "grad_norm": 6.289615154266357, "learning_rate": 2.6992994464837245e-06, "loss": 0.2831, "step": 23430 }, { "epoch": 2.0619282195636877, "grad_norm": 9.707883834838867, "learning_rate": 2.6947568615909437e-06, "loss": 0.2577, "step": 23440 }, { "epoch": 2.062807881773399, "grad_norm": 7.893085956573486, "learning_rate": 2.690216691812893e-06, "loss": 0.2963, "step": 23450 }, { "epoch": 2.0636875439831104, "grad_norm": 6.922300815582275, "learning_rate": 2.6856789419061335e-06, "loss": 0.3084, "step": 23460 }, { "epoch": 2.0645672061928217, "grad_norm": 8.302508354187012, "learning_rate": 2.681143616624692e-06, "loss": 0.2563, "step": 23470 }, { "epoch": 2.0654468684025336, "grad_norm": 8.314599990844727, "learning_rate": 2.6766107207200532e-06, "loss": 0.2794, "step": 23480 }, { "epoch": 2.066326530612245, "grad_norm": 8.756461143493652, "learning_rate": 2.6720802589411603e-06, "loss": 0.2635, "step": 23490 }, { "epoch": 2.0672061928219563, "grad_norm": 8.954370498657227, "learning_rate": 2.6675522360344036e-06, "loss": 0.2873, "step": 23500 }, { "epoch": 2.0680858550316676, "grad_norm": 7.34266996383667, "learning_rate": 2.663026656743617e-06, "loss": 0.2707, "step": 23510 }, { "epoch": 2.0689655172413794, "grad_norm": 8.247711181640625, "learning_rate": 2.658503525810081e-06, "loss": 0.3202, "step": 23520 }, { "epoch": 2.069845179451091, "grad_norm": 7.770339012145996, "learning_rate": 2.653982847972501e-06, "loss": 0.3054, "step": 23530 }, { "epoch": 2.070724841660802, "grad_norm": 8.377229690551758, "learning_rate": 2.6494646279670176e-06, "loss": 0.2664, "step": 23540 }, { "epoch": 2.0716045038705135, "grad_norm": 10.644085884094238, "learning_rate": 2.644948870527201e-06, "loss": 0.2607, "step": 23550 }, { "epoch": 2.0724841660802253, "grad_norm": 7.874335289001465, "learning_rate": 2.640435580384033e-06, "loss": 0.2885, "step": 23560 }, { "epoch": 2.0733638282899367, "grad_norm": 9.307093620300293, "learning_rate": 2.6359247622659144e-06, "loss": 0.2587, "step": 23570 }, { "epoch": 2.074243490499648, "grad_norm": 10.079574584960938, "learning_rate": 2.6314164208986615e-06, "loss": 0.3382, "step": 23580 }, { "epoch": 2.0751231527093594, "grad_norm": 8.833085060119629, "learning_rate": 2.626910561005482e-06, "loss": 0.3166, "step": 23590 }, { "epoch": 2.0760028149190712, "grad_norm": 7.242616653442383, "learning_rate": 2.622407187306999e-06, "loss": 0.3017, "step": 23600 }, { "epoch": 2.0768824771287826, "grad_norm": 6.994810581207275, "learning_rate": 2.6179063045212213e-06, "loss": 0.264, "step": 23610 }, { "epoch": 2.077762139338494, "grad_norm": 6.752554893493652, "learning_rate": 2.6134079173635496e-06, "loss": 0.2719, "step": 23620 }, { "epoch": 2.0786418015482053, "grad_norm": 9.769316673278809, "learning_rate": 2.6089120305467762e-06, "loss": 0.2694, "step": 23630 }, { "epoch": 2.079521463757917, "grad_norm": 7.727301597595215, "learning_rate": 2.604418648781065e-06, "loss": 0.2572, "step": 23640 }, { "epoch": 2.0804011259676285, "grad_norm": 8.475481033325195, "learning_rate": 2.5999277767739622e-06, "loss": 0.2663, "step": 23650 }, { "epoch": 2.08128078817734, "grad_norm": 7.169694900512695, "learning_rate": 2.59543941923038e-06, "loss": 0.2816, "step": 23660 }, { "epoch": 2.082160450387051, "grad_norm": 9.25267505645752, "learning_rate": 2.5909535808525975e-06, "loss": 0.2827, "step": 23670 }, { "epoch": 2.083040112596763, "grad_norm": 19.286649703979492, "learning_rate": 2.586470266340258e-06, "loss": 0.3251, "step": 23680 }, { "epoch": 2.0839197748064744, "grad_norm": 7.2203240394592285, "learning_rate": 2.581989480390357e-06, "loss": 0.2538, "step": 23690 }, { "epoch": 2.0847994370161858, "grad_norm": 7.511997699737549, "learning_rate": 2.5775112276972416e-06, "loss": 0.2872, "step": 23700 }, { "epoch": 2.085679099225897, "grad_norm": 7.28903865814209, "learning_rate": 2.5730355129526053e-06, "loss": 0.2726, "step": 23710 }, { "epoch": 2.086558761435609, "grad_norm": 9.883954048156738, "learning_rate": 2.568562340845481e-06, "loss": 0.2985, "step": 23720 }, { "epoch": 2.0874384236453203, "grad_norm": 11.482284545898438, "learning_rate": 2.5640917160622427e-06, "loss": 0.2713, "step": 23730 }, { "epoch": 2.0883180858550316, "grad_norm": 9.780261993408203, "learning_rate": 2.55962364328659e-06, "loss": 0.2442, "step": 23740 }, { "epoch": 2.089197748064743, "grad_norm": 11.667009353637695, "learning_rate": 2.5551581271995503e-06, "loss": 0.2689, "step": 23750 }, { "epoch": 2.090077410274455, "grad_norm": 7.371754169464111, "learning_rate": 2.550695172479479e-06, "loss": 0.2501, "step": 23760 }, { "epoch": 2.090957072484166, "grad_norm": 6.567035675048828, "learning_rate": 2.546234783802034e-06, "loss": 0.2428, "step": 23770 }, { "epoch": 2.0918367346938775, "grad_norm": 6.177751541137695, "learning_rate": 2.5417769658401992e-06, "loss": 0.2552, "step": 23780 }, { "epoch": 2.092716396903589, "grad_norm": 7.389760494232178, "learning_rate": 2.5373217232642577e-06, "loss": 0.2214, "step": 23790 }, { "epoch": 2.0935960591133007, "grad_norm": 9.659595489501953, "learning_rate": 2.532869060741795e-06, "loss": 0.2801, "step": 23800 }, { "epoch": 2.094475721323012, "grad_norm": 6.131315231323242, "learning_rate": 2.5284189829376966e-06, "loss": 0.2465, "step": 23810 }, { "epoch": 2.0953553835327234, "grad_norm": 7.834970474243164, "learning_rate": 2.5239714945141397e-06, "loss": 0.2742, "step": 23820 }, { "epoch": 2.096235045742435, "grad_norm": 9.14561939239502, "learning_rate": 2.519526600130581e-06, "loss": 0.294, "step": 23830 }, { "epoch": 2.097114707952146, "grad_norm": 11.989640235900879, "learning_rate": 2.5150843044437712e-06, "loss": 0.309, "step": 23840 }, { "epoch": 2.097994370161858, "grad_norm": 10.88481616973877, "learning_rate": 2.510644612107731e-06, "loss": 0.267, "step": 23850 }, { "epoch": 2.0988740323715693, "grad_norm": 8.774112701416016, "learning_rate": 2.506207527773754e-06, "loss": 0.2719, "step": 23860 }, { "epoch": 2.0997536945812807, "grad_norm": 12.508010864257812, "learning_rate": 2.5017730560904064e-06, "loss": 0.2671, "step": 23870 }, { "epoch": 2.100633356790992, "grad_norm": 8.722457885742188, "learning_rate": 2.4973412017035126e-06, "loss": 0.2852, "step": 23880 }, { "epoch": 2.101513019000704, "grad_norm": 11.050003051757812, "learning_rate": 2.4929119692561567e-06, "loss": 0.2991, "step": 23890 }, { "epoch": 2.1023926812104152, "grad_norm": 8.361954689025879, "learning_rate": 2.488485363388674e-06, "loss": 0.3042, "step": 23900 }, { "epoch": 2.1032723434201266, "grad_norm": 8.860512733459473, "learning_rate": 2.484061388738649e-06, "loss": 0.2854, "step": 23910 }, { "epoch": 2.104152005629838, "grad_norm": 9.22645378112793, "learning_rate": 2.4796400499409133e-06, "loss": 0.2734, "step": 23920 }, { "epoch": 2.1050316678395498, "grad_norm": 9.289389610290527, "learning_rate": 2.475221351627532e-06, "loss": 0.2463, "step": 23930 }, { "epoch": 2.105911330049261, "grad_norm": 9.103738784790039, "learning_rate": 2.470805298427806e-06, "loss": 0.2985, "step": 23940 }, { "epoch": 2.1067909922589725, "grad_norm": 8.031414985656738, "learning_rate": 2.4663918949682635e-06, "loss": 0.274, "step": 23950 }, { "epoch": 2.107670654468684, "grad_norm": 8.7466402053833, "learning_rate": 2.461981145872658e-06, "loss": 0.2827, "step": 23960 }, { "epoch": 2.1085503166783957, "grad_norm": 13.09333610534668, "learning_rate": 2.457573055761963e-06, "loss": 0.2524, "step": 23970 }, { "epoch": 2.109429978888107, "grad_norm": 9.19884967803955, "learning_rate": 2.453167629254366e-06, "loss": 0.2829, "step": 23980 }, { "epoch": 2.1103096410978184, "grad_norm": 10.191561698913574, "learning_rate": 2.4487648709652605e-06, "loss": 0.271, "step": 23990 }, { "epoch": 2.1111893033075297, "grad_norm": 9.056473731994629, "learning_rate": 2.4443647855072523e-06, "loss": 0.3154, "step": 24000 }, { "epoch": 2.1120689655172415, "grad_norm": 8.315712928771973, "learning_rate": 2.439967377490135e-06, "loss": 0.2562, "step": 24010 }, { "epoch": 2.112948627726953, "grad_norm": 12.459883689880371, "learning_rate": 2.43557265152091e-06, "loss": 0.2547, "step": 24020 }, { "epoch": 2.1138282899366643, "grad_norm": 8.85256576538086, "learning_rate": 2.4311806122037602e-06, "loss": 0.2861, "step": 24030 }, { "epoch": 2.1147079521463756, "grad_norm": 7.136203765869141, "learning_rate": 2.426791264140056e-06, "loss": 0.2684, "step": 24040 }, { "epoch": 2.1155876143560874, "grad_norm": 9.055746078491211, "learning_rate": 2.4224046119283513e-06, "loss": 0.2679, "step": 24050 }, { "epoch": 2.116467276565799, "grad_norm": 8.55331039428711, "learning_rate": 2.4180206601643718e-06, "loss": 0.2976, "step": 24060 }, { "epoch": 2.11734693877551, "grad_norm": 10.027508735656738, "learning_rate": 2.4136394134410164e-06, "loss": 0.2367, "step": 24070 }, { "epoch": 2.1182266009852215, "grad_norm": 11.154163360595703, "learning_rate": 2.4092608763483477e-06, "loss": 0.2859, "step": 24080 }, { "epoch": 2.1191062631949333, "grad_norm": 7.798529148101807, "learning_rate": 2.4048850534735896e-06, "loss": 0.308, "step": 24090 }, { "epoch": 2.1199859254046447, "grad_norm": 19.477235794067383, "learning_rate": 2.4005119494011275e-06, "loss": 0.2708, "step": 24100 }, { "epoch": 2.120865587614356, "grad_norm": 8.197837829589844, "learning_rate": 2.396141568712493e-06, "loss": 0.2752, "step": 24110 }, { "epoch": 2.1217452498240674, "grad_norm": 9.07505989074707, "learning_rate": 2.3917739159863657e-06, "loss": 0.3276, "step": 24120 }, { "epoch": 2.1226249120337792, "grad_norm": 6.780468940734863, "learning_rate": 2.3874089957985673e-06, "loss": 0.284, "step": 24130 }, { "epoch": 2.1235045742434906, "grad_norm": 9.675938606262207, "learning_rate": 2.3830468127220586e-06, "loss": 0.2451, "step": 24140 }, { "epoch": 2.124384236453202, "grad_norm": 11.502592086791992, "learning_rate": 2.3786873713269274e-06, "loss": 0.2905, "step": 24150 }, { "epoch": 2.1252638986629133, "grad_norm": 9.033289909362793, "learning_rate": 2.3743306761803987e-06, "loss": 0.2853, "step": 24160 }, { "epoch": 2.1261435608726247, "grad_norm": 11.858999252319336, "learning_rate": 2.369976731846812e-06, "loss": 0.252, "step": 24170 }, { "epoch": 2.1270232230823365, "grad_norm": 8.561975479125977, "learning_rate": 2.365625542887628e-06, "loss": 0.2502, "step": 24180 }, { "epoch": 2.127902885292048, "grad_norm": 8.005743026733398, "learning_rate": 2.3612771138614194e-06, "loss": 0.253, "step": 24190 }, { "epoch": 2.128782547501759, "grad_norm": 10.196906089782715, "learning_rate": 2.356931449323867e-06, "loss": 0.2253, "step": 24200 }, { "epoch": 2.129662209711471, "grad_norm": 9.156463623046875, "learning_rate": 2.352588553827761e-06, "loss": 0.2793, "step": 24210 }, { "epoch": 2.1305418719211824, "grad_norm": 7.945369720458984, "learning_rate": 2.3482484319229837e-06, "loss": 0.2862, "step": 24220 }, { "epoch": 2.1314215341308937, "grad_norm": 6.354840278625488, "learning_rate": 2.3439110881565123e-06, "loss": 0.2581, "step": 24230 }, { "epoch": 2.132301196340605, "grad_norm": 8.48112678527832, "learning_rate": 2.339576527072422e-06, "loss": 0.2525, "step": 24240 }, { "epoch": 2.1331808585503165, "grad_norm": 8.551837921142578, "learning_rate": 2.335244753211857e-06, "loss": 0.2972, "step": 24250 }, { "epoch": 2.1340605207600283, "grad_norm": 11.311365127563477, "learning_rate": 2.330915771113057e-06, "loss": 0.3305, "step": 24260 }, { "epoch": 2.1349401829697396, "grad_norm": 10.357368469238281, "learning_rate": 2.3265895853113283e-06, "loss": 0.2757, "step": 24270 }, { "epoch": 2.135819845179451, "grad_norm": 11.475586891174316, "learning_rate": 2.322266200339048e-06, "loss": 0.253, "step": 24280 }, { "epoch": 2.1366995073891624, "grad_norm": 6.869146347045898, "learning_rate": 2.3179456207256635e-06, "loss": 0.269, "step": 24290 }, { "epoch": 2.137579169598874, "grad_norm": 11.192413330078125, "learning_rate": 2.3136278509976796e-06, "loss": 0.2544, "step": 24300 }, { "epoch": 2.1384588318085855, "grad_norm": 14.168062210083008, "learning_rate": 2.3093128956786563e-06, "loss": 0.2721, "step": 24310 }, { "epoch": 2.139338494018297, "grad_norm": 12.085538864135742, "learning_rate": 2.3050007592892082e-06, "loss": 0.2714, "step": 24320 }, { "epoch": 2.1402181562280083, "grad_norm": 9.23324203491211, "learning_rate": 2.300691446346992e-06, "loss": 0.258, "step": 24330 }, { "epoch": 2.14109781843772, "grad_norm": 9.512421607971191, "learning_rate": 2.2963849613667137e-06, "loss": 0.2999, "step": 24340 }, { "epoch": 2.1419774806474314, "grad_norm": 9.64366626739502, "learning_rate": 2.2920813088601103e-06, "loss": 0.2703, "step": 24350 }, { "epoch": 2.142857142857143, "grad_norm": 7.408082485198975, "learning_rate": 2.2877804933359534e-06, "loss": 0.2645, "step": 24360 }, { "epoch": 2.143736805066854, "grad_norm": 10.160873413085938, "learning_rate": 2.283482519300042e-06, "loss": 0.3055, "step": 24370 }, { "epoch": 2.144616467276566, "grad_norm": 9.730076789855957, "learning_rate": 2.2791873912551964e-06, "loss": 0.331, "step": 24380 }, { "epoch": 2.1454961294862773, "grad_norm": 8.358415603637695, "learning_rate": 2.274895113701262e-06, "loss": 0.2833, "step": 24390 }, { "epoch": 2.1463757916959887, "grad_norm": 7.4113311767578125, "learning_rate": 2.270605691135091e-06, "loss": 0.2668, "step": 24400 }, { "epoch": 2.1472554539057, "grad_norm": 9.8328857421875, "learning_rate": 2.2663191280505447e-06, "loss": 0.2638, "step": 24410 }, { "epoch": 2.148135116115412, "grad_norm": 11.318716049194336, "learning_rate": 2.262035428938496e-06, "loss": 0.2557, "step": 24420 }, { "epoch": 2.149014778325123, "grad_norm": 8.59965705871582, "learning_rate": 2.2577545982868067e-06, "loss": 0.2531, "step": 24430 }, { "epoch": 2.1498944405348346, "grad_norm": 11.908195495605469, "learning_rate": 2.2534766405803383e-06, "loss": 0.2315, "step": 24440 }, { "epoch": 2.150774102744546, "grad_norm": 8.72808837890625, "learning_rate": 2.2492015603009476e-06, "loss": 0.2828, "step": 24450 }, { "epoch": 2.1516537649542578, "grad_norm": 7.5967512130737305, "learning_rate": 2.2449293619274693e-06, "loss": 0.294, "step": 24460 }, { "epoch": 2.152533427163969, "grad_norm": 9.702754974365234, "learning_rate": 2.24066004993572e-06, "loss": 0.2713, "step": 24470 }, { "epoch": 2.1534130893736805, "grad_norm": 7.408155918121338, "learning_rate": 2.2363936287984993e-06, "loss": 0.2735, "step": 24480 }, { "epoch": 2.154292751583392, "grad_norm": 11.151867866516113, "learning_rate": 2.232130102985566e-06, "loss": 0.2955, "step": 24490 }, { "epoch": 2.1551724137931036, "grad_norm": 7.4963459968566895, "learning_rate": 2.2278694769636572e-06, "loss": 0.2477, "step": 24500 }, { "epoch": 2.156052076002815, "grad_norm": 10.030394554138184, "learning_rate": 2.223611755196466e-06, "loss": 0.2974, "step": 24510 }, { "epoch": 2.1569317382125264, "grad_norm": 9.771892547607422, "learning_rate": 2.219356942144642e-06, "loss": 0.2587, "step": 24520 }, { "epoch": 2.1578114004222377, "grad_norm": 8.877381324768066, "learning_rate": 2.215105042265793e-06, "loss": 0.2719, "step": 24530 }, { "epoch": 2.1586910626319495, "grad_norm": 8.882919311523438, "learning_rate": 2.2108560600144696e-06, "loss": 0.2522, "step": 24540 }, { "epoch": 2.159570724841661, "grad_norm": 7.848208427429199, "learning_rate": 2.206609999842167e-06, "loss": 0.2576, "step": 24550 }, { "epoch": 2.1604503870513723, "grad_norm": 8.158295631408691, "learning_rate": 2.2023668661973203e-06, "loss": 0.2724, "step": 24560 }, { "epoch": 2.1613300492610836, "grad_norm": 7.04145622253418, "learning_rate": 2.1981266635252947e-06, "loss": 0.2394, "step": 24570 }, { "epoch": 2.162209711470795, "grad_norm": 9.586469650268555, "learning_rate": 2.193889396268392e-06, "loss": 0.2669, "step": 24580 }, { "epoch": 2.163089373680507, "grad_norm": 9.93383502960205, "learning_rate": 2.189655068865833e-06, "loss": 0.2816, "step": 24590 }, { "epoch": 2.163969035890218, "grad_norm": 7.975595474243164, "learning_rate": 2.185423685753759e-06, "loss": 0.2641, "step": 24600 }, { "epoch": 2.1648486980999295, "grad_norm": 12.061174392700195, "learning_rate": 2.181195251365229e-06, "loss": 0.2525, "step": 24610 }, { "epoch": 2.1657283603096413, "grad_norm": 8.783780097961426, "learning_rate": 2.1769697701302083e-06, "loss": 0.2348, "step": 24620 }, { "epoch": 2.1666080225193527, "grad_norm": 7.475301742553711, "learning_rate": 2.172747246475575e-06, "loss": 0.2683, "step": 24630 }, { "epoch": 2.167487684729064, "grad_norm": 9.196951866149902, "learning_rate": 2.1685276848251042e-06, "loss": 0.2615, "step": 24640 }, { "epoch": 2.1683673469387754, "grad_norm": 10.035334587097168, "learning_rate": 2.1643110895994656e-06, "loss": 0.2464, "step": 24650 }, { "epoch": 2.169247009148487, "grad_norm": 8.014044761657715, "learning_rate": 2.160097465216231e-06, "loss": 0.2427, "step": 24660 }, { "epoch": 2.1701266713581986, "grad_norm": 8.547564506530762, "learning_rate": 2.1558868160898444e-06, "loss": 0.2931, "step": 24670 }, { "epoch": 2.17100633356791, "grad_norm": 9.997305870056152, "learning_rate": 2.1516791466316465e-06, "loss": 0.249, "step": 24680 }, { "epoch": 2.1718859957776213, "grad_norm": 9.387547492980957, "learning_rate": 2.147474461249851e-06, "loss": 0.2668, "step": 24690 }, { "epoch": 2.1727656579873327, "grad_norm": 8.17619514465332, "learning_rate": 2.1432727643495415e-06, "loss": 0.299, "step": 24700 }, { "epoch": 2.1736453201970445, "grad_norm": 10.718908309936523, "learning_rate": 2.1390740603326793e-06, "loss": 0.2617, "step": 24710 }, { "epoch": 2.174524982406756, "grad_norm": 7.888399124145508, "learning_rate": 2.1348783535980855e-06, "loss": 0.2639, "step": 24720 }, { "epoch": 2.175404644616467, "grad_norm": 11.741738319396973, "learning_rate": 2.1306856485414347e-06, "loss": 0.2944, "step": 24730 }, { "epoch": 2.1762843068261786, "grad_norm": 8.544672012329102, "learning_rate": 2.126495949555269e-06, "loss": 0.2722, "step": 24740 }, { "epoch": 2.1771639690358904, "grad_norm": 8.630536079406738, "learning_rate": 2.122309261028973e-06, "loss": 0.3102, "step": 24750 }, { "epoch": 2.1780436312456017, "grad_norm": 8.346260070800781, "learning_rate": 2.1181255873487784e-06, "loss": 0.3119, "step": 24760 }, { "epoch": 2.178923293455313, "grad_norm": 11.546196937561035, "learning_rate": 2.113944932897762e-06, "loss": 0.2859, "step": 24770 }, { "epoch": 2.1798029556650245, "grad_norm": 12.704532623291016, "learning_rate": 2.109767302055834e-06, "loss": 0.2992, "step": 24780 }, { "epoch": 2.1806826178747363, "grad_norm": 7.557281017303467, "learning_rate": 2.105592699199738e-06, "loss": 0.2909, "step": 24790 }, { "epoch": 2.1815622800844476, "grad_norm": 7.876852512359619, "learning_rate": 2.1014211287030444e-06, "loss": 0.2343, "step": 24800 }, { "epoch": 2.182441942294159, "grad_norm": 12.797781944274902, "learning_rate": 2.0972525949361467e-06, "loss": 0.3002, "step": 24810 }, { "epoch": 2.1833216045038704, "grad_norm": 7.645691394805908, "learning_rate": 2.09308710226626e-06, "loss": 0.3134, "step": 24820 }, { "epoch": 2.184201266713582, "grad_norm": 12.188246726989746, "learning_rate": 2.0889246550574104e-06, "loss": 0.2962, "step": 24830 }, { "epoch": 2.1850809289232935, "grad_norm": 9.720561981201172, "learning_rate": 2.084765257670433e-06, "loss": 0.2533, "step": 24840 }, { "epoch": 2.185960591133005, "grad_norm": 7.319816589355469, "learning_rate": 2.080608914462969e-06, "loss": 0.2521, "step": 24850 }, { "epoch": 2.1868402533427163, "grad_norm": 9.590618133544922, "learning_rate": 2.076455629789459e-06, "loss": 0.2987, "step": 24860 }, { "epoch": 2.187719915552428, "grad_norm": 9.063633918762207, "learning_rate": 2.072305408001142e-06, "loss": 0.2715, "step": 24870 }, { "epoch": 2.1885995777621394, "grad_norm": 11.494123458862305, "learning_rate": 2.0681582534460446e-06, "loss": 0.2548, "step": 24880 }, { "epoch": 2.189479239971851, "grad_norm": 9.030837059020996, "learning_rate": 2.0640141704689797e-06, "loss": 0.2728, "step": 24890 }, { "epoch": 2.190358902181562, "grad_norm": 7.172940254211426, "learning_rate": 2.0598731634115497e-06, "loss": 0.2477, "step": 24900 }, { "epoch": 2.191238564391274, "grad_norm": 10.435382843017578, "learning_rate": 2.0557352366121215e-06, "loss": 0.2466, "step": 24910 }, { "epoch": 2.1921182266009853, "grad_norm": 6.28445291519165, "learning_rate": 2.0516003944058464e-06, "loss": 0.2774, "step": 24920 }, { "epoch": 2.1929978888106967, "grad_norm": 10.842371940612793, "learning_rate": 2.0474686411246396e-06, "loss": 0.2772, "step": 24930 }, { "epoch": 2.193877551020408, "grad_norm": 8.209763526916504, "learning_rate": 2.0433399810971776e-06, "loss": 0.2548, "step": 24940 }, { "epoch": 2.19475721323012, "grad_norm": 8.642714500427246, "learning_rate": 2.0392144186489034e-06, "loss": 0.2734, "step": 24950 }, { "epoch": 2.195636875439831, "grad_norm": 6.706401824951172, "learning_rate": 2.035091958102009e-06, "loss": 0.2419, "step": 24960 }, { "epoch": 2.1965165376495426, "grad_norm": 8.55677318572998, "learning_rate": 2.0309726037754336e-06, "loss": 0.2891, "step": 24970 }, { "epoch": 2.197396199859254, "grad_norm": 9.083685874938965, "learning_rate": 2.026856359984872e-06, "loss": 0.2575, "step": 24980 }, { "epoch": 2.1982758620689653, "grad_norm": 5.911430358886719, "learning_rate": 2.0227432310427514e-06, "loss": 0.2807, "step": 24990 }, { "epoch": 2.199155524278677, "grad_norm": 10.426762580871582, "learning_rate": 2.0186332212582414e-06, "loss": 0.2339, "step": 25000 }, { "epoch": 2.2000351864883885, "grad_norm": 8.951666831970215, "learning_rate": 2.014526334937241e-06, "loss": 0.2465, "step": 25010 }, { "epoch": 2.2009148486981, "grad_norm": 5.4352922439575195, "learning_rate": 2.0104225763823778e-06, "loss": 0.3067, "step": 25020 }, { "epoch": 2.201794510907811, "grad_norm": 8.297005653381348, "learning_rate": 2.006321949893e-06, "loss": 0.2308, "step": 25030 }, { "epoch": 2.202674173117523, "grad_norm": 10.175971984863281, "learning_rate": 2.002224459765178e-06, "loss": 0.2378, "step": 25040 }, { "epoch": 2.2035538353272344, "grad_norm": 9.614944458007812, "learning_rate": 1.998130110291693e-06, "loss": 0.2925, "step": 25050 }, { "epoch": 2.2044334975369457, "grad_norm": 9.27309513092041, "learning_rate": 1.9940389057620407e-06, "loss": 0.2725, "step": 25060 }, { "epoch": 2.205313159746657, "grad_norm": 9.730428695678711, "learning_rate": 1.9899508504624184e-06, "loss": 0.2856, "step": 25070 }, { "epoch": 2.206192821956369, "grad_norm": 12.87455940246582, "learning_rate": 1.985865948675723e-06, "loss": 0.3136, "step": 25080 }, { "epoch": 2.2070724841660803, "grad_norm": 12.054197311401367, "learning_rate": 1.9817842046815503e-06, "loss": 0.265, "step": 25090 }, { "epoch": 2.2079521463757916, "grad_norm": 6.3737640380859375, "learning_rate": 1.9777056227561853e-06, "loss": 0.223, "step": 25100 }, { "epoch": 2.208831808585503, "grad_norm": 11.32607364654541, "learning_rate": 1.973630207172605e-06, "loss": 0.2917, "step": 25110 }, { "epoch": 2.209711470795215, "grad_norm": 8.698062896728516, "learning_rate": 1.9695579622004634e-06, "loss": 0.2958, "step": 25120 }, { "epoch": 2.210591133004926, "grad_norm": 8.592835426330566, "learning_rate": 1.9654888921060943e-06, "loss": 0.2408, "step": 25130 }, { "epoch": 2.2114707952146375, "grad_norm": 8.759222030639648, "learning_rate": 1.961423001152511e-06, "loss": 0.2036, "step": 25140 }, { "epoch": 2.212350457424349, "grad_norm": 11.373425483703613, "learning_rate": 1.9573602935993846e-06, "loss": 0.2935, "step": 25150 }, { "epoch": 2.2132301196340607, "grad_norm": 10.215764045715332, "learning_rate": 1.953300773703063e-06, "loss": 0.2622, "step": 25160 }, { "epoch": 2.214109781843772, "grad_norm": 6.571837902069092, "learning_rate": 1.949244445716547e-06, "loss": 0.2526, "step": 25170 }, { "epoch": 2.2149894440534834, "grad_norm": 10.701519966125488, "learning_rate": 1.9451913138894945e-06, "loss": 0.2423, "step": 25180 }, { "epoch": 2.2158691062631948, "grad_norm": 8.554136276245117, "learning_rate": 1.9411413824682202e-06, "loss": 0.2712, "step": 25190 }, { "epoch": 2.2167487684729066, "grad_norm": 10.421695709228516, "learning_rate": 1.9370946556956786e-06, "loss": 0.2471, "step": 25200 }, { "epoch": 2.217628430682618, "grad_norm": 9.845836639404297, "learning_rate": 1.9330511378114714e-06, "loss": 0.28, "step": 25210 }, { "epoch": 2.2185080928923293, "grad_norm": 10.108556747436523, "learning_rate": 1.9290108330518366e-06, "loss": 0.2668, "step": 25220 }, { "epoch": 2.2193877551020407, "grad_norm": 10.790846824645996, "learning_rate": 1.9249737456496435e-06, "loss": 0.2922, "step": 25230 }, { "epoch": 2.2202674173117525, "grad_norm": 7.328277587890625, "learning_rate": 1.9209398798343986e-06, "loss": 0.2362, "step": 25240 }, { "epoch": 2.221147079521464, "grad_norm": 9.052085876464844, "learning_rate": 1.916909239832226e-06, "loss": 0.2465, "step": 25250 }, { "epoch": 2.222026741731175, "grad_norm": 8.186562538146973, "learning_rate": 1.9128818298658718e-06, "loss": 0.2526, "step": 25260 }, { "epoch": 2.2229064039408866, "grad_norm": 8.880982398986816, "learning_rate": 1.9088576541547e-06, "loss": 0.2954, "step": 25270 }, { "epoch": 2.2237860661505984, "grad_norm": 10.264419555664062, "learning_rate": 1.9048367169146837e-06, "loss": 0.2616, "step": 25280 }, { "epoch": 2.2246657283603097, "grad_norm": 8.556580543518066, "learning_rate": 1.9008190223584038e-06, "loss": 0.2845, "step": 25290 }, { "epoch": 2.225545390570021, "grad_norm": 9.676003456115723, "learning_rate": 1.8968045746950476e-06, "loss": 0.2462, "step": 25300 }, { "epoch": 2.2264250527797325, "grad_norm": 9.458199501037598, "learning_rate": 1.8927933781303942e-06, "loss": 0.2711, "step": 25310 }, { "epoch": 2.227304714989444, "grad_norm": 12.513518333435059, "learning_rate": 1.8887854368668246e-06, "loss": 0.3451, "step": 25320 }, { "epoch": 2.2281843771991556, "grad_norm": 11.326349258422852, "learning_rate": 1.8847807551033036e-06, "loss": 0.2695, "step": 25330 }, { "epoch": 2.229064039408867, "grad_norm": 11.38241195678711, "learning_rate": 1.8807793370353782e-06, "loss": 0.2368, "step": 25340 }, { "epoch": 2.2299437016185784, "grad_norm": 8.36948299407959, "learning_rate": 1.8767811868551855e-06, "loss": 0.2989, "step": 25350 }, { "epoch": 2.23082336382829, "grad_norm": 11.120427131652832, "learning_rate": 1.8727863087514326e-06, "loss": 0.2267, "step": 25360 }, { "epoch": 2.2317030260380015, "grad_norm": 7.78348970413208, "learning_rate": 1.868794706909397e-06, "loss": 0.255, "step": 25370 }, { "epoch": 2.232582688247713, "grad_norm": 9.99831771850586, "learning_rate": 1.8648063855109316e-06, "loss": 0.2699, "step": 25380 }, { "epoch": 2.2334623504574243, "grad_norm": 10.319661140441895, "learning_rate": 1.860821348734445e-06, "loss": 0.2574, "step": 25390 }, { "epoch": 2.2343420126671356, "grad_norm": 7.2734055519104, "learning_rate": 1.8568396007549071e-06, "loss": 0.2524, "step": 25400 }, { "epoch": 2.2352216748768474, "grad_norm": 11.37409496307373, "learning_rate": 1.8528611457438434e-06, "loss": 0.2642, "step": 25410 }, { "epoch": 2.236101337086559, "grad_norm": 8.699897766113281, "learning_rate": 1.848885987869326e-06, "loss": 0.2777, "step": 25420 }, { "epoch": 2.23698099929627, "grad_norm": 9.67802906036377, "learning_rate": 1.8449141312959783e-06, "loss": 0.2576, "step": 25430 }, { "epoch": 2.2378606615059815, "grad_norm": 9.244704246520996, "learning_rate": 1.8409455801849606e-06, "loss": 0.2737, "step": 25440 }, { "epoch": 2.2387403237156933, "grad_norm": 6.939197063446045, "learning_rate": 1.8369803386939722e-06, "loss": 0.3368, "step": 25450 }, { "epoch": 2.2396199859254047, "grad_norm": 11.897838592529297, "learning_rate": 1.8330184109772437e-06, "loss": 0.3281, "step": 25460 }, { "epoch": 2.240499648135116, "grad_norm": 12.16904067993164, "learning_rate": 1.8290598011855326e-06, "loss": 0.3087, "step": 25470 }, { "epoch": 2.2413793103448274, "grad_norm": 8.123565673828125, "learning_rate": 1.8251045134661261e-06, "loss": 0.222, "step": 25480 }, { "epoch": 2.242258972554539, "grad_norm": 10.649580001831055, "learning_rate": 1.821152551962826e-06, "loss": 0.3244, "step": 25490 }, { "epoch": 2.2431386347642506, "grad_norm": 7.529005527496338, "learning_rate": 1.817203920815948e-06, "loss": 0.2413, "step": 25500 }, { "epoch": 2.244018296973962, "grad_norm": 9.62614917755127, "learning_rate": 1.8132586241623278e-06, "loss": 0.247, "step": 25510 }, { "epoch": 2.2448979591836733, "grad_norm": 10.561853408813477, "learning_rate": 1.8093166661352935e-06, "loss": 0.3013, "step": 25520 }, { "epoch": 2.245777621393385, "grad_norm": 8.187665939331055, "learning_rate": 1.8053780508646884e-06, "loss": 0.3095, "step": 25530 }, { "epoch": 2.2466572836030965, "grad_norm": 7.029410362243652, "learning_rate": 1.8014427824768465e-06, "loss": 0.2598, "step": 25540 }, { "epoch": 2.247536945812808, "grad_norm": 7.572699069976807, "learning_rate": 1.7975108650945965e-06, "loss": 0.3015, "step": 25550 }, { "epoch": 2.248416608022519, "grad_norm": 7.102493762969971, "learning_rate": 1.7935823028372606e-06, "loss": 0.2857, "step": 25560 }, { "epoch": 2.249296270232231, "grad_norm": 6.326128005981445, "learning_rate": 1.7896570998206418e-06, "loss": 0.2215, "step": 25570 }, { "epoch": 2.2501759324419424, "grad_norm": 10.867376327514648, "learning_rate": 1.78573526015702e-06, "loss": 0.3012, "step": 25580 }, { "epoch": 2.2510555946516537, "grad_norm": 9.584199905395508, "learning_rate": 1.7818167879551612e-06, "loss": 0.2816, "step": 25590 }, { "epoch": 2.251935256861365, "grad_norm": 8.12747859954834, "learning_rate": 1.7779016873202947e-06, "loss": 0.2963, "step": 25600 }, { "epoch": 2.252814919071077, "grad_norm": 8.461793899536133, "learning_rate": 1.773989962354125e-06, "loss": 0.2321, "step": 25610 }, { "epoch": 2.2536945812807883, "grad_norm": 9.827314376831055, "learning_rate": 1.770081617154814e-06, "loss": 0.2599, "step": 25620 }, { "epoch": 2.2545742434904996, "grad_norm": 9.168169975280762, "learning_rate": 1.7661766558169851e-06, "loss": 0.2272, "step": 25630 }, { "epoch": 2.255453905700211, "grad_norm": 11.145028114318848, "learning_rate": 1.7622750824317165e-06, "loss": 0.238, "step": 25640 }, { "epoch": 2.2563335679099223, "grad_norm": 8.006464004516602, "learning_rate": 1.758376901086536e-06, "loss": 0.2414, "step": 25650 }, { "epoch": 2.257213230119634, "grad_norm": 9.525107383728027, "learning_rate": 1.754482115865418e-06, "loss": 0.2586, "step": 25660 }, { "epoch": 2.2580928923293455, "grad_norm": 8.052286148071289, "learning_rate": 1.750590730848782e-06, "loss": 0.2861, "step": 25670 }, { "epoch": 2.258972554539057, "grad_norm": 10.190641403198242, "learning_rate": 1.7467027501134814e-06, "loss": 0.2413, "step": 25680 }, { "epoch": 2.2598522167487687, "grad_norm": 8.249744415283203, "learning_rate": 1.742818177732804e-06, "loss": 0.2719, "step": 25690 }, { "epoch": 2.26073187895848, "grad_norm": 7.228156089782715, "learning_rate": 1.7389370177764676e-06, "loss": 0.2491, "step": 25700 }, { "epoch": 2.2616115411681914, "grad_norm": 11.648438453674316, "learning_rate": 1.7350592743106127e-06, "loss": 0.2932, "step": 25710 }, { "epoch": 2.2624912033779028, "grad_norm": 14.691826820373535, "learning_rate": 1.7311849513978052e-06, "loss": 0.2846, "step": 25720 }, { "epoch": 2.263370865587614, "grad_norm": 12.390837669372559, "learning_rate": 1.7273140530970234e-06, "loss": 0.2678, "step": 25730 }, { "epoch": 2.264250527797326, "grad_norm": 11.541531562805176, "learning_rate": 1.7234465834636565e-06, "loss": 0.2435, "step": 25740 }, { "epoch": 2.2651301900070373, "grad_norm": 9.189976692199707, "learning_rate": 1.7195825465495093e-06, "loss": 0.2815, "step": 25750 }, { "epoch": 2.2660098522167487, "grad_norm": 8.070886611938477, "learning_rate": 1.7157219464027785e-06, "loss": 0.2818, "step": 25760 }, { "epoch": 2.2668895144264605, "grad_norm": 7.365234851837158, "learning_rate": 1.711864787068071e-06, "loss": 0.2251, "step": 25770 }, { "epoch": 2.267769176636172, "grad_norm": 10.297174453735352, "learning_rate": 1.7080110725863835e-06, "loss": 0.2281, "step": 25780 }, { "epoch": 2.268648838845883, "grad_norm": 10.293027877807617, "learning_rate": 1.7041608069951026e-06, "loss": 0.2834, "step": 25790 }, { "epoch": 2.2695285010555946, "grad_norm": 9.092068672180176, "learning_rate": 1.7003139943280074e-06, "loss": 0.2606, "step": 25800 }, { "epoch": 2.270408163265306, "grad_norm": 10.666790962219238, "learning_rate": 1.696470638615254e-06, "loss": 0.2543, "step": 25810 }, { "epoch": 2.2712878254750177, "grad_norm": 11.535818099975586, "learning_rate": 1.6926307438833788e-06, "loss": 0.2554, "step": 25820 }, { "epoch": 2.272167487684729, "grad_norm": 8.592547416687012, "learning_rate": 1.688794314155292e-06, "loss": 0.2327, "step": 25830 }, { "epoch": 2.2730471498944405, "grad_norm": 10.168164253234863, "learning_rate": 1.6849613534502718e-06, "loss": 0.2382, "step": 25840 }, { "epoch": 2.273926812104152, "grad_norm": 13.695806503295898, "learning_rate": 1.6811318657839675e-06, "loss": 0.3062, "step": 25850 }, { "epoch": 2.2748064743138636, "grad_norm": 8.301909446716309, "learning_rate": 1.6773058551683846e-06, "loss": 0.2809, "step": 25860 }, { "epoch": 2.275686136523575, "grad_norm": 11.111045837402344, "learning_rate": 1.6734833256118883e-06, "loss": 0.2499, "step": 25870 }, { "epoch": 2.2765657987332863, "grad_norm": 13.464539527893066, "learning_rate": 1.6696642811191943e-06, "loss": 0.2637, "step": 25880 }, { "epoch": 2.2774454609429977, "grad_norm": 7.243203163146973, "learning_rate": 1.6658487256913701e-06, "loss": 0.215, "step": 25890 }, { "epoch": 2.2783251231527095, "grad_norm": 9.862035751342773, "learning_rate": 1.6620366633258257e-06, "loss": 0.2553, "step": 25900 }, { "epoch": 2.279204785362421, "grad_norm": 8.234103202819824, "learning_rate": 1.658228098016314e-06, "loss": 0.2898, "step": 25910 }, { "epoch": 2.2800844475721322, "grad_norm": 7.461636543273926, "learning_rate": 1.6544230337529204e-06, "loss": 0.2811, "step": 25920 }, { "epoch": 2.2809641097818436, "grad_norm": 9.076791763305664, "learning_rate": 1.6506214745220695e-06, "loss": 0.3118, "step": 25930 }, { "epoch": 2.2818437719915554, "grad_norm": 9.090426445007324, "learning_rate": 1.646823424306504e-06, "loss": 0.2788, "step": 25940 }, { "epoch": 2.2827234342012668, "grad_norm": 10.752902030944824, "learning_rate": 1.643028887085295e-06, "loss": 0.2467, "step": 25950 }, { "epoch": 2.283603096410978, "grad_norm": 7.987455368041992, "learning_rate": 1.639237866833837e-06, "loss": 0.2778, "step": 25960 }, { "epoch": 2.2844827586206895, "grad_norm": 9.911569595336914, "learning_rate": 1.6354503675238354e-06, "loss": 0.3059, "step": 25970 }, { "epoch": 2.2853624208304013, "grad_norm": 9.100192070007324, "learning_rate": 1.6316663931233046e-06, "loss": 0.2713, "step": 25980 }, { "epoch": 2.2862420830401127, "grad_norm": 6.690523147583008, "learning_rate": 1.6278859475965753e-06, "loss": 0.3179, "step": 25990 }, { "epoch": 2.287121745249824, "grad_norm": 8.268904685974121, "learning_rate": 1.6241090349042686e-06, "loss": 0.257, "step": 26000 }, { "epoch": 2.2880014074595354, "grad_norm": 12.295182228088379, "learning_rate": 1.6203356590033153e-06, "loss": 0.3004, "step": 26010 }, { "epoch": 2.288881069669247, "grad_norm": 7.850646018981934, "learning_rate": 1.616565823846935e-06, "loss": 0.2489, "step": 26020 }, { "epoch": 2.2897607318789586, "grad_norm": 9.743672370910645, "learning_rate": 1.612799533384638e-06, "loss": 0.3178, "step": 26030 }, { "epoch": 2.29064039408867, "grad_norm": 10.37140941619873, "learning_rate": 1.6090367915622257e-06, "loss": 0.2454, "step": 26040 }, { "epoch": 2.2915200562983813, "grad_norm": 8.72269058227539, "learning_rate": 1.6052776023217753e-06, "loss": 0.2371, "step": 26050 }, { "epoch": 2.2923997185080927, "grad_norm": 9.418316841125488, "learning_rate": 1.6015219696016464e-06, "loss": 0.2593, "step": 26060 }, { "epoch": 2.2932793807178045, "grad_norm": 8.558599472045898, "learning_rate": 1.5977698973364702e-06, "loss": 0.2175, "step": 26070 }, { "epoch": 2.294159042927516, "grad_norm": 16.42795181274414, "learning_rate": 1.5940213894571482e-06, "loss": 0.3013, "step": 26080 }, { "epoch": 2.295038705137227, "grad_norm": 7.830699443817139, "learning_rate": 1.5902764498908507e-06, "loss": 0.2679, "step": 26090 }, { "epoch": 2.295918367346939, "grad_norm": 11.048858642578125, "learning_rate": 1.5865350825610053e-06, "loss": 0.2714, "step": 26100 }, { "epoch": 2.2967980295566504, "grad_norm": 10.988842964172363, "learning_rate": 1.5827972913872992e-06, "loss": 0.2549, "step": 26110 }, { "epoch": 2.2976776917663617, "grad_norm": 9.873026847839355, "learning_rate": 1.5790630802856733e-06, "loss": 0.2818, "step": 26120 }, { "epoch": 2.298557353976073, "grad_norm": 8.541927337646484, "learning_rate": 1.5753324531683144e-06, "loss": 0.2592, "step": 26130 }, { "epoch": 2.2994370161857844, "grad_norm": 9.763505935668945, "learning_rate": 1.5716054139436616e-06, "loss": 0.2651, "step": 26140 }, { "epoch": 2.3003166783954963, "grad_norm": 8.084816932678223, "learning_rate": 1.5678819665163886e-06, "loss": 0.2572, "step": 26150 }, { "epoch": 2.3011963406052076, "grad_norm": 8.611684799194336, "learning_rate": 1.564162114787407e-06, "loss": 0.2571, "step": 26160 }, { "epoch": 2.302076002814919, "grad_norm": 9.426928520202637, "learning_rate": 1.5604458626538671e-06, "loss": 0.277, "step": 26170 }, { "epoch": 2.302955665024631, "grad_norm": 10.874741554260254, "learning_rate": 1.5567332140091391e-06, "loss": 0.2405, "step": 26180 }, { "epoch": 2.303835327234342, "grad_norm": 9.62654972076416, "learning_rate": 1.5530241727428219e-06, "loss": 0.2741, "step": 26190 }, { "epoch": 2.3047149894440535, "grad_norm": 7.789315223693848, "learning_rate": 1.5493187427407398e-06, "loss": 0.2393, "step": 26200 }, { "epoch": 2.305594651653765, "grad_norm": 8.233599662780762, "learning_rate": 1.545616927884927e-06, "loss": 0.2261, "step": 26210 }, { "epoch": 2.3064743138634762, "grad_norm": 8.685219764709473, "learning_rate": 1.5419187320536316e-06, "loss": 0.2593, "step": 26220 }, { "epoch": 2.307353976073188, "grad_norm": 8.256808280944824, "learning_rate": 1.5382241591213165e-06, "loss": 0.2729, "step": 26230 }, { "epoch": 2.3082336382828994, "grad_norm": 7.994416236877441, "learning_rate": 1.534533212958637e-06, "loss": 0.2575, "step": 26240 }, { "epoch": 2.3091133004926108, "grad_norm": 11.396039962768555, "learning_rate": 1.530845897432461e-06, "loss": 0.2625, "step": 26250 }, { "epoch": 2.309992962702322, "grad_norm": 14.439395904541016, "learning_rate": 1.5271622164058454e-06, "loss": 0.309, "step": 26260 }, { "epoch": 2.310872624912034, "grad_norm": 7.588683605194092, "learning_rate": 1.5234821737380394e-06, "loss": 0.2351, "step": 26270 }, { "epoch": 2.3117522871217453, "grad_norm": 10.379295349121094, "learning_rate": 1.5198057732844867e-06, "loss": 0.2979, "step": 26280 }, { "epoch": 2.3126319493314567, "grad_norm": 11.829018592834473, "learning_rate": 1.516133018896808e-06, "loss": 0.2739, "step": 26290 }, { "epoch": 2.313511611541168, "grad_norm": 7.89164924621582, "learning_rate": 1.5124639144228082e-06, "loss": 0.2729, "step": 26300 }, { "epoch": 2.31439127375088, "grad_norm": 9.64254379272461, "learning_rate": 1.5087984637064667e-06, "loss": 0.2214, "step": 26310 }, { "epoch": 2.315270935960591, "grad_norm": 9.370820999145508, "learning_rate": 1.5051366705879345e-06, "loss": 0.2705, "step": 26320 }, { "epoch": 2.3161505981703026, "grad_norm": 9.355693817138672, "learning_rate": 1.5014785389035337e-06, "loss": 0.2325, "step": 26330 }, { "epoch": 2.317030260380014, "grad_norm": 9.876879692077637, "learning_rate": 1.4978240724857473e-06, "loss": 0.3023, "step": 26340 }, { "epoch": 2.3179099225897257, "grad_norm": 9.587382316589355, "learning_rate": 1.4941732751632198e-06, "loss": 0.2859, "step": 26350 }, { "epoch": 2.318789584799437, "grad_norm": 8.14909553527832, "learning_rate": 1.490526150760751e-06, "loss": 0.2428, "step": 26360 }, { "epoch": 2.3196692470091484, "grad_norm": 10.967558860778809, "learning_rate": 1.4868827030992917e-06, "loss": 0.2725, "step": 26370 }, { "epoch": 2.32054890921886, "grad_norm": 10.781310081481934, "learning_rate": 1.4832429359959454e-06, "loss": 0.2579, "step": 26380 }, { "epoch": 2.3214285714285716, "grad_norm": 14.68195629119873, "learning_rate": 1.4796068532639545e-06, "loss": 0.2852, "step": 26390 }, { "epoch": 2.322308233638283, "grad_norm": 11.17782974243164, "learning_rate": 1.4759744587127018e-06, "loss": 0.2965, "step": 26400 }, { "epoch": 2.3231878958479943, "grad_norm": 9.737462997436523, "learning_rate": 1.4723457561477128e-06, "loss": 0.246, "step": 26410 }, { "epoch": 2.3240675580577057, "grad_norm": 17.042381286621094, "learning_rate": 1.4687207493706328e-06, "loss": 0.2707, "step": 26420 }, { "epoch": 2.3249472202674175, "grad_norm": 18.329111099243164, "learning_rate": 1.4650994421792475e-06, "loss": 0.3059, "step": 26430 }, { "epoch": 2.325826882477129, "grad_norm": 9.13410472869873, "learning_rate": 1.4614818383674595e-06, "loss": 0.242, "step": 26440 }, { "epoch": 2.3267065446868402, "grad_norm": 7.889363765716553, "learning_rate": 1.4578679417252918e-06, "loss": 0.2394, "step": 26450 }, { "epoch": 2.3275862068965516, "grad_norm": 7.83784818649292, "learning_rate": 1.4542577560388888e-06, "loss": 0.2943, "step": 26460 }, { "epoch": 2.328465869106263, "grad_norm": 8.70726490020752, "learning_rate": 1.450651285090503e-06, "loss": 0.2547, "step": 26470 }, { "epoch": 2.3293455313159748, "grad_norm": 10.528024673461914, "learning_rate": 1.447048532658491e-06, "loss": 0.2355, "step": 26480 }, { "epoch": 2.330225193525686, "grad_norm": 8.658928871154785, "learning_rate": 1.4434495025173228e-06, "loss": 0.2858, "step": 26490 }, { "epoch": 2.3311048557353975, "grad_norm": 7.9178996086120605, "learning_rate": 1.4398541984375625e-06, "loss": 0.2671, "step": 26500 }, { "epoch": 2.3319845179451093, "grad_norm": 11.213088035583496, "learning_rate": 1.4362626241858706e-06, "loss": 0.2831, "step": 26510 }, { "epoch": 2.3328641801548207, "grad_norm": 9.709138870239258, "learning_rate": 1.4326747835250054e-06, "loss": 0.2276, "step": 26520 }, { "epoch": 2.333743842364532, "grad_norm": 10.198075294494629, "learning_rate": 1.429090680213807e-06, "loss": 0.2455, "step": 26530 }, { "epoch": 2.3346235045742434, "grad_norm": 9.615987777709961, "learning_rate": 1.425510318007204e-06, "loss": 0.2542, "step": 26540 }, { "epoch": 2.3355031667839548, "grad_norm": 8.472567558288574, "learning_rate": 1.4219337006562034e-06, "loss": 0.2244, "step": 26550 }, { "epoch": 2.3363828289936666, "grad_norm": 11.021154403686523, "learning_rate": 1.418360831907889e-06, "loss": 0.2731, "step": 26560 }, { "epoch": 2.337262491203378, "grad_norm": 10.616844177246094, "learning_rate": 1.4147917155054209e-06, "loss": 0.2345, "step": 26570 }, { "epoch": 2.3381421534130893, "grad_norm": 10.869376182556152, "learning_rate": 1.4112263551880239e-06, "loss": 0.2776, "step": 26580 }, { "epoch": 2.339021815622801, "grad_norm": 10.243927001953125, "learning_rate": 1.4076647546909895e-06, "loss": 0.2791, "step": 26590 }, { "epoch": 2.3399014778325125, "grad_norm": 9.995868682861328, "learning_rate": 1.404106917745669e-06, "loss": 0.2682, "step": 26600 }, { "epoch": 2.340781140042224, "grad_norm": 11.622546195983887, "learning_rate": 1.4005528480794712e-06, "loss": 0.2615, "step": 26610 }, { "epoch": 2.341660802251935, "grad_norm": 11.289910316467285, "learning_rate": 1.3970025494158607e-06, "loss": 0.2644, "step": 26620 }, { "epoch": 2.3425404644616465, "grad_norm": 10.029290199279785, "learning_rate": 1.3934560254743473e-06, "loss": 0.2884, "step": 26630 }, { "epoch": 2.3434201266713584, "grad_norm": 10.605550765991211, "learning_rate": 1.3899132799704867e-06, "loss": 0.2733, "step": 26640 }, { "epoch": 2.3442997888810697, "grad_norm": 8.734492301940918, "learning_rate": 1.3863743166158827e-06, "loss": 0.2615, "step": 26650 }, { "epoch": 2.345179451090781, "grad_norm": 9.449271202087402, "learning_rate": 1.3828391391181644e-06, "loss": 0.257, "step": 26660 }, { "epoch": 2.3460591133004924, "grad_norm": 8.63390827178955, "learning_rate": 1.3793077511810066e-06, "loss": 0.2453, "step": 26670 }, { "epoch": 2.3469387755102042, "grad_norm": 7.2091064453125, "learning_rate": 1.3757801565041062e-06, "loss": 0.2391, "step": 26680 }, { "epoch": 2.3478184377199156, "grad_norm": 11.213451385498047, "learning_rate": 1.372256358783189e-06, "loss": 0.2674, "step": 26690 }, { "epoch": 2.348698099929627, "grad_norm": 9.489259719848633, "learning_rate": 1.3687363617100042e-06, "loss": 0.2995, "step": 26700 }, { "epoch": 2.3495777621393383, "grad_norm": 17.132892608642578, "learning_rate": 1.3652201689723165e-06, "loss": 0.2797, "step": 26710 }, { "epoch": 2.35045742434905, "grad_norm": 13.258963584899902, "learning_rate": 1.361707784253906e-06, "loss": 0.2757, "step": 26720 }, { "epoch": 2.3513370865587615, "grad_norm": 11.199814796447754, "learning_rate": 1.3581992112345633e-06, "loss": 0.2782, "step": 26730 }, { "epoch": 2.352216748768473, "grad_norm": 9.25526237487793, "learning_rate": 1.3546944535900847e-06, "loss": 0.245, "step": 26740 }, { "epoch": 2.3530964109781842, "grad_norm": 14.189632415771484, "learning_rate": 1.3511935149922723e-06, "loss": 0.2693, "step": 26750 }, { "epoch": 2.353976073187896, "grad_norm": 8.003217697143555, "learning_rate": 1.3476963991089242e-06, "loss": 0.2317, "step": 26760 }, { "epoch": 2.3548557353976074, "grad_norm": 13.045838356018066, "learning_rate": 1.3442031096038345e-06, "loss": 0.2587, "step": 26770 }, { "epoch": 2.3557353976073188, "grad_norm": 11.49007511138916, "learning_rate": 1.3407136501367879e-06, "loss": 0.2636, "step": 26780 }, { "epoch": 2.35661505981703, "grad_norm": 8.74366569519043, "learning_rate": 1.3372280243635572e-06, "loss": 0.2882, "step": 26790 }, { "epoch": 2.3574947220267415, "grad_norm": 8.75794792175293, "learning_rate": 1.3337462359358977e-06, "loss": 0.2662, "step": 26800 }, { "epoch": 2.3583743842364533, "grad_norm": 13.087101936340332, "learning_rate": 1.3302682885015483e-06, "loss": 0.2763, "step": 26810 }, { "epoch": 2.3592540464461647, "grad_norm": 7.669143199920654, "learning_rate": 1.3267941857042193e-06, "loss": 0.2872, "step": 26820 }, { "epoch": 2.360133708655876, "grad_norm": 10.942428588867188, "learning_rate": 1.323323931183595e-06, "loss": 0.2358, "step": 26830 }, { "epoch": 2.361013370865588, "grad_norm": 6.718374252319336, "learning_rate": 1.3198575285753278e-06, "loss": 0.2447, "step": 26840 }, { "epoch": 2.361893033075299, "grad_norm": 7.675749778747559, "learning_rate": 1.3163949815110333e-06, "loss": 0.1962, "step": 26850 }, { "epoch": 2.3627726952850105, "grad_norm": 6.947536468505859, "learning_rate": 1.3129362936182915e-06, "loss": 0.1947, "step": 26860 }, { "epoch": 2.363652357494722, "grad_norm": 8.137070655822754, "learning_rate": 1.3094814685206358e-06, "loss": 0.2522, "step": 26870 }, { "epoch": 2.3645320197044333, "grad_norm": 9.658519744873047, "learning_rate": 1.3060305098375526e-06, "loss": 0.2595, "step": 26880 }, { "epoch": 2.365411681914145, "grad_norm": 13.012948989868164, "learning_rate": 1.3025834211844829e-06, "loss": 0.259, "step": 26890 }, { "epoch": 2.3662913441238564, "grad_norm": 8.589855194091797, "learning_rate": 1.2991402061728031e-06, "loss": 0.2354, "step": 26900 }, { "epoch": 2.367171006333568, "grad_norm": 8.98943042755127, "learning_rate": 1.2957008684098427e-06, "loss": 0.2871, "step": 26910 }, { "epoch": 2.3680506685432796, "grad_norm": 9.736959457397461, "learning_rate": 1.2922654114988608e-06, "loss": 0.2816, "step": 26920 }, { "epoch": 2.368930330752991, "grad_norm": 9.839082717895508, "learning_rate": 1.2888338390390536e-06, "loss": 0.3144, "step": 26930 }, { "epoch": 2.3698099929627023, "grad_norm": 8.738235473632812, "learning_rate": 1.2854061546255502e-06, "loss": 0.2617, "step": 26940 }, { "epoch": 2.3706896551724137, "grad_norm": 8.251740455627441, "learning_rate": 1.2819823618494026e-06, "loss": 0.2566, "step": 26950 }, { "epoch": 2.371569317382125, "grad_norm": 12.495291709899902, "learning_rate": 1.278562464297587e-06, "loss": 0.2823, "step": 26960 }, { "epoch": 2.372448979591837, "grad_norm": 9.69678020477295, "learning_rate": 1.2751464655529988e-06, "loss": 0.2504, "step": 26970 }, { "epoch": 2.3733286418015482, "grad_norm": 9.68349838256836, "learning_rate": 1.2717343691944467e-06, "loss": 0.2931, "step": 26980 }, { "epoch": 2.3742083040112596, "grad_norm": 12.286625862121582, "learning_rate": 1.2683261787966572e-06, "loss": 0.272, "step": 26990 }, { "epoch": 2.375087966220971, "grad_norm": 7.7557525634765625, "learning_rate": 1.264921897930258e-06, "loss": 0.2319, "step": 27000 }, { "epoch": 2.3759676284306828, "grad_norm": 9.652934074401855, "learning_rate": 1.261521530161784e-06, "loss": 0.2906, "step": 27010 }, { "epoch": 2.376847290640394, "grad_norm": 10.093395233154297, "learning_rate": 1.2581250790536698e-06, "loss": 0.2763, "step": 27020 }, { "epoch": 2.3777269528501055, "grad_norm": 8.045316696166992, "learning_rate": 1.2547325481642453e-06, "loss": 0.2382, "step": 27030 }, { "epoch": 2.378606615059817, "grad_norm": 7.900249004364014, "learning_rate": 1.251343941047738e-06, "loss": 0.2225, "step": 27040 }, { "epoch": 2.3794862772695287, "grad_norm": 9.270853996276855, "learning_rate": 1.2479592612542597e-06, "loss": 0.285, "step": 27050 }, { "epoch": 2.38036593947924, "grad_norm": 7.158077239990234, "learning_rate": 1.2445785123298083e-06, "loss": 0.2499, "step": 27060 }, { "epoch": 2.3812456016889514, "grad_norm": 10.732743263244629, "learning_rate": 1.2412016978162683e-06, "loss": 0.2677, "step": 27070 }, { "epoch": 2.3821252638986627, "grad_norm": 8.210497856140137, "learning_rate": 1.2378288212513956e-06, "loss": 0.2349, "step": 27080 }, { "epoch": 2.3830049261083746, "grad_norm": 8.981377601623535, "learning_rate": 1.2344598861688223e-06, "loss": 0.2456, "step": 27090 }, { "epoch": 2.383884588318086, "grad_norm": 11.74933910369873, "learning_rate": 1.2310948960980556e-06, "loss": 0.2456, "step": 27100 }, { "epoch": 2.3847642505277973, "grad_norm": 9.89930248260498, "learning_rate": 1.2277338545644652e-06, "loss": 0.2895, "step": 27110 }, { "epoch": 2.3856439127375086, "grad_norm": 8.634401321411133, "learning_rate": 1.224376765089284e-06, "loss": 0.2298, "step": 27120 }, { "epoch": 2.3865235749472204, "grad_norm": 9.827590942382812, "learning_rate": 1.221023631189609e-06, "loss": 0.2456, "step": 27130 }, { "epoch": 2.387403237156932, "grad_norm": 8.49179458618164, "learning_rate": 1.2176744563783854e-06, "loss": 0.2923, "step": 27140 }, { "epoch": 2.388282899366643, "grad_norm": 10.686359405517578, "learning_rate": 1.2143292441644184e-06, "loss": 0.2499, "step": 27150 }, { "epoch": 2.3891625615763545, "grad_norm": 10.084891319274902, "learning_rate": 1.2109879980523577e-06, "loss": 0.2567, "step": 27160 }, { "epoch": 2.3900422237860663, "grad_norm": 15.05826473236084, "learning_rate": 1.207650721542697e-06, "loss": 0.2697, "step": 27170 }, { "epoch": 2.3909218859957777, "grad_norm": 10.184721946716309, "learning_rate": 1.2043174181317745e-06, "loss": 0.2264, "step": 27180 }, { "epoch": 2.391801548205489, "grad_norm": 10.383023262023926, "learning_rate": 1.2009880913117644e-06, "loss": 0.2457, "step": 27190 }, { "epoch": 2.3926812104152004, "grad_norm": 9.578553199768066, "learning_rate": 1.197662744570674e-06, "loss": 0.2481, "step": 27200 }, { "epoch": 2.393560872624912, "grad_norm": 11.263495445251465, "learning_rate": 1.194341381392341e-06, "loss": 0.283, "step": 27210 }, { "epoch": 2.3944405348346236, "grad_norm": 11.350603103637695, "learning_rate": 1.1910240052564293e-06, "loss": 0.251, "step": 27220 }, { "epoch": 2.395320197044335, "grad_norm": 6.262871742248535, "learning_rate": 1.1877106196384286e-06, "loss": 0.2795, "step": 27230 }, { "epoch": 2.3961998592540463, "grad_norm": 10.339143753051758, "learning_rate": 1.184401228009645e-06, "loss": 0.2591, "step": 27240 }, { "epoch": 2.397079521463758, "grad_norm": 9.577591896057129, "learning_rate": 1.1810958338372008e-06, "loss": 0.2546, "step": 27250 }, { "epoch": 2.3979591836734695, "grad_norm": 6.2865986824035645, "learning_rate": 1.1777944405840302e-06, "loss": 0.202, "step": 27260 }, { "epoch": 2.398838845883181, "grad_norm": 10.101096153259277, "learning_rate": 1.174497051708876e-06, "loss": 0.3029, "step": 27270 }, { "epoch": 2.399718508092892, "grad_norm": 11.812389373779297, "learning_rate": 1.1712036706662872e-06, "loss": 0.2353, "step": 27280 }, { "epoch": 2.4005981703026036, "grad_norm": 12.473541259765625, "learning_rate": 1.1679143009066125e-06, "loss": 0.282, "step": 27290 }, { "epoch": 2.4014778325123154, "grad_norm": 8.223649024963379, "learning_rate": 1.1646289458759963e-06, "loss": 0.3287, "step": 27300 }, { "epoch": 2.4023574947220268, "grad_norm": 12.341087341308594, "learning_rate": 1.1613476090163827e-06, "loss": 0.2821, "step": 27310 }, { "epoch": 2.403237156931738, "grad_norm": 10.481534004211426, "learning_rate": 1.1580702937654974e-06, "loss": 0.3017, "step": 27320 }, { "epoch": 2.40411681914145, "grad_norm": 8.189126968383789, "learning_rate": 1.1547970035568623e-06, "loss": 0.2717, "step": 27330 }, { "epoch": 2.4049964813511613, "grad_norm": 7.664244651794434, "learning_rate": 1.1515277418197757e-06, "loss": 0.2348, "step": 27340 }, { "epoch": 2.4058761435608726, "grad_norm": 8.523743629455566, "learning_rate": 1.1482625119793168e-06, "loss": 0.2472, "step": 27350 }, { "epoch": 2.406755805770584, "grad_norm": 10.541024208068848, "learning_rate": 1.1450013174563435e-06, "loss": 0.257, "step": 27360 }, { "epoch": 2.4076354679802954, "grad_norm": 8.914420127868652, "learning_rate": 1.141744161667485e-06, "loss": 0.2352, "step": 27370 }, { "epoch": 2.408515130190007, "grad_norm": 10.424946784973145, "learning_rate": 1.1384910480251337e-06, "loss": 0.2693, "step": 27380 }, { "epoch": 2.4093947923997185, "grad_norm": 6.457013130187988, "learning_rate": 1.1352419799374565e-06, "loss": 0.2353, "step": 27390 }, { "epoch": 2.41027445460943, "grad_norm": 8.183097839355469, "learning_rate": 1.1319969608083747e-06, "loss": 0.2097, "step": 27400 }, { "epoch": 2.4111541168191413, "grad_norm": 9.504168510437012, "learning_rate": 1.1287559940375703e-06, "loss": 0.2376, "step": 27410 }, { "epoch": 2.412033779028853, "grad_norm": 8.288468360900879, "learning_rate": 1.125519083020482e-06, "loss": 0.254, "step": 27420 }, { "epoch": 2.4129134412385644, "grad_norm": 9.22205924987793, "learning_rate": 1.1222862311482957e-06, "loss": 0.2511, "step": 27430 }, { "epoch": 2.413793103448276, "grad_norm": 7.545994281768799, "learning_rate": 1.1190574418079465e-06, "loss": 0.2507, "step": 27440 }, { "epoch": 2.414672765657987, "grad_norm": 14.260639190673828, "learning_rate": 1.115832718382112e-06, "loss": 0.2724, "step": 27450 }, { "epoch": 2.415552427867699, "grad_norm": 7.344399452209473, "learning_rate": 1.1126120642492112e-06, "loss": 0.2484, "step": 27460 }, { "epoch": 2.4164320900774103, "grad_norm": 8.52161693572998, "learning_rate": 1.109395482783402e-06, "loss": 0.2952, "step": 27470 }, { "epoch": 2.4173117522871217, "grad_norm": 9.30480670928955, "learning_rate": 1.1061829773545712e-06, "loss": 0.23, "step": 27480 }, { "epoch": 2.418191414496833, "grad_norm": 10.797283172607422, "learning_rate": 1.1029745513283385e-06, "loss": 0.2275, "step": 27490 }, { "epoch": 2.419071076706545, "grad_norm": 7.31487512588501, "learning_rate": 1.0997702080660478e-06, "loss": 0.2402, "step": 27500 }, { "epoch": 2.4199507389162562, "grad_norm": 9.701096534729004, "learning_rate": 1.0965699509247656e-06, "loss": 0.3119, "step": 27510 }, { "epoch": 2.4208304011259676, "grad_norm": 9.185564994812012, "learning_rate": 1.093373783257281e-06, "loss": 0.2648, "step": 27520 }, { "epoch": 2.421710063335679, "grad_norm": 8.990410804748535, "learning_rate": 1.0901817084120948e-06, "loss": 0.2633, "step": 27530 }, { "epoch": 2.4225897255453908, "grad_norm": 6.469135761260986, "learning_rate": 1.0869937297334204e-06, "loss": 0.2608, "step": 27540 }, { "epoch": 2.423469387755102, "grad_norm": 8.281134605407715, "learning_rate": 1.0838098505611837e-06, "loss": 0.2856, "step": 27550 }, { "epoch": 2.4243490499648135, "grad_norm": 8.440711975097656, "learning_rate": 1.0806300742310082e-06, "loss": 0.2776, "step": 27560 }, { "epoch": 2.425228712174525, "grad_norm": 10.243587493896484, "learning_rate": 1.0774544040742274e-06, "loss": 0.2568, "step": 27570 }, { "epoch": 2.4261083743842367, "grad_norm": 9.226295471191406, "learning_rate": 1.0742828434178671e-06, "loss": 0.218, "step": 27580 }, { "epoch": 2.426988036593948, "grad_norm": 8.262555122375488, "learning_rate": 1.0711153955846482e-06, "loss": 0.2356, "step": 27590 }, { "epoch": 2.4278676988036594, "grad_norm": 11.031623840332031, "learning_rate": 1.067952063892988e-06, "loss": 0.2863, "step": 27600 }, { "epoch": 2.4287473610133707, "grad_norm": 7.551013946533203, "learning_rate": 1.0647928516569867e-06, "loss": 0.2719, "step": 27610 }, { "epoch": 2.429627023223082, "grad_norm": 10.287662506103516, "learning_rate": 1.061637762186426e-06, "loss": 0.2992, "step": 27620 }, { "epoch": 2.430506685432794, "grad_norm": 12.450438499450684, "learning_rate": 1.058486798786776e-06, "loss": 0.2924, "step": 27630 }, { "epoch": 2.4313863476425053, "grad_norm": 8.260564804077148, "learning_rate": 1.0553399647591766e-06, "loss": 0.258, "step": 27640 }, { "epoch": 2.4322660098522166, "grad_norm": 5.172186851501465, "learning_rate": 1.052197263400449e-06, "loss": 0.2731, "step": 27650 }, { "epoch": 2.4331456720619284, "grad_norm": 10.71817398071289, "learning_rate": 1.0490586980030787e-06, "loss": 0.2435, "step": 27660 }, { "epoch": 2.43402533427164, "grad_norm": 9.274666786193848, "learning_rate": 1.0459242718552204e-06, "loss": 0.2306, "step": 27670 }, { "epoch": 2.434904996481351, "grad_norm": 11.079204559326172, "learning_rate": 1.0427939882406918e-06, "loss": 0.2205, "step": 27680 }, { "epoch": 2.4357846586910625, "grad_norm": 7.465479850769043, "learning_rate": 1.0396678504389706e-06, "loss": 0.2385, "step": 27690 }, { "epoch": 2.436664320900774, "grad_norm": 11.998015403747559, "learning_rate": 1.036545861725191e-06, "loss": 0.2743, "step": 27700 }, { "epoch": 2.4375439831104857, "grad_norm": 8.199888229370117, "learning_rate": 1.0334280253701422e-06, "loss": 0.2911, "step": 27710 }, { "epoch": 2.438423645320197, "grad_norm": 8.317693710327148, "learning_rate": 1.0303143446402615e-06, "loss": 0.2357, "step": 27720 }, { "epoch": 2.4393033075299084, "grad_norm": 8.699806213378906, "learning_rate": 1.0272048227976317e-06, "loss": 0.2573, "step": 27730 }, { "epoch": 2.4401829697396202, "grad_norm": 10.864421844482422, "learning_rate": 1.0240994630999795e-06, "loss": 0.2356, "step": 27740 }, { "epoch": 2.4410626319493316, "grad_norm": 11.127573013305664, "learning_rate": 1.0209982688006703e-06, "loss": 0.2861, "step": 27750 }, { "epoch": 2.441942294159043, "grad_norm": 10.3131685256958, "learning_rate": 1.0179012431487085e-06, "loss": 0.2683, "step": 27760 }, { "epoch": 2.4428219563687543, "grad_norm": 8.478229522705078, "learning_rate": 1.0148083893887278e-06, "loss": 0.2499, "step": 27770 }, { "epoch": 2.4437016185784657, "grad_norm": 11.45812702178955, "learning_rate": 1.011719710760991e-06, "loss": 0.2484, "step": 27780 }, { "epoch": 2.4445812807881775, "grad_norm": 9.396849632263184, "learning_rate": 1.0086352105013918e-06, "loss": 0.2249, "step": 27790 }, { "epoch": 2.445460942997889, "grad_norm": 7.5963826179504395, "learning_rate": 1.0055548918414382e-06, "loss": 0.2743, "step": 27800 }, { "epoch": 2.4463406052076, "grad_norm": 8.563611030578613, "learning_rate": 1.002478758008265e-06, "loss": 0.2459, "step": 27810 }, { "epoch": 2.4472202674173116, "grad_norm": 16.793766021728516, "learning_rate": 9.994068122246186e-07, "loss": 0.2733, "step": 27820 }, { "epoch": 2.4480999296270234, "grad_norm": 9.164410591125488, "learning_rate": 9.96339057708857e-07, "loss": 0.2532, "step": 27830 }, { "epoch": 2.4489795918367347, "grad_norm": 6.555493354797363, "learning_rate": 9.932754976749515e-07, "loss": 0.2793, "step": 27840 }, { "epoch": 2.449859254046446, "grad_norm": 7.357025623321533, "learning_rate": 9.902161353324757e-07, "loss": 0.2492, "step": 27850 }, { "epoch": 2.4507389162561575, "grad_norm": 12.972249984741211, "learning_rate": 9.87160973886605e-07, "loss": 0.26, "step": 27860 }, { "epoch": 2.4516185784658693, "grad_norm": 9.126791000366211, "learning_rate": 9.84110016538115e-07, "loss": 0.2208, "step": 27870 }, { "epoch": 2.4524982406755806, "grad_norm": 8.780950546264648, "learning_rate": 9.810632664833763e-07, "loss": 0.2135, "step": 27880 }, { "epoch": 2.453377902885292, "grad_norm": 7.557904243469238, "learning_rate": 9.780207269143532e-07, "loss": 0.2515, "step": 27890 }, { "epoch": 2.4542575650950034, "grad_norm": 12.459617614746094, "learning_rate": 9.74982401018597e-07, "loss": 0.2401, "step": 27900 }, { "epoch": 2.455137227304715, "grad_norm": 8.953314781188965, "learning_rate": 9.71948291979245e-07, "loss": 0.2807, "step": 27910 }, { "epoch": 2.4560168895144265, "grad_norm": 8.327310562133789, "learning_rate": 9.68918402975017e-07, "loss": 0.2597, "step": 27920 }, { "epoch": 2.456896551724138, "grad_norm": 8.107495307922363, "learning_rate": 9.658927371802107e-07, "loss": 0.2837, "step": 27930 }, { "epoch": 2.4577762139338493, "grad_norm": 11.619678497314453, "learning_rate": 9.628712977646997e-07, "loss": 0.2677, "step": 27940 }, { "epoch": 2.4586558761435606, "grad_norm": 5.659707546234131, "learning_rate": 9.598540878939316e-07, "loss": 0.2153, "step": 27950 }, { "epoch": 2.4595355383532724, "grad_norm": 9.474629402160645, "learning_rate": 9.568411107289194e-07, "loss": 0.2126, "step": 27960 }, { "epoch": 2.460415200562984, "grad_norm": 7.9623003005981445, "learning_rate": 9.53832369426247e-07, "loss": 0.2466, "step": 27970 }, { "epoch": 2.461294862772695, "grad_norm": 5.29686164855957, "learning_rate": 9.508278671380538e-07, "loss": 0.256, "step": 27980 }, { "epoch": 2.462174524982407, "grad_norm": 8.345632553100586, "learning_rate": 9.478276070120412e-07, "loss": 0.2553, "step": 27990 }, { "epoch": 2.4630541871921183, "grad_norm": 8.057117462158203, "learning_rate": 9.448315921914691e-07, "loss": 0.2391, "step": 28000 }, { "epoch": 2.4639338494018297, "grad_norm": 7.946046829223633, "learning_rate": 9.418398258151456e-07, "loss": 0.2398, "step": 28010 }, { "epoch": 2.464813511611541, "grad_norm": 9.716998100280762, "learning_rate": 9.388523110174291e-07, "loss": 0.2631, "step": 28020 }, { "epoch": 2.4656931738212524, "grad_norm": 9.582300186157227, "learning_rate": 9.358690509282276e-07, "loss": 0.263, "step": 28030 }, { "epoch": 2.466572836030964, "grad_norm": 8.397933006286621, "learning_rate": 9.328900486729825e-07, "loss": 0.275, "step": 28040 }, { "epoch": 2.4674524982406756, "grad_norm": 5.541325092315674, "learning_rate": 9.299153073726847e-07, "loss": 0.2434, "step": 28050 }, { "epoch": 2.468332160450387, "grad_norm": 8.477067947387695, "learning_rate": 9.26944830143855e-07, "loss": 0.2453, "step": 28060 }, { "epoch": 2.4692118226600988, "grad_norm": 9.539761543273926, "learning_rate": 9.239786200985463e-07, "loss": 0.2369, "step": 28070 }, { "epoch": 2.47009148486981, "grad_norm": 15.647865295410156, "learning_rate": 9.210166803443465e-07, "loss": 0.2372, "step": 28080 }, { "epoch": 2.4709711470795215, "grad_norm": 13.014402389526367, "learning_rate": 9.180590139843642e-07, "loss": 0.2411, "step": 28090 }, { "epoch": 2.471850809289233, "grad_norm": 9.817190170288086, "learning_rate": 9.151056241172335e-07, "loss": 0.2935, "step": 28100 }, { "epoch": 2.472730471498944, "grad_norm": 9.296449661254883, "learning_rate": 9.121565138371069e-07, "loss": 0.1887, "step": 28110 }, { "epoch": 2.473610133708656, "grad_norm": 8.540068626403809, "learning_rate": 9.09211686233653e-07, "loss": 0.2278, "step": 28120 }, { "epoch": 2.4744897959183674, "grad_norm": 8.165756225585938, "learning_rate": 9.06271144392058e-07, "loss": 0.2305, "step": 28130 }, { "epoch": 2.4753694581280787, "grad_norm": 8.633156776428223, "learning_rate": 9.033348913930118e-07, "loss": 0.2934, "step": 28140 }, { "epoch": 2.47624912033779, "grad_norm": 12.355216026306152, "learning_rate": 9.004029303127154e-07, "loss": 0.2579, "step": 28150 }, { "epoch": 2.477128782547502, "grad_norm": 6.931093215942383, "learning_rate": 8.974752642228712e-07, "loss": 0.2248, "step": 28160 }, { "epoch": 2.4780084447572133, "grad_norm": 9.568988800048828, "learning_rate": 8.945518961906813e-07, "loss": 0.2809, "step": 28170 }, { "epoch": 2.4788881069669246, "grad_norm": 10.220922470092773, "learning_rate": 8.916328292788484e-07, "loss": 0.264, "step": 28180 }, { "epoch": 2.479767769176636, "grad_norm": 8.122461318969727, "learning_rate": 8.887180665455664e-07, "loss": 0.2936, "step": 28190 }, { "epoch": 2.480647431386348, "grad_norm": 11.014387130737305, "learning_rate": 8.858076110445185e-07, "loss": 0.2599, "step": 28200 }, { "epoch": 2.481527093596059, "grad_norm": 6.665904998779297, "learning_rate": 8.829014658248808e-07, "loss": 0.2946, "step": 28210 }, { "epoch": 2.4824067558057705, "grad_norm": 9.427952766418457, "learning_rate": 8.799996339313066e-07, "loss": 0.2913, "step": 28220 }, { "epoch": 2.483286418015482, "grad_norm": 9.20001220703125, "learning_rate": 8.771021184039336e-07, "loss": 0.2207, "step": 28230 }, { "epoch": 2.4841660802251937, "grad_norm": 11.420575141906738, "learning_rate": 8.742089222783807e-07, "loss": 0.2436, "step": 28240 }, { "epoch": 2.485045742434905, "grad_norm": 7.512379169464111, "learning_rate": 8.713200485857348e-07, "loss": 0.2346, "step": 28250 }, { "epoch": 2.4859254046446164, "grad_norm": 9.574664115905762, "learning_rate": 8.684355003525608e-07, "loss": 0.2889, "step": 28260 }, { "epoch": 2.486805066854328, "grad_norm": 13.454678535461426, "learning_rate": 8.655552806008898e-07, "loss": 0.2368, "step": 28270 }, { "epoch": 2.4876847290640396, "grad_norm": 12.177894592285156, "learning_rate": 8.626793923482129e-07, "loss": 0.2672, "step": 28280 }, { "epoch": 2.488564391273751, "grad_norm": 9.464627265930176, "learning_rate": 8.598078386074926e-07, "loss": 0.2584, "step": 28290 }, { "epoch": 2.4894440534834623, "grad_norm": 11.612042427062988, "learning_rate": 8.56940622387143e-07, "loss": 0.337, "step": 28300 }, { "epoch": 2.4903237156931737, "grad_norm": 8.471236228942871, "learning_rate": 8.54077746691036e-07, "loss": 0.2192, "step": 28310 }, { "epoch": 2.4912033779028855, "grad_norm": 12.686564445495605, "learning_rate": 8.512192145184983e-07, "loss": 0.2381, "step": 28320 }, { "epoch": 2.492083040112597, "grad_norm": 7.785597801208496, "learning_rate": 8.483650288643041e-07, "loss": 0.2441, "step": 28330 }, { "epoch": 2.492962702322308, "grad_norm": 12.599031448364258, "learning_rate": 8.455151927186733e-07, "loss": 0.2631, "step": 28340 }, { "epoch": 2.4938423645320196, "grad_norm": 10.343585968017578, "learning_rate": 8.426697090672703e-07, "loss": 0.2787, "step": 28350 }, { "epoch": 2.494722026741731, "grad_norm": 8.94437026977539, "learning_rate": 8.398285808911977e-07, "loss": 0.272, "step": 28360 }, { "epoch": 2.4956016889514427, "grad_norm": 6.7695536613464355, "learning_rate": 8.369918111669989e-07, "loss": 0.2111, "step": 28370 }, { "epoch": 2.496481351161154, "grad_norm": 9.179426193237305, "learning_rate": 8.341594028666472e-07, "loss": 0.2303, "step": 28380 }, { "epoch": 2.4973610133708655, "grad_norm": 13.072867393493652, "learning_rate": 8.313313589575483e-07, "loss": 0.2944, "step": 28390 }, { "epoch": 2.4982406755805773, "grad_norm": 10.81110668182373, "learning_rate": 8.28507682402535e-07, "loss": 0.2378, "step": 28400 }, { "epoch": 2.4991203377902886, "grad_norm": 10.513447761535645, "learning_rate": 8.256883761598633e-07, "loss": 0.3036, "step": 28410 }, { "epoch": 2.5, "grad_norm": 10.146556854248047, "learning_rate": 8.228734431832152e-07, "loss": 0.2812, "step": 28420 }, { "epoch": 2.5008796622097114, "grad_norm": 12.3344144821167, "learning_rate": 8.200628864216869e-07, "loss": 0.2515, "step": 28430 }, { "epoch": 2.5017593244194227, "grad_norm": 9.551947593688965, "learning_rate": 8.172567088197886e-07, "loss": 0.2728, "step": 28440 }, { "epoch": 2.5026389866291345, "grad_norm": 16.022441864013672, "learning_rate": 8.14454913317449e-07, "loss": 0.2903, "step": 28450 }, { "epoch": 2.503518648838846, "grad_norm": 8.986305236816406, "learning_rate": 8.116575028499968e-07, "loss": 0.3194, "step": 28460 }, { "epoch": 2.5043983110485573, "grad_norm": 11.070241928100586, "learning_rate": 8.088644803481754e-07, "loss": 0.2302, "step": 28470 }, { "epoch": 2.505277973258269, "grad_norm": 7.617795467376709, "learning_rate": 8.06075848738126e-07, "loss": 0.2664, "step": 28480 }, { "epoch": 2.5061576354679804, "grad_norm": 6.656663417816162, "learning_rate": 8.032916109413891e-07, "loss": 0.2382, "step": 28490 }, { "epoch": 2.507037297677692, "grad_norm": 10.24718189239502, "learning_rate": 8.005117698749065e-07, "loss": 0.2567, "step": 28500 }, { "epoch": 2.507916959887403, "grad_norm": 10.335458755493164, "learning_rate": 7.977363284510109e-07, "loss": 0.2607, "step": 28510 }, { "epoch": 2.5087966220971145, "grad_norm": 9.105319023132324, "learning_rate": 7.949652895774229e-07, "loss": 0.2341, "step": 28520 }, { "epoch": 2.5096762843068263, "grad_norm": 8.481842994689941, "learning_rate": 7.921986561572559e-07, "loss": 0.2061, "step": 28530 }, { "epoch": 2.5105559465165377, "grad_norm": 9.304788589477539, "learning_rate": 7.894364310890051e-07, "loss": 0.256, "step": 28540 }, { "epoch": 2.511435608726249, "grad_norm": 7.772065162658691, "learning_rate": 7.866786172665458e-07, "loss": 0.2657, "step": 28550 }, { "epoch": 2.512315270935961, "grad_norm": 7.45796012878418, "learning_rate": 7.839252175791368e-07, "loss": 0.2321, "step": 28560 }, { "epoch": 2.513194933145672, "grad_norm": 9.586442947387695, "learning_rate": 7.811762349114077e-07, "loss": 0.2603, "step": 28570 }, { "epoch": 2.5140745953553836, "grad_norm": 7.778568744659424, "learning_rate": 7.784316721433622e-07, "loss": 0.2504, "step": 28580 }, { "epoch": 2.514954257565095, "grad_norm": 8.546323776245117, "learning_rate": 7.756915321503733e-07, "loss": 0.2353, "step": 28590 }, { "epoch": 2.5158339197748063, "grad_norm": 9.199655532836914, "learning_rate": 7.729558178031787e-07, "loss": 0.2203, "step": 28600 }, { "epoch": 2.516713581984518, "grad_norm": 6.505067825317383, "learning_rate": 7.702245319678847e-07, "loss": 0.2306, "step": 28610 }, { "epoch": 2.5175932441942295, "grad_norm": 9.309685707092285, "learning_rate": 7.674976775059528e-07, "loss": 0.2594, "step": 28620 }, { "epoch": 2.518472906403941, "grad_norm": 13.2066068649292, "learning_rate": 7.647752572742039e-07, "loss": 0.3056, "step": 28630 }, { "epoch": 2.519352568613652, "grad_norm": 11.38178539276123, "learning_rate": 7.620572741248134e-07, "loss": 0.2294, "step": 28640 }, { "epoch": 2.520232230823364, "grad_norm": 9.357268333435059, "learning_rate": 7.593437309053058e-07, "loss": 0.2449, "step": 28650 }, { "epoch": 2.5211118930330754, "grad_norm": 10.859067916870117, "learning_rate": 7.566346304585597e-07, "loss": 0.2622, "step": 28660 }, { "epoch": 2.5219915552427867, "grad_norm": 9.365452766418457, "learning_rate": 7.539299756227936e-07, "loss": 0.2638, "step": 28670 }, { "epoch": 2.522871217452498, "grad_norm": 7.861043930053711, "learning_rate": 7.512297692315696e-07, "loss": 0.2712, "step": 28680 }, { "epoch": 2.5237508796622095, "grad_norm": 9.84830379486084, "learning_rate": 7.485340141137936e-07, "loss": 0.2637, "step": 28690 }, { "epoch": 2.5246305418719213, "grad_norm": 13.482549667358398, "learning_rate": 7.458427130937008e-07, "loss": 0.2829, "step": 28700 }, { "epoch": 2.5255102040816326, "grad_norm": 10.275890350341797, "learning_rate": 7.431558689908664e-07, "loss": 0.2152, "step": 28710 }, { "epoch": 2.526389866291344, "grad_norm": 11.31167984008789, "learning_rate": 7.404734846201933e-07, "loss": 0.2481, "step": 28720 }, { "epoch": 2.527269528501056, "grad_norm": 9.893057823181152, "learning_rate": 7.377955627919115e-07, "loss": 0.2353, "step": 28730 }, { "epoch": 2.528149190710767, "grad_norm": 7.619636058807373, "learning_rate": 7.351221063115787e-07, "loss": 0.2437, "step": 28740 }, { "epoch": 2.5290288529204785, "grad_norm": 11.600910186767578, "learning_rate": 7.32453117980072e-07, "loss": 0.274, "step": 28750 }, { "epoch": 2.52990851513019, "grad_norm": 8.112164497375488, "learning_rate": 7.297886005935878e-07, "loss": 0.2461, "step": 28760 }, { "epoch": 2.5307881773399012, "grad_norm": 10.742247581481934, "learning_rate": 7.271285569436387e-07, "loss": 0.2677, "step": 28770 }, { "epoch": 2.531667839549613, "grad_norm": 11.3738374710083, "learning_rate": 7.244729898170488e-07, "loss": 0.3065, "step": 28780 }, { "epoch": 2.5325475017593244, "grad_norm": 10.802400588989258, "learning_rate": 7.218219019959565e-07, "loss": 0.2782, "step": 28790 }, { "epoch": 2.533427163969036, "grad_norm": 7.719118595123291, "learning_rate": 7.191752962578041e-07, "loss": 0.2609, "step": 28800 }, { "epoch": 2.5343068261787476, "grad_norm": 8.852863311767578, "learning_rate": 7.165331753753373e-07, "loss": 0.2252, "step": 28810 }, { "epoch": 2.535186488388459, "grad_norm": 8.276715278625488, "learning_rate": 7.138955421166066e-07, "loss": 0.2433, "step": 28820 }, { "epoch": 2.5360661505981703, "grad_norm": 6.071931838989258, "learning_rate": 7.112623992449574e-07, "loss": 0.3027, "step": 28830 }, { "epoch": 2.5369458128078817, "grad_norm": 7.0992536544799805, "learning_rate": 7.08633749519032e-07, "loss": 0.263, "step": 28840 }, { "epoch": 2.537825475017593, "grad_norm": 9.908455848693848, "learning_rate": 7.060095956927682e-07, "loss": 0.2178, "step": 28850 }, { "epoch": 2.538705137227305, "grad_norm": 9.501165390014648, "learning_rate": 7.033899405153898e-07, "loss": 0.2602, "step": 28860 }, { "epoch": 2.539584799437016, "grad_norm": 9.930216789245605, "learning_rate": 7.007747867314085e-07, "loss": 0.257, "step": 28870 }, { "epoch": 2.5404644616467276, "grad_norm": 11.7543363571167, "learning_rate": 6.98164137080623e-07, "loss": 0.2923, "step": 28880 }, { "epoch": 2.5413441238564394, "grad_norm": 9.649060249328613, "learning_rate": 6.955579942981061e-07, "loss": 0.2191, "step": 28890 }, { "epoch": 2.5422237860661507, "grad_norm": 7.6634907722473145, "learning_rate": 6.929563611142181e-07, "loss": 0.2701, "step": 28900 }, { "epoch": 2.543103448275862, "grad_norm": 11.78989315032959, "learning_rate": 6.903592402545878e-07, "loss": 0.2764, "step": 28910 }, { "epoch": 2.5439831104855735, "grad_norm": 7.3339033126831055, "learning_rate": 6.877666344401185e-07, "loss": 0.2261, "step": 28920 }, { "epoch": 2.544862772695285, "grad_norm": 10.470422744750977, "learning_rate": 6.851785463869864e-07, "loss": 0.251, "step": 28930 }, { "epoch": 2.5457424349049966, "grad_norm": 9.41976547241211, "learning_rate": 6.825949788066299e-07, "loss": 0.2554, "step": 28940 }, { "epoch": 2.546622097114708, "grad_norm": 10.245515823364258, "learning_rate": 6.800159344057538e-07, "loss": 0.2253, "step": 28950 }, { "epoch": 2.5475017593244194, "grad_norm": 4.7021403312683105, "learning_rate": 6.774414158863246e-07, "loss": 0.2095, "step": 28960 }, { "epoch": 2.548381421534131, "grad_norm": 10.688718795776367, "learning_rate": 6.748714259455652e-07, "loss": 0.2557, "step": 28970 }, { "epoch": 2.5492610837438425, "grad_norm": 11.149628639221191, "learning_rate": 6.723059672759568e-07, "loss": 0.2979, "step": 28980 }, { "epoch": 2.550140745953554, "grad_norm": 10.744662284851074, "learning_rate": 6.697450425652319e-07, "loss": 0.2596, "step": 28990 }, { "epoch": 2.5510204081632653, "grad_norm": 13.430843353271484, "learning_rate": 6.671886544963724e-07, "loss": 0.2822, "step": 29000 }, { "epoch": 2.5519000703729766, "grad_norm": 9.236974716186523, "learning_rate": 6.646368057476083e-07, "loss": 0.2409, "step": 29010 }, { "epoch": 2.552779732582688, "grad_norm": 9.959867477416992, "learning_rate": 6.620894989924114e-07, "loss": 0.2643, "step": 29020 }, { "epoch": 2.5536593947924, "grad_norm": 9.248339653015137, "learning_rate": 6.595467368995013e-07, "loss": 0.296, "step": 29030 }, { "epoch": 2.554539057002111, "grad_norm": 9.445277214050293, "learning_rate": 6.570085221328293e-07, "loss": 0.2679, "step": 29040 }, { "epoch": 2.5554187192118225, "grad_norm": 11.224138259887695, "learning_rate": 6.544748573515853e-07, "loss": 0.2351, "step": 29050 }, { "epoch": 2.5562983814215343, "grad_norm": 11.50367546081543, "learning_rate": 6.519457452101958e-07, "loss": 0.2792, "step": 29060 }, { "epoch": 2.5571780436312457, "grad_norm": 9.113892555236816, "learning_rate": 6.494211883583096e-07, "loss": 0.2487, "step": 29070 }, { "epoch": 2.558057705840957, "grad_norm": 13.122761726379395, "learning_rate": 6.469011894408122e-07, "loss": 0.2732, "step": 29080 }, { "epoch": 2.5589373680506684, "grad_norm": 14.287981033325195, "learning_rate": 6.44385751097808e-07, "loss": 0.2671, "step": 29090 }, { "epoch": 2.5598170302603798, "grad_norm": 13.54216194152832, "learning_rate": 6.418748759646248e-07, "loss": 0.2247, "step": 29100 }, { "epoch": 2.5606966924700916, "grad_norm": 9.157291412353516, "learning_rate": 6.393685666718113e-07, "loss": 0.2916, "step": 29110 }, { "epoch": 2.561576354679803, "grad_norm": 9.197953224182129, "learning_rate": 6.368668258451321e-07, "loss": 0.2883, "step": 29120 }, { "epoch": 2.5624560168895143, "grad_norm": 9.482345581054688, "learning_rate": 6.343696561055612e-07, "loss": 0.256, "step": 29130 }, { "epoch": 2.563335679099226, "grad_norm": 11.047036170959473, "learning_rate": 6.318770600692914e-07, "loss": 0.2493, "step": 29140 }, { "epoch": 2.5642153413089375, "grad_norm": 8.177952766418457, "learning_rate": 6.293890403477188e-07, "loss": 0.269, "step": 29150 }, { "epoch": 2.565095003518649, "grad_norm": 9.924796104431152, "learning_rate": 6.269055995474443e-07, "loss": 0.2483, "step": 29160 }, { "epoch": 2.56597466572836, "grad_norm": 10.93701457977295, "learning_rate": 6.244267402702764e-07, "loss": 0.2862, "step": 29170 }, { "epoch": 2.5668543279380716, "grad_norm": 13.050559043884277, "learning_rate": 6.219524651132192e-07, "loss": 0.2153, "step": 29180 }, { "epoch": 2.5677339901477834, "grad_norm": 9.773306846618652, "learning_rate": 6.194827766684758e-07, "loss": 0.2209, "step": 29190 }, { "epoch": 2.5686136523574947, "grad_norm": 10.559741973876953, "learning_rate": 6.170176775234443e-07, "loss": 0.2457, "step": 29200 }, { "epoch": 2.569493314567206, "grad_norm": 12.057326316833496, "learning_rate": 6.145571702607134e-07, "loss": 0.2656, "step": 29210 }, { "epoch": 2.570372976776918, "grad_norm": 11.184866905212402, "learning_rate": 6.121012574580637e-07, "loss": 0.2533, "step": 29220 }, { "epoch": 2.5712526389866293, "grad_norm": 6.724252700805664, "learning_rate": 6.096499416884605e-07, "loss": 0.2624, "step": 29230 }, { "epoch": 2.5721323011963406, "grad_norm": 6.695914268493652, "learning_rate": 6.072032255200522e-07, "loss": 0.2597, "step": 29240 }, { "epoch": 2.573011963406052, "grad_norm": 12.834527015686035, "learning_rate": 6.047611115161705e-07, "loss": 0.2319, "step": 29250 }, { "epoch": 2.5738916256157633, "grad_norm": 8.50806713104248, "learning_rate": 6.023236022353224e-07, "loss": 0.2518, "step": 29260 }, { "epoch": 2.574771287825475, "grad_norm": 12.279712677001953, "learning_rate": 5.998907002311954e-07, "loss": 0.3143, "step": 29270 }, { "epoch": 2.5756509500351865, "grad_norm": 9.768590927124023, "learning_rate": 5.97462408052647e-07, "loss": 0.3216, "step": 29280 }, { "epoch": 2.576530612244898, "grad_norm": 8.362082481384277, "learning_rate": 5.950387282437037e-07, "loss": 0.2512, "step": 29290 }, { "epoch": 2.5774102744546097, "grad_norm": 10.737588882446289, "learning_rate": 5.926196633435649e-07, "loss": 0.2633, "step": 29300 }, { "epoch": 2.578289936664321, "grad_norm": 7.448565483093262, "learning_rate": 5.902052158865884e-07, "loss": 0.2386, "step": 29310 }, { "epoch": 2.5791695988740324, "grad_norm": 12.47265338897705, "learning_rate": 5.877953884023014e-07, "loss": 0.3008, "step": 29320 }, { "epoch": 2.5800492610837438, "grad_norm": 13.067049980163574, "learning_rate": 5.853901834153863e-07, "loss": 0.2503, "step": 29330 }, { "epoch": 2.580928923293455, "grad_norm": 13.239156723022461, "learning_rate": 5.829896034456827e-07, "loss": 0.276, "step": 29340 }, { "epoch": 2.581808585503167, "grad_norm": 12.369124412536621, "learning_rate": 5.805936510081883e-07, "loss": 0.2416, "step": 29350 }, { "epoch": 2.5826882477128783, "grad_norm": 8.579302787780762, "learning_rate": 5.782023286130495e-07, "loss": 0.2614, "step": 29360 }, { "epoch": 2.5835679099225897, "grad_norm": 11.622750282287598, "learning_rate": 5.758156387655633e-07, "loss": 0.2604, "step": 29370 }, { "epoch": 2.5844475721323015, "grad_norm": 9.268298149108887, "learning_rate": 5.734335839661725e-07, "loss": 0.2426, "step": 29380 }, { "epoch": 2.585327234342013, "grad_norm": 5.829324722290039, "learning_rate": 5.710561667104636e-07, "loss": 0.2893, "step": 29390 }, { "epoch": 2.586206896551724, "grad_norm": 8.291449546813965, "learning_rate": 5.686833894891675e-07, "loss": 0.2567, "step": 29400 }, { "epoch": 2.5870865587614356, "grad_norm": 8.7439546585083, "learning_rate": 5.663152547881507e-07, "loss": 0.2354, "step": 29410 }, { "epoch": 2.587966220971147, "grad_norm": 15.997117042541504, "learning_rate": 5.639517650884169e-07, "loss": 0.219, "step": 29420 }, { "epoch": 2.5888458831808583, "grad_norm": 10.617009162902832, "learning_rate": 5.615929228661038e-07, "loss": 0.2647, "step": 29430 }, { "epoch": 2.58972554539057, "grad_norm": 11.163925170898438, "learning_rate": 5.592387305924796e-07, "loss": 0.2281, "step": 29440 }, { "epoch": 2.5906052076002815, "grad_norm": 5.28806734085083, "learning_rate": 5.568891907339402e-07, "loss": 0.2354, "step": 29450 }, { "epoch": 2.591484869809993, "grad_norm": 9.781082153320312, "learning_rate": 5.545443057520106e-07, "loss": 0.252, "step": 29460 }, { "epoch": 2.5923645320197046, "grad_norm": 9.936671257019043, "learning_rate": 5.522040781033355e-07, "loss": 0.2449, "step": 29470 }, { "epoch": 2.593244194229416, "grad_norm": 13.169912338256836, "learning_rate": 5.498685102396817e-07, "loss": 0.2396, "step": 29480 }, { "epoch": 2.5941238564391274, "grad_norm": 9.308562278747559, "learning_rate": 5.475376046079334e-07, "loss": 0.3295, "step": 29490 }, { "epoch": 2.5950035186488387, "grad_norm": 12.258657455444336, "learning_rate": 5.452113636500911e-07, "loss": 0.271, "step": 29500 }, { "epoch": 2.59588318085855, "grad_norm": 11.40805435180664, "learning_rate": 5.428897898032687e-07, "loss": 0.1999, "step": 29510 }, { "epoch": 2.596762843068262, "grad_norm": 9.036211967468262, "learning_rate": 5.4057288549969e-07, "loss": 0.2646, "step": 29520 }, { "epoch": 2.5976425052779732, "grad_norm": 10.487746238708496, "learning_rate": 5.382606531666856e-07, "loss": 0.2668, "step": 29530 }, { "epoch": 2.5985221674876846, "grad_norm": 4.850767612457275, "learning_rate": 5.359530952266951e-07, "loss": 0.2422, "step": 29540 }, { "epoch": 2.5994018296973964, "grad_norm": 10.026910781860352, "learning_rate": 5.336502140972549e-07, "loss": 0.2097, "step": 29550 }, { "epoch": 2.600281491907108, "grad_norm": 11.725647926330566, "learning_rate": 5.313520121910076e-07, "loss": 0.2526, "step": 29560 }, { "epoch": 2.601161154116819, "grad_norm": 11.315289497375488, "learning_rate": 5.290584919156905e-07, "loss": 0.2341, "step": 29570 }, { "epoch": 2.6020408163265305, "grad_norm": 9.29858112335205, "learning_rate": 5.267696556741353e-07, "loss": 0.3217, "step": 29580 }, { "epoch": 2.602920478536242, "grad_norm": 9.461380958557129, "learning_rate": 5.2448550586427e-07, "loss": 0.2597, "step": 29590 }, { "epoch": 2.6038001407459537, "grad_norm": 10.011266708374023, "learning_rate": 5.222060448791106e-07, "loss": 0.2556, "step": 29600 }, { "epoch": 2.604679802955665, "grad_norm": 8.006189346313477, "learning_rate": 5.199312751067592e-07, "loss": 0.25, "step": 29610 }, { "epoch": 2.6055594651653764, "grad_norm": 8.115425109863281, "learning_rate": 5.176611989304059e-07, "loss": 0.2797, "step": 29620 }, { "epoch": 2.606439127375088, "grad_norm": 9.661474227905273, "learning_rate": 5.153958187283214e-07, "loss": 0.3046, "step": 29630 }, { "epoch": 2.6073187895847996, "grad_norm": 7.257543563842773, "learning_rate": 5.131351368738591e-07, "loss": 0.2266, "step": 29640 }, { "epoch": 2.608198451794511, "grad_norm": 10.379621505737305, "learning_rate": 5.108791557354475e-07, "loss": 0.2381, "step": 29650 }, { "epoch": 2.6090781140042223, "grad_norm": 10.329465866088867, "learning_rate": 5.086278776765918e-07, "loss": 0.245, "step": 29660 }, { "epoch": 2.6099577762139337, "grad_norm": 11.906383514404297, "learning_rate": 5.063813050558691e-07, "loss": 0.2775, "step": 29670 }, { "epoch": 2.6108374384236455, "grad_norm": 11.198293685913086, "learning_rate": 5.041394402269273e-07, "loss": 0.245, "step": 29680 }, { "epoch": 2.611717100633357, "grad_norm": 9.710712432861328, "learning_rate": 5.019022855384825e-07, "loss": 0.2523, "step": 29690 }, { "epoch": 2.612596762843068, "grad_norm": 6.617435455322266, "learning_rate": 4.996698433343166e-07, "loss": 0.2776, "step": 29700 }, { "epoch": 2.61347642505278, "grad_norm": 8.630655288696289, "learning_rate": 4.974421159532711e-07, "loss": 0.2407, "step": 29710 }, { "epoch": 2.6143560872624914, "grad_norm": 11.976882934570312, "learning_rate": 4.952191057292544e-07, "loss": 0.2768, "step": 29720 }, { "epoch": 2.6152357494722027, "grad_norm": 8.438273429870605, "learning_rate": 4.930008149912252e-07, "loss": 0.2689, "step": 29730 }, { "epoch": 2.616115411681914, "grad_norm": 8.814942359924316, "learning_rate": 4.907872460632018e-07, "loss": 0.2501, "step": 29740 }, { "epoch": 2.6169950738916254, "grad_norm": 11.576513290405273, "learning_rate": 4.88578401264258e-07, "loss": 0.235, "step": 29750 }, { "epoch": 2.6178747361013373, "grad_norm": 9.901166915893555, "learning_rate": 4.863742829085139e-07, "loss": 0.2384, "step": 29760 }, { "epoch": 2.6187543983110486, "grad_norm": 10.31019115447998, "learning_rate": 4.841748933051393e-07, "loss": 0.2812, "step": 29770 }, { "epoch": 2.61963406052076, "grad_norm": 11.119100570678711, "learning_rate": 4.819802347583524e-07, "loss": 0.2407, "step": 29780 }, { "epoch": 2.6205137227304713, "grad_norm": 11.94837760925293, "learning_rate": 4.797903095674094e-07, "loss": 0.2436, "step": 29790 }, { "epoch": 2.621393384940183, "grad_norm": 7.963234901428223, "learning_rate": 4.776051200266141e-07, "loss": 0.2707, "step": 29800 }, { "epoch": 2.6222730471498945, "grad_norm": 6.797237396240234, "learning_rate": 4.7542466842530387e-07, "loss": 0.2713, "step": 29810 }, { "epoch": 2.623152709359606, "grad_norm": 5.605098247528076, "learning_rate": 4.7324895704785377e-07, "loss": 0.2127, "step": 29820 }, { "epoch": 2.6240323715693172, "grad_norm": 8.654659271240234, "learning_rate": 4.710779881736743e-07, "loss": 0.2152, "step": 29830 }, { "epoch": 2.6249120337790286, "grad_norm": 6.476244926452637, "learning_rate": 4.689117640772062e-07, "loss": 0.2833, "step": 29840 }, { "epoch": 2.6257916959887404, "grad_norm": 10.241493225097656, "learning_rate": 4.6675028702791837e-07, "loss": 0.2787, "step": 29850 }, { "epoch": 2.6266713581984518, "grad_norm": 9.300375938415527, "learning_rate": 4.645935592903078e-07, "loss": 0.2614, "step": 29860 }, { "epoch": 2.627551020408163, "grad_norm": 8.985767364501953, "learning_rate": 4.6244158312389485e-07, "loss": 0.2316, "step": 29870 }, { "epoch": 2.628430682617875, "grad_norm": 7.495253562927246, "learning_rate": 4.602943607832233e-07, "loss": 0.1996, "step": 29880 }, { "epoch": 2.6293103448275863, "grad_norm": 11.15822982788086, "learning_rate": 4.5815189451785535e-07, "loss": 0.2552, "step": 29890 }, { "epoch": 2.6301900070372977, "grad_norm": 11.256115913391113, "learning_rate": 4.5601418657237e-07, "loss": 0.2826, "step": 29900 }, { "epoch": 2.631069669247009, "grad_norm": 18.873960494995117, "learning_rate": 4.5388123918636226e-07, "loss": 0.2793, "step": 29910 }, { "epoch": 2.6319493314567204, "grad_norm": 10.295198440551758, "learning_rate": 4.517530545944382e-07, "loss": 0.2492, "step": 29920 }, { "epoch": 2.632828993666432, "grad_norm": 15.341087341308594, "learning_rate": 4.496296350262169e-07, "loss": 0.2525, "step": 29930 }, { "epoch": 2.6337086558761436, "grad_norm": 8.729537010192871, "learning_rate": 4.4751098270632233e-07, "loss": 0.2657, "step": 29940 }, { "epoch": 2.634588318085855, "grad_norm": 8.666203498840332, "learning_rate": 4.4539709985438494e-07, "loss": 0.2475, "step": 29950 }, { "epoch": 2.6354679802955667, "grad_norm": 12.03293228149414, "learning_rate": 4.432879886850411e-07, "loss": 0.2585, "step": 29960 }, { "epoch": 2.636347642505278, "grad_norm": 11.469677925109863, "learning_rate": 4.4118365140792206e-07, "loss": 0.2419, "step": 29970 }, { "epoch": 2.6372273047149895, "grad_norm": 8.695686340332031, "learning_rate": 4.3908409022766276e-07, "loss": 0.2607, "step": 29980 }, { "epoch": 2.638106966924701, "grad_norm": 6.671716690063477, "learning_rate": 4.36989307343893e-07, "loss": 0.2433, "step": 29990 }, { "epoch": 2.638986629134412, "grad_norm": 15.60685920715332, "learning_rate": 4.3489930495123565e-07, "loss": 0.2613, "step": 30000 }, { "epoch": 2.639866291344124, "grad_norm": 11.905458450317383, "learning_rate": 4.328140852393076e-07, "loss": 0.2278, "step": 30010 }, { "epoch": 2.6407459535538353, "grad_norm": 9.998126029968262, "learning_rate": 4.3073365039271343e-07, "loss": 0.2829, "step": 30020 }, { "epoch": 2.6416256157635467, "grad_norm": 9.992156982421875, "learning_rate": 4.286580025910414e-07, "loss": 0.2805, "step": 30030 }, { "epoch": 2.6425052779732585, "grad_norm": 7.362611293792725, "learning_rate": 4.26587144008872e-07, "loss": 0.2784, "step": 30040 }, { "epoch": 2.64338494018297, "grad_norm": 9.366451263427734, "learning_rate": 4.245210768157626e-07, "loss": 0.2654, "step": 30050 }, { "epoch": 2.6442646023926812, "grad_norm": 10.350317001342773, "learning_rate": 4.224598031762522e-07, "loss": 0.2343, "step": 30060 }, { "epoch": 2.6451442646023926, "grad_norm": 10.349838256835938, "learning_rate": 4.2040332524985907e-07, "loss": 0.2456, "step": 30070 }, { "epoch": 2.646023926812104, "grad_norm": 7.704195022583008, "learning_rate": 4.183516451910752e-07, "loss": 0.2013, "step": 30080 }, { "epoch": 2.6469035890218158, "grad_norm": 6.986741542816162, "learning_rate": 4.163047651493679e-07, "loss": 0.2706, "step": 30090 }, { "epoch": 2.647783251231527, "grad_norm": 8.904753684997559, "learning_rate": 4.1426268726917406e-07, "loss": 0.2441, "step": 30100 }, { "epoch": 2.6486629134412385, "grad_norm": 12.50906753540039, "learning_rate": 4.1222541368989956e-07, "loss": 0.2253, "step": 30110 }, { "epoch": 2.6495425756509503, "grad_norm": 7.583887100219727, "learning_rate": 4.1019294654591976e-07, "loss": 0.2385, "step": 30120 }, { "epoch": 2.6504222378606617, "grad_norm": 11.569705963134766, "learning_rate": 4.081652879665715e-07, "loss": 0.2436, "step": 30130 }, { "epoch": 2.651301900070373, "grad_norm": 10.791657447814941, "learning_rate": 4.061424400761543e-07, "loss": 0.241, "step": 30140 }, { "epoch": 2.6521815622800844, "grad_norm": 7.721018314361572, "learning_rate": 4.0412440499392914e-07, "loss": 0.2568, "step": 30150 }, { "epoch": 2.6530612244897958, "grad_norm": 9.3014497756958, "learning_rate": 4.0211118483411246e-07, "loss": 0.2421, "step": 30160 }, { "epoch": 2.653940886699507, "grad_norm": 13.579079627990723, "learning_rate": 4.001027817058789e-07, "loss": 0.307, "step": 30170 }, { "epoch": 2.654820548909219, "grad_norm": 9.12967300415039, "learning_rate": 3.980991977133558e-07, "loss": 0.2854, "step": 30180 }, { "epoch": 2.6557002111189303, "grad_norm": 10.351334571838379, "learning_rate": 3.961004349556191e-07, "loss": 0.2681, "step": 30190 }, { "epoch": 2.6565798733286416, "grad_norm": 8.293131828308105, "learning_rate": 3.9410649552669934e-07, "loss": 0.2271, "step": 30200 }, { "epoch": 2.6574595355383535, "grad_norm": 12.458370208740234, "learning_rate": 3.92117381515566e-07, "loss": 0.2185, "step": 30210 }, { "epoch": 2.658339197748065, "grad_norm": 10.22793197631836, "learning_rate": 3.901330950061405e-07, "loss": 0.2792, "step": 30220 }, { "epoch": 2.659218859957776, "grad_norm": 10.916854858398438, "learning_rate": 3.8815363807728266e-07, "loss": 0.2429, "step": 30230 }, { "epoch": 2.6600985221674875, "grad_norm": 13.661593437194824, "learning_rate": 3.8617901280279257e-07, "loss": 0.2454, "step": 30240 }, { "epoch": 2.660978184377199, "grad_norm": 8.371736526489258, "learning_rate": 3.8420922125141145e-07, "loss": 0.2445, "step": 30250 }, { "epoch": 2.6618578465869107, "grad_norm": 10.706241607666016, "learning_rate": 3.8224426548681416e-07, "loss": 0.287, "step": 30260 }, { "epoch": 2.662737508796622, "grad_norm": 10.886584281921387, "learning_rate": 3.802841475676061e-07, "loss": 0.2771, "step": 30270 }, { "epoch": 2.6636171710063334, "grad_norm": 9.801243782043457, "learning_rate": 3.783288695473314e-07, "loss": 0.2347, "step": 30280 }, { "epoch": 2.6644968332160452, "grad_norm": 11.469341278076172, "learning_rate": 3.763784334744569e-07, "loss": 0.2807, "step": 30290 }, { "epoch": 2.6653764954257566, "grad_norm": 9.64080810546875, "learning_rate": 3.7443284139238143e-07, "loss": 0.2372, "step": 30300 }, { "epoch": 2.666256157635468, "grad_norm": 10.17080020904541, "learning_rate": 3.7249209533942733e-07, "loss": 0.2259, "step": 30310 }, { "epoch": 2.6671358198451793, "grad_norm": 10.55148696899414, "learning_rate": 3.7055619734883863e-07, "loss": 0.2697, "step": 30320 }, { "epoch": 2.6680154820548907, "grad_norm": 12.678217887878418, "learning_rate": 3.6862514944878137e-07, "loss": 0.2177, "step": 30330 }, { "epoch": 2.6688951442646025, "grad_norm": 10.502551078796387, "learning_rate": 3.666989536623411e-07, "loss": 0.2615, "step": 30340 }, { "epoch": 2.669774806474314, "grad_norm": 6.5010905265808105, "learning_rate": 3.6477761200751803e-07, "loss": 0.2949, "step": 30350 }, { "epoch": 2.6706544686840252, "grad_norm": 10.247170448303223, "learning_rate": 3.6286112649722927e-07, "loss": 0.2368, "step": 30360 }, { "epoch": 2.671534130893737, "grad_norm": 9.152579307556152, "learning_rate": 3.60949499139302e-07, "loss": 0.2363, "step": 30370 }, { "epoch": 2.6724137931034484, "grad_norm": 9.234353065490723, "learning_rate": 3.590427319364764e-07, "loss": 0.314, "step": 30380 }, { "epoch": 2.6732934553131598, "grad_norm": 9.695586204528809, "learning_rate": 3.571408268863974e-07, "loss": 0.2236, "step": 30390 }, { "epoch": 2.674173117522871, "grad_norm": 8.309393882751465, "learning_rate": 3.5524378598161824e-07, "loss": 0.2915, "step": 30400 }, { "epoch": 2.6750527797325825, "grad_norm": 10.544567108154297, "learning_rate": 3.533516112095969e-07, "loss": 0.2272, "step": 30410 }, { "epoch": 2.6759324419422943, "grad_norm": 10.63736629486084, "learning_rate": 3.514643045526922e-07, "loss": 0.2455, "step": 30420 }, { "epoch": 2.6768121041520057, "grad_norm": 5.737158298492432, "learning_rate": 3.4958186798816127e-07, "loss": 0.2517, "step": 30430 }, { "epoch": 2.677691766361717, "grad_norm": 9.33073616027832, "learning_rate": 3.477043034881633e-07, "loss": 0.2683, "step": 30440 }, { "epoch": 2.678571428571429, "grad_norm": 10.010102272033691, "learning_rate": 3.45831613019747e-07, "loss": 0.2476, "step": 30450 }, { "epoch": 2.67945109078114, "grad_norm": 6.060555458068848, "learning_rate": 3.439637985448613e-07, "loss": 0.222, "step": 30460 }, { "epoch": 2.6803307529908516, "grad_norm": 9.652078628540039, "learning_rate": 3.421008620203431e-07, "loss": 0.2172, "step": 30470 }, { "epoch": 2.681210415200563, "grad_norm": 6.347632884979248, "learning_rate": 3.4024280539791733e-07, "loss": 0.1935, "step": 30480 }, { "epoch": 2.6820900774102743, "grad_norm": 10.874980926513672, "learning_rate": 3.3838963062420127e-07, "loss": 0.2543, "step": 30490 }, { "epoch": 2.682969739619986, "grad_norm": 11.331039428710938, "learning_rate": 3.3654133964069414e-07, "loss": 0.266, "step": 30500 }, { "epoch": 2.6838494018296974, "grad_norm": 8.562450408935547, "learning_rate": 3.3469793438377916e-07, "loss": 0.3137, "step": 30510 }, { "epoch": 2.684729064039409, "grad_norm": 13.904664993286133, "learning_rate": 3.328594167847216e-07, "loss": 0.2328, "step": 30520 }, { "epoch": 2.6856087262491206, "grad_norm": 6.605637550354004, "learning_rate": 3.310257887696644e-07, "loss": 0.2641, "step": 30530 }, { "epoch": 2.686488388458832, "grad_norm": 15.213129043579102, "learning_rate": 3.2919705225963107e-07, "loss": 0.2303, "step": 30540 }, { "epoch": 2.6873680506685433, "grad_norm": 9.022674560546875, "learning_rate": 3.2737320917051784e-07, "loss": 0.234, "step": 30550 }, { "epoch": 2.6882477128782547, "grad_norm": 6.027808666229248, "learning_rate": 3.255542614130952e-07, "loss": 0.2147, "step": 30560 }, { "epoch": 2.689127375087966, "grad_norm": 8.481608390808105, "learning_rate": 3.237402108930049e-07, "loss": 0.269, "step": 30570 }, { "epoch": 2.6900070372976774, "grad_norm": 9.910133361816406, "learning_rate": 3.2193105951075775e-07, "loss": 0.2725, "step": 30580 }, { "epoch": 2.6908866995073892, "grad_norm": 12.060616493225098, "learning_rate": 3.2012680916173153e-07, "loss": 0.2688, "step": 30590 }, { "epoch": 2.6917663617171006, "grad_norm": 7.669306755065918, "learning_rate": 3.18327461736172e-07, "loss": 0.2747, "step": 30600 }, { "epoch": 2.692646023926812, "grad_norm": 10.74693489074707, "learning_rate": 3.1653301911918456e-07, "loss": 0.2509, "step": 30610 }, { "epoch": 2.6935256861365238, "grad_norm": 11.376936912536621, "learning_rate": 3.1474348319074034e-07, "loss": 0.2207, "step": 30620 }, { "epoch": 2.694405348346235, "grad_norm": 7.429731369018555, "learning_rate": 3.129588558256641e-07, "loss": 0.2681, "step": 30630 }, { "epoch": 2.6952850105559465, "grad_norm": 8.826720237731934, "learning_rate": 3.111791388936425e-07, "loss": 0.2433, "step": 30640 }, { "epoch": 2.696164672765658, "grad_norm": 5.929775238037109, "learning_rate": 3.0940433425921845e-07, "loss": 0.2489, "step": 30650 }, { "epoch": 2.697044334975369, "grad_norm": 12.571870803833008, "learning_rate": 3.0763444378178466e-07, "loss": 0.2283, "step": 30660 }, { "epoch": 2.697923997185081, "grad_norm": 12.321381568908691, "learning_rate": 3.058694693155873e-07, "loss": 0.2853, "step": 30670 }, { "epoch": 2.6988036593947924, "grad_norm": 12.769957542419434, "learning_rate": 3.0410941270972496e-07, "loss": 0.2407, "step": 30680 }, { "epoch": 2.6996833216045037, "grad_norm": 6.758404731750488, "learning_rate": 3.023542758081377e-07, "loss": 0.2632, "step": 30690 }, { "epoch": 2.7005629838142156, "grad_norm": 6.644548416137695, "learning_rate": 3.00604060449618e-07, "loss": 0.2542, "step": 30700 }, { "epoch": 2.701442646023927, "grad_norm": 9.4510498046875, "learning_rate": 2.988587684677979e-07, "loss": 0.2853, "step": 30710 }, { "epoch": 2.7023223082336383, "grad_norm": 9.338167190551758, "learning_rate": 2.971184016911527e-07, "loss": 0.2483, "step": 30720 }, { "epoch": 2.7032019704433496, "grad_norm": 9.217671394348145, "learning_rate": 2.9538296194299875e-07, "loss": 0.2442, "step": 30730 }, { "epoch": 2.704081632653061, "grad_norm": 9.51320743560791, "learning_rate": 2.93652451041489e-07, "loss": 0.2363, "step": 30740 }, { "epoch": 2.704961294862773, "grad_norm": 9.254216194152832, "learning_rate": 2.919268707996131e-07, "loss": 0.2171, "step": 30750 }, { "epoch": 2.705840957072484, "grad_norm": 12.215865135192871, "learning_rate": 2.9020622302519617e-07, "loss": 0.2404, "step": 30760 }, { "epoch": 2.7067206192821955, "grad_norm": 9.41160774230957, "learning_rate": 2.884905095208934e-07, "loss": 0.2619, "step": 30770 }, { "epoch": 2.7076002814919073, "grad_norm": 8.954604148864746, "learning_rate": 2.867797320841931e-07, "loss": 0.2667, "step": 30780 }, { "epoch": 2.7084799437016187, "grad_norm": 10.385095596313477, "learning_rate": 2.850738925074109e-07, "loss": 0.2119, "step": 30790 }, { "epoch": 2.70935960591133, "grad_norm": 8.871313095092773, "learning_rate": 2.833729925776896e-07, "loss": 0.2613, "step": 30800 }, { "epoch": 2.7102392681210414, "grad_norm": 9.25278377532959, "learning_rate": 2.816770340769959e-07, "loss": 0.2565, "step": 30810 }, { "epoch": 2.711118930330753, "grad_norm": 10.715754508972168, "learning_rate": 2.7998601878212075e-07, "loss": 0.2603, "step": 30820 }, { "epoch": 2.7119985925404646, "grad_norm": 10.745888710021973, "learning_rate": 2.7829994846467647e-07, "loss": 0.1961, "step": 30830 }, { "epoch": 2.712878254750176, "grad_norm": 11.878287315368652, "learning_rate": 2.7661882489109393e-07, "loss": 0.2241, "step": 30840 }, { "epoch": 2.7137579169598873, "grad_norm": 7.89145040512085, "learning_rate": 2.7494264982262095e-07, "loss": 0.2055, "step": 30850 }, { "epoch": 2.714637579169599, "grad_norm": 10.061182975769043, "learning_rate": 2.73271425015324e-07, "loss": 0.3057, "step": 30860 }, { "epoch": 2.7155172413793105, "grad_norm": 7.961201190948486, "learning_rate": 2.716051522200791e-07, "loss": 0.246, "step": 30870 }, { "epoch": 2.716396903589022, "grad_norm": 10.886479377746582, "learning_rate": 2.69943833182576e-07, "loss": 0.2843, "step": 30880 }, { "epoch": 2.7172765657987332, "grad_norm": 10.008533477783203, "learning_rate": 2.6828746964331687e-07, "loss": 0.2542, "step": 30890 }, { "epoch": 2.7181562280084446, "grad_norm": 7.706098556518555, "learning_rate": 2.666360633376086e-07, "loss": 0.2551, "step": 30900 }, { "epoch": 2.7190358902181564, "grad_norm": 10.033446311950684, "learning_rate": 2.649896159955662e-07, "loss": 0.2506, "step": 30910 }, { "epoch": 2.7199155524278678, "grad_norm": 9.040377616882324, "learning_rate": 2.6334812934211086e-07, "loss": 0.2441, "step": 30920 }, { "epoch": 2.720795214637579, "grad_norm": 8.872335433959961, "learning_rate": 2.617116050969626e-07, "loss": 0.2353, "step": 30930 }, { "epoch": 2.7216748768472905, "grad_norm": 10.381696701049805, "learning_rate": 2.6008004497464766e-07, "loss": 0.2202, "step": 30940 }, { "epoch": 2.7225545390570023, "grad_norm": 11.664424896240234, "learning_rate": 2.584534506844871e-07, "loss": 0.288, "step": 30950 }, { "epoch": 2.7234342012667137, "grad_norm": 8.893899917602539, "learning_rate": 2.568318239306017e-07, "loss": 0.2285, "step": 30960 }, { "epoch": 2.724313863476425, "grad_norm": 7.423923492431641, "learning_rate": 2.552151664119085e-07, "loss": 0.2804, "step": 30970 }, { "epoch": 2.7251935256861364, "grad_norm": 13.50676441192627, "learning_rate": 2.5360347982211706e-07, "loss": 0.2349, "step": 30980 }, { "epoch": 2.7260731878958477, "grad_norm": 9.75160026550293, "learning_rate": 2.519967658497291e-07, "loss": 0.2934, "step": 30990 }, { "epoch": 2.7269528501055595, "grad_norm": 7.950038433074951, "learning_rate": 2.5039502617803746e-07, "loss": 0.2931, "step": 31000 }, { "epoch": 2.727832512315271, "grad_norm": 9.227513313293457, "learning_rate": 2.487982624851221e-07, "loss": 0.2318, "step": 31010 }, { "epoch": 2.7287121745249823, "grad_norm": 9.0122652053833, "learning_rate": 2.472064764438531e-07, "loss": 0.2569, "step": 31020 }, { "epoch": 2.729591836734694, "grad_norm": 9.692352294921875, "learning_rate": 2.456196697218827e-07, "loss": 0.2894, "step": 31030 }, { "epoch": 2.7304714989444054, "grad_norm": 8.579285621643066, "learning_rate": 2.440378439816471e-07, "loss": 0.2516, "step": 31040 }, { "epoch": 2.731351161154117, "grad_norm": 10.335159301757812, "learning_rate": 2.4246100088036517e-07, "loss": 0.2942, "step": 31050 }, { "epoch": 2.732230823363828, "grad_norm": 7.997519016265869, "learning_rate": 2.408891420700332e-07, "loss": 0.2262, "step": 31060 }, { "epoch": 2.7331104855735395, "grad_norm": 4.82148551940918, "learning_rate": 2.3932226919742895e-07, "loss": 0.2293, "step": 31070 }, { "epoch": 2.7339901477832513, "grad_norm": 11.64458179473877, "learning_rate": 2.3776038390410483e-07, "loss": 0.2441, "step": 31080 }, { "epoch": 2.7348698099929627, "grad_norm": 11.714370727539062, "learning_rate": 2.3620348782638692e-07, "loss": 0.2176, "step": 31090 }, { "epoch": 2.735749472202674, "grad_norm": 9.969170570373535, "learning_rate": 2.3465158259537768e-07, "loss": 0.2832, "step": 31100 }, { "epoch": 2.736629134412386, "grad_norm": 10.090313911437988, "learning_rate": 2.3310466983694546e-07, "loss": 0.2752, "step": 31110 }, { "epoch": 2.7375087966220972, "grad_norm": 8.728675842285156, "learning_rate": 2.3156275117173444e-07, "loss": 0.2284, "step": 31120 }, { "epoch": 2.7383884588318086, "grad_norm": 11.923070907592773, "learning_rate": 2.3002582821515186e-07, "loss": 0.1824, "step": 31130 }, { "epoch": 2.73926812104152, "grad_norm": 12.911874771118164, "learning_rate": 2.2849390257737302e-07, "loss": 0.2858, "step": 31140 }, { "epoch": 2.7401477832512313, "grad_norm": 13.065872192382812, "learning_rate": 2.2696697586333805e-07, "loss": 0.2673, "step": 31150 }, { "epoch": 2.741027445460943, "grad_norm": 8.055092811584473, "learning_rate": 2.254450496727506e-07, "loss": 0.2227, "step": 31160 }, { "epoch": 2.7419071076706545, "grad_norm": 12.166391372680664, "learning_rate": 2.2392812560007138e-07, "loss": 0.2356, "step": 31170 }, { "epoch": 2.742786769880366, "grad_norm": 9.8870849609375, "learning_rate": 2.224162052345258e-07, "loss": 0.2391, "step": 31180 }, { "epoch": 2.7436664320900777, "grad_norm": 10.968052864074707, "learning_rate": 2.20909290160094e-07, "loss": 0.2548, "step": 31190 }, { "epoch": 2.744546094299789, "grad_norm": 12.11378288269043, "learning_rate": 2.194073819555126e-07, "loss": 0.2975, "step": 31200 }, { "epoch": 2.7454257565095004, "grad_norm": 11.91383171081543, "learning_rate": 2.1791048219427458e-07, "loss": 0.2571, "step": 31210 }, { "epoch": 2.7463054187192117, "grad_norm": 8.78177261352539, "learning_rate": 2.1641859244462327e-07, "loss": 0.2485, "step": 31220 }, { "epoch": 2.747185080928923, "grad_norm": 7.976553440093994, "learning_rate": 2.149317142695545e-07, "loss": 0.2302, "step": 31230 }, { "epoch": 2.748064743138635, "grad_norm": 10.409425735473633, "learning_rate": 2.1344984922681332e-07, "loss": 0.2833, "step": 31240 }, { "epoch": 2.7489444053483463, "grad_norm": 11.509135246276855, "learning_rate": 2.1197299886889233e-07, "loss": 0.2718, "step": 31250 }, { "epoch": 2.7498240675580576, "grad_norm": 7.150649547576904, "learning_rate": 2.1050116474303273e-07, "loss": 0.2189, "step": 31260 }, { "epoch": 2.7507037297677694, "grad_norm": 9.12989616394043, "learning_rate": 2.0903434839121716e-07, "loss": 0.2668, "step": 31270 }, { "epoch": 2.751583391977481, "grad_norm": 12.38235092163086, "learning_rate": 2.0757255135017307e-07, "loss": 0.2564, "step": 31280 }, { "epoch": 2.752463054187192, "grad_norm": 13.090129852294922, "learning_rate": 2.0611577515136926e-07, "loss": 0.2613, "step": 31290 }, { "epoch": 2.7533427163969035, "grad_norm": 8.765003204345703, "learning_rate": 2.0466402132101327e-07, "loss": 0.2826, "step": 31300 }, { "epoch": 2.754222378606615, "grad_norm": 11.810928344726562, "learning_rate": 2.0321729138005342e-07, "loss": 0.2631, "step": 31310 }, { "epoch": 2.7551020408163263, "grad_norm": 8.77116584777832, "learning_rate": 2.017755868441723e-07, "loss": 0.2524, "step": 31320 }, { "epoch": 2.755981703026038, "grad_norm": 8.768543243408203, "learning_rate": 2.0033890922378784e-07, "loss": 0.2703, "step": 31330 }, { "epoch": 2.7568613652357494, "grad_norm": 9.403644561767578, "learning_rate": 1.9890726002405437e-07, "loss": 0.2711, "step": 31340 }, { "epoch": 2.757741027445461, "grad_norm": 7.875265598297119, "learning_rate": 1.9748064074485262e-07, "loss": 0.2197, "step": 31350 }, { "epoch": 2.7586206896551726, "grad_norm": 11.309364318847656, "learning_rate": 1.9605905288079875e-07, "loss": 0.2502, "step": 31360 }, { "epoch": 2.759500351864884, "grad_norm": 12.45068645477295, "learning_rate": 1.946424979212358e-07, "loss": 0.2429, "step": 31370 }, { "epoch": 2.7603800140745953, "grad_norm": 9.211343765258789, "learning_rate": 1.9323097735023223e-07, "loss": 0.2439, "step": 31380 }, { "epoch": 2.7612596762843067, "grad_norm": 4.872377872467041, "learning_rate": 1.918244926465862e-07, "loss": 0.2249, "step": 31390 }, { "epoch": 2.762139338494018, "grad_norm": 10.694042205810547, "learning_rate": 1.9042304528381738e-07, "loss": 0.2895, "step": 31400 }, { "epoch": 2.76301900070373, "grad_norm": 8.408943176269531, "learning_rate": 1.8902663673016685e-07, "loss": 0.2198, "step": 31410 }, { "epoch": 2.763898662913441, "grad_norm": 11.878582954406738, "learning_rate": 1.8763526844859992e-07, "loss": 0.2303, "step": 31420 }, { "epoch": 2.7647783251231526, "grad_norm": 10.579988479614258, "learning_rate": 1.8624894189679832e-07, "loss": 0.2877, "step": 31430 }, { "epoch": 2.7656579873328644, "grad_norm": 11.452542304992676, "learning_rate": 1.8486765852716415e-07, "loss": 0.2905, "step": 31440 }, { "epoch": 2.7665376495425757, "grad_norm": 5.378048896789551, "learning_rate": 1.8349141978681483e-07, "loss": 0.2223, "step": 31450 }, { "epoch": 2.767417311752287, "grad_norm": 8.79360294342041, "learning_rate": 1.821202271175826e-07, "loss": 0.2347, "step": 31460 }, { "epoch": 2.7682969739619985, "grad_norm": 8.293211936950684, "learning_rate": 1.807540819560133e-07, "loss": 0.2343, "step": 31470 }, { "epoch": 2.76917663617171, "grad_norm": 8.516851425170898, "learning_rate": 1.7939298573336483e-07, "loss": 0.2622, "step": 31480 }, { "epoch": 2.7700562983814216, "grad_norm": 7.7600998878479, "learning_rate": 1.7803693987560544e-07, "loss": 0.2784, "step": 31490 }, { "epoch": 2.770935960591133, "grad_norm": 9.701373100280762, "learning_rate": 1.766859458034126e-07, "loss": 0.2307, "step": 31500 }, { "epoch": 2.7718156228008444, "grad_norm": 10.182894706726074, "learning_rate": 1.7534000493217074e-07, "loss": 0.2351, "step": 31510 }, { "epoch": 2.772695285010556, "grad_norm": 10.15060043334961, "learning_rate": 1.7399911867197027e-07, "loss": 0.2835, "step": 31520 }, { "epoch": 2.7735749472202675, "grad_norm": 11.90304183959961, "learning_rate": 1.7266328842760637e-07, "loss": 0.2609, "step": 31530 }, { "epoch": 2.774454609429979, "grad_norm": 6.080796241760254, "learning_rate": 1.7133251559857677e-07, "loss": 0.2249, "step": 31540 }, { "epoch": 2.7753342716396903, "grad_norm": 11.542882919311523, "learning_rate": 1.7000680157908235e-07, "loss": 0.2441, "step": 31550 }, { "epoch": 2.7762139338494016, "grad_norm": 7.0940141677856445, "learning_rate": 1.6868614775802094e-07, "loss": 0.2357, "step": 31560 }, { "epoch": 2.7770935960591134, "grad_norm": 10.108989715576172, "learning_rate": 1.6737055551899196e-07, "loss": 0.3039, "step": 31570 }, { "epoch": 2.777973258268825, "grad_norm": 10.384716987609863, "learning_rate": 1.6606002624029171e-07, "loss": 0.2906, "step": 31580 }, { "epoch": 2.778852920478536, "grad_norm": 8.562464714050293, "learning_rate": 1.6475456129490862e-07, "loss": 0.2784, "step": 31590 }, { "epoch": 2.779732582688248, "grad_norm": 8.403987884521484, "learning_rate": 1.6345416205053145e-07, "loss": 0.2375, "step": 31600 }, { "epoch": 2.7806122448979593, "grad_norm": 6.4532904624938965, "learning_rate": 1.6215882986953647e-07, "loss": 0.2039, "step": 31610 }, { "epoch": 2.7814919071076707, "grad_norm": 10.178884506225586, "learning_rate": 1.608685661089937e-07, "loss": 0.2486, "step": 31620 }, { "epoch": 2.782371569317382, "grad_norm": 10.942785263061523, "learning_rate": 1.5958337212066356e-07, "loss": 0.2675, "step": 31630 }, { "epoch": 2.7832512315270934, "grad_norm": 12.221735954284668, "learning_rate": 1.5830324925099449e-07, "loss": 0.2366, "step": 31640 }, { "epoch": 2.7841308937368052, "grad_norm": 10.451655387878418, "learning_rate": 1.5702819884112153e-07, "loss": 0.2553, "step": 31650 }, { "epoch": 2.7850105559465166, "grad_norm": 8.708036422729492, "learning_rate": 1.5575822222686665e-07, "loss": 0.2168, "step": 31660 }, { "epoch": 2.785890218156228, "grad_norm": 9.220015525817871, "learning_rate": 1.5449332073873447e-07, "loss": 0.2191, "step": 31670 }, { "epoch": 2.7867698803659398, "grad_norm": 8.38720989227295, "learning_rate": 1.5323349570191492e-07, "loss": 0.2337, "step": 31680 }, { "epoch": 2.787649542575651, "grad_norm": 7.742721080780029, "learning_rate": 1.519787484362778e-07, "loss": 0.254, "step": 31690 }, { "epoch": 2.7885292047853625, "grad_norm": 9.67593765258789, "learning_rate": 1.5072908025637435e-07, "loss": 0.2974, "step": 31700 }, { "epoch": 2.789408866995074, "grad_norm": 6.76805305480957, "learning_rate": 1.494844924714328e-07, "loss": 0.2546, "step": 31710 }, { "epoch": 2.790288529204785, "grad_norm": 10.028313636779785, "learning_rate": 1.4824498638536022e-07, "loss": 0.2791, "step": 31720 }, { "epoch": 2.7911681914144966, "grad_norm": 9.72877025604248, "learning_rate": 1.4701056329674002e-07, "loss": 0.2586, "step": 31730 }, { "epoch": 2.7920478536242084, "grad_norm": 7.8559489250183105, "learning_rate": 1.4578122449882992e-07, "loss": 0.2252, "step": 31740 }, { "epoch": 2.7929275158339197, "grad_norm": 9.875581741333008, "learning_rate": 1.4455697127956025e-07, "loss": 0.2783, "step": 31750 }, { "epoch": 2.793807178043631, "grad_norm": 13.349963188171387, "learning_rate": 1.433378049215356e-07, "loss": 0.2651, "step": 31760 }, { "epoch": 2.794686840253343, "grad_norm": 6.104625701904297, "learning_rate": 1.4212372670202813e-07, "loss": 0.2206, "step": 31770 }, { "epoch": 2.7955665024630543, "grad_norm": 9.057249069213867, "learning_rate": 1.4091473789298092e-07, "loss": 0.2312, "step": 31780 }, { "epoch": 2.7964461646727656, "grad_norm": 6.602721691131592, "learning_rate": 1.3971083976100696e-07, "loss": 0.24, "step": 31790 }, { "epoch": 2.797325826882477, "grad_norm": 9.063281059265137, "learning_rate": 1.385120335673823e-07, "loss": 0.2691, "step": 31800 }, { "epoch": 2.7982054890921884, "grad_norm": 10.263291358947754, "learning_rate": 1.373183205680506e-07, "loss": 0.316, "step": 31810 }, { "epoch": 2.7990851513019, "grad_norm": 9.298303604125977, "learning_rate": 1.3612970201362042e-07, "loss": 0.2926, "step": 31820 }, { "epoch": 2.7999648135116115, "grad_norm": 10.175630569458008, "learning_rate": 1.3494617914935947e-07, "loss": 0.2491, "step": 31830 }, { "epoch": 2.800844475721323, "grad_norm": 10.62816047668457, "learning_rate": 1.3376775321520096e-07, "loss": 0.2641, "step": 31840 }, { "epoch": 2.8017241379310347, "grad_norm": 9.729991912841797, "learning_rate": 1.325944254457362e-07, "loss": 0.2102, "step": 31850 }, { "epoch": 2.802603800140746, "grad_norm": 10.691816329956055, "learning_rate": 1.314261970702152e-07, "loss": 0.2631, "step": 31860 }, { "epoch": 2.8034834623504574, "grad_norm": 10.146700859069824, "learning_rate": 1.3026306931254618e-07, "loss": 0.2405, "step": 31870 }, { "epoch": 2.804363124560169, "grad_norm": 8.159823417663574, "learning_rate": 1.2910504339129382e-07, "loss": 0.2411, "step": 31880 }, { "epoch": 2.80524278676988, "grad_norm": 9.023846626281738, "learning_rate": 1.279521205196771e-07, "loss": 0.2634, "step": 31890 }, { "epoch": 2.806122448979592, "grad_norm": 11.689916610717773, "learning_rate": 1.2680430190556926e-07, "loss": 0.2175, "step": 31900 }, { "epoch": 2.8070021111893033, "grad_norm": 9.623178482055664, "learning_rate": 1.2566158875149448e-07, "loss": 0.2417, "step": 31910 }, { "epoch": 2.8078817733990147, "grad_norm": 9.253336906433105, "learning_rate": 1.2452398225463126e-07, "loss": 0.2559, "step": 31920 }, { "epoch": 2.8087614356087265, "grad_norm": 8.470133781433105, "learning_rate": 1.2339148360680564e-07, "loss": 0.2569, "step": 31930 }, { "epoch": 2.809641097818438, "grad_norm": 8.434446334838867, "learning_rate": 1.2226409399449303e-07, "loss": 0.2414, "step": 31940 }, { "epoch": 2.810520760028149, "grad_norm": 12.16212272644043, "learning_rate": 1.211418145988158e-07, "loss": 0.2541, "step": 31950 }, { "epoch": 2.8114004222378606, "grad_norm": 10.510778427124023, "learning_rate": 1.2002464659554236e-07, "loss": 0.2258, "step": 31960 }, { "epoch": 2.812280084447572, "grad_norm": 9.424552917480469, "learning_rate": 1.1891259115508869e-07, "loss": 0.2437, "step": 31970 }, { "epoch": 2.8131597466572837, "grad_norm": 8.200555801391602, "learning_rate": 1.1780564944251172e-07, "loss": 0.2467, "step": 31980 }, { "epoch": 2.814039408866995, "grad_norm": 10.773746490478516, "learning_rate": 1.1670382261751046e-07, "loss": 0.2214, "step": 31990 }, { "epoch": 2.8149190710767065, "grad_norm": 8.540154457092285, "learning_rate": 1.156071118344293e-07, "loss": 0.2234, "step": 32000 }, { "epoch": 2.8157987332864183, "grad_norm": 9.762429237365723, "learning_rate": 1.1451551824224694e-07, "loss": 0.2864, "step": 32010 }, { "epoch": 2.8166783954961296, "grad_norm": 11.231390953063965, "learning_rate": 1.1342904298458634e-07, "loss": 0.257, "step": 32020 }, { "epoch": 2.817558057705841, "grad_norm": 14.313315391540527, "learning_rate": 1.123476871997048e-07, "loss": 0.28, "step": 32030 }, { "epoch": 2.8184377199155524, "grad_norm": 8.229467391967773, "learning_rate": 1.1127145202049717e-07, "loss": 0.2672, "step": 32040 }, { "epoch": 2.8193173821252637, "grad_norm": 8.322956085205078, "learning_rate": 1.1020033857449486e-07, "loss": 0.2303, "step": 32050 }, { "epoch": 2.8201970443349755, "grad_norm": 11.256406784057617, "learning_rate": 1.0913434798386191e-07, "loss": 0.2744, "step": 32060 }, { "epoch": 2.821076706544687, "grad_norm": 10.719897270202637, "learning_rate": 1.0807348136539497e-07, "loss": 0.2057, "step": 32070 }, { "epoch": 2.8219563687543983, "grad_norm": 13.208366394042969, "learning_rate": 1.0701773983052333e-07, "loss": 0.2593, "step": 32080 }, { "epoch": 2.8228360309641096, "grad_norm": 9.660809516906738, "learning_rate": 1.0596712448530778e-07, "loss": 0.2084, "step": 32090 }, { "epoch": 2.8237156931738214, "grad_norm": 10.168910026550293, "learning_rate": 1.0492163643043674e-07, "loss": 0.2508, "step": 32100 }, { "epoch": 2.824595355383533, "grad_norm": 9.190484046936035, "learning_rate": 1.038812767612285e-07, "loss": 0.253, "step": 32110 }, { "epoch": 2.825475017593244, "grad_norm": 9.227503776550293, "learning_rate": 1.0284604656762787e-07, "loss": 0.2433, "step": 32120 }, { "epoch": 2.8263546798029555, "grad_norm": 10.307750701904297, "learning_rate": 1.018159469342056e-07, "loss": 0.2189, "step": 32130 }, { "epoch": 2.827234342012667, "grad_norm": 7.137749671936035, "learning_rate": 1.0079097894015733e-07, "loss": 0.2586, "step": 32140 }, { "epoch": 2.8281140042223787, "grad_norm": 7.411183834075928, "learning_rate": 9.977114365930296e-08, "loss": 0.2162, "step": 32150 }, { "epoch": 2.82899366643209, "grad_norm": 8.655733108520508, "learning_rate": 9.875644216008506e-08, "loss": 0.2083, "step": 32160 }, { "epoch": 2.8298733286418014, "grad_norm": 7.573199272155762, "learning_rate": 9.774687550556716e-08, "loss": 0.2941, "step": 32170 }, { "epoch": 2.830752990851513, "grad_norm": 10.667156219482422, "learning_rate": 9.67424447534332e-08, "loss": 0.2368, "step": 32180 }, { "epoch": 2.8316326530612246, "grad_norm": 10.697809219360352, "learning_rate": 9.574315095598697e-08, "loss": 0.281, "step": 32190 }, { "epoch": 2.832512315270936, "grad_norm": 10.230324745178223, "learning_rate": 9.47489951601499e-08, "loss": 0.2634, "step": 32200 }, { "epoch": 2.8333919774806473, "grad_norm": 7.479730129241943, "learning_rate": 9.37599784074611e-08, "loss": 0.2483, "step": 32210 }, { "epoch": 2.8342716396903587, "grad_norm": 10.717842102050781, "learning_rate": 9.277610173407615e-08, "loss": 0.3039, "step": 32220 }, { "epoch": 2.8351513019000705, "grad_norm": 12.183694839477539, "learning_rate": 9.179736617076329e-08, "loss": 0.297, "step": 32230 }, { "epoch": 2.836030964109782, "grad_norm": 8.75050163269043, "learning_rate": 9.082377274290788e-08, "loss": 0.2994, "step": 32240 }, { "epoch": 2.836910626319493, "grad_norm": 8.652719497680664, "learning_rate": 8.98553224705051e-08, "loss": 0.2448, "step": 32250 }, { "epoch": 2.837790288529205, "grad_norm": 10.901712417602539, "learning_rate": 8.889201636816391e-08, "loss": 0.2401, "step": 32260 }, { "epoch": 2.8386699507389164, "grad_norm": 9.630731582641602, "learning_rate": 8.793385544510314e-08, "loss": 0.254, "step": 32270 }, { "epoch": 2.8395496129486277, "grad_norm": 11.411863327026367, "learning_rate": 8.698084070515034e-08, "loss": 0.2029, "step": 32280 }, { "epoch": 2.840429275158339, "grad_norm": 9.607818603515625, "learning_rate": 8.603297314674297e-08, "loss": 0.273, "step": 32290 }, { "epoch": 2.8413089373680505, "grad_norm": 9.135359764099121, "learning_rate": 8.509025376292612e-08, "loss": 0.268, "step": 32300 }, { "epoch": 2.8421885995777623, "grad_norm": 12.208234786987305, "learning_rate": 8.415268354134976e-08, "loss": 0.2183, "step": 32310 }, { "epoch": 2.8430682617874736, "grad_norm": 6.660031318664551, "learning_rate": 8.322026346427037e-08, "loss": 0.2644, "step": 32320 }, { "epoch": 2.843947923997185, "grad_norm": 11.252057075500488, "learning_rate": 8.229299450854877e-08, "loss": 0.2168, "step": 32330 }, { "epoch": 2.844827586206897, "grad_norm": 8.26291561126709, "learning_rate": 8.137087764564955e-08, "loss": 0.2827, "step": 32340 }, { "epoch": 2.845707248416608, "grad_norm": 8.656943321228027, "learning_rate": 8.04539138416388e-08, "loss": 0.2685, "step": 32350 }, { "epoch": 2.8465869106263195, "grad_norm": 11.703451156616211, "learning_rate": 7.954210405718365e-08, "loss": 0.2493, "step": 32360 }, { "epoch": 2.847466572836031, "grad_norm": 13.312135696411133, "learning_rate": 7.863544924755328e-08, "loss": 0.2502, "step": 32370 }, { "epoch": 2.8483462350457422, "grad_norm": 9.25203800201416, "learning_rate": 7.773395036261456e-08, "loss": 0.2707, "step": 32380 }, { "epoch": 2.849225897255454, "grad_norm": 7.441296577453613, "learning_rate": 7.683760834683307e-08, "loss": 0.2488, "step": 32390 }, { "epoch": 2.8501055594651654, "grad_norm": 8.636082649230957, "learning_rate": 7.594642413927211e-08, "loss": 0.2497, "step": 32400 }, { "epoch": 2.850985221674877, "grad_norm": 14.062500953674316, "learning_rate": 7.506039867359149e-08, "loss": 0.2637, "step": 32410 }, { "epoch": 2.8518648838845886, "grad_norm": 7.8036932945251465, "learning_rate": 7.417953287804535e-08, "loss": 0.2318, "step": 32420 }, { "epoch": 2.8527445460943, "grad_norm": 8.875176429748535, "learning_rate": 7.330382767548438e-08, "loss": 0.2495, "step": 32430 }, { "epoch": 2.8536242083040113, "grad_norm": 10.92945671081543, "learning_rate": 7.243328398334914e-08, "loss": 0.2391, "step": 32440 }, { "epoch": 2.8545038705137227, "grad_norm": 6.371419906616211, "learning_rate": 7.156790271367675e-08, "loss": 0.2322, "step": 32450 }, { "epoch": 2.855383532723434, "grad_norm": 12.08604621887207, "learning_rate": 7.070768477309309e-08, "loss": 0.2729, "step": 32460 }, { "epoch": 2.8562631949331454, "grad_norm": 9.874917030334473, "learning_rate": 6.985263106281559e-08, "loss": 0.265, "step": 32470 }, { "epoch": 2.857142857142857, "grad_norm": 13.764368057250977, "learning_rate": 6.900274247865102e-08, "loss": 0.2598, "step": 32480 }, { "epoch": 2.8580225193525686, "grad_norm": 10.636714935302734, "learning_rate": 6.815801991099546e-08, "loss": 0.2245, "step": 32490 }, { "epoch": 2.85890218156228, "grad_norm": 8.018701553344727, "learning_rate": 6.731846424483213e-08, "loss": 0.2354, "step": 32500 }, { "epoch": 2.8597818437719917, "grad_norm": 10.222970008850098, "learning_rate": 6.648407635973075e-08, "loss": 0.3089, "step": 32510 }, { "epoch": 2.860661505981703, "grad_norm": 9.386000633239746, "learning_rate": 6.56548571298482e-08, "loss": 0.258, "step": 32520 }, { "epoch": 2.8615411681914145, "grad_norm": 10.807276725769043, "learning_rate": 6.483080742392511e-08, "loss": 0.2599, "step": 32530 }, { "epoch": 2.862420830401126, "grad_norm": 9.54965591430664, "learning_rate": 6.401192810528755e-08, "loss": 0.235, "step": 32540 }, { "epoch": 2.863300492610837, "grad_norm": 10.52071475982666, "learning_rate": 6.319822003184261e-08, "loss": 0.2189, "step": 32550 }, { "epoch": 2.864180154820549, "grad_norm": 10.82470989227295, "learning_rate": 6.238968405608226e-08, "loss": 0.2666, "step": 32560 }, { "epoch": 2.8650598170302604, "grad_norm": 9.819168090820312, "learning_rate": 6.158632102507778e-08, "loss": 0.242, "step": 32570 }, { "epoch": 2.8659394792399717, "grad_norm": 8.183506965637207, "learning_rate": 6.078813178048205e-08, "loss": 0.2564, "step": 32580 }, { "epoch": 2.8668191414496835, "grad_norm": 9.023285865783691, "learning_rate": 5.999511715852779e-08, "loss": 0.2143, "step": 32590 }, { "epoch": 2.867698803659395, "grad_norm": 10.497591972351074, "learning_rate": 5.92072779900249e-08, "loss": 0.2448, "step": 32600 }, { "epoch": 2.8685784658691063, "grad_norm": 8.36562442779541, "learning_rate": 5.842461510036312e-08, "loss": 0.2527, "step": 32610 }, { "epoch": 2.8694581280788176, "grad_norm": 10.060345649719238, "learning_rate": 5.764712930950822e-08, "loss": 0.2392, "step": 32620 }, { "epoch": 2.870337790288529, "grad_norm": 8.57156753540039, "learning_rate": 5.6874821432001405e-08, "loss": 0.2427, "step": 32630 }, { "epoch": 2.871217452498241, "grad_norm": 11.771944046020508, "learning_rate": 5.6107692276961e-08, "loss": 0.2389, "step": 32640 }, { "epoch": 2.872097114707952, "grad_norm": 13.209012985229492, "learning_rate": 5.534574264807802e-08, "loss": 0.2043, "step": 32650 }, { "epoch": 2.8729767769176635, "grad_norm": 9.269519805908203, "learning_rate": 5.4588973343618345e-08, "loss": 0.2266, "step": 32660 }, { "epoch": 2.8738564391273753, "grad_norm": 8.806354522705078, "learning_rate": 5.383738515642056e-08, "loss": 0.2285, "step": 32670 }, { "epoch": 2.8747361013370867, "grad_norm": 13.099539756774902, "learning_rate": 5.309097887389314e-08, "loss": 0.2619, "step": 32680 }, { "epoch": 2.875615763546798, "grad_norm": 9.137969017028809, "learning_rate": 5.2349755278018885e-08, "loss": 0.2736, "step": 32690 }, { "epoch": 2.8764954257565094, "grad_norm": 6.8905792236328125, "learning_rate": 5.1613715145348855e-08, "loss": 0.2282, "step": 32700 }, { "epoch": 2.8773750879662208, "grad_norm": 8.732641220092773, "learning_rate": 5.088285924700342e-08, "loss": 0.2204, "step": 32710 }, { "epoch": 2.8782547501759326, "grad_norm": 8.645782470703125, "learning_rate": 5.015718834867289e-08, "loss": 0.2496, "step": 32720 }, { "epoch": 2.879134412385644, "grad_norm": 7.95278787612915, "learning_rate": 4.9436703210614665e-08, "loss": 0.2556, "step": 32730 }, { "epoch": 2.8800140745953553, "grad_norm": 12.770002365112305, "learning_rate": 4.872140458765329e-08, "loss": 0.2303, "step": 32740 }, { "epoch": 2.880893736805067, "grad_norm": 7.619011402130127, "learning_rate": 4.8011293229179855e-08, "loss": 0.2951, "step": 32750 }, { "epoch": 2.8817733990147785, "grad_norm": 7.902576923370361, "learning_rate": 4.730636987915038e-08, "loss": 0.2739, "step": 32760 }, { "epoch": 2.88265306122449, "grad_norm": 9.821297645568848, "learning_rate": 4.6606635276086334e-08, "loss": 0.2351, "step": 32770 }, { "epoch": 2.883532723434201, "grad_norm": 10.085095405578613, "learning_rate": 4.591209015307241e-08, "loss": 0.2536, "step": 32780 }, { "epoch": 2.8844123856439126, "grad_norm": 8.42567253112793, "learning_rate": 4.522273523775767e-08, "loss": 0.2326, "step": 32790 }, { "epoch": 2.8852920478536244, "grad_norm": 5.844264984130859, "learning_rate": 4.453857125235217e-08, "loss": 0.2616, "step": 32800 }, { "epoch": 2.8861717100633357, "grad_norm": 9.220449447631836, "learning_rate": 4.385959891362812e-08, "loss": 0.2219, "step": 32810 }, { "epoch": 2.887051372273047, "grad_norm": 10.559328079223633, "learning_rate": 4.3185818932918735e-08, "loss": 0.2529, "step": 32820 }, { "epoch": 2.887931034482759, "grad_norm": 7.533463001251221, "learning_rate": 4.251723201611824e-08, "loss": 0.2563, "step": 32830 }, { "epoch": 2.8888106966924703, "grad_norm": 8.21745491027832, "learning_rate": 4.1853838863678554e-08, "loss": 0.1838, "step": 32840 }, { "epoch": 2.8896903589021816, "grad_norm": 8.622432708740234, "learning_rate": 4.119564017061206e-08, "loss": 0.2473, "step": 32850 }, { "epoch": 2.890570021111893, "grad_norm": 8.779812812805176, "learning_rate": 4.0542636626487165e-08, "loss": 0.2071, "step": 32860 }, { "epoch": 2.8914496833216043, "grad_norm": 8.579705238342285, "learning_rate": 3.9894828915432174e-08, "loss": 0.2196, "step": 32870 }, { "epoch": 2.8923293455313157, "grad_norm": 10.293035507202148, "learning_rate": 3.925221771612864e-08, "loss": 0.2738, "step": 32880 }, { "epoch": 2.8932090077410275, "grad_norm": 7.439376354217529, "learning_rate": 3.861480370181636e-08, "loss": 0.2491, "step": 32890 }, { "epoch": 2.894088669950739, "grad_norm": 8.080939292907715, "learning_rate": 3.798258754029005e-08, "loss": 0.2248, "step": 32900 }, { "epoch": 2.8949683321604502, "grad_norm": 9.89918327331543, "learning_rate": 3.735556989389766e-08, "loss": 0.272, "step": 32910 }, { "epoch": 2.895847994370162, "grad_norm": 11.671601295471191, "learning_rate": 3.6733751419541494e-08, "loss": 0.2482, "step": 32920 }, { "epoch": 2.8967276565798734, "grad_norm": 7.178308010101318, "learning_rate": 3.6117132768677676e-08, "loss": 0.2121, "step": 32930 }, { "epoch": 2.8976073187895848, "grad_norm": 11.013496398925781, "learning_rate": 3.5505714587312775e-08, "loss": 0.2606, "step": 32940 }, { "epoch": 2.898486980999296, "grad_norm": 7.146206378936768, "learning_rate": 3.489949751600663e-08, "loss": 0.2689, "step": 32950 }, { "epoch": 2.8993666432090075, "grad_norm": 10.130054473876953, "learning_rate": 3.429848218987009e-08, "loss": 0.2431, "step": 32960 }, { "epoch": 2.9002463054187193, "grad_norm": 11.860734939575195, "learning_rate": 3.3702669238562804e-08, "loss": 0.2499, "step": 32970 }, { "epoch": 2.9011259676284307, "grad_norm": 9.694138526916504, "learning_rate": 3.3112059286296575e-08, "loss": 0.2252, "step": 32980 }, { "epoch": 2.902005629838142, "grad_norm": 8.458369255065918, "learning_rate": 3.252665295182922e-08, "loss": 0.2388, "step": 32990 }, { "epoch": 2.902885292047854, "grad_norm": 8.014810562133789, "learning_rate": 3.194645084846904e-08, "loss": 0.2686, "step": 33000 }, { "epoch": 2.903764954257565, "grad_norm": 10.555482864379883, "learning_rate": 3.137145358407201e-08, "loss": 0.2388, "step": 33010 }, { "epoch": 2.9046446164672766, "grad_norm": 7.554744720458984, "learning_rate": 3.080166176104016e-08, "loss": 0.25, "step": 33020 }, { "epoch": 2.905524278676988, "grad_norm": 7.614463806152344, "learning_rate": 3.0237075976322636e-08, "loss": 0.289, "step": 33030 }, { "epoch": 2.9064039408866993, "grad_norm": 10.582052230834961, "learning_rate": 2.9677696821414636e-08, "loss": 0.2475, "step": 33040 }, { "epoch": 2.907283603096411, "grad_norm": 8.471565246582031, "learning_rate": 2.9123524882355704e-08, "loss": 0.2753, "step": 33050 }, { "epoch": 2.9081632653061225, "grad_norm": 7.295182228088379, "learning_rate": 2.8574560739730305e-08, "loss": 0.2557, "step": 33060 }, { "epoch": 2.909042927515834, "grad_norm": 12.938552856445312, "learning_rate": 2.8030804968667836e-08, "loss": 0.261, "step": 33070 }, { "epoch": 2.9099225897255456, "grad_norm": 7.929080963134766, "learning_rate": 2.7492258138839822e-08, "loss": 0.2305, "step": 33080 }, { "epoch": 2.910802251935257, "grad_norm": 9.294986724853516, "learning_rate": 2.6958920814462162e-08, "loss": 0.2287, "step": 33090 }, { "epoch": 2.9116819141449684, "grad_norm": 8.087462425231934, "learning_rate": 2.6430793554290122e-08, "loss": 0.252, "step": 33100 }, { "epoch": 2.9125615763546797, "grad_norm": 12.515022277832031, "learning_rate": 2.5907876911623332e-08, "loss": 0.277, "step": 33110 }, { "epoch": 2.913441238564391, "grad_norm": 8.738909721374512, "learning_rate": 2.5390171434301336e-08, "loss": 0.2469, "step": 33120 }, { "epoch": 2.914320900774103, "grad_norm": 10.20217514038086, "learning_rate": 2.487767766470417e-08, "loss": 0.3039, "step": 33130 }, { "epoch": 2.9152005629838142, "grad_norm": 12.114776611328125, "learning_rate": 2.437039613975234e-08, "loss": 0.2673, "step": 33140 }, { "epoch": 2.9160802251935256, "grad_norm": 8.783181190490723, "learning_rate": 2.3868327390905167e-08, "loss": 0.2574, "step": 33150 }, { "epoch": 2.9169598874032374, "grad_norm": 6.961308002471924, "learning_rate": 2.337147194416023e-08, "loss": 0.2557, "step": 33160 }, { "epoch": 2.917839549612949, "grad_norm": 11.260627746582031, "learning_rate": 2.2879830320054475e-08, "loss": 0.2662, "step": 33170 }, { "epoch": 2.91871921182266, "grad_norm": 13.388823509216309, "learning_rate": 2.2393403033661444e-08, "loss": 0.2712, "step": 33180 }, { "epoch": 2.9195988740323715, "grad_norm": 15.514480590820312, "learning_rate": 2.191219059459293e-08, "loss": 0.3126, "step": 33190 }, { "epoch": 2.920478536242083, "grad_norm": 8.064638137817383, "learning_rate": 2.1436193506996216e-08, "loss": 0.2415, "step": 33200 }, { "epoch": 2.9213581984517947, "grad_norm": 7.884963512420654, "learning_rate": 2.0965412269555174e-08, "loss": 0.2429, "step": 33210 }, { "epoch": 2.922237860661506, "grad_norm": 8.071057319641113, "learning_rate": 2.049984737548971e-08, "loss": 0.2547, "step": 33220 }, { "epoch": 2.9231175228712174, "grad_norm": 9.139983177185059, "learning_rate": 2.003949931255411e-08, "loss": 0.2602, "step": 33230 }, { "epoch": 2.9239971850809288, "grad_norm": 8.26064682006836, "learning_rate": 1.9584368563036472e-08, "loss": 0.2487, "step": 33240 }, { "epoch": 2.9248768472906406, "grad_norm": 6.971067905426025, "learning_rate": 1.9134455603761482e-08, "loss": 0.2495, "step": 33250 }, { "epoch": 2.925756509500352, "grad_norm": 8.537172317504883, "learning_rate": 1.868976090608432e-08, "loss": 0.2545, "step": 33260 }, { "epoch": 2.9266361717100633, "grad_norm": 8.919508934020996, "learning_rate": 1.825028493589509e-08, "loss": 0.2385, "step": 33270 }, { "epoch": 2.9275158339197747, "grad_norm": 6.409901142120361, "learning_rate": 1.781602815361605e-08, "loss": 0.2422, "step": 33280 }, { "epoch": 2.928395496129486, "grad_norm": 9.761019706726074, "learning_rate": 1.738699101420105e-08, "loss": 0.2816, "step": 33290 }, { "epoch": 2.929275158339198, "grad_norm": 9.501956939697266, "learning_rate": 1.696317396713554e-08, "loss": 0.2862, "step": 33300 }, { "epoch": 2.930154820548909, "grad_norm": 11.43960952758789, "learning_rate": 1.6544577456437115e-08, "loss": 0.2733, "step": 33310 }, { "epoch": 2.9310344827586206, "grad_norm": 10.957393646240234, "learning_rate": 1.613120192065276e-08, "loss": 0.2721, "step": 33320 }, { "epoch": 2.9319141449683324, "grad_norm": 9.899510383605957, "learning_rate": 1.572304779286049e-08, "loss": 0.2494, "step": 33330 }, { "epoch": 2.9327938071780437, "grad_norm": 9.6947660446167, "learning_rate": 1.5320115500667144e-08, "loss": 0.2463, "step": 33340 }, { "epoch": 2.933673469387755, "grad_norm": 5.297719955444336, "learning_rate": 1.4922405466210044e-08, "loss": 0.2298, "step": 33350 }, { "epoch": 2.9345531315974664, "grad_norm": 10.799773216247559, "learning_rate": 1.4529918106154228e-08, "loss": 0.2489, "step": 33360 }, { "epoch": 2.935432793807178, "grad_norm": 8.792510986328125, "learning_rate": 1.4142653831694109e-08, "loss": 0.2306, "step": 33370 }, { "epoch": 2.9363124560168896, "grad_norm": 12.777946472167969, "learning_rate": 1.3760613048551253e-08, "loss": 0.2587, "step": 33380 }, { "epoch": 2.937192118226601, "grad_norm": 9.354962348937988, "learning_rate": 1.3383796156974937e-08, "loss": 0.2407, "step": 33390 }, { "epoch": 2.9380717804363123, "grad_norm": 7.616337776184082, "learning_rate": 1.3012203551742708e-08, "loss": 0.259, "step": 33400 }, { "epoch": 2.938951442646024, "grad_norm": 8.383606910705566, "learning_rate": 1.2645835622156488e-08, "loss": 0.2254, "step": 33410 }, { "epoch": 2.9398311048557355, "grad_norm": 8.561638832092285, "learning_rate": 1.2284692752047023e-08, "loss": 0.2444, "step": 33420 }, { "epoch": 2.940710767065447, "grad_norm": 7.782486915588379, "learning_rate": 1.1928775319768882e-08, "loss": 0.193, "step": 33430 }, { "epoch": 2.9415904292751582, "grad_norm": 10.560375213623047, "learning_rate": 1.1578083698204346e-08, "loss": 0.2607, "step": 33440 }, { "epoch": 2.9424700914848696, "grad_norm": 8.411771774291992, "learning_rate": 1.123261825475841e-08, "loss": 0.2825, "step": 33450 }, { "epoch": 2.9433497536945814, "grad_norm": 6.3587212562561035, "learning_rate": 1.089237935136267e-08, "loss": 0.235, "step": 33460 }, { "epoch": 2.9442294159042928, "grad_norm": 11.614686012268066, "learning_rate": 1.0557367344471436e-08, "loss": 0.2535, "step": 33470 }, { "epoch": 2.945109078114004, "grad_norm": 8.587145805358887, "learning_rate": 1.0227582585065065e-08, "loss": 0.2799, "step": 33480 }, { "epoch": 2.945988740323716, "grad_norm": 9.496132850646973, "learning_rate": 9.903025418646073e-09, "loss": 0.2595, "step": 33490 }, { "epoch": 2.9468684025334273, "grad_norm": 10.519966125488281, "learning_rate": 9.583696185239687e-09, "loss": 0.2431, "step": 33500 }, { "epoch": 2.9477480647431387, "grad_norm": 11.90830135345459, "learning_rate": 9.269595219396076e-09, "loss": 0.2301, "step": 33510 }, { "epoch": 2.94862772695285, "grad_norm": 7.631098747253418, "learning_rate": 8.960722850185899e-09, "loss": 0.2765, "step": 33520 }, { "epoch": 2.9495073891625614, "grad_norm": 8.635032653808594, "learning_rate": 8.657079401203639e-09, "loss": 0.2016, "step": 33530 }, { "epoch": 2.950387051372273, "grad_norm": 8.623146057128906, "learning_rate": 8.358665190564274e-09, "loss": 0.2187, "step": 33540 }, { "epoch": 2.9512667135819846, "grad_norm": 9.282051086425781, "learning_rate": 8.065480530904946e-09, "loss": 0.2267, "step": 33550 }, { "epoch": 2.952146375791696, "grad_norm": 7.176925182342529, "learning_rate": 7.777525729384395e-09, "loss": 0.2067, "step": 33560 }, { "epoch": 2.9530260380014077, "grad_norm": 10.370579719543457, "learning_rate": 7.494801087681303e-09, "loss": 0.2539, "step": 33570 }, { "epoch": 2.953905700211119, "grad_norm": 15.978102684020996, "learning_rate": 7.2173069019954024e-09, "loss": 0.2564, "step": 33580 }, { "epoch": 2.9547853624208305, "grad_norm": 10.790876388549805, "learning_rate": 6.945043463047474e-09, "loss": 0.2279, "step": 33590 }, { "epoch": 2.955665024630542, "grad_norm": 10.836160659790039, "learning_rate": 6.678011056076572e-09, "loss": 0.212, "step": 33600 }, { "epoch": 2.956544686840253, "grad_norm": 7.041357517242432, "learning_rate": 6.416209960842801e-09, "loss": 0.2301, "step": 33610 }, { "epoch": 2.9574243490499645, "grad_norm": 9.003617286682129, "learning_rate": 6.159640451625093e-09, "loss": 0.2595, "step": 33620 }, { "epoch": 2.9583040112596763, "grad_norm": 9.65697956085205, "learning_rate": 5.9083027972212104e-09, "loss": 0.2149, "step": 33630 }, { "epoch": 2.9591836734693877, "grad_norm": 8.786408424377441, "learning_rate": 5.662197260948299e-09, "loss": 0.261, "step": 33640 }, { "epoch": 2.960063335679099, "grad_norm": 10.742688179016113, "learning_rate": 5.421324100641778e-09, "loss": 0.2633, "step": 33650 }, { "epoch": 2.960942997888811, "grad_norm": 10.487140655517578, "learning_rate": 5.1856835686542315e-09, "loss": 0.2961, "step": 33660 }, { "epoch": 2.9618226600985222, "grad_norm": 10.437482833862305, "learning_rate": 4.9552759118581815e-09, "loss": 0.2543, "step": 33670 }, { "epoch": 2.9627023223082336, "grad_norm": 6.9348225593566895, "learning_rate": 4.730101371642759e-09, "loss": 0.3103, "step": 33680 }, { "epoch": 2.963581984517945, "grad_norm": 8.705235481262207, "learning_rate": 4.510160183914258e-09, "loss": 0.248, "step": 33690 }, { "epoch": 2.9644616467276563, "grad_norm": 8.136268615722656, "learning_rate": 4.295452579097248e-09, "loss": 0.2304, "step": 33700 }, { "epoch": 2.965341308937368, "grad_norm": 8.082895278930664, "learning_rate": 4.0859787821323495e-09, "loss": 0.2293, "step": 33710 }, { "epoch": 2.9662209711470795, "grad_norm": 8.789935111999512, "learning_rate": 3.881739012476793e-09, "loss": 0.208, "step": 33720 }, { "epoch": 2.967100633356791, "grad_norm": 8.411272048950195, "learning_rate": 3.6827334841049722e-09, "loss": 0.2029, "step": 33730 }, { "epoch": 2.9679802955665027, "grad_norm": 9.793478965759277, "learning_rate": 3.4889624055078895e-09, "loss": 0.2387, "step": 33740 }, { "epoch": 2.968859957776214, "grad_norm": 13.224653244018555, "learning_rate": 3.3004259796914904e-09, "loss": 0.2941, "step": 33750 }, { "epoch": 2.9697396199859254, "grad_norm": 9.50903606414795, "learning_rate": 3.1171244041777735e-09, "loss": 0.2608, "step": 33760 }, { "epoch": 2.9706192821956368, "grad_norm": 7.975781440734863, "learning_rate": 2.9390578710059015e-09, "loss": 0.2306, "step": 33770 }, { "epoch": 2.971498944405348, "grad_norm": 8.43726634979248, "learning_rate": 2.766226566728869e-09, "loss": 0.2447, "step": 33780 }, { "epoch": 2.97237860661506, "grad_norm": 8.042189598083496, "learning_rate": 2.5986306724146147e-09, "loss": 0.2435, "step": 33790 }, { "epoch": 2.9732582688247713, "grad_norm": 11.750335693359375, "learning_rate": 2.436270363648241e-09, "loss": 0.2311, "step": 33800 }, { "epoch": 2.9741379310344827, "grad_norm": 11.66080379486084, "learning_rate": 2.279145810527572e-09, "loss": 0.2714, "step": 33810 }, { "epoch": 2.9750175932441945, "grad_norm": 8.352204322814941, "learning_rate": 2.127257177665931e-09, "loss": 0.2422, "step": 33820 }, { "epoch": 2.975897255453906, "grad_norm": 11.924675941467285, "learning_rate": 1.980604624191029e-09, "loss": 0.209, "step": 33830 }, { "epoch": 2.976776917663617, "grad_norm": 11.479485511779785, "learning_rate": 1.8391883037460756e-09, "loss": 0.2728, "step": 33840 }, { "epoch": 2.9776565798733285, "grad_norm": 9.403388977050781, "learning_rate": 1.7030083644870022e-09, "loss": 0.2237, "step": 33850 }, { "epoch": 2.97853624208304, "grad_norm": 11.5449800491333, "learning_rate": 1.5720649490841288e-09, "loss": 0.2947, "step": 33860 }, { "epoch": 2.9794159042927517, "grad_norm": 8.40696907043457, "learning_rate": 1.4463581947216087e-09, "loss": 0.2607, "step": 33870 }, { "epoch": 2.980295566502463, "grad_norm": 8.978914260864258, "learning_rate": 1.3258882330979827e-09, "loss": 0.2552, "step": 33880 }, { "epoch": 2.9811752287121744, "grad_norm": 11.220206260681152, "learning_rate": 1.2106551904256247e-09, "loss": 0.2733, "step": 33890 }, { "epoch": 2.9820548909218862, "grad_norm": 9.741687774658203, "learning_rate": 1.100659187428521e-09, "loss": 0.2486, "step": 33900 }, { "epoch": 2.9829345531315976, "grad_norm": 11.845671653747559, "learning_rate": 9.959003393456012e-10, "loss": 0.2345, "step": 33910 }, { "epoch": 2.983814215341309, "grad_norm": 16.310115814208984, "learning_rate": 8.963787559296278e-10, "loss": 0.2478, "step": 33920 }, { "epoch": 2.9846938775510203, "grad_norm": 7.665103912353516, "learning_rate": 8.020945414444203e-10, "loss": 0.2619, "step": 33930 }, { "epoch": 2.9855735397607317, "grad_norm": 8.259710311889648, "learning_rate": 7.130477946681869e-10, "loss": 0.2251, "step": 33940 }, { "epoch": 2.9864532019704435, "grad_norm": 9.41935920715332, "learning_rate": 6.292386088924129e-10, "loss": 0.246, "step": 33950 }, { "epoch": 2.987332864180155, "grad_norm": 9.716100692749023, "learning_rate": 5.506670719196417e-10, "loss": 0.253, "step": 33960 }, { "epoch": 2.9882125263898662, "grad_norm": 7.610142230987549, "learning_rate": 4.77333266067359e-10, "loss": 0.2285, "step": 33970 }, { "epoch": 2.989092188599578, "grad_norm": 9.878477096557617, "learning_rate": 4.092372681641088e-10, "loss": 0.2765, "step": 33980 }, { "epoch": 2.9899718508092894, "grad_norm": 10.409518241882324, "learning_rate": 3.4637914955171214e-10, "loss": 0.2517, "step": 33990 }, { "epoch": 2.9908515130190008, "grad_norm": 8.166799545288086, "learning_rate": 2.887589760836029e-10, "loss": 0.2267, "step": 34000 }, { "epoch": 2.991731175228712, "grad_norm": 9.701534271240234, "learning_rate": 2.363768081270479e-10, "loss": 0.2497, "step": 34010 }, { "epoch": 2.9926108374384235, "grad_norm": 9.702855110168457, "learning_rate": 1.8923270055981602e-10, "loss": 0.2717, "step": 34020 }, { "epoch": 2.993490499648135, "grad_norm": 9.330683708190918, "learning_rate": 1.4732670277406437e-10, "loss": 0.2544, "step": 34030 }, { "epoch": 2.9943701618578467, "grad_norm": 10.684103965759277, "learning_rate": 1.1065885867189707e-10, "loss": 0.2212, "step": 34040 }, { "epoch": 2.995249824067558, "grad_norm": 10.803227424621582, "learning_rate": 7.922920666980637e-11, "loss": 0.2559, "step": 34050 }, { "epoch": 2.9961294862772694, "grad_norm": 9.291400909423828, "learning_rate": 5.303777969534185e-11, "loss": 0.2402, "step": 34060 }, { "epoch": 2.997009148486981, "grad_norm": 9.152578353881836, "learning_rate": 3.208460518766554e-11, "loss": 0.2484, "step": 34070 }, { "epoch": 2.9978888106966926, "grad_norm": 10.296873092651367, "learning_rate": 1.6369705099217314e-11, "loss": 0.2375, "step": 34080 }, { "epoch": 2.998768472906404, "grad_norm": 7.6489458084106445, "learning_rate": 5.893095893494405e-12, "loss": 0.2329, "step": 34090 }, { "epoch": 2.9996481351161153, "grad_norm": 13.089485168457031, "learning_rate": 6.547885467167447e-13, "loss": 0.2758, "step": 34100 }, { "epoch": 3.0, "step": 34104, "total_flos": 4.166743018080043e+18, "train_loss": 0.6109988800263757, "train_runtime": 28639.0765, "train_samples_per_second": 19.053, "train_steps_per_second": 1.191 } ], "logging_steps": 10, "max_steps": 34104, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.166743018080043e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }