{ "best_global_step": 5852, "best_metric": 0.848216712474823, "best_model_checkpoint": "./arthur-ft/checkpoint-5852", "epoch": 1.9999006600148612, "eval_steps": 500, "global_step": 11704, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017086477443862974, "grad_norm": 81.49402618408203, "learning_rate": 1.278409090909091e-07, "loss": 1.5968, "step": 10 }, { "epoch": 0.0034172954887725948, "grad_norm": 27.926727294921875, "learning_rate": 2.6988636363636366e-07, "loss": 1.2647, "step": 20 }, { "epoch": 0.005125943233158892, "grad_norm": 26.150636672973633, "learning_rate": 4.119318181818182e-07, "loss": 1.1425, "step": 30 }, { "epoch": 0.0068345909775451895, "grad_norm": 32.89878463745117, "learning_rate": 5.539772727272728e-07, "loss": 1.1125, "step": 40 }, { "epoch": 0.008543238721931487, "grad_norm": 31.276884078979492, "learning_rate": 6.960227272727273e-07, "loss": 1.1157, "step": 50 }, { "epoch": 0.010251886466317785, "grad_norm": 22.583457946777344, "learning_rate": 8.380681818181818e-07, "loss": 1.1315, "step": 60 }, { "epoch": 0.011960534210704083, "grad_norm": 26.30866813659668, "learning_rate": 9.801136363636364e-07, "loss": 1.107, "step": 70 }, { "epoch": 0.013669181955090379, "grad_norm": 26.83280372619629, "learning_rate": 1.1221590909090909e-06, "loss": 1.1115, "step": 80 }, { "epoch": 0.015377829699476677, "grad_norm": 25.484037399291992, "learning_rate": 1.2642045454545456e-06, "loss": 1.1214, "step": 90 }, { "epoch": 0.017086477443862973, "grad_norm": 21.573806762695312, "learning_rate": 1.40625e-06, "loss": 1.0899, "step": 100 }, { "epoch": 0.018795125188249273, "grad_norm": 31.41425323486328, "learning_rate": 1.5482954545454546e-06, "loss": 1.1174, "step": 110 }, { "epoch": 0.02050377293263557, "grad_norm": 31.211849212646484, "learning_rate": 1.6903409090909093e-06, "loss": 1.1159, "step": 120 }, { "epoch": 0.022212420677021866, "grad_norm": 21.36687660217285, "learning_rate": 1.8323863636363638e-06, "loss": 1.0619, "step": 130 }, { "epoch": 0.023921068421408165, "grad_norm": 21.135522842407227, "learning_rate": 1.9744318181818183e-06, "loss": 1.0649, "step": 140 }, { "epoch": 0.025629716165794462, "grad_norm": 15.732582092285156, "learning_rate": 2.1164772727272728e-06, "loss": 1.0778, "step": 150 }, { "epoch": 0.027338363910180758, "grad_norm": 29.327178955078125, "learning_rate": 2.2585227272727277e-06, "loss": 1.0911, "step": 160 }, { "epoch": 0.029047011654567058, "grad_norm": 18.275588989257812, "learning_rate": 2.4005681818181818e-06, "loss": 1.0608, "step": 170 }, { "epoch": 0.030755659398953354, "grad_norm": 23.922975540161133, "learning_rate": 2.5426136363636367e-06, "loss": 1.0478, "step": 180 }, { "epoch": 0.03246430714333965, "grad_norm": 20.124839782714844, "learning_rate": 2.684659090909091e-06, "loss": 1.0195, "step": 190 }, { "epoch": 0.03417295488772595, "grad_norm": 17.850400924682617, "learning_rate": 2.8267045454545457e-06, "loss": 1.0701, "step": 200 }, { "epoch": 0.03588160263211224, "grad_norm": 21.76569175720215, "learning_rate": 2.96875e-06, "loss": 1.0621, "step": 210 }, { "epoch": 0.037590250376498546, "grad_norm": 18.49285125732422, "learning_rate": 3.110795454545455e-06, "loss": 1.0733, "step": 220 }, { "epoch": 0.03929889812088484, "grad_norm": 20.746557235717773, "learning_rate": 3.252840909090909e-06, "loss": 1.0711, "step": 230 }, { "epoch": 0.04100754586527114, "grad_norm": 22.905445098876953, "learning_rate": 3.3948863636363636e-06, "loss": 1.0938, "step": 240 }, { "epoch": 0.042716193609657435, "grad_norm": 21.94207763671875, "learning_rate": 3.5369318181818186e-06, "loss": 1.0307, "step": 250 }, { "epoch": 0.04442484135404373, "grad_norm": 19.50674819946289, "learning_rate": 3.678977272727273e-06, "loss": 1.0386, "step": 260 }, { "epoch": 0.04613348909843003, "grad_norm": 33.76179504394531, "learning_rate": 3.821022727272727e-06, "loss": 1.024, "step": 270 }, { "epoch": 0.04784213684281633, "grad_norm": 23.91062355041504, "learning_rate": 3.963068181818182e-06, "loss": 1.0586, "step": 280 }, { "epoch": 0.04955078458720263, "grad_norm": 20.59576988220215, "learning_rate": 4.105113636363637e-06, "loss": 1.0404, "step": 290 }, { "epoch": 0.051259432331588924, "grad_norm": 20.871490478515625, "learning_rate": 4.247159090909092e-06, "loss": 1.0237, "step": 300 }, { "epoch": 0.05296808007597522, "grad_norm": 20.965486526489258, "learning_rate": 4.389204545454546e-06, "loss": 0.9917, "step": 310 }, { "epoch": 0.054676727820361516, "grad_norm": 25.50008201599121, "learning_rate": 4.53125e-06, "loss": 1.0394, "step": 320 }, { "epoch": 0.05638537556474781, "grad_norm": 19.419902801513672, "learning_rate": 4.673295454545455e-06, "loss": 0.9999, "step": 330 }, { "epoch": 0.058094023309134116, "grad_norm": 22.62456512451172, "learning_rate": 4.815340909090909e-06, "loss": 1.0649, "step": 340 }, { "epoch": 0.05980267105352041, "grad_norm": 29.71078109741211, "learning_rate": 4.957386363636364e-06, "loss": 1.0064, "step": 350 }, { "epoch": 0.06151131879790671, "grad_norm": 22.22879409790039, "learning_rate": 4.9969168428470764e-06, "loss": 1.06, "step": 360 }, { "epoch": 0.063219966542293, "grad_norm": 21.7359619140625, "learning_rate": 4.992512332628612e-06, "loss": 1.043, "step": 370 }, { "epoch": 0.0649286142866793, "grad_norm": 18.66118812561035, "learning_rate": 4.988107822410148e-06, "loss": 0.9956, "step": 380 }, { "epoch": 0.0666372620310656, "grad_norm": 18.667190551757812, "learning_rate": 4.983703312191685e-06, "loss": 1.0341, "step": 390 }, { "epoch": 0.0683459097754519, "grad_norm": 21.578454971313477, "learning_rate": 4.979298801973221e-06, "loss": 1.0265, "step": 400 }, { "epoch": 0.07005455751983819, "grad_norm": 22.787809371948242, "learning_rate": 4.974894291754757e-06, "loss": 1.0062, "step": 410 }, { "epoch": 0.07176320526422449, "grad_norm": 15.971848487854004, "learning_rate": 4.970489781536293e-06, "loss": 0.9964, "step": 420 }, { "epoch": 0.0734718530086108, "grad_norm": 19.03502082824707, "learning_rate": 4.96608527131783e-06, "loss": 1.0245, "step": 430 }, { "epoch": 0.07518050075299709, "grad_norm": 19.77265167236328, "learning_rate": 4.9616807610993666e-06, "loss": 1.0321, "step": 440 }, { "epoch": 0.07688914849738339, "grad_norm": 26.002628326416016, "learning_rate": 4.957276250880902e-06, "loss": 1.024, "step": 450 }, { "epoch": 0.07859779624176969, "grad_norm": 20.502666473388672, "learning_rate": 4.952871740662439e-06, "loss": 1.02, "step": 460 }, { "epoch": 0.08030644398615598, "grad_norm": 25.582834243774414, "learning_rate": 4.948467230443975e-06, "loss": 0.9959, "step": 470 }, { "epoch": 0.08201509173054228, "grad_norm": 23.496889114379883, "learning_rate": 4.944062720225512e-06, "loss": 1.0149, "step": 480 }, { "epoch": 0.08372373947492857, "grad_norm": 19.036056518554688, "learning_rate": 4.9396582100070475e-06, "loss": 1.0096, "step": 490 }, { "epoch": 0.08543238721931487, "grad_norm": 17.741846084594727, "learning_rate": 4.935253699788584e-06, "loss": 1.0365, "step": 500 }, { "epoch": 0.08714103496370117, "grad_norm": 17.736528396606445, "learning_rate": 4.93084918957012e-06, "loss": 0.9791, "step": 510 }, { "epoch": 0.08884968270808746, "grad_norm": 15.833001136779785, "learning_rate": 4.926444679351657e-06, "loss": 1.0195, "step": 520 }, { "epoch": 0.09055833045247376, "grad_norm": 21.06598663330078, "learning_rate": 4.9220401691331925e-06, "loss": 1.0199, "step": 530 }, { "epoch": 0.09226697819686006, "grad_norm": 20.22286605834961, "learning_rate": 4.917635658914729e-06, "loss": 1.0549, "step": 540 }, { "epoch": 0.09397562594124637, "grad_norm": 18.53827476501465, "learning_rate": 4.913231148696265e-06, "loss": 1.0115, "step": 550 }, { "epoch": 0.09568427368563266, "grad_norm": 17.737276077270508, "learning_rate": 4.908826638477802e-06, "loss": 0.9855, "step": 560 }, { "epoch": 0.09739292143001896, "grad_norm": 16.152812957763672, "learning_rate": 4.9044221282593376e-06, "loss": 1.0029, "step": 570 }, { "epoch": 0.09910156917440525, "grad_norm": 22.577655792236328, "learning_rate": 4.900017618040874e-06, "loss": 0.9897, "step": 580 }, { "epoch": 0.10081021691879155, "grad_norm": 19.24541664123535, "learning_rate": 4.895613107822411e-06, "loss": 0.9428, "step": 590 }, { "epoch": 0.10251886466317785, "grad_norm": 14.999211311340332, "learning_rate": 4.891208597603947e-06, "loss": 1.0375, "step": 600 }, { "epoch": 0.10422751240756414, "grad_norm": 17.597455978393555, "learning_rate": 4.8868040873854835e-06, "loss": 0.9238, "step": 610 }, { "epoch": 0.10593616015195044, "grad_norm": 14.606751441955566, "learning_rate": 4.882399577167019e-06, "loss": 1.0094, "step": 620 }, { "epoch": 0.10764480789633674, "grad_norm": 19.107078552246094, "learning_rate": 4.877995066948556e-06, "loss": 1.0012, "step": 630 }, { "epoch": 0.10935345564072303, "grad_norm": 22.561248779296875, "learning_rate": 4.873590556730092e-06, "loss": 0.9683, "step": 640 }, { "epoch": 0.11106210338510933, "grad_norm": 20.76687240600586, "learning_rate": 4.869186046511628e-06, "loss": 0.9516, "step": 650 }, { "epoch": 0.11277075112949562, "grad_norm": 18.26988410949707, "learning_rate": 4.864781536293164e-06, "loss": 1.003, "step": 660 }, { "epoch": 0.11447939887388194, "grad_norm": 24.70865821838379, "learning_rate": 4.860377026074701e-06, "loss": 0.9629, "step": 670 }, { "epoch": 0.11618804661826823, "grad_norm": 15.833657264709473, "learning_rate": 4.855972515856237e-06, "loss": 0.9842, "step": 680 }, { "epoch": 0.11789669436265453, "grad_norm": 23.024721145629883, "learning_rate": 4.851568005637774e-06, "loss": 0.9945, "step": 690 }, { "epoch": 0.11960534210704082, "grad_norm": 21.521650314331055, "learning_rate": 4.8471634954193094e-06, "loss": 0.9154, "step": 700 }, { "epoch": 0.12131398985142712, "grad_norm": 16.77184295654297, "learning_rate": 4.842758985200846e-06, "loss": 0.9333, "step": 710 }, { "epoch": 0.12302263759581342, "grad_norm": 18.226619720458984, "learning_rate": 4.838354474982383e-06, "loss": 1.003, "step": 720 }, { "epoch": 0.12473128534019971, "grad_norm": 18.95140266418457, "learning_rate": 4.833949964763919e-06, "loss": 0.9384, "step": 730 }, { "epoch": 0.126439933084586, "grad_norm": 21.0819149017334, "learning_rate": 4.829545454545455e-06, "loss": 1.0182, "step": 740 }, { "epoch": 0.12814858082897232, "grad_norm": 20.32185935974121, "learning_rate": 4.825140944326991e-06, "loss": 0.9399, "step": 750 }, { "epoch": 0.1298572285733586, "grad_norm": 21.246639251708984, "learning_rate": 4.820736434108528e-06, "loss": 0.9671, "step": 760 }, { "epoch": 0.1315658763177449, "grad_norm": 17.076871871948242, "learning_rate": 4.816331923890064e-06, "loss": 0.9338, "step": 770 }, { "epoch": 0.1332745240621312, "grad_norm": 17.45732879638672, "learning_rate": 4.8119274136715996e-06, "loss": 0.9071, "step": 780 }, { "epoch": 0.1349831718065175, "grad_norm": 15.503561973571777, "learning_rate": 4.807522903453136e-06, "loss": 1.0028, "step": 790 }, { "epoch": 0.1366918195509038, "grad_norm": 20.221580505371094, "learning_rate": 4.803118393234673e-06, "loss": 0.9703, "step": 800 }, { "epoch": 0.1384004672952901, "grad_norm": 16.32524299621582, "learning_rate": 4.798713883016209e-06, "loss": 0.9856, "step": 810 }, { "epoch": 0.14010911503967638, "grad_norm": 19.584348678588867, "learning_rate": 4.7943093727977455e-06, "loss": 0.9418, "step": 820 }, { "epoch": 0.1418177627840627, "grad_norm": 23.859182357788086, "learning_rate": 4.789904862579281e-06, "loss": 0.9651, "step": 830 }, { "epoch": 0.14352641052844897, "grad_norm": 20.757596969604492, "learning_rate": 4.785500352360818e-06, "loss": 0.9272, "step": 840 }, { "epoch": 0.14523505827283528, "grad_norm": 19.72559928894043, "learning_rate": 4.781095842142354e-06, "loss": 0.9644, "step": 850 }, { "epoch": 0.1469437060172216, "grad_norm": 20.055456161499023, "learning_rate": 4.7766913319238905e-06, "loss": 0.9819, "step": 860 }, { "epoch": 0.14865235376160787, "grad_norm": 18.991012573242188, "learning_rate": 4.772286821705427e-06, "loss": 0.9234, "step": 870 }, { "epoch": 0.15036100150599419, "grad_norm": 17.789796829223633, "learning_rate": 4.767882311486963e-06, "loss": 1.0245, "step": 880 }, { "epoch": 0.15206964925038047, "grad_norm": 19.364513397216797, "learning_rate": 4.7634778012685e-06, "loss": 0.9382, "step": 890 }, { "epoch": 0.15377829699476678, "grad_norm": 18.194772720336914, "learning_rate": 4.759073291050036e-06, "loss": 0.9533, "step": 900 }, { "epoch": 0.15548694473915306, "grad_norm": 15.367209434509277, "learning_rate": 4.7546687808315714e-06, "loss": 0.9901, "step": 910 }, { "epoch": 0.15719559248353937, "grad_norm": 26.23330307006836, "learning_rate": 4.750264270613108e-06, "loss": 0.8894, "step": 920 }, { "epoch": 0.15890424022792565, "grad_norm": 20.43960189819336, "learning_rate": 4.745859760394644e-06, "loss": 0.9469, "step": 930 }, { "epoch": 0.16061288797231196, "grad_norm": 17.476476669311523, "learning_rate": 4.741455250176181e-06, "loss": 0.9681, "step": 940 }, { "epoch": 0.16232153571669825, "grad_norm": 18.390302658081055, "learning_rate": 4.737050739957717e-06, "loss": 0.9627, "step": 950 }, { "epoch": 0.16403018346108456, "grad_norm": 26.21846580505371, "learning_rate": 4.732646229739253e-06, "loss": 0.9453, "step": 960 }, { "epoch": 0.16573883120547084, "grad_norm": 17.23887062072754, "learning_rate": 4.72824171952079e-06, "loss": 0.9315, "step": 970 }, { "epoch": 0.16744747894985715, "grad_norm": 15.847450256347656, "learning_rate": 4.723837209302326e-06, "loss": 0.9448, "step": 980 }, { "epoch": 0.16915612669424346, "grad_norm": 20.83458709716797, "learning_rate": 4.719432699083862e-06, "loss": 0.9788, "step": 990 }, { "epoch": 0.17086477443862974, "grad_norm": 28.041086196899414, "learning_rate": 4.715028188865399e-06, "loss": 0.9217, "step": 1000 }, { "epoch": 0.17257342218301605, "grad_norm": 21.50284767150879, "learning_rate": 4.710623678646935e-06, "loss": 0.8928, "step": 1010 }, { "epoch": 0.17428206992740233, "grad_norm": 21.39044761657715, "learning_rate": 4.706219168428472e-06, "loss": 0.9415, "step": 1020 }, { "epoch": 0.17599071767178864, "grad_norm": 21.087949752807617, "learning_rate": 4.7018146582100075e-06, "loss": 0.9399, "step": 1030 }, { "epoch": 0.17769936541617493, "grad_norm": 16.453859329223633, "learning_rate": 4.697410147991543e-06, "loss": 0.948, "step": 1040 }, { "epoch": 0.17940801316056124, "grad_norm": 18.201675415039062, "learning_rate": 4.69300563777308e-06, "loss": 0.9403, "step": 1050 }, { "epoch": 0.18111666090494752, "grad_norm": 18.971012115478516, "learning_rate": 4.688601127554616e-06, "loss": 0.9294, "step": 1060 }, { "epoch": 0.18282530864933383, "grad_norm": 18.481828689575195, "learning_rate": 4.6841966173361525e-06, "loss": 0.8809, "step": 1070 }, { "epoch": 0.1845339563937201, "grad_norm": 17.92839813232422, "learning_rate": 4.679792107117689e-06, "loss": 0.9608, "step": 1080 }, { "epoch": 0.18624260413810642, "grad_norm": 21.6907958984375, "learning_rate": 4.675387596899225e-06, "loss": 0.9896, "step": 1090 }, { "epoch": 0.18795125188249273, "grad_norm": 19.17830467224121, "learning_rate": 4.670983086680762e-06, "loss": 0.9319, "step": 1100 }, { "epoch": 0.189659899626879, "grad_norm": 19.919885635375977, "learning_rate": 4.666578576462298e-06, "loss": 0.9509, "step": 1110 }, { "epoch": 0.19136854737126532, "grad_norm": 13.461675643920898, "learning_rate": 4.662174066243834e-06, "loss": 0.9315, "step": 1120 }, { "epoch": 0.1930771951156516, "grad_norm": 25.66329574584961, "learning_rate": 4.65776955602537e-06, "loss": 0.9203, "step": 1130 }, { "epoch": 0.19478584286003792, "grad_norm": 20.48524284362793, "learning_rate": 4.653365045806907e-06, "loss": 0.8759, "step": 1140 }, { "epoch": 0.1964944906044242, "grad_norm": 18.57932472229004, "learning_rate": 4.6489605355884435e-06, "loss": 0.9367, "step": 1150 }, { "epoch": 0.1982031383488105, "grad_norm": 24.531593322753906, "learning_rate": 4.644556025369979e-06, "loss": 0.924, "step": 1160 }, { "epoch": 0.1999117860931968, "grad_norm": 19.594648361206055, "learning_rate": 4.640151515151515e-06, "loss": 0.9189, "step": 1170 }, { "epoch": 0.2016204338375831, "grad_norm": 18.946157455444336, "learning_rate": 4.635747004933052e-06, "loss": 0.9476, "step": 1180 }, { "epoch": 0.20332908158196938, "grad_norm": 18.381322860717773, "learning_rate": 4.631342494714588e-06, "loss": 0.961, "step": 1190 }, { "epoch": 0.2050377293263557, "grad_norm": 18.244287490844727, "learning_rate": 4.626937984496124e-06, "loss": 0.9345, "step": 1200 }, { "epoch": 0.20674637707074198, "grad_norm": 21.273303985595703, "learning_rate": 4.62253347427766e-06, "loss": 0.8893, "step": 1210 }, { "epoch": 0.2084550248151283, "grad_norm": 21.534873962402344, "learning_rate": 4.618128964059197e-06, "loss": 0.8948, "step": 1220 }, { "epoch": 0.2101636725595146, "grad_norm": 20.035734176635742, "learning_rate": 4.613724453840734e-06, "loss": 0.9211, "step": 1230 }, { "epoch": 0.21187232030390088, "grad_norm": 19.587982177734375, "learning_rate": 4.6093199436222695e-06, "loss": 0.9074, "step": 1240 }, { "epoch": 0.2135809680482872, "grad_norm": 20.059412002563477, "learning_rate": 4.604915433403806e-06, "loss": 0.9389, "step": 1250 }, { "epoch": 0.21528961579267347, "grad_norm": 23.202457427978516, "learning_rate": 4.600510923185342e-06, "loss": 0.9303, "step": 1260 }, { "epoch": 0.21699826353705978, "grad_norm": 22.944717407226562, "learning_rate": 4.596106412966879e-06, "loss": 0.9118, "step": 1270 }, { "epoch": 0.21870691128144606, "grad_norm": 19.934560775756836, "learning_rate": 4.591701902748415e-06, "loss": 0.9638, "step": 1280 }, { "epoch": 0.22041555902583237, "grad_norm": 19.087709426879883, "learning_rate": 4.587297392529951e-06, "loss": 0.9519, "step": 1290 }, { "epoch": 0.22212420677021866, "grad_norm": 17.25513458251953, "learning_rate": 4.582892882311487e-06, "loss": 0.8735, "step": 1300 }, { "epoch": 0.22383285451460497, "grad_norm": 23.020050048828125, "learning_rate": 4.578488372093024e-06, "loss": 0.9319, "step": 1310 }, { "epoch": 0.22554150225899125, "grad_norm": 18.893648147583008, "learning_rate": 4.57408386187456e-06, "loss": 0.9329, "step": 1320 }, { "epoch": 0.22725015000337756, "grad_norm": 20.73868179321289, "learning_rate": 4.569679351656096e-06, "loss": 0.8715, "step": 1330 }, { "epoch": 0.22895879774776387, "grad_norm": 25.549577713012695, "learning_rate": 4.565274841437632e-06, "loss": 0.9145, "step": 1340 }, { "epoch": 0.23066744549215015, "grad_norm": 18.99001693725586, "learning_rate": 4.560870331219169e-06, "loss": 0.9251, "step": 1350 }, { "epoch": 0.23237609323653646, "grad_norm": 19.704002380371094, "learning_rate": 4.5564658210007055e-06, "loss": 0.9342, "step": 1360 }, { "epoch": 0.23408474098092275, "grad_norm": 20.581199645996094, "learning_rate": 4.552061310782241e-06, "loss": 0.9107, "step": 1370 }, { "epoch": 0.23579338872530906, "grad_norm": 18.79061508178711, "learning_rate": 4.547656800563778e-06, "loss": 0.962, "step": 1380 }, { "epoch": 0.23750203646969534, "grad_norm": 17.29990577697754, "learning_rate": 4.543252290345314e-06, "loss": 0.8641, "step": 1390 }, { "epoch": 0.23921068421408165, "grad_norm": 16.5628719329834, "learning_rate": 4.5388477801268506e-06, "loss": 0.9237, "step": 1400 }, { "epoch": 0.24091933195846793, "grad_norm": 17.68106460571289, "learning_rate": 4.534443269908386e-06, "loss": 0.9167, "step": 1410 }, { "epoch": 0.24262797970285424, "grad_norm": 20.802289962768555, "learning_rate": 4.530038759689923e-06, "loss": 0.9236, "step": 1420 }, { "epoch": 0.24433662744724052, "grad_norm": 15.484850883483887, "learning_rate": 4.525634249471459e-06, "loss": 0.9322, "step": 1430 }, { "epoch": 0.24604527519162683, "grad_norm": 21.147815704345703, "learning_rate": 4.521229739252996e-06, "loss": 0.9034, "step": 1440 }, { "epoch": 0.24775392293601312, "grad_norm": 20.891565322875977, "learning_rate": 4.5168252290345315e-06, "loss": 0.9118, "step": 1450 }, { "epoch": 0.24946257068039943, "grad_norm": 20.994525909423828, "learning_rate": 4.512420718816068e-06, "loss": 0.8692, "step": 1460 }, { "epoch": 0.25117121842478574, "grad_norm": 15.313887596130371, "learning_rate": 4.508016208597604e-06, "loss": 0.9845, "step": 1470 }, { "epoch": 0.252879866169172, "grad_norm": 20.045129776000977, "learning_rate": 4.503611698379141e-06, "loss": 0.9341, "step": 1480 }, { "epoch": 0.2545885139135583, "grad_norm": 20.646169662475586, "learning_rate": 4.4992071881606765e-06, "loss": 0.9434, "step": 1490 }, { "epoch": 0.25629716165794464, "grad_norm": 21.185823440551758, "learning_rate": 4.494802677942213e-06, "loss": 0.9123, "step": 1500 }, { "epoch": 0.2580058094023309, "grad_norm": 15.910945892333984, "learning_rate": 4.49039816772375e-06, "loss": 0.9393, "step": 1510 }, { "epoch": 0.2597144571467172, "grad_norm": 19.833402633666992, "learning_rate": 4.485993657505286e-06, "loss": 0.9744, "step": 1520 }, { "epoch": 0.2614231048911035, "grad_norm": 18.990707397460938, "learning_rate": 4.481589147286822e-06, "loss": 0.9286, "step": 1530 }, { "epoch": 0.2631317526354898, "grad_norm": 21.067312240600586, "learning_rate": 4.477184637068358e-06, "loss": 0.9652, "step": 1540 }, { "epoch": 0.2648404003798761, "grad_norm": 20.689836502075195, "learning_rate": 4.472780126849895e-06, "loss": 0.9158, "step": 1550 }, { "epoch": 0.2665490481242624, "grad_norm": 17.695697784423828, "learning_rate": 4.468375616631431e-06, "loss": 0.8859, "step": 1560 }, { "epoch": 0.26825769586864867, "grad_norm": 20.22654914855957, "learning_rate": 4.463971106412967e-06, "loss": 0.8948, "step": 1570 }, { "epoch": 0.269966343613035, "grad_norm": 15.549092292785645, "learning_rate": 4.459566596194503e-06, "loss": 0.8615, "step": 1580 }, { "epoch": 0.2716749913574213, "grad_norm": 18.86482810974121, "learning_rate": 4.45516208597604e-06, "loss": 0.9054, "step": 1590 }, { "epoch": 0.2733836391018076, "grad_norm": 18.071102142333984, "learning_rate": 4.450757575757576e-06, "loss": 0.939, "step": 1600 }, { "epoch": 0.2750922868461939, "grad_norm": 22.560697555541992, "learning_rate": 4.4463530655391125e-06, "loss": 0.8798, "step": 1610 }, { "epoch": 0.2768009345905802, "grad_norm": 19.61587905883789, "learning_rate": 4.441948555320648e-06, "loss": 0.8923, "step": 1620 }, { "epoch": 0.2785095823349665, "grad_norm": 13.9995698928833, "learning_rate": 4.437544045102185e-06, "loss": 0.9556, "step": 1630 }, { "epoch": 0.28021823007935276, "grad_norm": 23.62803077697754, "learning_rate": 4.433139534883722e-06, "loss": 0.8756, "step": 1640 }, { "epoch": 0.2819268778237391, "grad_norm": 19.477319717407227, "learning_rate": 4.428735024665258e-06, "loss": 0.8588, "step": 1650 }, { "epoch": 0.2836355255681254, "grad_norm": 17.02006721496582, "learning_rate": 4.424330514446794e-06, "loss": 0.934, "step": 1660 }, { "epoch": 0.28534417331251166, "grad_norm": 18.509023666381836, "learning_rate": 4.41992600422833e-06, "loss": 0.9693, "step": 1670 }, { "epoch": 0.28705282105689794, "grad_norm": 16.825519561767578, "learning_rate": 4.415521494009867e-06, "loss": 0.8973, "step": 1680 }, { "epoch": 0.2887614688012843, "grad_norm": 18.926586151123047, "learning_rate": 4.411116983791403e-06, "loss": 0.8644, "step": 1690 }, { "epoch": 0.29047011654567056, "grad_norm": 20.28687286376953, "learning_rate": 4.4067124735729385e-06, "loss": 0.9245, "step": 1700 }, { "epoch": 0.29217876429005685, "grad_norm": 23.774314880371094, "learning_rate": 4.402307963354475e-06, "loss": 0.8688, "step": 1710 }, { "epoch": 0.2938874120344432, "grad_norm": 18.38115692138672, "learning_rate": 4.397903453136012e-06, "loss": 0.8836, "step": 1720 }, { "epoch": 0.29559605977882947, "grad_norm": 17.962003707885742, "learning_rate": 4.393498942917548e-06, "loss": 0.8547, "step": 1730 }, { "epoch": 0.29730470752321575, "grad_norm": 17.536418914794922, "learning_rate": 4.389094432699084e-06, "loss": 0.8899, "step": 1740 }, { "epoch": 0.29901335526760203, "grad_norm": 24.884021759033203, "learning_rate": 4.38468992248062e-06, "loss": 0.8861, "step": 1750 }, { "epoch": 0.30072200301198837, "grad_norm": 21.32032012939453, "learning_rate": 4.380285412262157e-06, "loss": 0.8905, "step": 1760 }, { "epoch": 0.30243065075637465, "grad_norm": 17.606523513793945, "learning_rate": 4.375880902043693e-06, "loss": 0.8898, "step": 1770 }, { "epoch": 0.30413929850076094, "grad_norm": 18.825279235839844, "learning_rate": 4.3714763918252295e-06, "loss": 0.8806, "step": 1780 }, { "epoch": 0.3058479462451472, "grad_norm": 18.960371017456055, "learning_rate": 4.367071881606766e-06, "loss": 0.897, "step": 1790 }, { "epoch": 0.30755659398953356, "grad_norm": 22.261259078979492, "learning_rate": 4.362667371388302e-06, "loss": 0.8931, "step": 1800 }, { "epoch": 0.30926524173391984, "grad_norm": 23.404190063476562, "learning_rate": 4.358262861169839e-06, "loss": 0.8802, "step": 1810 }, { "epoch": 0.3109738894783061, "grad_norm": 14.648833274841309, "learning_rate": 4.3538583509513745e-06, "loss": 0.9234, "step": 1820 }, { "epoch": 0.31268253722269246, "grad_norm": 18.37412452697754, "learning_rate": 4.34945384073291e-06, "loss": 0.8852, "step": 1830 }, { "epoch": 0.31439118496707874, "grad_norm": 24.400611877441406, "learning_rate": 4.345049330514447e-06, "loss": 0.8791, "step": 1840 }, { "epoch": 0.316099832711465, "grad_norm": 17.905906677246094, "learning_rate": 4.340644820295983e-06, "loss": 0.874, "step": 1850 }, { "epoch": 0.3178084804558513, "grad_norm": 16.834829330444336, "learning_rate": 4.33624031007752e-06, "loss": 0.907, "step": 1860 }, { "epoch": 0.31951712820023764, "grad_norm": 18.529735565185547, "learning_rate": 4.331835799859056e-06, "loss": 0.8914, "step": 1870 }, { "epoch": 0.3212257759446239, "grad_norm": 18.155649185180664, "learning_rate": 4.327431289640592e-06, "loss": 0.8678, "step": 1880 }, { "epoch": 0.3229344236890102, "grad_norm": 15.488029479980469, "learning_rate": 4.323026779422129e-06, "loss": 0.8673, "step": 1890 }, { "epoch": 0.3246430714333965, "grad_norm": 22.161739349365234, "learning_rate": 4.318622269203665e-06, "loss": 0.8082, "step": 1900 }, { "epoch": 0.32635171917778283, "grad_norm": 21.01485252380371, "learning_rate": 4.314217758985201e-06, "loss": 0.8848, "step": 1910 }, { "epoch": 0.3280603669221691, "grad_norm": 17.303821563720703, "learning_rate": 4.309813248766738e-06, "loss": 0.8917, "step": 1920 }, { "epoch": 0.3297690146665554, "grad_norm": 27.273990631103516, "learning_rate": 4.305408738548274e-06, "loss": 0.8299, "step": 1930 }, { "epoch": 0.3314776624109417, "grad_norm": 20.609886169433594, "learning_rate": 4.3010042283298106e-06, "loss": 0.8975, "step": 1940 }, { "epoch": 0.333186310155328, "grad_norm": 21.860870361328125, "learning_rate": 4.296599718111346e-06, "loss": 0.8811, "step": 1950 }, { "epoch": 0.3348949578997143, "grad_norm": 21.051359176635742, "learning_rate": 4.292195207892882e-06, "loss": 0.9229, "step": 1960 }, { "epoch": 0.3366036056441006, "grad_norm": 22.4477596282959, "learning_rate": 4.287790697674419e-06, "loss": 0.865, "step": 1970 }, { "epoch": 0.3383122533884869, "grad_norm": 20.99222755432129, "learning_rate": 4.283386187455955e-06, "loss": 0.8377, "step": 1980 }, { "epoch": 0.3400209011328732, "grad_norm": 23.59244155883789, "learning_rate": 4.2789816772374915e-06, "loss": 0.8419, "step": 1990 }, { "epoch": 0.3417295488772595, "grad_norm": 17.199111938476562, "learning_rate": 4.274577167019028e-06, "loss": 0.9104, "step": 2000 }, { "epoch": 0.34343819662164576, "grad_norm": 23.190162658691406, "learning_rate": 4.270172656800564e-06, "loss": 0.859, "step": 2010 }, { "epoch": 0.3451468443660321, "grad_norm": 22.3214168548584, "learning_rate": 4.265768146582101e-06, "loss": 0.898, "step": 2020 }, { "epoch": 0.3468554921104184, "grad_norm": 17.06951141357422, "learning_rate": 4.2613636363636365e-06, "loss": 0.9043, "step": 2030 }, { "epoch": 0.34856413985480467, "grad_norm": 22.465560913085938, "learning_rate": 4.256959126145173e-06, "loss": 0.8559, "step": 2040 }, { "epoch": 0.35027278759919095, "grad_norm": 20.7056884765625, "learning_rate": 4.25255461592671e-06, "loss": 0.8545, "step": 2050 }, { "epoch": 0.3519814353435773, "grad_norm": 18.856229782104492, "learning_rate": 4.248150105708246e-06, "loss": 0.8404, "step": 2060 }, { "epoch": 0.35369008308796357, "grad_norm": 19.156654357910156, "learning_rate": 4.2437455954897824e-06, "loss": 0.9017, "step": 2070 }, { "epoch": 0.35539873083234985, "grad_norm": 19.859079360961914, "learning_rate": 4.239341085271318e-06, "loss": 0.9067, "step": 2080 }, { "epoch": 0.3571073785767362, "grad_norm": 20.216876983642578, "learning_rate": 4.234936575052854e-06, "loss": 0.8961, "step": 2090 }, { "epoch": 0.35881602632112247, "grad_norm": 21.373823165893555, "learning_rate": 4.230532064834391e-06, "loss": 0.8803, "step": 2100 }, { "epoch": 0.36052467406550875, "grad_norm": 21.679407119750977, "learning_rate": 4.226127554615927e-06, "loss": 0.857, "step": 2110 }, { "epoch": 0.36223332180989504, "grad_norm": 21.290212631225586, "learning_rate": 4.221723044397463e-06, "loss": 0.9099, "step": 2120 }, { "epoch": 0.3639419695542814, "grad_norm": 23.025487899780273, "learning_rate": 4.217318534179e-06, "loss": 0.8501, "step": 2130 }, { "epoch": 0.36565061729866766, "grad_norm": 24.255035400390625, "learning_rate": 4.212914023960536e-06, "loss": 0.8415, "step": 2140 }, { "epoch": 0.36735926504305394, "grad_norm": 19.89132308959961, "learning_rate": 4.2085095137420726e-06, "loss": 0.8498, "step": 2150 }, { "epoch": 0.3690679127874402, "grad_norm": 19.75184440612793, "learning_rate": 4.204105003523608e-06, "loss": 0.8162, "step": 2160 }, { "epoch": 0.37077656053182656, "grad_norm": 19.339553833007812, "learning_rate": 4.199700493305145e-06, "loss": 0.8784, "step": 2170 }, { "epoch": 0.37248520827621284, "grad_norm": 15.743782997131348, "learning_rate": 4.195295983086681e-06, "loss": 0.8739, "step": 2180 }, { "epoch": 0.3741938560205991, "grad_norm": 20.931917190551758, "learning_rate": 4.190891472868218e-06, "loss": 0.8697, "step": 2190 }, { "epoch": 0.37590250376498546, "grad_norm": 21.439781188964844, "learning_rate": 4.186486962649754e-06, "loss": 0.9417, "step": 2200 }, { "epoch": 0.37761115150937175, "grad_norm": 19.33049964904785, "learning_rate": 4.18208245243129e-06, "loss": 0.8648, "step": 2210 }, { "epoch": 0.379319799253758, "grad_norm": 20.86115074157715, "learning_rate": 4.177677942212826e-06, "loss": 0.9008, "step": 2220 }, { "epoch": 0.3810284469981443, "grad_norm": 21.383541107177734, "learning_rate": 4.173273431994363e-06, "loss": 0.8436, "step": 2230 }, { "epoch": 0.38273709474253065, "grad_norm": 20.323444366455078, "learning_rate": 4.1688689217758985e-06, "loss": 0.8607, "step": 2240 }, { "epoch": 0.38444574248691693, "grad_norm": 20.108402252197266, "learning_rate": 4.164464411557435e-06, "loss": 0.8718, "step": 2250 }, { "epoch": 0.3861543902313032, "grad_norm": 27.39733123779297, "learning_rate": 4.160059901338971e-06, "loss": 0.801, "step": 2260 }, { "epoch": 0.3878630379756895, "grad_norm": 19.76158332824707, "learning_rate": 4.155655391120508e-06, "loss": 0.8525, "step": 2270 }, { "epoch": 0.38957168572007583, "grad_norm": 20.22632598876953, "learning_rate": 4.1512508809020444e-06, "loss": 0.8277, "step": 2280 }, { "epoch": 0.3912803334644621, "grad_norm": 20.0892333984375, "learning_rate": 4.14684637068358e-06, "loss": 0.8352, "step": 2290 }, { "epoch": 0.3929889812088484, "grad_norm": 18.96234893798828, "learning_rate": 4.142441860465117e-06, "loss": 0.8461, "step": 2300 }, { "epoch": 0.39469762895323474, "grad_norm": 24.289127349853516, "learning_rate": 4.138037350246653e-06, "loss": 0.8953, "step": 2310 }, { "epoch": 0.396406276697621, "grad_norm": 22.399789810180664, "learning_rate": 4.1336328400281895e-06, "loss": 0.8364, "step": 2320 }, { "epoch": 0.3981149244420073, "grad_norm": 24.583871841430664, "learning_rate": 4.129228329809726e-06, "loss": 0.8271, "step": 2330 }, { "epoch": 0.3998235721863936, "grad_norm": 25.536149978637695, "learning_rate": 4.124823819591261e-06, "loss": 0.8332, "step": 2340 }, { "epoch": 0.4015322199307799, "grad_norm": 25.381229400634766, "learning_rate": 4.120419309372798e-06, "loss": 0.8155, "step": 2350 }, { "epoch": 0.4032408676751662, "grad_norm": 20.306066513061523, "learning_rate": 4.1160147991543346e-06, "loss": 0.8213, "step": 2360 }, { "epoch": 0.4049495154195525, "grad_norm": 22.400867462158203, "learning_rate": 4.11161028893587e-06, "loss": 0.8163, "step": 2370 }, { "epoch": 0.40665816316393877, "grad_norm": 16.857330322265625, "learning_rate": 4.107205778717407e-06, "loss": 0.8498, "step": 2380 }, { "epoch": 0.4083668109083251, "grad_norm": 23.580421447753906, "learning_rate": 4.102801268498943e-06, "loss": 0.8312, "step": 2390 }, { "epoch": 0.4100754586527114, "grad_norm": 19.323286056518555, "learning_rate": 4.09839675828048e-06, "loss": 0.8104, "step": 2400 }, { "epoch": 0.41178410639709767, "grad_norm": 20.80855941772461, "learning_rate": 4.093992248062016e-06, "loss": 0.8135, "step": 2410 }, { "epoch": 0.41349275414148395, "grad_norm": 19.280595779418945, "learning_rate": 4.089587737843552e-06, "loss": 0.844, "step": 2420 }, { "epoch": 0.4152014018858703, "grad_norm": 24.815204620361328, "learning_rate": 4.085183227625089e-06, "loss": 0.8324, "step": 2430 }, { "epoch": 0.4169100496302566, "grad_norm": 19.941333770751953, "learning_rate": 4.080778717406625e-06, "loss": 0.8529, "step": 2440 }, { "epoch": 0.41861869737464286, "grad_norm": 18.017372131347656, "learning_rate": 4.076374207188161e-06, "loss": 0.8462, "step": 2450 }, { "epoch": 0.4203273451190292, "grad_norm": 15.000432014465332, "learning_rate": 4.071969696969697e-06, "loss": 0.8409, "step": 2460 }, { "epoch": 0.4220359928634155, "grad_norm": 26.720317840576172, "learning_rate": 4.067565186751233e-06, "loss": 0.8698, "step": 2470 }, { "epoch": 0.42374464060780176, "grad_norm": 21.395301818847656, "learning_rate": 4.06316067653277e-06, "loss": 0.7904, "step": 2480 }, { "epoch": 0.42545328835218804, "grad_norm": 22.071170806884766, "learning_rate": 4.058756166314306e-06, "loss": 0.8307, "step": 2490 }, { "epoch": 0.4271619360965744, "grad_norm": 18.912866592407227, "learning_rate": 4.054351656095842e-06, "loss": 0.8372, "step": 2500 }, { "epoch": 0.42887058384096066, "grad_norm": 23.51670265197754, "learning_rate": 4.049947145877379e-06, "loss": 0.8141, "step": 2510 }, { "epoch": 0.43057923158534694, "grad_norm": 17.042999267578125, "learning_rate": 4.045542635658915e-06, "loss": 0.8862, "step": 2520 }, { "epoch": 0.4322878793297332, "grad_norm": 21.787776947021484, "learning_rate": 4.0411381254404515e-06, "loss": 0.8552, "step": 2530 }, { "epoch": 0.43399652707411956, "grad_norm": 20.526792526245117, "learning_rate": 4.036733615221987e-06, "loss": 0.8179, "step": 2540 }, { "epoch": 0.43570517481850585, "grad_norm": 25.407398223876953, "learning_rate": 4.032329105003524e-06, "loss": 0.8514, "step": 2550 }, { "epoch": 0.43741382256289213, "grad_norm": 16.01190948486328, "learning_rate": 4.027924594785061e-06, "loss": 0.8364, "step": 2560 }, { "epoch": 0.43912247030727847, "grad_norm": 20.050710678100586, "learning_rate": 4.0235200845665965e-06, "loss": 0.8362, "step": 2570 }, { "epoch": 0.44083111805166475, "grad_norm": 20.279884338378906, "learning_rate": 4.019115574348133e-06, "loss": 0.8034, "step": 2580 }, { "epoch": 0.44253976579605103, "grad_norm": 18.78345489501953, "learning_rate": 4.014711064129669e-06, "loss": 0.8336, "step": 2590 }, { "epoch": 0.4442484135404373, "grad_norm": 24.339946746826172, "learning_rate": 4.010306553911205e-06, "loss": 0.8588, "step": 2600 }, { "epoch": 0.44595706128482365, "grad_norm": 19.264131546020508, "learning_rate": 4.005902043692742e-06, "loss": 0.8536, "step": 2610 }, { "epoch": 0.44766570902920994, "grad_norm": 18.921791076660156, "learning_rate": 4.0014975334742774e-06, "loss": 0.819, "step": 2620 }, { "epoch": 0.4493743567735962, "grad_norm": 18.068126678466797, "learning_rate": 3.997093023255814e-06, "loss": 0.8061, "step": 2630 }, { "epoch": 0.4510830045179825, "grad_norm": 17.197391510009766, "learning_rate": 3.992688513037351e-06, "loss": 0.7977, "step": 2640 }, { "epoch": 0.45279165226236884, "grad_norm": 17.76527976989746, "learning_rate": 3.988284002818887e-06, "loss": 0.9012, "step": 2650 }, { "epoch": 0.4545003000067551, "grad_norm": 19.648696899414062, "learning_rate": 3.983879492600423e-06, "loss": 0.8426, "step": 2660 }, { "epoch": 0.4562089477511414, "grad_norm": 23.721616744995117, "learning_rate": 3.979474982381959e-06, "loss": 0.8055, "step": 2670 }, { "epoch": 0.45791759549552774, "grad_norm": 19.17746353149414, "learning_rate": 3.975070472163496e-06, "loss": 0.8192, "step": 2680 }, { "epoch": 0.459626243239914, "grad_norm": 19.428604125976562, "learning_rate": 3.9706659619450326e-06, "loss": 0.8168, "step": 2690 }, { "epoch": 0.4613348909843003, "grad_norm": 20.59436798095703, "learning_rate": 3.966261451726568e-06, "loss": 0.873, "step": 2700 }, { "epoch": 0.4630435387286866, "grad_norm": 22.71458625793457, "learning_rate": 3.961856941508105e-06, "loss": 0.8439, "step": 2710 }, { "epoch": 0.4647521864730729, "grad_norm": 30.239309310913086, "learning_rate": 3.957452431289641e-06, "loss": 0.7587, "step": 2720 }, { "epoch": 0.4664608342174592, "grad_norm": 18.89266014099121, "learning_rate": 3.953047921071177e-06, "loss": 0.8704, "step": 2730 }, { "epoch": 0.4681694819618455, "grad_norm": 18.356983184814453, "learning_rate": 3.9486434108527135e-06, "loss": 0.8343, "step": 2740 }, { "epoch": 0.4698781297062318, "grad_norm": 20.14874267578125, "learning_rate": 3.944238900634249e-06, "loss": 0.8119, "step": 2750 }, { "epoch": 0.4715867774506181, "grad_norm": 28.85474967956543, "learning_rate": 3.939834390415786e-06, "loss": 0.8767, "step": 2760 }, { "epoch": 0.4732954251950044, "grad_norm": 22.18447494506836, "learning_rate": 3.935429880197323e-06, "loss": 0.8175, "step": 2770 }, { "epoch": 0.4750040729393907, "grad_norm": 22.46308135986328, "learning_rate": 3.9310253699788585e-06, "loss": 0.8281, "step": 2780 }, { "epoch": 0.47671272068377696, "grad_norm": 18.005477905273438, "learning_rate": 3.926620859760395e-06, "loss": 0.7928, "step": 2790 }, { "epoch": 0.4784213684281633, "grad_norm": 18.78510093688965, "learning_rate": 3.922216349541931e-06, "loss": 0.8042, "step": 2800 }, { "epoch": 0.4801300161725496, "grad_norm": 22.980220794677734, "learning_rate": 3.917811839323468e-06, "loss": 0.8096, "step": 2810 }, { "epoch": 0.48183866391693586, "grad_norm": 18.548603057861328, "learning_rate": 3.913407329105004e-06, "loss": 0.8696, "step": 2820 }, { "epoch": 0.4835473116613222, "grad_norm": 23.86473846435547, "learning_rate": 3.90900281888654e-06, "loss": 0.8154, "step": 2830 }, { "epoch": 0.4852559594057085, "grad_norm": 24.574298858642578, "learning_rate": 3.904598308668077e-06, "loss": 0.8047, "step": 2840 }, { "epoch": 0.48696460715009476, "grad_norm": 21.888259887695312, "learning_rate": 3.900193798449613e-06, "loss": 0.8212, "step": 2850 }, { "epoch": 0.48867325489448105, "grad_norm": 18.29496955871582, "learning_rate": 3.895789288231149e-06, "loss": 0.8578, "step": 2860 }, { "epoch": 0.4903819026388674, "grad_norm": 19.80244255065918, "learning_rate": 3.891384778012685e-06, "loss": 0.8449, "step": 2870 }, { "epoch": 0.49209055038325367, "grad_norm": 22.403602600097656, "learning_rate": 3.886980267794221e-06, "loss": 0.8207, "step": 2880 }, { "epoch": 0.49379919812763995, "grad_norm": 25.105716705322266, "learning_rate": 3.882575757575758e-06, "loss": 0.8606, "step": 2890 }, { "epoch": 0.49550784587202623, "grad_norm": 19.511430740356445, "learning_rate": 3.878171247357294e-06, "loss": 0.8417, "step": 2900 }, { "epoch": 0.49721649361641257, "grad_norm": 20.566545486450195, "learning_rate": 3.87376673713883e-06, "loss": 0.7735, "step": 2910 }, { "epoch": 0.49892514136079885, "grad_norm": 19.69638442993164, "learning_rate": 3.869362226920367e-06, "loss": 0.8195, "step": 2920 }, { "epoch": 0.5006337891051852, "grad_norm": 19.20965576171875, "learning_rate": 3.864957716701903e-06, "loss": 0.8407, "step": 2930 }, { "epoch": 0.5023424368495715, "grad_norm": 18.67803955078125, "learning_rate": 3.86055320648344e-06, "loss": 0.8401, "step": 2940 }, { "epoch": 0.5040510845939578, "grad_norm": 19.72920036315918, "learning_rate": 3.8561486962649755e-06, "loss": 0.8176, "step": 2950 }, { "epoch": 0.505759732338344, "grad_norm": 27.366355895996094, "learning_rate": 3.851744186046512e-06, "loss": 0.8252, "step": 2960 }, { "epoch": 0.5074683800827303, "grad_norm": 24.130985260009766, "learning_rate": 3.847339675828049e-06, "loss": 0.8809, "step": 2970 }, { "epoch": 0.5091770278271166, "grad_norm": 17.88861846923828, "learning_rate": 3.842935165609585e-06, "loss": 0.7728, "step": 2980 }, { "epoch": 0.5108856755715029, "grad_norm": 22.26430892944336, "learning_rate": 3.8385306553911205e-06, "loss": 0.8516, "step": 2990 }, { "epoch": 0.5125943233158893, "grad_norm": 20.059843063354492, "learning_rate": 3.834126145172657e-06, "loss": 0.8084, "step": 3000 }, { "epoch": 0.5143029710602756, "grad_norm": 22.344680786132812, "learning_rate": 3.829721634954193e-06, "loss": 0.7728, "step": 3010 }, { "epoch": 0.5160116188046618, "grad_norm": 18.06348991394043, "learning_rate": 3.82531712473573e-06, "loss": 0.8463, "step": 3020 }, { "epoch": 0.5177202665490481, "grad_norm": 20.816757202148438, "learning_rate": 3.820912614517266e-06, "loss": 0.7904, "step": 3030 }, { "epoch": 0.5194289142934344, "grad_norm": 24.29160499572754, "learning_rate": 3.816508104298802e-06, "loss": 0.8059, "step": 3040 }, { "epoch": 0.5211375620378207, "grad_norm": 21.73212242126465, "learning_rate": 3.8121035940803385e-06, "loss": 0.7817, "step": 3050 }, { "epoch": 0.522846209782207, "grad_norm": 20.20355224609375, "learning_rate": 3.807699083861875e-06, "loss": 0.7836, "step": 3060 }, { "epoch": 0.5245548575265934, "grad_norm": 15.811525344848633, "learning_rate": 3.803294573643411e-06, "loss": 0.84, "step": 3070 }, { "epoch": 0.5262635052709796, "grad_norm": 23.239578247070312, "learning_rate": 3.7988900634249478e-06, "loss": 0.795, "step": 3080 }, { "epoch": 0.5279721530153659, "grad_norm": 18.50345802307129, "learning_rate": 3.794485553206484e-06, "loss": 0.8536, "step": 3090 }, { "epoch": 0.5296808007597522, "grad_norm": 21.647409439086914, "learning_rate": 3.7900810429880203e-06, "loss": 0.8141, "step": 3100 }, { "epoch": 0.5313894485041385, "grad_norm": 22.411800384521484, "learning_rate": 3.7856765327695566e-06, "loss": 0.7931, "step": 3110 }, { "epoch": 0.5330980962485248, "grad_norm": 23.15050506591797, "learning_rate": 3.7812720225510924e-06, "loss": 0.7902, "step": 3120 }, { "epoch": 0.5348067439929111, "grad_norm": 26.446077346801758, "learning_rate": 3.7768675123326287e-06, "loss": 0.8206, "step": 3130 }, { "epoch": 0.5365153917372973, "grad_norm": 18.19157600402832, "learning_rate": 3.772463002114165e-06, "loss": 0.796, "step": 3140 }, { "epoch": 0.5382240394816837, "grad_norm": 28.09468650817871, "learning_rate": 3.768058491895701e-06, "loss": 0.7592, "step": 3150 }, { "epoch": 0.53993268722607, "grad_norm": 19.753379821777344, "learning_rate": 3.763653981677238e-06, "loss": 0.7917, "step": 3160 }, { "epoch": 0.5416413349704563, "grad_norm": 22.52701759338379, "learning_rate": 3.759249471458774e-06, "loss": 0.8036, "step": 3170 }, { "epoch": 0.5433499827148426, "grad_norm": 24.160633087158203, "learning_rate": 3.7548449612403104e-06, "loss": 0.8111, "step": 3180 }, { "epoch": 0.5450586304592289, "grad_norm": 20.43000030517578, "learning_rate": 3.7504404510218467e-06, "loss": 0.8181, "step": 3190 }, { "epoch": 0.5467672782036151, "grad_norm": 20.047271728515625, "learning_rate": 3.746035940803383e-06, "loss": 0.8229, "step": 3200 }, { "epoch": 0.5484759259480014, "grad_norm": 20.642215728759766, "learning_rate": 3.741631430584919e-06, "loss": 0.8322, "step": 3210 }, { "epoch": 0.5501845736923878, "grad_norm": 19.686071395874023, "learning_rate": 3.737226920366456e-06, "loss": 0.775, "step": 3220 }, { "epoch": 0.5518932214367741, "grad_norm": 17.0440616607666, "learning_rate": 3.732822410147992e-06, "loss": 0.8128, "step": 3230 }, { "epoch": 0.5536018691811604, "grad_norm": 20.75046730041504, "learning_rate": 3.7284178999295284e-06, "loss": 0.8061, "step": 3240 }, { "epoch": 0.5553105169255467, "grad_norm": 23.867816925048828, "learning_rate": 3.7240133897110643e-06, "loss": 0.7951, "step": 3250 }, { "epoch": 0.557019164669933, "grad_norm": 26.70461082458496, "learning_rate": 3.7196088794926005e-06, "loss": 0.8343, "step": 3260 }, { "epoch": 0.5587278124143192, "grad_norm": 17.15665054321289, "learning_rate": 3.715204369274137e-06, "loss": 0.8434, "step": 3270 }, { "epoch": 0.5604364601587055, "grad_norm": 23.122482299804688, "learning_rate": 3.710799859055673e-06, "loss": 0.799, "step": 3280 }, { "epoch": 0.5621451079030919, "grad_norm": 18.05946922302246, "learning_rate": 3.7063953488372093e-06, "loss": 0.8326, "step": 3290 }, { "epoch": 0.5638537556474782, "grad_norm": 21.532657623291016, "learning_rate": 3.701990838618746e-06, "loss": 0.7646, "step": 3300 }, { "epoch": 0.5655624033918645, "grad_norm": 21.229511260986328, "learning_rate": 3.6975863284002823e-06, "loss": 0.7565, "step": 3310 }, { "epoch": 0.5672710511362508, "grad_norm": 18.513898849487305, "learning_rate": 3.6931818181818186e-06, "loss": 0.8021, "step": 3320 }, { "epoch": 0.568979698880637, "grad_norm": 20.819110870361328, "learning_rate": 3.688777307963355e-06, "loss": 0.8215, "step": 3330 }, { "epoch": 0.5706883466250233, "grad_norm": 27.454303741455078, "learning_rate": 3.684372797744891e-06, "loss": 0.7404, "step": 3340 }, { "epoch": 0.5723969943694096, "grad_norm": 20.618860244750977, "learning_rate": 3.6799682875264273e-06, "loss": 0.7855, "step": 3350 }, { "epoch": 0.5741056421137959, "grad_norm": 21.150808334350586, "learning_rate": 3.675563777307964e-06, "loss": 0.7411, "step": 3360 }, { "epoch": 0.5758142898581823, "grad_norm": 23.632627487182617, "learning_rate": 3.6711592670895003e-06, "loss": 0.7503, "step": 3370 }, { "epoch": 0.5775229376025686, "grad_norm": 19.350055694580078, "learning_rate": 3.666754756871036e-06, "loss": 0.8135, "step": 3380 }, { "epoch": 0.5792315853469548, "grad_norm": 19.341176986694336, "learning_rate": 3.6623502466525724e-06, "loss": 0.8338, "step": 3390 }, { "epoch": 0.5809402330913411, "grad_norm": 24.91313362121582, "learning_rate": 3.6579457364341087e-06, "loss": 0.7763, "step": 3400 }, { "epoch": 0.5826488808357274, "grad_norm": 23.72249412536621, "learning_rate": 3.653541226215645e-06, "loss": 0.7926, "step": 3410 }, { "epoch": 0.5843575285801137, "grad_norm": 22.838260650634766, "learning_rate": 3.649136715997181e-06, "loss": 0.8036, "step": 3420 }, { "epoch": 0.5860661763245, "grad_norm": 19.691679000854492, "learning_rate": 3.6447322057787175e-06, "loss": 0.8427, "step": 3430 }, { "epoch": 0.5877748240688864, "grad_norm": 21.973587036132812, "learning_rate": 3.640327695560254e-06, "loss": 0.8159, "step": 3440 }, { "epoch": 0.5894834718132727, "grad_norm": 21.701208114624023, "learning_rate": 3.6359231853417904e-06, "loss": 0.8004, "step": 3450 }, { "epoch": 0.5911921195576589, "grad_norm": 21.209928512573242, "learning_rate": 3.6315186751233267e-06, "loss": 0.8038, "step": 3460 }, { "epoch": 0.5929007673020452, "grad_norm": 19.597747802734375, "learning_rate": 3.627114164904863e-06, "loss": 0.7868, "step": 3470 }, { "epoch": 0.5946094150464315, "grad_norm": 18.882831573486328, "learning_rate": 3.6227096546863992e-06, "loss": 0.7861, "step": 3480 }, { "epoch": 0.5963180627908178, "grad_norm": 18.91342544555664, "learning_rate": 3.6183051444679355e-06, "loss": 0.8082, "step": 3490 }, { "epoch": 0.5980267105352041, "grad_norm": 23.127704620361328, "learning_rate": 3.613900634249472e-06, "loss": 0.8093, "step": 3500 }, { "epoch": 0.5997353582795903, "grad_norm": 23.314237594604492, "learning_rate": 3.6094961240310076e-06, "loss": 0.8148, "step": 3510 }, { "epoch": 0.6014440060239767, "grad_norm": 19.88514518737793, "learning_rate": 3.6050916138125443e-06, "loss": 0.7951, "step": 3520 }, { "epoch": 0.603152653768363, "grad_norm": 23.107532501220703, "learning_rate": 3.6006871035940805e-06, "loss": 0.7852, "step": 3530 }, { "epoch": 0.6048613015127493, "grad_norm": 26.108352661132812, "learning_rate": 3.596282593375617e-06, "loss": 0.7846, "step": 3540 }, { "epoch": 0.6065699492571356, "grad_norm": 21.61062240600586, "learning_rate": 3.591878083157153e-06, "loss": 0.8116, "step": 3550 }, { "epoch": 0.6082785970015219, "grad_norm": 22.475379943847656, "learning_rate": 3.5874735729386893e-06, "loss": 0.8082, "step": 3560 }, { "epoch": 0.6099872447459082, "grad_norm": 20.961181640625, "learning_rate": 3.5830690627202256e-06, "loss": 0.7747, "step": 3570 }, { "epoch": 0.6116958924902944, "grad_norm": 23.609365463256836, "learning_rate": 3.5786645525017623e-06, "loss": 0.828, "step": 3580 }, { "epoch": 0.6134045402346808, "grad_norm": 17.144989013671875, "learning_rate": 3.5742600422832986e-06, "loss": 0.8089, "step": 3590 }, { "epoch": 0.6151131879790671, "grad_norm": 24.28973388671875, "learning_rate": 3.569855532064835e-06, "loss": 0.782, "step": 3600 }, { "epoch": 0.6168218357234534, "grad_norm": 21.782333374023438, "learning_rate": 3.565451021846371e-06, "loss": 0.8252, "step": 3610 }, { "epoch": 0.6185304834678397, "grad_norm": 18.921234130859375, "learning_rate": 3.5610465116279074e-06, "loss": 0.7856, "step": 3620 }, { "epoch": 0.620239131212226, "grad_norm": 27.037317276000977, "learning_rate": 3.5566420014094436e-06, "loss": 0.7732, "step": 3630 }, { "epoch": 0.6219477789566122, "grad_norm": 20.37610626220703, "learning_rate": 3.5522374911909795e-06, "loss": 0.8081, "step": 3640 }, { "epoch": 0.6236564267009985, "grad_norm": 20.596923828125, "learning_rate": 3.547832980972516e-06, "loss": 0.8218, "step": 3650 }, { "epoch": 0.6253650744453849, "grad_norm": 19.31607437133789, "learning_rate": 3.5434284707540524e-06, "loss": 0.8212, "step": 3660 }, { "epoch": 0.6270737221897712, "grad_norm": 25.045026779174805, "learning_rate": 3.5390239605355887e-06, "loss": 0.8197, "step": 3670 }, { "epoch": 0.6287823699341575, "grad_norm": 26.2932071685791, "learning_rate": 3.534619450317125e-06, "loss": 0.8084, "step": 3680 }, { "epoch": 0.6304910176785438, "grad_norm": 22.81402587890625, "learning_rate": 3.530214940098661e-06, "loss": 0.7689, "step": 3690 }, { "epoch": 0.63219966542293, "grad_norm": 19.472158432006836, "learning_rate": 3.5258104298801975e-06, "loss": 0.7875, "step": 3700 }, { "epoch": 0.6339083131673163, "grad_norm": 18.043285369873047, "learning_rate": 3.5214059196617337e-06, "loss": 0.8188, "step": 3710 }, { "epoch": 0.6356169609117026, "grad_norm": 29.622112274169922, "learning_rate": 3.5170014094432704e-06, "loss": 0.7512, "step": 3720 }, { "epoch": 0.6373256086560889, "grad_norm": 20.153039932250977, "learning_rate": 3.5125968992248067e-06, "loss": 0.7823, "step": 3730 }, { "epoch": 0.6390342564004753, "grad_norm": 23.100482940673828, "learning_rate": 3.508192389006343e-06, "loss": 0.8137, "step": 3740 }, { "epoch": 0.6407429041448616, "grad_norm": 23.236019134521484, "learning_rate": 3.5037878787878792e-06, "loss": 0.7014, "step": 3750 }, { "epoch": 0.6424515518892479, "grad_norm": 22.595932006835938, "learning_rate": 3.4993833685694155e-06, "loss": 0.7896, "step": 3760 }, { "epoch": 0.6441601996336341, "grad_norm": 24.64199447631836, "learning_rate": 3.4949788583509513e-06, "loss": 0.7879, "step": 3770 }, { "epoch": 0.6458688473780204, "grad_norm": 17.925630569458008, "learning_rate": 3.4905743481324876e-06, "loss": 0.8212, "step": 3780 }, { "epoch": 0.6475774951224067, "grad_norm": 27.082433700561523, "learning_rate": 3.4861698379140243e-06, "loss": 0.8151, "step": 3790 }, { "epoch": 0.649286142866793, "grad_norm": 19.66040802001953, "learning_rate": 3.4817653276955606e-06, "loss": 0.7707, "step": 3800 }, { "epoch": 0.6509947906111794, "grad_norm": 22.2485408782959, "learning_rate": 3.477360817477097e-06, "loss": 0.742, "step": 3810 }, { "epoch": 0.6527034383555657, "grad_norm": 20.138118743896484, "learning_rate": 3.472956307258633e-06, "loss": 0.8006, "step": 3820 }, { "epoch": 0.6544120860999519, "grad_norm": 21.199825286865234, "learning_rate": 3.4685517970401693e-06, "loss": 0.7662, "step": 3830 }, { "epoch": 0.6561207338443382, "grad_norm": 24.360260009765625, "learning_rate": 3.4641472868217056e-06, "loss": 0.8112, "step": 3840 }, { "epoch": 0.6578293815887245, "grad_norm": 16.09538459777832, "learning_rate": 3.4597427766032423e-06, "loss": 0.7945, "step": 3850 }, { "epoch": 0.6595380293331108, "grad_norm": 22.721424102783203, "learning_rate": 3.4553382663847786e-06, "loss": 0.8033, "step": 3860 }, { "epoch": 0.6612466770774971, "grad_norm": 24.86945343017578, "learning_rate": 3.450933756166315e-06, "loss": 0.7616, "step": 3870 }, { "epoch": 0.6629553248218834, "grad_norm": 23.66960906982422, "learning_rate": 3.446529245947851e-06, "loss": 0.739, "step": 3880 }, { "epoch": 0.6646639725662697, "grad_norm": 26.404010772705078, "learning_rate": 3.4421247357293874e-06, "loss": 0.8027, "step": 3890 }, { "epoch": 0.666372620310656, "grad_norm": 17.85309410095215, "learning_rate": 3.437720225510923e-06, "loss": 0.8071, "step": 3900 }, { "epoch": 0.6680812680550423, "grad_norm": 21.82198143005371, "learning_rate": 3.4333157152924595e-06, "loss": 0.8042, "step": 3910 }, { "epoch": 0.6697899157994286, "grad_norm": 23.275218963623047, "learning_rate": 3.4289112050739957e-06, "loss": 0.7481, "step": 3920 }, { "epoch": 0.6714985635438149, "grad_norm": 21.871013641357422, "learning_rate": 3.4245066948555324e-06, "loss": 0.7892, "step": 3930 }, { "epoch": 0.6732072112882012, "grad_norm": 21.8370418548584, "learning_rate": 3.4201021846370687e-06, "loss": 0.7582, "step": 3940 }, { "epoch": 0.6749158590325874, "grad_norm": 23.338394165039062, "learning_rate": 3.415697674418605e-06, "loss": 0.7742, "step": 3950 }, { "epoch": 0.6766245067769738, "grad_norm": 22.160715103149414, "learning_rate": 3.4112931642001412e-06, "loss": 0.7382, "step": 3960 }, { "epoch": 0.6783331545213601, "grad_norm": 20.671384811401367, "learning_rate": 3.4068886539816775e-06, "loss": 0.7889, "step": 3970 }, { "epoch": 0.6800418022657464, "grad_norm": 25.99142837524414, "learning_rate": 3.4024841437632137e-06, "loss": 0.7906, "step": 3980 }, { "epoch": 0.6817504500101327, "grad_norm": 21.951120376586914, "learning_rate": 3.3980796335447504e-06, "loss": 0.7836, "step": 3990 }, { "epoch": 0.683459097754519, "grad_norm": 19.033308029174805, "learning_rate": 3.3936751233262867e-06, "loss": 0.7602, "step": 4000 }, { "epoch": 0.6851677454989052, "grad_norm": 23.86874008178711, "learning_rate": 3.389270613107823e-06, "loss": 0.7759, "step": 4010 }, { "epoch": 0.6868763932432915, "grad_norm": 19.606098175048828, "learning_rate": 3.3848661028893592e-06, "loss": 0.7874, "step": 4020 }, { "epoch": 0.6885850409876779, "grad_norm": 20.22423553466797, "learning_rate": 3.380461592670895e-06, "loss": 0.825, "step": 4030 }, { "epoch": 0.6902936887320642, "grad_norm": 27.637001037597656, "learning_rate": 3.3760570824524313e-06, "loss": 0.771, "step": 4040 }, { "epoch": 0.6920023364764505, "grad_norm": 18.97125244140625, "learning_rate": 3.3716525722339676e-06, "loss": 0.7249, "step": 4050 }, { "epoch": 0.6937109842208368, "grad_norm": 22.724328994750977, "learning_rate": 3.367248062015504e-06, "loss": 0.746, "step": 4060 }, { "epoch": 0.695419631965223, "grad_norm": 21.274978637695312, "learning_rate": 3.3628435517970406e-06, "loss": 0.7504, "step": 4070 }, { "epoch": 0.6971282797096093, "grad_norm": 23.363569259643555, "learning_rate": 3.358439041578577e-06, "loss": 0.6809, "step": 4080 }, { "epoch": 0.6988369274539956, "grad_norm": 27.47598648071289, "learning_rate": 3.354034531360113e-06, "loss": 0.755, "step": 4090 }, { "epoch": 0.7005455751983819, "grad_norm": 23.85652732849121, "learning_rate": 3.3496300211416494e-06, "loss": 0.7601, "step": 4100 }, { "epoch": 0.7022542229427683, "grad_norm": 18.246395111083984, "learning_rate": 3.3452255109231856e-06, "loss": 0.7201, "step": 4110 }, { "epoch": 0.7039628706871546, "grad_norm": 22.6968936920166, "learning_rate": 3.340821000704722e-06, "loss": 0.772, "step": 4120 }, { "epoch": 0.7056715184315409, "grad_norm": 16.688634872436523, "learning_rate": 3.3364164904862586e-06, "loss": 0.7743, "step": 4130 }, { "epoch": 0.7073801661759271, "grad_norm": 22.384685516357422, "learning_rate": 3.332011980267795e-06, "loss": 0.7562, "step": 4140 }, { "epoch": 0.7090888139203134, "grad_norm": 25.848621368408203, "learning_rate": 3.327607470049331e-06, "loss": 0.778, "step": 4150 }, { "epoch": 0.7107974616646997, "grad_norm": 20.71343231201172, "learning_rate": 3.323202959830867e-06, "loss": 0.7714, "step": 4160 }, { "epoch": 0.712506109409086, "grad_norm": 25.288433074951172, "learning_rate": 3.318798449612403e-06, "loss": 0.7812, "step": 4170 }, { "epoch": 0.7142147571534724, "grad_norm": 25.958364486694336, "learning_rate": 3.3143939393939395e-06, "loss": 0.8008, "step": 4180 }, { "epoch": 0.7159234048978587, "grad_norm": 23.568279266357422, "learning_rate": 3.3099894291754757e-06, "loss": 0.7468, "step": 4190 }, { "epoch": 0.7176320526422449, "grad_norm": 25.222332000732422, "learning_rate": 3.305584918957012e-06, "loss": 0.7379, "step": 4200 }, { "epoch": 0.7193407003866312, "grad_norm": 23.69734764099121, "learning_rate": 3.3011804087385487e-06, "loss": 0.7478, "step": 4210 }, { "epoch": 0.7210493481310175, "grad_norm": 18.56196403503418, "learning_rate": 3.296775898520085e-06, "loss": 0.7341, "step": 4220 }, { "epoch": 0.7227579958754038, "grad_norm": 28.462255477905273, "learning_rate": 3.2923713883016212e-06, "loss": 0.7084, "step": 4230 }, { "epoch": 0.7244666436197901, "grad_norm": 23.669126510620117, "learning_rate": 3.2879668780831575e-06, "loss": 0.741, "step": 4240 }, { "epoch": 0.7261752913641765, "grad_norm": 22.7609920501709, "learning_rate": 3.2835623678646938e-06, "loss": 0.7507, "step": 4250 }, { "epoch": 0.7278839391085627, "grad_norm": 21.962385177612305, "learning_rate": 3.27915785764623e-06, "loss": 0.7521, "step": 4260 }, { "epoch": 0.729592586852949, "grad_norm": 23.406116485595703, "learning_rate": 3.2747533474277667e-06, "loss": 0.7374, "step": 4270 }, { "epoch": 0.7313012345973353, "grad_norm": 25.467397689819336, "learning_rate": 3.270348837209303e-06, "loss": 0.7894, "step": 4280 }, { "epoch": 0.7330098823417216, "grad_norm": 21.29004669189453, "learning_rate": 3.265944326990839e-06, "loss": 0.7763, "step": 4290 }, { "epoch": 0.7347185300861079, "grad_norm": 26.53734588623047, "learning_rate": 3.261539816772375e-06, "loss": 0.7704, "step": 4300 }, { "epoch": 0.7364271778304942, "grad_norm": 26.881288528442383, "learning_rate": 3.2571353065539113e-06, "loss": 0.7655, "step": 4310 }, { "epoch": 0.7381358255748804, "grad_norm": 21.281936645507812, "learning_rate": 3.2527307963354476e-06, "loss": 0.7732, "step": 4320 }, { "epoch": 0.7398444733192668, "grad_norm": 22.189983367919922, "learning_rate": 3.248326286116984e-06, "loss": 0.766, "step": 4330 }, { "epoch": 0.7415531210636531, "grad_norm": 20.438308715820312, "learning_rate": 3.24392177589852e-06, "loss": 0.7765, "step": 4340 }, { "epoch": 0.7432617688080394, "grad_norm": 23.522388458251953, "learning_rate": 3.239517265680057e-06, "loss": 0.7617, "step": 4350 }, { "epoch": 0.7449704165524257, "grad_norm": 27.77216148376465, "learning_rate": 3.235112755461593e-06, "loss": 0.7321, "step": 4360 }, { "epoch": 0.746679064296812, "grad_norm": 25.899330139160156, "learning_rate": 3.2307082452431294e-06, "loss": 0.705, "step": 4370 }, { "epoch": 0.7483877120411982, "grad_norm": 24.98331069946289, "learning_rate": 3.2263037350246656e-06, "loss": 0.7356, "step": 4380 }, { "epoch": 0.7500963597855845, "grad_norm": 22.49882698059082, "learning_rate": 3.221899224806202e-06, "loss": 0.751, "step": 4390 }, { "epoch": 0.7518050075299709, "grad_norm": 21.93841552734375, "learning_rate": 3.217494714587738e-06, "loss": 0.7846, "step": 4400 }, { "epoch": 0.7535136552743572, "grad_norm": 16.464521408081055, "learning_rate": 3.213090204369275e-06, "loss": 0.7572, "step": 4410 }, { "epoch": 0.7552223030187435, "grad_norm": 22.928815841674805, "learning_rate": 3.2086856941508103e-06, "loss": 0.7004, "step": 4420 }, { "epoch": 0.7569309507631298, "grad_norm": 28.229320526123047, "learning_rate": 3.204281183932347e-06, "loss": 0.7712, "step": 4430 }, { "epoch": 0.758639598507516, "grad_norm": 17.726673126220703, "learning_rate": 3.1998766737138832e-06, "loss": 0.7637, "step": 4440 }, { "epoch": 0.7603482462519023, "grad_norm": 23.8514404296875, "learning_rate": 3.1954721634954195e-06, "loss": 0.7547, "step": 4450 }, { "epoch": 0.7620568939962886, "grad_norm": 24.728208541870117, "learning_rate": 3.1910676532769557e-06, "loss": 0.7626, "step": 4460 }, { "epoch": 0.7637655417406749, "grad_norm": 20.695667266845703, "learning_rate": 3.186663143058492e-06, "loss": 0.7339, "step": 4470 }, { "epoch": 0.7654741894850613, "grad_norm": 21.278423309326172, "learning_rate": 3.1822586328400283e-06, "loss": 0.7922, "step": 4480 }, { "epoch": 0.7671828372294476, "grad_norm": 23.316635131835938, "learning_rate": 3.177854122621565e-06, "loss": 0.7576, "step": 4490 }, { "epoch": 0.7688914849738339, "grad_norm": 24.75078010559082, "learning_rate": 3.1734496124031012e-06, "loss": 0.7455, "step": 4500 }, { "epoch": 0.7706001327182201, "grad_norm": 23.322919845581055, "learning_rate": 3.1690451021846375e-06, "loss": 0.7637, "step": 4510 }, { "epoch": 0.7723087804626064, "grad_norm": 26.70413589477539, "learning_rate": 3.1646405919661738e-06, "loss": 0.6983, "step": 4520 }, { "epoch": 0.7740174282069927, "grad_norm": 21.821128845214844, "learning_rate": 3.16023608174771e-06, "loss": 0.7044, "step": 4530 }, { "epoch": 0.775726075951379, "grad_norm": 19.717451095581055, "learning_rate": 3.1558315715292463e-06, "loss": 0.7294, "step": 4540 }, { "epoch": 0.7774347236957654, "grad_norm": 21.586071014404297, "learning_rate": 3.151427061310782e-06, "loss": 0.77, "step": 4550 }, { "epoch": 0.7791433714401517, "grad_norm": 25.673486709594727, "learning_rate": 3.1470225510923184e-06, "loss": 0.7258, "step": 4560 }, { "epoch": 0.780852019184538, "grad_norm": 27.769350051879883, "learning_rate": 3.142618040873855e-06, "loss": 0.797, "step": 4570 }, { "epoch": 0.7825606669289242, "grad_norm": 20.539966583251953, "learning_rate": 3.1382135306553914e-06, "loss": 0.7611, "step": 4580 }, { "epoch": 0.7842693146733105, "grad_norm": 21.524412155151367, "learning_rate": 3.1338090204369276e-06, "loss": 0.7666, "step": 4590 }, { "epoch": 0.7859779624176968, "grad_norm": 22.3591365814209, "learning_rate": 3.129404510218464e-06, "loss": 0.7882, "step": 4600 }, { "epoch": 0.7876866101620831, "grad_norm": 23.00992202758789, "learning_rate": 3.125e-06, "loss": 0.7842, "step": 4610 }, { "epoch": 0.7893952579064695, "grad_norm": 16.515499114990234, "learning_rate": 3.1205954897815364e-06, "loss": 0.7369, "step": 4620 }, { "epoch": 0.7911039056508558, "grad_norm": 21.174406051635742, "learning_rate": 3.116190979563073e-06, "loss": 0.738, "step": 4630 }, { "epoch": 0.792812553395242, "grad_norm": 23.586978912353516, "learning_rate": 3.1117864693446094e-06, "loss": 0.6997, "step": 4640 }, { "epoch": 0.7945212011396283, "grad_norm": 32.21963882446289, "learning_rate": 3.1073819591261456e-06, "loss": 0.7019, "step": 4650 }, { "epoch": 0.7962298488840146, "grad_norm": 25.9871883392334, "learning_rate": 3.102977448907682e-06, "loss": 0.7083, "step": 4660 }, { "epoch": 0.7979384966284009, "grad_norm": 24.332395553588867, "learning_rate": 3.098572938689218e-06, "loss": 0.7863, "step": 4670 }, { "epoch": 0.7996471443727872, "grad_norm": 20.869014739990234, "learning_rate": 3.094168428470754e-06, "loss": 0.7582, "step": 4680 }, { "epoch": 0.8013557921171734, "grad_norm": 22.537940979003906, "learning_rate": 3.0897639182522903e-06, "loss": 0.806, "step": 4690 }, { "epoch": 0.8030644398615598, "grad_norm": 26.17819595336914, "learning_rate": 3.0853594080338265e-06, "loss": 0.743, "step": 4700 }, { "epoch": 0.8047730876059461, "grad_norm": 23.158397674560547, "learning_rate": 3.0809548978153632e-06, "loss": 0.7075, "step": 4710 }, { "epoch": 0.8064817353503324, "grad_norm": 18.984607696533203, "learning_rate": 3.0765503875968995e-06, "loss": 0.7483, "step": 4720 }, { "epoch": 0.8081903830947187, "grad_norm": 22.03697967529297, "learning_rate": 3.0721458773784358e-06, "loss": 0.7295, "step": 4730 }, { "epoch": 0.809899030839105, "grad_norm": 19.310800552368164, "learning_rate": 3.067741367159972e-06, "loss": 0.7566, "step": 4740 }, { "epoch": 0.8116076785834913, "grad_norm": 27.38188934326172, "learning_rate": 3.0633368569415083e-06, "loss": 0.7487, "step": 4750 }, { "epoch": 0.8133163263278775, "grad_norm": 30.696491241455078, "learning_rate": 3.0589323467230446e-06, "loss": 0.7468, "step": 4760 }, { "epoch": 0.8150249740722639, "grad_norm": 25.93939208984375, "learning_rate": 3.0545278365045812e-06, "loss": 0.7608, "step": 4770 }, { "epoch": 0.8167336218166502, "grad_norm": 24.9782772064209, "learning_rate": 3.0501233262861175e-06, "loss": 0.7327, "step": 4780 }, { "epoch": 0.8184422695610365, "grad_norm": 19.76726531982422, "learning_rate": 3.0457188160676538e-06, "loss": 0.7585, "step": 4790 }, { "epoch": 0.8201509173054228, "grad_norm": 24.16695785522461, "learning_rate": 3.04131430584919e-06, "loss": 0.7812, "step": 4800 }, { "epoch": 0.8218595650498091, "grad_norm": 25.34935188293457, "learning_rate": 3.036909795630726e-06, "loss": 0.718, "step": 4810 }, { "epoch": 0.8235682127941953, "grad_norm": 21.88555335998535, "learning_rate": 3.032505285412262e-06, "loss": 0.7264, "step": 4820 }, { "epoch": 0.8252768605385816, "grad_norm": 28.31941795349121, "learning_rate": 3.0281007751937984e-06, "loss": 0.7012, "step": 4830 }, { "epoch": 0.8269855082829679, "grad_norm": 24.483379364013672, "learning_rate": 3.0236962649753347e-06, "loss": 0.6828, "step": 4840 }, { "epoch": 0.8286941560273543, "grad_norm": 22.461471557617188, "learning_rate": 3.0192917547568714e-06, "loss": 0.7266, "step": 4850 }, { "epoch": 0.8304028037717406, "grad_norm": 19.53203773498535, "learning_rate": 3.0148872445384076e-06, "loss": 0.707, "step": 4860 }, { "epoch": 0.8321114515161269, "grad_norm": 26.503684997558594, "learning_rate": 3.010482734319944e-06, "loss": 0.7399, "step": 4870 }, { "epoch": 0.8338200992605131, "grad_norm": 25.25548553466797, "learning_rate": 3.00607822410148e-06, "loss": 0.7094, "step": 4880 }, { "epoch": 0.8355287470048994, "grad_norm": 27.260940551757812, "learning_rate": 3.0016737138830164e-06, "loss": 0.7311, "step": 4890 }, { "epoch": 0.8372373947492857, "grad_norm": 22.992063522338867, "learning_rate": 2.9972692036645527e-06, "loss": 0.7389, "step": 4900 }, { "epoch": 0.838946042493672, "grad_norm": 24.592796325683594, "learning_rate": 2.9928646934460894e-06, "loss": 0.753, "step": 4910 }, { "epoch": 0.8406546902380584, "grad_norm": 21.972124099731445, "learning_rate": 2.9884601832276256e-06, "loss": 0.7347, "step": 4920 }, { "epoch": 0.8423633379824447, "grad_norm": 21.52046775817871, "learning_rate": 2.984055673009162e-06, "loss": 0.6925, "step": 4930 }, { "epoch": 0.844071985726831, "grad_norm": 26.47010040283203, "learning_rate": 2.9796511627906977e-06, "loss": 0.6887, "step": 4940 }, { "epoch": 0.8457806334712172, "grad_norm": 31.673635482788086, "learning_rate": 2.975246652572234e-06, "loss": 0.7223, "step": 4950 }, { "epoch": 0.8474892812156035, "grad_norm": 24.043643951416016, "learning_rate": 2.9708421423537703e-06, "loss": 0.7438, "step": 4960 }, { "epoch": 0.8491979289599898, "grad_norm": 21.57198715209961, "learning_rate": 2.9664376321353065e-06, "loss": 0.7187, "step": 4970 }, { "epoch": 0.8509065767043761, "grad_norm": 27.323469161987305, "learning_rate": 2.962033121916843e-06, "loss": 0.7423, "step": 4980 }, { "epoch": 0.8526152244487625, "grad_norm": 29.45259666442871, "learning_rate": 2.9576286116983795e-06, "loss": 0.7215, "step": 4990 }, { "epoch": 0.8543238721931488, "grad_norm": 25.042516708374023, "learning_rate": 2.9532241014799158e-06, "loss": 0.7226, "step": 5000 }, { "epoch": 0.856032519937535, "grad_norm": 20.377517700195312, "learning_rate": 2.948819591261452e-06, "loss": 0.7774, "step": 5010 }, { "epoch": 0.8577411676819213, "grad_norm": 19.54035758972168, "learning_rate": 2.9444150810429883e-06, "loss": 0.7994, "step": 5020 }, { "epoch": 0.8594498154263076, "grad_norm": 23.07032012939453, "learning_rate": 2.9400105708245246e-06, "loss": 0.7022, "step": 5030 }, { "epoch": 0.8611584631706939, "grad_norm": 23.482563018798828, "learning_rate": 2.935606060606061e-06, "loss": 0.7228, "step": 5040 }, { "epoch": 0.8628671109150802, "grad_norm": 20.6116886138916, "learning_rate": 2.9312015503875975e-06, "loss": 0.6769, "step": 5050 }, { "epoch": 0.8645757586594665, "grad_norm": 23.133941650390625, "learning_rate": 2.9267970401691338e-06, "loss": 0.7216, "step": 5060 }, { "epoch": 0.8662844064038528, "grad_norm": 19.501455307006836, "learning_rate": 2.9223925299506696e-06, "loss": 0.7417, "step": 5070 }, { "epoch": 0.8679930541482391, "grad_norm": 20.669921875, "learning_rate": 2.917988019732206e-06, "loss": 0.7187, "step": 5080 }, { "epoch": 0.8697017018926254, "grad_norm": 24.454565048217773, "learning_rate": 2.913583509513742e-06, "loss": 0.6937, "step": 5090 }, { "epoch": 0.8714103496370117, "grad_norm": 26.569597244262695, "learning_rate": 2.9091789992952784e-06, "loss": 0.7769, "step": 5100 }, { "epoch": 0.873118997381398, "grad_norm": 23.066076278686523, "learning_rate": 2.9047744890768147e-06, "loss": 0.7607, "step": 5110 }, { "epoch": 0.8748276451257843, "grad_norm": 25.31006622314453, "learning_rate": 2.900369978858351e-06, "loss": 0.692, "step": 5120 }, { "epoch": 0.8765362928701705, "grad_norm": 24.027446746826172, "learning_rate": 2.8959654686398876e-06, "loss": 0.6777, "step": 5130 }, { "epoch": 0.8782449406145569, "grad_norm": 21.320232391357422, "learning_rate": 2.891560958421424e-06, "loss": 0.7671, "step": 5140 }, { "epoch": 0.8799535883589432, "grad_norm": 22.37028694152832, "learning_rate": 2.88715644820296e-06, "loss": 0.7451, "step": 5150 }, { "epoch": 0.8816622361033295, "grad_norm": 17.270870208740234, "learning_rate": 2.8827519379844964e-06, "loss": 0.7202, "step": 5160 }, { "epoch": 0.8833708838477158, "grad_norm": 32.7978401184082, "learning_rate": 2.8783474277660327e-06, "loss": 0.6955, "step": 5170 }, { "epoch": 0.8850795315921021, "grad_norm": 30.631633758544922, "learning_rate": 2.873942917547569e-06, "loss": 0.7657, "step": 5180 }, { "epoch": 0.8867881793364883, "grad_norm": 21.487262725830078, "learning_rate": 2.8695384073291056e-06, "loss": 0.723, "step": 5190 }, { "epoch": 0.8884968270808746, "grad_norm": 25.697208404541016, "learning_rate": 2.865133897110641e-06, "loss": 0.7678, "step": 5200 }, { "epoch": 0.8902054748252609, "grad_norm": 17.121862411499023, "learning_rate": 2.8607293868921778e-06, "loss": 0.736, "step": 5210 }, { "epoch": 0.8919141225696473, "grad_norm": 22.777664184570312, "learning_rate": 2.856324876673714e-06, "loss": 0.6902, "step": 5220 }, { "epoch": 0.8936227703140336, "grad_norm": 21.997692108154297, "learning_rate": 2.8519203664552503e-06, "loss": 0.7177, "step": 5230 }, { "epoch": 0.8953314180584199, "grad_norm": 27.831954956054688, "learning_rate": 2.8475158562367866e-06, "loss": 0.7061, "step": 5240 }, { "epoch": 0.8970400658028062, "grad_norm": 20.20841407775879, "learning_rate": 2.843111346018323e-06, "loss": 0.7103, "step": 5250 }, { "epoch": 0.8987487135471924, "grad_norm": 22.670791625976562, "learning_rate": 2.838706835799859e-06, "loss": 0.6827, "step": 5260 }, { "epoch": 0.9004573612915787, "grad_norm": 19.863046646118164, "learning_rate": 2.8343023255813958e-06, "loss": 0.7659, "step": 5270 }, { "epoch": 0.902166009035965, "grad_norm": 21.82082176208496, "learning_rate": 2.829897815362932e-06, "loss": 0.6768, "step": 5280 }, { "epoch": 0.9038746567803514, "grad_norm": 20.27167320251465, "learning_rate": 2.8254933051444683e-06, "loss": 0.7313, "step": 5290 }, { "epoch": 0.9055833045247377, "grad_norm": 23.12434196472168, "learning_rate": 2.8210887949260046e-06, "loss": 0.7167, "step": 5300 }, { "epoch": 0.907291952269124, "grad_norm": 28.580188751220703, "learning_rate": 2.816684284707541e-06, "loss": 0.707, "step": 5310 }, { "epoch": 0.9090006000135102, "grad_norm": 21.65957260131836, "learning_rate": 2.812279774489077e-06, "loss": 0.7243, "step": 5320 }, { "epoch": 0.9107092477578965, "grad_norm": 18.040910720825195, "learning_rate": 2.807875264270613e-06, "loss": 0.714, "step": 5330 }, { "epoch": 0.9124178955022828, "grad_norm": 25.710729598999023, "learning_rate": 2.803470754052149e-06, "loss": 0.7092, "step": 5340 }, { "epoch": 0.9141265432466691, "grad_norm": 23.372407913208008, "learning_rate": 2.799066243833686e-06, "loss": 0.6511, "step": 5350 }, { "epoch": 0.9158351909910555, "grad_norm": 24.783931732177734, "learning_rate": 2.794661733615222e-06, "loss": 0.6906, "step": 5360 }, { "epoch": 0.9175438387354418, "grad_norm": 25.27882194519043, "learning_rate": 2.7902572233967584e-06, "loss": 0.686, "step": 5370 }, { "epoch": 0.919252486479828, "grad_norm": 22.388492584228516, "learning_rate": 2.7858527131782947e-06, "loss": 0.6987, "step": 5380 }, { "epoch": 0.9209611342242143, "grad_norm": 20.66554832458496, "learning_rate": 2.781448202959831e-06, "loss": 0.6719, "step": 5390 }, { "epoch": 0.9226697819686006, "grad_norm": 21.613603591918945, "learning_rate": 2.7770436927413672e-06, "loss": 0.7096, "step": 5400 }, { "epoch": 0.9243784297129869, "grad_norm": 22.45414161682129, "learning_rate": 2.772639182522904e-06, "loss": 0.6965, "step": 5410 }, { "epoch": 0.9260870774573732, "grad_norm": 22.07771110534668, "learning_rate": 2.76823467230444e-06, "loss": 0.6987, "step": 5420 }, { "epoch": 0.9277957252017595, "grad_norm": 22.09317970275879, "learning_rate": 2.7638301620859764e-06, "loss": 0.7033, "step": 5430 }, { "epoch": 0.9295043729461459, "grad_norm": 22.29112434387207, "learning_rate": 2.7594256518675127e-06, "loss": 0.7126, "step": 5440 }, { "epoch": 0.9312130206905321, "grad_norm": 25.35603141784668, "learning_rate": 2.755021141649049e-06, "loss": 0.6872, "step": 5450 }, { "epoch": 0.9329216684349184, "grad_norm": 26.43601417541504, "learning_rate": 2.750616631430585e-06, "loss": 0.6884, "step": 5460 }, { "epoch": 0.9346303161793047, "grad_norm": 22.09392738342285, "learning_rate": 2.746212121212121e-06, "loss": 0.7547, "step": 5470 }, { "epoch": 0.936338963923691, "grad_norm": 18.14749526977539, "learning_rate": 2.7418076109936578e-06, "loss": 0.7237, "step": 5480 }, { "epoch": 0.9380476116680773, "grad_norm": 25.575664520263672, "learning_rate": 2.737403100775194e-06, "loss": 0.7274, "step": 5490 }, { "epoch": 0.9397562594124635, "grad_norm": 30.890182495117188, "learning_rate": 2.7329985905567303e-06, "loss": 0.657, "step": 5500 }, { "epoch": 0.9414649071568499, "grad_norm": 25.72110939025879, "learning_rate": 2.7285940803382666e-06, "loss": 0.6839, "step": 5510 }, { "epoch": 0.9431735549012362, "grad_norm": 19.578693389892578, "learning_rate": 2.724189570119803e-06, "loss": 0.7292, "step": 5520 }, { "epoch": 0.9448822026456225, "grad_norm": 19.946809768676758, "learning_rate": 2.719785059901339e-06, "loss": 0.7023, "step": 5530 }, { "epoch": 0.9465908503900088, "grad_norm": 29.049034118652344, "learning_rate": 2.7153805496828758e-06, "loss": 0.6711, "step": 5540 }, { "epoch": 0.9482994981343951, "grad_norm": 26.28841781616211, "learning_rate": 2.710976039464412e-06, "loss": 0.6878, "step": 5550 }, { "epoch": 0.9500081458787814, "grad_norm": 28.58267593383789, "learning_rate": 2.7065715292459483e-06, "loss": 0.6849, "step": 5560 }, { "epoch": 0.9517167936231676, "grad_norm": 23.82330894470215, "learning_rate": 2.7021670190274846e-06, "loss": 0.6599, "step": 5570 }, { "epoch": 0.9534254413675539, "grad_norm": 23.920379638671875, "learning_rate": 2.697762508809021e-06, "loss": 0.7053, "step": 5580 }, { "epoch": 0.9551340891119403, "grad_norm": 21.284543991088867, "learning_rate": 2.6933579985905567e-06, "loss": 0.6852, "step": 5590 }, { "epoch": 0.9568427368563266, "grad_norm": 25.53280258178711, "learning_rate": 2.688953488372093e-06, "loss": 0.7453, "step": 5600 }, { "epoch": 0.9585513846007129, "grad_norm": 25.06231689453125, "learning_rate": 2.684548978153629e-06, "loss": 0.7138, "step": 5610 }, { "epoch": 0.9602600323450992, "grad_norm": 23.394676208496094, "learning_rate": 2.680144467935166e-06, "loss": 0.7542, "step": 5620 }, { "epoch": 0.9619686800894854, "grad_norm": 18.197351455688477, "learning_rate": 2.675739957716702e-06, "loss": 0.6856, "step": 5630 }, { "epoch": 0.9636773278338717, "grad_norm": 20.14853286743164, "learning_rate": 2.6713354474982384e-06, "loss": 0.7383, "step": 5640 }, { "epoch": 0.965385975578258, "grad_norm": 19.874074935913086, "learning_rate": 2.6669309372797747e-06, "loss": 0.7068, "step": 5650 }, { "epoch": 0.9670946233226444, "grad_norm": 21.235719680786133, "learning_rate": 2.662526427061311e-06, "loss": 0.7039, "step": 5660 }, { "epoch": 0.9688032710670307, "grad_norm": 24.528348922729492, "learning_rate": 2.6581219168428472e-06, "loss": 0.7245, "step": 5670 }, { "epoch": 0.970511918811417, "grad_norm": 23.650028228759766, "learning_rate": 2.653717406624384e-06, "loss": 0.6733, "step": 5680 }, { "epoch": 0.9722205665558032, "grad_norm": 24.60836410522461, "learning_rate": 2.64931289640592e-06, "loss": 0.6852, "step": 5690 }, { "epoch": 0.9739292143001895, "grad_norm": 21.282657623291016, "learning_rate": 2.6449083861874564e-06, "loss": 0.6296, "step": 5700 }, { "epoch": 0.9756378620445758, "grad_norm": 25.645389556884766, "learning_rate": 2.6405038759689927e-06, "loss": 0.716, "step": 5710 }, { "epoch": 0.9773465097889621, "grad_norm": 27.467487335205078, "learning_rate": 2.6360993657505286e-06, "loss": 0.6401, "step": 5720 }, { "epoch": 0.9790551575333485, "grad_norm": 24.290742874145508, "learning_rate": 2.631694855532065e-06, "loss": 0.7524, "step": 5730 }, { "epoch": 0.9807638052777348, "grad_norm": 25.512126922607422, "learning_rate": 2.627290345313601e-06, "loss": 0.7001, "step": 5740 }, { "epoch": 0.982472453022121, "grad_norm": 25.331077575683594, "learning_rate": 2.6228858350951373e-06, "loss": 0.7483, "step": 5750 }, { "epoch": 0.9841811007665073, "grad_norm": 27.04343032836914, "learning_rate": 2.618481324876674e-06, "loss": 0.6901, "step": 5760 }, { "epoch": 0.9858897485108936, "grad_norm": 22.927780151367188, "learning_rate": 2.6140768146582103e-06, "loss": 0.6261, "step": 5770 }, { "epoch": 0.9875983962552799, "grad_norm": 19.579212188720703, "learning_rate": 2.6096723044397466e-06, "loss": 0.6957, "step": 5780 }, { "epoch": 0.9893070439996662, "grad_norm": 29.58092498779297, "learning_rate": 2.605267794221283e-06, "loss": 0.7329, "step": 5790 }, { "epoch": 0.9910156917440525, "grad_norm": 17.51485252380371, "learning_rate": 2.600863284002819e-06, "loss": 0.6811, "step": 5800 }, { "epoch": 0.9927243394884389, "grad_norm": 24.039936065673828, "learning_rate": 2.5964587737843554e-06, "loss": 0.7713, "step": 5810 }, { "epoch": 0.9944329872328251, "grad_norm": 21.120576858520508, "learning_rate": 2.592054263565892e-06, "loss": 0.675, "step": 5820 }, { "epoch": 0.9961416349772114, "grad_norm": 21.085262298583984, "learning_rate": 2.5876497533474283e-06, "loss": 0.6972, "step": 5830 }, { "epoch": 0.9978502827215977, "grad_norm": 23.86699104309082, "learning_rate": 2.5832452431289646e-06, "loss": 0.7248, "step": 5840 }, { "epoch": 0.999558930465984, "grad_norm": 22.0477237701416, "learning_rate": 2.5788407329105004e-06, "loss": 0.6543, "step": 5850 }, { "epoch": 0.9999006600148612, "eval_loss": 0.848216712474823, "eval_runtime": 139.044, "eval_samples_per_second": 70.906, "eval_steps_per_second": 8.868, "step": 5852 }, { "epoch": 1.001366918195509, "grad_norm": 17.0991268157959, "learning_rate": 2.5744362226920367e-06, "loss": 0.6552, "step": 5860 }, { "epoch": 1.0030755659398953, "grad_norm": 25.986722946166992, "learning_rate": 2.570031712473573e-06, "loss": 0.4587, "step": 5870 }, { "epoch": 1.0047842136842817, "grad_norm": 30.915128707885742, "learning_rate": 2.5656272022551092e-06, "loss": 0.429, "step": 5880 }, { "epoch": 1.006492861428668, "grad_norm": 25.264280319213867, "learning_rate": 2.5612226920366455e-06, "loss": 0.4217, "step": 5890 }, { "epoch": 1.0082015091730543, "grad_norm": 30.28125, "learning_rate": 2.556818181818182e-06, "loss": 0.4479, "step": 5900 }, { "epoch": 1.0099101569174405, "grad_norm": 22.753318786621094, "learning_rate": 2.5524136715997184e-06, "loss": 0.4314, "step": 5910 }, { "epoch": 1.0116188046618269, "grad_norm": 31.079761505126953, "learning_rate": 2.5480091613812547e-06, "loss": 0.4476, "step": 5920 }, { "epoch": 1.013327452406213, "grad_norm": 23.833829879760742, "learning_rate": 2.543604651162791e-06, "loss": 0.4658, "step": 5930 }, { "epoch": 1.0150361001505994, "grad_norm": 29.113054275512695, "learning_rate": 2.5392001409443272e-06, "loss": 0.4135, "step": 5940 }, { "epoch": 1.0167447478949858, "grad_norm": 33.021793365478516, "learning_rate": 2.5347956307258635e-06, "loss": 0.455, "step": 5950 }, { "epoch": 1.018453395639372, "grad_norm": 25.65928840637207, "learning_rate": 2.5303911205074e-06, "loss": 0.4224, "step": 5960 }, { "epoch": 1.0201620433837584, "grad_norm": 29.965845108032227, "learning_rate": 2.5259866102889365e-06, "loss": 0.4035, "step": 5970 }, { "epoch": 1.0218706911281445, "grad_norm": 27.758283615112305, "learning_rate": 2.5215821000704723e-06, "loss": 0.4181, "step": 5980 }, { "epoch": 1.023579338872531, "grad_norm": 27.05050277709961, "learning_rate": 2.5171775898520086e-06, "loss": 0.4385, "step": 5990 }, { "epoch": 1.0252879866169171, "grad_norm": 32.60606002807617, "learning_rate": 2.512773079633545e-06, "loss": 0.4366, "step": 6000 }, { "epoch": 1.0269966343613035, "grad_norm": 26.773223876953125, "learning_rate": 2.508368569415081e-06, "loss": 0.3887, "step": 6010 }, { "epoch": 1.02870528210569, "grad_norm": 28.4731502532959, "learning_rate": 2.5039640591966174e-06, "loss": 0.4236, "step": 6020 }, { "epoch": 1.030413929850076, "grad_norm": 27.751144409179688, "learning_rate": 2.4995595489781536e-06, "loss": 0.4405, "step": 6030 }, { "epoch": 1.0321225775944625, "grad_norm": 30.297574996948242, "learning_rate": 2.4951550387596903e-06, "loss": 0.4587, "step": 6040 }, { "epoch": 1.0338312253388486, "grad_norm": 27.601472854614258, "learning_rate": 2.4907505285412266e-06, "loss": 0.4233, "step": 6050 }, { "epoch": 1.035539873083235, "grad_norm": 28.992273330688477, "learning_rate": 2.486346018322763e-06, "loss": 0.3768, "step": 6060 }, { "epoch": 1.0372485208276212, "grad_norm": 30.652511596679688, "learning_rate": 2.481941508104299e-06, "loss": 0.4453, "step": 6070 }, { "epoch": 1.0389571685720076, "grad_norm": 27.534353256225586, "learning_rate": 2.4775369978858354e-06, "loss": 0.4561, "step": 6080 }, { "epoch": 1.0406658163163938, "grad_norm": 33.124420166015625, "learning_rate": 2.4731324876673716e-06, "loss": 0.4058, "step": 6090 }, { "epoch": 1.0423744640607802, "grad_norm": 27.169292449951172, "learning_rate": 2.468727977448908e-06, "loss": 0.3672, "step": 6100 }, { "epoch": 1.0440831118051666, "grad_norm": 34.9072151184082, "learning_rate": 2.464323467230444e-06, "loss": 0.3951, "step": 6110 }, { "epoch": 1.0457917595495527, "grad_norm": 19.93570899963379, "learning_rate": 2.4599189570119804e-06, "loss": 0.4027, "step": 6120 }, { "epoch": 1.0475004072939391, "grad_norm": 20.85097312927246, "learning_rate": 2.4555144467935167e-06, "loss": 0.3867, "step": 6130 }, { "epoch": 1.0492090550383253, "grad_norm": 39.88778305053711, "learning_rate": 2.451109936575053e-06, "loss": 0.4293, "step": 6140 }, { "epoch": 1.0509177027827117, "grad_norm": 24.23454475402832, "learning_rate": 2.4467054263565892e-06, "loss": 0.4324, "step": 6150 }, { "epoch": 1.0526263505270979, "grad_norm": 26.025646209716797, "learning_rate": 2.4423009161381255e-06, "loss": 0.4314, "step": 6160 }, { "epoch": 1.0543349982714842, "grad_norm": 32.79511260986328, "learning_rate": 2.4378964059196618e-06, "loss": 0.3892, "step": 6170 }, { "epoch": 1.0560436460158706, "grad_norm": 29.5976505279541, "learning_rate": 2.4334918957011984e-06, "loss": 0.4593, "step": 6180 }, { "epoch": 1.0577522937602568, "grad_norm": 31.08228874206543, "learning_rate": 2.4290873854827347e-06, "loss": 0.3868, "step": 6190 }, { "epoch": 1.0594609415046432, "grad_norm": 32.12119674682617, "learning_rate": 2.424682875264271e-06, "loss": 0.4333, "step": 6200 }, { "epoch": 1.0611695892490294, "grad_norm": 41.33872985839844, "learning_rate": 2.420278365045807e-06, "loss": 0.4146, "step": 6210 }, { "epoch": 1.0628782369934158, "grad_norm": 41.04495620727539, "learning_rate": 2.4158738548273435e-06, "loss": 0.465, "step": 6220 }, { "epoch": 1.064586884737802, "grad_norm": 36.077674865722656, "learning_rate": 2.4114693446088798e-06, "loss": 0.4483, "step": 6230 }, { "epoch": 1.0662955324821883, "grad_norm": 27.00971221923828, "learning_rate": 2.407064834390416e-06, "loss": 0.4006, "step": 6240 }, { "epoch": 1.0680041802265747, "grad_norm": 26.599790573120117, "learning_rate": 2.4026603241719523e-06, "loss": 0.4588, "step": 6250 }, { "epoch": 1.069712827970961, "grad_norm": 32.89334487915039, "learning_rate": 2.3982558139534886e-06, "loss": 0.4048, "step": 6260 }, { "epoch": 1.0714214757153473, "grad_norm": 28.60340118408203, "learning_rate": 2.393851303735025e-06, "loss": 0.4211, "step": 6270 }, { "epoch": 1.0731301234597335, "grad_norm": 33.43773651123047, "learning_rate": 2.389446793516561e-06, "loss": 0.449, "step": 6280 }, { "epoch": 1.0748387712041199, "grad_norm": 24.94864273071289, "learning_rate": 2.3850422832980974e-06, "loss": 0.4117, "step": 6290 }, { "epoch": 1.076547418948506, "grad_norm": 40.33943557739258, "learning_rate": 2.3806377730796336e-06, "loss": 0.399, "step": 6300 }, { "epoch": 1.0782560666928924, "grad_norm": 21.64677619934082, "learning_rate": 2.37623326286117e-06, "loss": 0.3691, "step": 6310 }, { "epoch": 1.0799647144372788, "grad_norm": 24.09543800354004, "learning_rate": 2.3718287526427066e-06, "loss": 0.4372, "step": 6320 }, { "epoch": 1.081673362181665, "grad_norm": 38.64820861816406, "learning_rate": 2.367424242424243e-06, "loss": 0.4074, "step": 6330 }, { "epoch": 1.0833820099260514, "grad_norm": 25.985990524291992, "learning_rate": 2.3630197322057787e-06, "loss": 0.4339, "step": 6340 }, { "epoch": 1.0850906576704376, "grad_norm": 24.800357818603516, "learning_rate": 2.358615221987315e-06, "loss": 0.4702, "step": 6350 }, { "epoch": 1.086799305414824, "grad_norm": 21.988859176635742, "learning_rate": 2.3542107117688516e-06, "loss": 0.4824, "step": 6360 }, { "epoch": 1.0885079531592101, "grad_norm": 39.72243881225586, "learning_rate": 2.349806201550388e-06, "loss": 0.4188, "step": 6370 }, { "epoch": 1.0902166009035965, "grad_norm": 23.119239807128906, "learning_rate": 2.345401691331924e-06, "loss": 0.4573, "step": 6380 }, { "epoch": 1.091925248647983, "grad_norm": 20.915830612182617, "learning_rate": 2.3409971811134604e-06, "loss": 0.4256, "step": 6390 }, { "epoch": 1.093633896392369, "grad_norm": 25.44793701171875, "learning_rate": 2.3365926708949967e-06, "loss": 0.416, "step": 6400 }, { "epoch": 1.0953425441367555, "grad_norm": 34.23642349243164, "learning_rate": 2.332188160676533e-06, "loss": 0.3993, "step": 6410 }, { "epoch": 1.0970511918811416, "grad_norm": 26.873048782348633, "learning_rate": 2.3277836504580692e-06, "loss": 0.4516, "step": 6420 }, { "epoch": 1.098759839625528, "grad_norm": 29.566207885742188, "learning_rate": 2.3233791402396055e-06, "loss": 0.4385, "step": 6430 }, { "epoch": 1.1004684873699142, "grad_norm": 38.95769500732422, "learning_rate": 2.3189746300211418e-06, "loss": 0.4334, "step": 6440 }, { "epoch": 1.1021771351143006, "grad_norm": 22.23900604248047, "learning_rate": 2.314570119802678e-06, "loss": 0.4133, "step": 6450 }, { "epoch": 1.1038857828586868, "grad_norm": 32.352516174316406, "learning_rate": 2.3101656095842147e-06, "loss": 0.4445, "step": 6460 }, { "epoch": 1.1055944306030732, "grad_norm": 27.49093246459961, "learning_rate": 2.3057610993657506e-06, "loss": 0.458, "step": 6470 }, { "epoch": 1.1073030783474596, "grad_norm": 22.2708740234375, "learning_rate": 2.301356589147287e-06, "loss": 0.4362, "step": 6480 }, { "epoch": 1.1090117260918457, "grad_norm": 28.47286605834961, "learning_rate": 2.296952078928823e-06, "loss": 0.4389, "step": 6490 }, { "epoch": 1.1107203738362321, "grad_norm": 33.60470199584961, "learning_rate": 2.2925475687103598e-06, "loss": 0.434, "step": 6500 }, { "epoch": 1.1124290215806183, "grad_norm": 25.99380874633789, "learning_rate": 2.288143058491896e-06, "loss": 0.4128, "step": 6510 }, { "epoch": 1.1141376693250047, "grad_norm": 23.311767578125, "learning_rate": 2.2837385482734323e-06, "loss": 0.3983, "step": 6520 }, { "epoch": 1.115846317069391, "grad_norm": 42.49270248413086, "learning_rate": 2.2793340380549686e-06, "loss": 0.4046, "step": 6530 }, { "epoch": 1.1175549648137773, "grad_norm": 34.39870834350586, "learning_rate": 2.274929527836505e-06, "loss": 0.4225, "step": 6540 }, { "epoch": 1.1192636125581636, "grad_norm": 58.568817138671875, "learning_rate": 2.270525017618041e-06, "loss": 0.403, "step": 6550 }, { "epoch": 1.1209722603025498, "grad_norm": 40.097511291503906, "learning_rate": 2.2661205073995774e-06, "loss": 0.4099, "step": 6560 }, { "epoch": 1.1226809080469362, "grad_norm": 22.363500595092773, "learning_rate": 2.2617159971811136e-06, "loss": 0.4422, "step": 6570 }, { "epoch": 1.1243895557913224, "grad_norm": 29.2000732421875, "learning_rate": 2.25731148696265e-06, "loss": 0.4302, "step": 6580 }, { "epoch": 1.1260982035357088, "grad_norm": 25.729015350341797, "learning_rate": 2.252906976744186e-06, "loss": 0.4165, "step": 6590 }, { "epoch": 1.127806851280095, "grad_norm": 41.546085357666016, "learning_rate": 2.2485024665257224e-06, "loss": 0.4259, "step": 6600 }, { "epoch": 1.1295154990244813, "grad_norm": 27.8181095123291, "learning_rate": 2.2440979563072587e-06, "loss": 0.4418, "step": 6610 }, { "epoch": 1.1312241467688677, "grad_norm": 27.532690048217773, "learning_rate": 2.239693446088795e-06, "loss": 0.4072, "step": 6620 }, { "epoch": 1.132932794513254, "grad_norm": 38.94101333618164, "learning_rate": 2.2352889358703312e-06, "loss": 0.4003, "step": 6630 }, { "epoch": 1.1346414422576403, "grad_norm": 28.348133087158203, "learning_rate": 2.230884425651868e-06, "loss": 0.3975, "step": 6640 }, { "epoch": 1.1363500900020265, "grad_norm": 37.84804916381836, "learning_rate": 2.226479915433404e-06, "loss": 0.4254, "step": 6650 }, { "epoch": 1.1380587377464129, "grad_norm": 26.082874298095703, "learning_rate": 2.2220754052149404e-06, "loss": 0.3791, "step": 6660 }, { "epoch": 1.139767385490799, "grad_norm": 28.30021095275879, "learning_rate": 2.2176708949964763e-06, "loss": 0.4773, "step": 6670 }, { "epoch": 1.1414760332351854, "grad_norm": 23.014328002929688, "learning_rate": 2.213266384778013e-06, "loss": 0.4389, "step": 6680 }, { "epoch": 1.1431846809795718, "grad_norm": 22.675397872924805, "learning_rate": 2.2088618745595492e-06, "loss": 0.4072, "step": 6690 }, { "epoch": 1.144893328723958, "grad_norm": 37.76887893676758, "learning_rate": 2.2044573643410855e-06, "loss": 0.4555, "step": 6700 }, { "epoch": 1.1466019764683444, "grad_norm": 25.542505264282227, "learning_rate": 2.2000528541226218e-06, "loss": 0.421, "step": 6710 }, { "epoch": 1.1483106242127306, "grad_norm": 26.092363357543945, "learning_rate": 2.195648343904158e-06, "loss": 0.433, "step": 6720 }, { "epoch": 1.150019271957117, "grad_norm": 38.503875732421875, "learning_rate": 2.1912438336856943e-06, "loss": 0.4213, "step": 6730 }, { "epoch": 1.1517279197015031, "grad_norm": 29.79505157470703, "learning_rate": 2.1868393234672306e-06, "loss": 0.3697, "step": 6740 }, { "epoch": 1.1534365674458895, "grad_norm": 31.861398696899414, "learning_rate": 2.182434813248767e-06, "loss": 0.4401, "step": 6750 }, { "epoch": 1.155145215190276, "grad_norm": 25.554759979248047, "learning_rate": 2.178030303030303e-06, "loss": 0.444, "step": 6760 }, { "epoch": 1.156853862934662, "grad_norm": 25.79574966430664, "learning_rate": 2.1736257928118394e-06, "loss": 0.4126, "step": 6770 }, { "epoch": 1.1585625106790485, "grad_norm": 30.038307189941406, "learning_rate": 2.169221282593376e-06, "loss": 0.4217, "step": 6780 }, { "epoch": 1.1602711584234346, "grad_norm": 29.3577823638916, "learning_rate": 2.1648167723749123e-06, "loss": 0.4702, "step": 6790 }, { "epoch": 1.161979806167821, "grad_norm": 28.816720962524414, "learning_rate": 2.160412262156448e-06, "loss": 0.4295, "step": 6800 }, { "epoch": 1.1636884539122072, "grad_norm": 27.419452667236328, "learning_rate": 2.1560077519379844e-06, "loss": 0.4234, "step": 6810 }, { "epoch": 1.1653971016565936, "grad_norm": 26.20050048828125, "learning_rate": 2.151603241719521e-06, "loss": 0.387, "step": 6820 }, { "epoch": 1.1671057494009798, "grad_norm": 25.682668685913086, "learning_rate": 2.1471987315010574e-06, "loss": 0.3946, "step": 6830 }, { "epoch": 1.1688143971453662, "grad_norm": 28.867799758911133, "learning_rate": 2.1427942212825936e-06, "loss": 0.4315, "step": 6840 }, { "epoch": 1.1705230448897526, "grad_norm": 27.76809310913086, "learning_rate": 2.13838971106413e-06, "loss": 0.4153, "step": 6850 }, { "epoch": 1.1722316926341387, "grad_norm": 55.45150375366211, "learning_rate": 2.133985200845666e-06, "loss": 0.4121, "step": 6860 }, { "epoch": 1.1739403403785251, "grad_norm": 35.20660400390625, "learning_rate": 2.1295806906272024e-06, "loss": 0.4489, "step": 6870 }, { "epoch": 1.1756489881229113, "grad_norm": 31.571155548095703, "learning_rate": 2.1251761804087387e-06, "loss": 0.4098, "step": 6880 }, { "epoch": 1.1773576358672977, "grad_norm": 39.88226318359375, "learning_rate": 2.120771670190275e-06, "loss": 0.3931, "step": 6890 }, { "epoch": 1.179066283611684, "grad_norm": 36.098209381103516, "learning_rate": 2.1163671599718112e-06, "loss": 0.3719, "step": 6900 }, { "epoch": 1.1807749313560703, "grad_norm": 27.275989532470703, "learning_rate": 2.1119626497533475e-06, "loss": 0.4229, "step": 6910 }, { "epoch": 1.1824835791004566, "grad_norm": 33.59117126464844, "learning_rate": 2.107558139534884e-06, "loss": 0.4148, "step": 6920 }, { "epoch": 1.1841922268448428, "grad_norm": 34.12028884887695, "learning_rate": 2.10315362931642e-06, "loss": 0.4032, "step": 6930 }, { "epoch": 1.1859008745892292, "grad_norm": 30.586210250854492, "learning_rate": 2.0987491190979563e-06, "loss": 0.3997, "step": 6940 }, { "epoch": 1.1876095223336154, "grad_norm": 37.81381607055664, "learning_rate": 2.0943446088794926e-06, "loss": 0.355, "step": 6950 }, { "epoch": 1.1893181700780018, "grad_norm": 24.5543270111084, "learning_rate": 2.0899400986610292e-06, "loss": 0.4053, "step": 6960 }, { "epoch": 1.191026817822388, "grad_norm": 30.73529052734375, "learning_rate": 2.0855355884425655e-06, "loss": 0.366, "step": 6970 }, { "epoch": 1.1927354655667743, "grad_norm": 39.325965881347656, "learning_rate": 2.0811310782241018e-06, "loss": 0.4275, "step": 6980 }, { "epoch": 1.1944441133111607, "grad_norm": 23.889474868774414, "learning_rate": 2.0767265680056376e-06, "loss": 0.3998, "step": 6990 }, { "epoch": 1.196152761055547, "grad_norm": 40.00243377685547, "learning_rate": 2.0723220577871743e-06, "loss": 0.4105, "step": 7000 }, { "epoch": 1.1978614087999333, "grad_norm": 29.528654098510742, "learning_rate": 2.0679175475687106e-06, "loss": 0.3773, "step": 7010 }, { "epoch": 1.1995700565443195, "grad_norm": 36.32196044921875, "learning_rate": 2.063513037350247e-06, "loss": 0.3841, "step": 7020 }, { "epoch": 1.2012787042887059, "grad_norm": 27.748289108276367, "learning_rate": 2.059108527131783e-06, "loss": 0.4538, "step": 7030 }, { "epoch": 1.202987352033092, "grad_norm": 28.619266510009766, "learning_rate": 2.0547040169133194e-06, "loss": 0.4644, "step": 7040 }, { "epoch": 1.2046959997774784, "grad_norm": 35.11726379394531, "learning_rate": 2.0502995066948556e-06, "loss": 0.4483, "step": 7050 }, { "epoch": 1.2064046475218648, "grad_norm": 26.959434509277344, "learning_rate": 2.045894996476392e-06, "loss": 0.4232, "step": 7060 }, { "epoch": 1.208113295266251, "grad_norm": 28.872108459472656, "learning_rate": 2.041490486257928e-06, "loss": 0.4432, "step": 7070 }, { "epoch": 1.2098219430106374, "grad_norm": 28.600481033325195, "learning_rate": 2.0370859760394644e-06, "loss": 0.4602, "step": 7080 }, { "epoch": 1.2115305907550236, "grad_norm": 29.9169864654541, "learning_rate": 2.0326814658210007e-06, "loss": 0.4027, "step": 7090 }, { "epoch": 1.21323923849941, "grad_norm": 25.34281349182129, "learning_rate": 2.0282769556025374e-06, "loss": 0.3764, "step": 7100 }, { "epoch": 1.2149478862437961, "grad_norm": 26.075227737426758, "learning_rate": 2.0238724453840736e-06, "loss": 0.4072, "step": 7110 }, { "epoch": 1.2166565339881825, "grad_norm": 36.243865966796875, "learning_rate": 2.0194679351656095e-06, "loss": 0.4358, "step": 7120 }, { "epoch": 1.218365181732569, "grad_norm": 30.791261672973633, "learning_rate": 2.0150634249471458e-06, "loss": 0.4268, "step": 7130 }, { "epoch": 1.220073829476955, "grad_norm": 31.967105865478516, "learning_rate": 2.0106589147286824e-06, "loss": 0.4535, "step": 7140 }, { "epoch": 1.2217824772213415, "grad_norm": 22.780460357666016, "learning_rate": 2.0062544045102187e-06, "loss": 0.3439, "step": 7150 }, { "epoch": 1.2234911249657276, "grad_norm": 24.721939086914062, "learning_rate": 2.001849894291755e-06, "loss": 0.3946, "step": 7160 }, { "epoch": 1.225199772710114, "grad_norm": 31.781126022338867, "learning_rate": 1.9974453840732912e-06, "loss": 0.4221, "step": 7170 }, { "epoch": 1.2269084204545002, "grad_norm": 39.08473587036133, "learning_rate": 1.9930408738548275e-06, "loss": 0.4108, "step": 7180 }, { "epoch": 1.2286170681988866, "grad_norm": 32.67459487915039, "learning_rate": 1.9886363636363638e-06, "loss": 0.4214, "step": 7190 }, { "epoch": 1.2303257159432728, "grad_norm": 36.37043762207031, "learning_rate": 1.9842318534179e-06, "loss": 0.3654, "step": 7200 }, { "epoch": 1.2320343636876592, "grad_norm": 30.632551193237305, "learning_rate": 1.9798273431994363e-06, "loss": 0.3435, "step": 7210 }, { "epoch": 1.2337430114320456, "grad_norm": 27.24967384338379, "learning_rate": 1.9754228329809726e-06, "loss": 0.3853, "step": 7220 }, { "epoch": 1.2354516591764317, "grad_norm": 34.78539276123047, "learning_rate": 1.971018322762509e-06, "loss": 0.4485, "step": 7230 }, { "epoch": 1.2371603069208181, "grad_norm": 30.56952476501465, "learning_rate": 1.9666138125440455e-06, "loss": 0.4489, "step": 7240 }, { "epoch": 1.2388689546652043, "grad_norm": 25.958833694458008, "learning_rate": 1.9622093023255814e-06, "loss": 0.3734, "step": 7250 }, { "epoch": 1.2405776024095907, "grad_norm": 21.95493507385254, "learning_rate": 1.9578047921071176e-06, "loss": 0.3972, "step": 7260 }, { "epoch": 1.242286250153977, "grad_norm": 30.268014907836914, "learning_rate": 1.9534002818886543e-06, "loss": 0.3786, "step": 7270 }, { "epoch": 1.2439948978983633, "grad_norm": 38.55772399902344, "learning_rate": 1.9489957716701906e-06, "loss": 0.4305, "step": 7280 }, { "epoch": 1.2457035456427497, "grad_norm": 34.68306350708008, "learning_rate": 1.944591261451727e-06, "loss": 0.4009, "step": 7290 }, { "epoch": 1.2474121933871358, "grad_norm": 31.926652908325195, "learning_rate": 1.940186751233263e-06, "loss": 0.3752, "step": 7300 }, { "epoch": 1.2491208411315222, "grad_norm": 25.892805099487305, "learning_rate": 1.9357822410147994e-06, "loss": 0.3809, "step": 7310 }, { "epoch": 1.2508294888759084, "grad_norm": 34.08556365966797, "learning_rate": 1.9313777307963356e-06, "loss": 0.4777, "step": 7320 }, { "epoch": 1.2525381366202948, "grad_norm": 22.77074432373047, "learning_rate": 1.926973220577872e-06, "loss": 0.4021, "step": 7330 }, { "epoch": 1.254246784364681, "grad_norm": 40.69630432128906, "learning_rate": 1.922568710359408e-06, "loss": 0.4507, "step": 7340 }, { "epoch": 1.2559554321090673, "grad_norm": 26.732057571411133, "learning_rate": 1.9181642001409444e-06, "loss": 0.3927, "step": 7350 }, { "epoch": 1.2576640798534537, "grad_norm": 27.998336791992188, "learning_rate": 1.9137596899224807e-06, "loss": 0.4073, "step": 7360 }, { "epoch": 1.25937272759784, "grad_norm": 29.810136795043945, "learning_rate": 1.9093551797040174e-06, "loss": 0.4041, "step": 7370 }, { "epoch": 1.2610813753422263, "grad_norm": 26.727005004882812, "learning_rate": 1.9049506694855532e-06, "loss": 0.3886, "step": 7380 }, { "epoch": 1.2627900230866125, "grad_norm": 36.07413101196289, "learning_rate": 1.9005461592670895e-06, "loss": 0.3573, "step": 7390 }, { "epoch": 1.2644986708309989, "grad_norm": 32.144283294677734, "learning_rate": 1.896141649048626e-06, "loss": 0.4074, "step": 7400 }, { "epoch": 1.2662073185753853, "grad_norm": 24.47068977355957, "learning_rate": 1.8917371388301622e-06, "loss": 0.3934, "step": 7410 }, { "epoch": 1.2679159663197714, "grad_norm": 29.83626365661621, "learning_rate": 1.8873326286116985e-06, "loss": 0.4235, "step": 7420 }, { "epoch": 1.2696246140641576, "grad_norm": 27.749542236328125, "learning_rate": 1.882928118393235e-06, "loss": 0.3754, "step": 7430 }, { "epoch": 1.271333261808544, "grad_norm": 25.998891830444336, "learning_rate": 1.8785236081747712e-06, "loss": 0.3815, "step": 7440 }, { "epoch": 1.2730419095529304, "grad_norm": 32.17466735839844, "learning_rate": 1.8741190979563073e-06, "loss": 0.4338, "step": 7450 }, { "epoch": 1.2747505572973166, "grad_norm": 28.768695831298828, "learning_rate": 1.8697145877378436e-06, "loss": 0.4261, "step": 7460 }, { "epoch": 1.276459205041703, "grad_norm": 29.64584732055664, "learning_rate": 1.86531007751938e-06, "loss": 0.3678, "step": 7470 }, { "epoch": 1.2781678527860891, "grad_norm": 32.0334587097168, "learning_rate": 1.8609055673009163e-06, "loss": 0.3834, "step": 7480 }, { "epoch": 1.2798765005304755, "grad_norm": 33.2336540222168, "learning_rate": 1.8565010570824526e-06, "loss": 0.4092, "step": 7490 }, { "epoch": 1.281585148274862, "grad_norm": 27.663143157958984, "learning_rate": 1.852096546863989e-06, "loss": 0.453, "step": 7500 }, { "epoch": 1.283293796019248, "grad_norm": 26.34569549560547, "learning_rate": 1.847692036645525e-06, "loss": 0.3915, "step": 7510 }, { "epoch": 1.2850024437636345, "grad_norm": 30.302072525024414, "learning_rate": 1.8432875264270614e-06, "loss": 0.4404, "step": 7520 }, { "epoch": 1.2867110915080207, "grad_norm": 29.25191879272461, "learning_rate": 1.8388830162085976e-06, "loss": 0.3705, "step": 7530 }, { "epoch": 1.288419739252407, "grad_norm": 25.125303268432617, "learning_rate": 1.8344785059901341e-06, "loss": 0.4077, "step": 7540 }, { "epoch": 1.2901283869967934, "grad_norm": 36.632869720458984, "learning_rate": 1.8300739957716704e-06, "loss": 0.4101, "step": 7550 }, { "epoch": 1.2918370347411796, "grad_norm": 34.67438507080078, "learning_rate": 1.8256694855532066e-06, "loss": 0.4094, "step": 7560 }, { "epoch": 1.2935456824855658, "grad_norm": 38.920654296875, "learning_rate": 1.8212649753347431e-06, "loss": 0.4094, "step": 7570 }, { "epoch": 1.2952543302299522, "grad_norm": 27.154075622558594, "learning_rate": 1.8168604651162792e-06, "loss": 0.3877, "step": 7580 }, { "epoch": 1.2969629779743386, "grad_norm": 28.526582717895508, "learning_rate": 1.8124559548978154e-06, "loss": 0.4418, "step": 7590 }, { "epoch": 1.2986716257187247, "grad_norm": 23.52269172668457, "learning_rate": 1.8080514446793517e-06, "loss": 0.3878, "step": 7600 }, { "epoch": 1.3003802734631111, "grad_norm": 24.462650299072266, "learning_rate": 1.8036469344608882e-06, "loss": 0.3884, "step": 7610 }, { "epoch": 1.3020889212074973, "grad_norm": 28.307111740112305, "learning_rate": 1.7992424242424244e-06, "loss": 0.3681, "step": 7620 }, { "epoch": 1.3037975689518837, "grad_norm": 27.19947624206543, "learning_rate": 1.7948379140239607e-06, "loss": 0.3905, "step": 7630 }, { "epoch": 1.30550621669627, "grad_norm": 40.48936080932617, "learning_rate": 1.7904334038054968e-06, "loss": 0.414, "step": 7640 }, { "epoch": 1.3072148644406563, "grad_norm": 30.751718521118164, "learning_rate": 1.7860288935870332e-06, "loss": 0.4488, "step": 7650 }, { "epoch": 1.3089235121850427, "grad_norm": 32.26466369628906, "learning_rate": 1.7816243833685695e-06, "loss": 0.4015, "step": 7660 }, { "epoch": 1.3106321599294288, "grad_norm": 32.198055267333984, "learning_rate": 1.7772198731501058e-06, "loss": 0.3984, "step": 7670 }, { "epoch": 1.3123408076738152, "grad_norm": 26.396881103515625, "learning_rate": 1.7728153629316422e-06, "loss": 0.3793, "step": 7680 }, { "epoch": 1.3140494554182014, "grad_norm": 37.478797912597656, "learning_rate": 1.7684108527131785e-06, "loss": 0.3862, "step": 7690 }, { "epoch": 1.3157581031625878, "grad_norm": 40.08991622924805, "learning_rate": 1.7640063424947148e-06, "loss": 0.4314, "step": 7700 }, { "epoch": 1.317466750906974, "grad_norm": 33.823116302490234, "learning_rate": 1.7596018322762508e-06, "loss": 0.3761, "step": 7710 }, { "epoch": 1.3191753986513604, "grad_norm": 30.793943405151367, "learning_rate": 1.7551973220577873e-06, "loss": 0.4022, "step": 7720 }, { "epoch": 1.3208840463957467, "grad_norm": 29.607755661010742, "learning_rate": 1.7507928118393236e-06, "loss": 0.4276, "step": 7730 }, { "epoch": 1.322592694140133, "grad_norm": 36.47589111328125, "learning_rate": 1.7463883016208598e-06, "loss": 0.3849, "step": 7740 }, { "epoch": 1.3243013418845193, "grad_norm": 29.826234817504883, "learning_rate": 1.7419837914023963e-06, "loss": 0.4171, "step": 7750 }, { "epoch": 1.3260099896289055, "grad_norm": 30.34208106994629, "learning_rate": 1.7375792811839326e-06, "loss": 0.4239, "step": 7760 }, { "epoch": 1.3277186373732919, "grad_norm": 32.37610626220703, "learning_rate": 1.7331747709654686e-06, "loss": 0.4417, "step": 7770 }, { "epoch": 1.3294272851176783, "grad_norm": 29.77751922607422, "learning_rate": 1.728770260747005e-06, "loss": 0.3529, "step": 7780 }, { "epoch": 1.3311359328620644, "grad_norm": 27.710689544677734, "learning_rate": 1.7243657505285414e-06, "loss": 0.396, "step": 7790 }, { "epoch": 1.3328445806064506, "grad_norm": 35.94044876098633, "learning_rate": 1.7199612403100776e-06, "loss": 0.4285, "step": 7800 }, { "epoch": 1.334553228350837, "grad_norm": 31.398242950439453, "learning_rate": 1.715556730091614e-06, "loss": 0.3621, "step": 7810 }, { "epoch": 1.3362618760952234, "grad_norm": 22.245264053344727, "learning_rate": 1.7111522198731504e-06, "loss": 0.4047, "step": 7820 }, { "epoch": 1.3379705238396096, "grad_norm": 25.29467010498047, "learning_rate": 1.7067477096546866e-06, "loss": 0.4099, "step": 7830 }, { "epoch": 1.339679171583996, "grad_norm": 28.86480140686035, "learning_rate": 1.7023431994362227e-06, "loss": 0.39, "step": 7840 }, { "epoch": 1.3413878193283821, "grad_norm": 32.23060607910156, "learning_rate": 1.697938689217759e-06, "loss": 0.4299, "step": 7850 }, { "epoch": 1.3430964670727685, "grad_norm": 37.11185836791992, "learning_rate": 1.6935341789992954e-06, "loss": 0.3409, "step": 7860 }, { "epoch": 1.344805114817155, "grad_norm": 31.354124069213867, "learning_rate": 1.6891296687808317e-06, "loss": 0.4066, "step": 7870 }, { "epoch": 1.346513762561541, "grad_norm": 29.411638259887695, "learning_rate": 1.684725158562368e-06, "loss": 0.4163, "step": 7880 }, { "epoch": 1.3482224103059275, "grad_norm": 29.95796775817871, "learning_rate": 1.6803206483439045e-06, "loss": 0.413, "step": 7890 }, { "epoch": 1.3499310580503137, "grad_norm": 26.26283073425293, "learning_rate": 1.6759161381254405e-06, "loss": 0.4122, "step": 7900 }, { "epoch": 1.3516397057947, "grad_norm": 23.130903244018555, "learning_rate": 1.6715116279069768e-06, "loss": 0.414, "step": 7910 }, { "epoch": 1.3533483535390864, "grad_norm": 33.57529830932617, "learning_rate": 1.667107117688513e-06, "loss": 0.4446, "step": 7920 }, { "epoch": 1.3550570012834726, "grad_norm": 27.545856475830078, "learning_rate": 1.6627026074700495e-06, "loss": 0.3671, "step": 7930 }, { "epoch": 1.3567656490278588, "grad_norm": 28.595279693603516, "learning_rate": 1.6582980972515858e-06, "loss": 0.3938, "step": 7940 }, { "epoch": 1.3584742967722452, "grad_norm": 34.10601806640625, "learning_rate": 1.653893587033122e-06, "loss": 0.4092, "step": 7950 }, { "epoch": 1.3601829445166316, "grad_norm": 36.68281936645508, "learning_rate": 1.6494890768146585e-06, "loss": 0.4359, "step": 7960 }, { "epoch": 1.3618915922610177, "grad_norm": 36.802757263183594, "learning_rate": 1.6450845665961946e-06, "loss": 0.3926, "step": 7970 }, { "epoch": 1.3636002400054041, "grad_norm": 38.538978576660156, "learning_rate": 1.6406800563777308e-06, "loss": 0.4068, "step": 7980 }, { "epoch": 1.3653088877497903, "grad_norm": 24.058565139770508, "learning_rate": 1.636275546159267e-06, "loss": 0.3744, "step": 7990 }, { "epoch": 1.3670175354941767, "grad_norm": 25.09589385986328, "learning_rate": 1.6318710359408036e-06, "loss": 0.4155, "step": 8000 }, { "epoch": 1.368726183238563, "grad_norm": 35.97821044921875, "learning_rate": 1.6274665257223398e-06, "loss": 0.4474, "step": 8010 }, { "epoch": 1.3704348309829493, "grad_norm": 30.579835891723633, "learning_rate": 1.6230620155038761e-06, "loss": 0.3728, "step": 8020 }, { "epoch": 1.3721434787273357, "grad_norm": 24.492128372192383, "learning_rate": 1.6186575052854122e-06, "loss": 0.4528, "step": 8030 }, { "epoch": 1.3738521264717218, "grad_norm": 29.143388748168945, "learning_rate": 1.6142529950669486e-06, "loss": 0.3942, "step": 8040 }, { "epoch": 1.3755607742161082, "grad_norm": 32.478759765625, "learning_rate": 1.609848484848485e-06, "loss": 0.3717, "step": 8050 }, { "epoch": 1.3772694219604944, "grad_norm": 30.298538208007812, "learning_rate": 1.6054439746300212e-06, "loss": 0.4021, "step": 8060 }, { "epoch": 1.3789780697048808, "grad_norm": 30.066699981689453, "learning_rate": 1.6010394644115576e-06, "loss": 0.3804, "step": 8070 }, { "epoch": 1.380686717449267, "grad_norm": 35.945133209228516, "learning_rate": 1.596634954193094e-06, "loss": 0.4372, "step": 8080 }, { "epoch": 1.3823953651936534, "grad_norm": 21.04485321044922, "learning_rate": 1.5922304439746302e-06, "loss": 0.3724, "step": 8090 }, { "epoch": 1.3841040129380398, "grad_norm": 27.374027252197266, "learning_rate": 1.5878259337561662e-06, "loss": 0.3829, "step": 8100 }, { "epoch": 1.385812660682426, "grad_norm": 27.289045333862305, "learning_rate": 1.5834214235377027e-06, "loss": 0.3279, "step": 8110 }, { "epoch": 1.3875213084268123, "grad_norm": 19.139402389526367, "learning_rate": 1.579016913319239e-06, "loss": 0.3983, "step": 8120 }, { "epoch": 1.3892299561711985, "grad_norm": 31.3995418548584, "learning_rate": 1.5746124031007752e-06, "loss": 0.4445, "step": 8130 }, { "epoch": 1.3909386039155849, "grad_norm": 23.96241569519043, "learning_rate": 1.5702078928823117e-06, "loss": 0.4473, "step": 8140 }, { "epoch": 1.3926472516599713, "grad_norm": 37.16488265991211, "learning_rate": 1.565803382663848e-06, "loss": 0.3865, "step": 8150 }, { "epoch": 1.3943558994043574, "grad_norm": 31.697296142578125, "learning_rate": 1.561398872445384e-06, "loss": 0.371, "step": 8160 }, { "epoch": 1.3960645471487436, "grad_norm": 24.636869430541992, "learning_rate": 1.5569943622269205e-06, "loss": 0.3392, "step": 8170 }, { "epoch": 1.39777319489313, "grad_norm": 36.1915168762207, "learning_rate": 1.5525898520084568e-06, "loss": 0.3992, "step": 8180 }, { "epoch": 1.3994818426375164, "grad_norm": 25.10267448425293, "learning_rate": 1.548185341789993e-06, "loss": 0.4454, "step": 8190 }, { "epoch": 1.4011904903819026, "grad_norm": 27.928958892822266, "learning_rate": 1.5437808315715295e-06, "loss": 0.3802, "step": 8200 }, { "epoch": 1.402899138126289, "grad_norm": 27.088727951049805, "learning_rate": 1.5393763213530658e-06, "loss": 0.3792, "step": 8210 }, { "epoch": 1.4046077858706751, "grad_norm": 28.89666175842285, "learning_rate": 1.534971811134602e-06, "loss": 0.3851, "step": 8220 }, { "epoch": 1.4063164336150615, "grad_norm": 35.841854095458984, "learning_rate": 1.530567300916138e-06, "loss": 0.4547, "step": 8230 }, { "epoch": 1.408025081359448, "grad_norm": 32.671783447265625, "learning_rate": 1.5261627906976746e-06, "loss": 0.3867, "step": 8240 }, { "epoch": 1.409733729103834, "grad_norm": 26.516185760498047, "learning_rate": 1.5217582804792108e-06, "loss": 0.4012, "step": 8250 }, { "epoch": 1.4114423768482205, "grad_norm": 37.364967346191406, "learning_rate": 1.5173537702607471e-06, "loss": 0.365, "step": 8260 }, { "epoch": 1.4131510245926067, "grad_norm": 27.502492904663086, "learning_rate": 1.5129492600422836e-06, "loss": 0.3835, "step": 8270 }, { "epoch": 1.414859672336993, "grad_norm": 30.40472412109375, "learning_rate": 1.5085447498238199e-06, "loss": 0.3815, "step": 8280 }, { "epoch": 1.4165683200813795, "grad_norm": 24.262475967407227, "learning_rate": 1.504140239605356e-06, "loss": 0.4278, "step": 8290 }, { "epoch": 1.4182769678257656, "grad_norm": 31.887592315673828, "learning_rate": 1.4997357293868922e-06, "loss": 0.4523, "step": 8300 }, { "epoch": 1.4199856155701518, "grad_norm": 25.971759796142578, "learning_rate": 1.4953312191684286e-06, "loss": 0.4228, "step": 8310 }, { "epoch": 1.4216942633145382, "grad_norm": 24.0732364654541, "learning_rate": 1.490926708949965e-06, "loss": 0.4284, "step": 8320 }, { "epoch": 1.4234029110589246, "grad_norm": 35.71511459350586, "learning_rate": 1.4865221987315012e-06, "loss": 0.3756, "step": 8330 }, { "epoch": 1.4251115588033108, "grad_norm": 25.345888137817383, "learning_rate": 1.4821176885130377e-06, "loss": 0.4182, "step": 8340 }, { "epoch": 1.4268202065476971, "grad_norm": 31.115188598632812, "learning_rate": 1.477713178294574e-06, "loss": 0.4363, "step": 8350 }, { "epoch": 1.4285288542920833, "grad_norm": 35.88517761230469, "learning_rate": 1.47330866807611e-06, "loss": 0.4125, "step": 8360 }, { "epoch": 1.4302375020364697, "grad_norm": 30.74094581604004, "learning_rate": 1.4689041578576462e-06, "loss": 0.3921, "step": 8370 }, { "epoch": 1.431946149780856, "grad_norm": 30.39889144897461, "learning_rate": 1.4644996476391827e-06, "loss": 0.4258, "step": 8380 }, { "epoch": 1.4336547975252423, "grad_norm": 30.968448638916016, "learning_rate": 1.460095137420719e-06, "loss": 0.3653, "step": 8390 }, { "epoch": 1.4353634452696287, "grad_norm": 29.428611755371094, "learning_rate": 1.4556906272022552e-06, "loss": 0.402, "step": 8400 }, { "epoch": 1.4370720930140148, "grad_norm": 29.114940643310547, "learning_rate": 1.4512861169837917e-06, "loss": 0.3934, "step": 8410 }, { "epoch": 1.4387807407584012, "grad_norm": 32.88404083251953, "learning_rate": 1.4468816067653278e-06, "loss": 0.401, "step": 8420 }, { "epoch": 1.4404893885027874, "grad_norm": 32.356021881103516, "learning_rate": 1.442477096546864e-06, "loss": 0.3543, "step": 8430 }, { "epoch": 1.4421980362471738, "grad_norm": 33.27191925048828, "learning_rate": 1.4380725863284003e-06, "loss": 0.4492, "step": 8440 }, { "epoch": 1.44390668399156, "grad_norm": 33.288536071777344, "learning_rate": 1.4336680761099368e-06, "loss": 0.3954, "step": 8450 }, { "epoch": 1.4456153317359464, "grad_norm": 30.489593505859375, "learning_rate": 1.429263565891473e-06, "loss": 0.4248, "step": 8460 }, { "epoch": 1.4473239794803328, "grad_norm": 38.16218566894531, "learning_rate": 1.4248590556730093e-06, "loss": 0.4624, "step": 8470 }, { "epoch": 1.449032627224719, "grad_norm": 25.624847412109375, "learning_rate": 1.4204545454545458e-06, "loss": 0.4115, "step": 8480 }, { "epoch": 1.4507412749691053, "grad_norm": 34.9322395324707, "learning_rate": 1.4160500352360818e-06, "loss": 0.4179, "step": 8490 }, { "epoch": 1.4524499227134915, "grad_norm": 31.277803421020508, "learning_rate": 1.4116455250176181e-06, "loss": 0.3662, "step": 8500 }, { "epoch": 1.4541585704578779, "grad_norm": 32.513633728027344, "learning_rate": 1.4072410147991544e-06, "loss": 0.4179, "step": 8510 }, { "epoch": 1.4558672182022643, "grad_norm": 31.79774284362793, "learning_rate": 1.4028365045806909e-06, "loss": 0.4221, "step": 8520 }, { "epoch": 1.4575758659466505, "grad_norm": 35.4056282043457, "learning_rate": 1.3984319943622271e-06, "loss": 0.4233, "step": 8530 }, { "epoch": 1.4592845136910366, "grad_norm": 32.08757019042969, "learning_rate": 1.3940274841437634e-06, "loss": 0.4619, "step": 8540 }, { "epoch": 1.460993161435423, "grad_norm": 31.21336555480957, "learning_rate": 1.3896229739252994e-06, "loss": 0.3375, "step": 8550 }, { "epoch": 1.4627018091798094, "grad_norm": 31.44502067565918, "learning_rate": 1.385218463706836e-06, "loss": 0.4121, "step": 8560 }, { "epoch": 1.4644104569241956, "grad_norm": 21.62190818786621, "learning_rate": 1.3808139534883722e-06, "loss": 0.3727, "step": 8570 }, { "epoch": 1.466119104668582, "grad_norm": 33.74460983276367, "learning_rate": 1.3764094432699084e-06, "loss": 0.426, "step": 8580 }, { "epoch": 1.4678277524129681, "grad_norm": 22.65791130065918, "learning_rate": 1.372004933051445e-06, "loss": 0.3669, "step": 8590 }, { "epoch": 1.4695364001573545, "grad_norm": 38.821624755859375, "learning_rate": 1.3676004228329812e-06, "loss": 0.3906, "step": 8600 }, { "epoch": 1.471245047901741, "grad_norm": 38.148475646972656, "learning_rate": 1.3631959126145175e-06, "loss": 0.4369, "step": 8610 }, { "epoch": 1.472953695646127, "grad_norm": 25.316579818725586, "learning_rate": 1.3587914023960535e-06, "loss": 0.3884, "step": 8620 }, { "epoch": 1.4746623433905135, "grad_norm": 40.01092529296875, "learning_rate": 1.35438689217759e-06, "loss": 0.4031, "step": 8630 }, { "epoch": 1.4763709911348997, "grad_norm": 23.749156951904297, "learning_rate": 1.3499823819591262e-06, "loss": 0.4174, "step": 8640 }, { "epoch": 1.478079638879286, "grad_norm": 25.226078033447266, "learning_rate": 1.3455778717406625e-06, "loss": 0.3646, "step": 8650 }, { "epoch": 1.4797882866236725, "grad_norm": 32.117034912109375, "learning_rate": 1.341173361522199e-06, "loss": 0.409, "step": 8660 }, { "epoch": 1.4814969343680586, "grad_norm": 27.94634437561035, "learning_rate": 1.3367688513037353e-06, "loss": 0.4158, "step": 8670 }, { "epoch": 1.4832055821124448, "grad_norm": 27.515697479248047, "learning_rate": 1.3323643410852713e-06, "loss": 0.3976, "step": 8680 }, { "epoch": 1.4849142298568312, "grad_norm": 41.739105224609375, "learning_rate": 1.3279598308668076e-06, "loss": 0.337, "step": 8690 }, { "epoch": 1.4866228776012176, "grad_norm": 37.13324737548828, "learning_rate": 1.323555320648344e-06, "loss": 0.381, "step": 8700 }, { "epoch": 1.4883315253456038, "grad_norm": 23.801599502563477, "learning_rate": 1.3191508104298803e-06, "loss": 0.4244, "step": 8710 }, { "epoch": 1.4900401730899901, "grad_norm": 28.293941497802734, "learning_rate": 1.3147463002114166e-06, "loss": 0.3995, "step": 8720 }, { "epoch": 1.4917488208343763, "grad_norm": 23.51873779296875, "learning_rate": 1.310341789992953e-06, "loss": 0.3845, "step": 8730 }, { "epoch": 1.4934574685787627, "grad_norm": 25.12767219543457, "learning_rate": 1.3059372797744893e-06, "loss": 0.3522, "step": 8740 }, { "epoch": 1.495166116323149, "grad_norm": 21.655824661254883, "learning_rate": 1.3015327695560254e-06, "loss": 0.3776, "step": 8750 }, { "epoch": 1.4968747640675353, "grad_norm": 32.18788146972656, "learning_rate": 1.2971282593375616e-06, "loss": 0.401, "step": 8760 }, { "epoch": 1.4985834118119217, "grad_norm": 45.1816520690918, "learning_rate": 1.2927237491190981e-06, "loss": 0.4374, "step": 8770 }, { "epoch": 1.5002920595563078, "grad_norm": 33.538047790527344, "learning_rate": 1.2883192389006344e-06, "loss": 0.4288, "step": 8780 }, { "epoch": 1.5020007073006942, "grad_norm": 31.226816177368164, "learning_rate": 1.2839147286821706e-06, "loss": 0.3961, "step": 8790 }, { "epoch": 1.5037093550450806, "grad_norm": 24.751720428466797, "learning_rate": 1.2795102184637071e-06, "loss": 0.351, "step": 8800 }, { "epoch": 1.5054180027894668, "grad_norm": 35.17796325683594, "learning_rate": 1.2751057082452432e-06, "loss": 0.4673, "step": 8810 }, { "epoch": 1.507126650533853, "grad_norm": 26.320959091186523, "learning_rate": 1.2707011980267794e-06, "loss": 0.4025, "step": 8820 }, { "epoch": 1.5088352982782394, "grad_norm": 25.2487735748291, "learning_rate": 1.2662966878083157e-06, "loss": 0.3631, "step": 8830 }, { "epoch": 1.5105439460226258, "grad_norm": 26.821157455444336, "learning_rate": 1.2618921775898522e-06, "loss": 0.3253, "step": 8840 }, { "epoch": 1.512252593767012, "grad_norm": 20.908111572265625, "learning_rate": 1.2574876673713885e-06, "loss": 0.3598, "step": 8850 }, { "epoch": 1.513961241511398, "grad_norm": 39.165706634521484, "learning_rate": 1.2530831571529247e-06, "loss": 0.4052, "step": 8860 }, { "epoch": 1.5156698892557845, "grad_norm": 32.390995025634766, "learning_rate": 1.248678646934461e-06, "loss": 0.4037, "step": 8870 }, { "epoch": 1.517378537000171, "grad_norm": 28.246858596801758, "learning_rate": 1.2442741367159972e-06, "loss": 0.3995, "step": 8880 }, { "epoch": 1.5190871847445573, "grad_norm": 31.864625930786133, "learning_rate": 1.2398696264975335e-06, "loss": 0.3803, "step": 8890 }, { "epoch": 1.5207958324889435, "grad_norm": 23.626855850219727, "learning_rate": 1.2354651162790698e-06, "loss": 0.3864, "step": 8900 }, { "epoch": 1.5225044802333296, "grad_norm": 24.359804153442383, "learning_rate": 1.2310606060606063e-06, "loss": 0.3669, "step": 8910 }, { "epoch": 1.524213127977716, "grad_norm": 27.360803604125977, "learning_rate": 1.2266560958421425e-06, "loss": 0.3916, "step": 8920 }, { "epoch": 1.5259217757221024, "grad_norm": 27.511882781982422, "learning_rate": 1.2222515856236788e-06, "loss": 0.3745, "step": 8930 }, { "epoch": 1.5276304234664888, "grad_norm": 26.44959831237793, "learning_rate": 1.217847075405215e-06, "loss": 0.3918, "step": 8940 }, { "epoch": 1.529339071210875, "grad_norm": 29.03026008605957, "learning_rate": 1.2134425651867513e-06, "loss": 0.3829, "step": 8950 }, { "epoch": 1.5310477189552611, "grad_norm": 31.914691925048828, "learning_rate": 1.2090380549682876e-06, "loss": 0.3987, "step": 8960 }, { "epoch": 1.5327563666996475, "grad_norm": 39.63639831542969, "learning_rate": 1.2046335447498238e-06, "loss": 0.4399, "step": 8970 }, { "epoch": 1.534465014444034, "grad_norm": 25.887651443481445, "learning_rate": 1.2002290345313603e-06, "loss": 0.4067, "step": 8980 }, { "epoch": 1.53617366218842, "grad_norm": 30.17310333251953, "learning_rate": 1.1958245243128964e-06, "loss": 0.397, "step": 8990 }, { "epoch": 1.5378823099328063, "grad_norm": 28.75864601135254, "learning_rate": 1.1914200140944329e-06, "loss": 0.4094, "step": 9000 }, { "epoch": 1.5395909576771927, "grad_norm": 40.84502029418945, "learning_rate": 1.1870155038759691e-06, "loss": 0.35, "step": 9010 }, { "epoch": 1.541299605421579, "grad_norm": 27.4794864654541, "learning_rate": 1.1826109936575054e-06, "loss": 0.4164, "step": 9020 }, { "epoch": 1.5430082531659655, "grad_norm": 35.87556076049805, "learning_rate": 1.1782064834390416e-06, "loss": 0.3561, "step": 9030 }, { "epoch": 1.5447169009103516, "grad_norm": 32.51176071166992, "learning_rate": 1.173801973220578e-06, "loss": 0.3813, "step": 9040 }, { "epoch": 1.5464255486547378, "grad_norm": 34.02533721923828, "learning_rate": 1.1693974630021144e-06, "loss": 0.4186, "step": 9050 }, { "epoch": 1.5481341963991242, "grad_norm": 25.257232666015625, "learning_rate": 1.1649929527836504e-06, "loss": 0.3428, "step": 9060 }, { "epoch": 1.5498428441435106, "grad_norm": 27.12441635131836, "learning_rate": 1.160588442565187e-06, "loss": 0.3731, "step": 9070 }, { "epoch": 1.5515514918878968, "grad_norm": 32.43393325805664, "learning_rate": 1.1561839323467232e-06, "loss": 0.3484, "step": 9080 }, { "epoch": 1.5532601396322832, "grad_norm": 35.18085479736328, "learning_rate": 1.1517794221282595e-06, "loss": 0.3964, "step": 9090 }, { "epoch": 1.5549687873766693, "grad_norm": 30.050132751464844, "learning_rate": 1.1473749119097957e-06, "loss": 0.3969, "step": 9100 }, { "epoch": 1.5566774351210557, "grad_norm": 34.45301818847656, "learning_rate": 1.142970401691332e-06, "loss": 0.3945, "step": 9110 }, { "epoch": 1.5583860828654421, "grad_norm": 31.908273696899414, "learning_rate": 1.1385658914728682e-06, "loss": 0.3823, "step": 9120 }, { "epoch": 1.5600947306098283, "grad_norm": 26.37557601928711, "learning_rate": 1.1341613812544045e-06, "loss": 0.3638, "step": 9130 }, { "epoch": 1.5618033783542145, "grad_norm": 25.550487518310547, "learning_rate": 1.129756871035941e-06, "loss": 0.3556, "step": 9140 }, { "epoch": 1.5635120260986008, "grad_norm": 28.921995162963867, "learning_rate": 1.1253523608174773e-06, "loss": 0.4134, "step": 9150 }, { "epoch": 1.5652206738429872, "grad_norm": 26.404720306396484, "learning_rate": 1.1209478505990135e-06, "loss": 0.3664, "step": 9160 }, { "epoch": 1.5669293215873736, "grad_norm": 29.88231086730957, "learning_rate": 1.1165433403805498e-06, "loss": 0.3848, "step": 9170 }, { "epoch": 1.5686379693317598, "grad_norm": 38.20869827270508, "learning_rate": 1.112138830162086e-06, "loss": 0.3823, "step": 9180 }, { "epoch": 1.570346617076146, "grad_norm": 42.82072448730469, "learning_rate": 1.1077343199436223e-06, "loss": 0.372, "step": 9190 }, { "epoch": 1.5720552648205324, "grad_norm": 27.147830963134766, "learning_rate": 1.1033298097251586e-06, "loss": 0.4172, "step": 9200 }, { "epoch": 1.5737639125649188, "grad_norm": 32.74360656738281, "learning_rate": 1.098925299506695e-06, "loss": 0.3772, "step": 9210 }, { "epoch": 1.575472560309305, "grad_norm": 23.909259796142578, "learning_rate": 1.0945207892882311e-06, "loss": 0.3215, "step": 9220 }, { "epoch": 1.5771812080536913, "grad_norm": 32.20122146606445, "learning_rate": 1.0901162790697676e-06, "loss": 0.3864, "step": 9230 }, { "epoch": 1.5788898557980775, "grad_norm": 29.837228775024414, "learning_rate": 1.0857117688513039e-06, "loss": 0.4026, "step": 9240 }, { "epoch": 1.580598503542464, "grad_norm": 34.161033630371094, "learning_rate": 1.0813072586328401e-06, "loss": 0.4217, "step": 9250 }, { "epoch": 1.5823071512868503, "grad_norm": 39.935638427734375, "learning_rate": 1.0769027484143764e-06, "loss": 0.3818, "step": 9260 }, { "epoch": 1.5840157990312365, "grad_norm": 29.2546443939209, "learning_rate": 1.0724982381959126e-06, "loss": 0.406, "step": 9270 }, { "epoch": 1.5857244467756226, "grad_norm": 33.643367767333984, "learning_rate": 1.0680937279774491e-06, "loss": 0.3627, "step": 9280 }, { "epoch": 1.587433094520009, "grad_norm": 48.66536331176758, "learning_rate": 1.0636892177589852e-06, "loss": 0.3924, "step": 9290 }, { "epoch": 1.5891417422643954, "grad_norm": 29.057153701782227, "learning_rate": 1.0592847075405217e-06, "loss": 0.3797, "step": 9300 }, { "epoch": 1.5908503900087818, "grad_norm": 30.0162296295166, "learning_rate": 1.054880197322058e-06, "loss": 0.4337, "step": 9310 }, { "epoch": 1.592559037753168, "grad_norm": 30.404836654663086, "learning_rate": 1.0504756871035942e-06, "loss": 0.3784, "step": 9320 }, { "epoch": 1.5942676854975542, "grad_norm": 41.39947509765625, "learning_rate": 1.0460711768851305e-06, "loss": 0.3421, "step": 9330 }, { "epoch": 1.5959763332419405, "grad_norm": 25.326269149780273, "learning_rate": 1.0416666666666667e-06, "loss": 0.3901, "step": 9340 }, { "epoch": 1.597684980986327, "grad_norm": 29.41655731201172, "learning_rate": 1.037262156448203e-06, "loss": 0.3393, "step": 9350 }, { "epoch": 1.5993936287307131, "grad_norm": 30.155683517456055, "learning_rate": 1.0328576462297392e-06, "loss": 0.3745, "step": 9360 }, { "epoch": 1.6011022764750993, "grad_norm": 27.556821823120117, "learning_rate": 1.0284531360112757e-06, "loss": 0.4145, "step": 9370 }, { "epoch": 1.6028109242194857, "grad_norm": 39.412540435791016, "learning_rate": 1.0240486257928118e-06, "loss": 0.3815, "step": 9380 }, { "epoch": 1.604519571963872, "grad_norm": 30.376188278198242, "learning_rate": 1.0196441155743483e-06, "loss": 0.3812, "step": 9390 }, { "epoch": 1.6062282197082585, "grad_norm": 35.539546966552734, "learning_rate": 1.0152396053558845e-06, "loss": 0.3949, "step": 9400 }, { "epoch": 1.6079368674526446, "grad_norm": 27.052183151245117, "learning_rate": 1.0108350951374208e-06, "loss": 0.4059, "step": 9410 }, { "epoch": 1.6096455151970308, "grad_norm": 22.53864860534668, "learning_rate": 1.006430584918957e-06, "loss": 0.3381, "step": 9420 }, { "epoch": 1.6113541629414172, "grad_norm": 33.662052154541016, "learning_rate": 1.0020260747004933e-06, "loss": 0.3852, "step": 9430 }, { "epoch": 1.6130628106858036, "grad_norm": 35.129295349121094, "learning_rate": 9.976215644820298e-07, "loss": 0.3737, "step": 9440 }, { "epoch": 1.6147714584301898, "grad_norm": 14.55792236328125, "learning_rate": 9.932170542635658e-07, "loss": 0.3682, "step": 9450 }, { "epoch": 1.6164801061745762, "grad_norm": 34.31297302246094, "learning_rate": 9.888125440451023e-07, "loss": 0.394, "step": 9460 }, { "epoch": 1.6181887539189623, "grad_norm": 28.12514305114746, "learning_rate": 9.844080338266386e-07, "loss": 0.4253, "step": 9470 }, { "epoch": 1.6198974016633487, "grad_norm": 31.71592140197754, "learning_rate": 9.800035236081749e-07, "loss": 0.3742, "step": 9480 }, { "epoch": 1.6216060494077351, "grad_norm": 33.897281646728516, "learning_rate": 9.755990133897111e-07, "loss": 0.3855, "step": 9490 }, { "epoch": 1.6233146971521213, "grad_norm": 26.927099227905273, "learning_rate": 9.711945031712474e-07, "loss": 0.4042, "step": 9500 }, { "epoch": 1.6250233448965075, "grad_norm": 31.36831283569336, "learning_rate": 9.667899929527836e-07, "loss": 0.4033, "step": 9510 }, { "epoch": 1.6267319926408939, "grad_norm": 32.52813720703125, "learning_rate": 9.6238548273432e-07, "loss": 0.4001, "step": 9520 }, { "epoch": 1.6284406403852802, "grad_norm": 29.446916580200195, "learning_rate": 9.579809725158564e-07, "loss": 0.3865, "step": 9530 }, { "epoch": 1.6301492881296666, "grad_norm": 22.962326049804688, "learning_rate": 9.535764622973927e-07, "loss": 0.3399, "step": 9540 }, { "epoch": 1.6318579358740528, "grad_norm": 22.97249984741211, "learning_rate": 9.491719520789289e-07, "loss": 0.3967, "step": 9550 }, { "epoch": 1.633566583618439, "grad_norm": 40.018470764160156, "learning_rate": 9.447674418604652e-07, "loss": 0.3935, "step": 9560 }, { "epoch": 1.6352752313628254, "grad_norm": 22.444059371948242, "learning_rate": 9.403629316420016e-07, "loss": 0.3224, "step": 9570 }, { "epoch": 1.6369838791072118, "grad_norm": 34.330078125, "learning_rate": 9.359584214235377e-07, "loss": 0.3486, "step": 9580 }, { "epoch": 1.638692526851598, "grad_norm": 35.540557861328125, "learning_rate": 9.315539112050741e-07, "loss": 0.3563, "step": 9590 }, { "epoch": 1.6404011745959843, "grad_norm": 24.032527923583984, "learning_rate": 9.271494009866105e-07, "loss": 0.3191, "step": 9600 }, { "epoch": 1.6421098223403705, "grad_norm": 38.39560317993164, "learning_rate": 9.227448907681466e-07, "loss": 0.3694, "step": 9610 }, { "epoch": 1.643818470084757, "grad_norm": 40.29669952392578, "learning_rate": 9.18340380549683e-07, "loss": 0.3968, "step": 9620 }, { "epoch": 1.6455271178291433, "grad_norm": 28.967849731445312, "learning_rate": 9.139358703312193e-07, "loss": 0.3501, "step": 9630 }, { "epoch": 1.6472357655735295, "grad_norm": 44.81010437011719, "learning_rate": 9.095313601127555e-07, "loss": 0.415, "step": 9640 }, { "epoch": 1.6489444133179156, "grad_norm": 25.93589210510254, "learning_rate": 9.051268498942918e-07, "loss": 0.3933, "step": 9650 }, { "epoch": 1.650653061062302, "grad_norm": 31.824234008789062, "learning_rate": 9.007223396758282e-07, "loss": 0.3739, "step": 9660 }, { "epoch": 1.6523617088066884, "grad_norm": 34.3546142578125, "learning_rate": 8.963178294573645e-07, "loss": 0.3996, "step": 9670 }, { "epoch": 1.6540703565510748, "grad_norm": 23.639925003051758, "learning_rate": 8.919133192389007e-07, "loss": 0.3853, "step": 9680 }, { "epoch": 1.655779004295461, "grad_norm": 30.642179489135742, "learning_rate": 8.875088090204371e-07, "loss": 0.3762, "step": 9690 }, { "epoch": 1.6574876520398472, "grad_norm": 30.923620223999023, "learning_rate": 8.831042988019733e-07, "loss": 0.3923, "step": 9700 }, { "epoch": 1.6591962997842336, "grad_norm": 27.91309356689453, "learning_rate": 8.786997885835096e-07, "loss": 0.3604, "step": 9710 }, { "epoch": 1.66090494752862, "grad_norm": 23.54095458984375, "learning_rate": 8.742952783650459e-07, "loss": 0.3512, "step": 9720 }, { "epoch": 1.6626135952730061, "grad_norm": 31.084632873535156, "learning_rate": 8.698907681465822e-07, "loss": 0.3475, "step": 9730 }, { "epoch": 1.6643222430173923, "grad_norm": 34.60007095336914, "learning_rate": 8.654862579281184e-07, "loss": 0.3999, "step": 9740 }, { "epoch": 1.6660308907617787, "grad_norm": 32.12785339355469, "learning_rate": 8.610817477096548e-07, "loss": 0.4169, "step": 9750 }, { "epoch": 1.667739538506165, "grad_norm": 26.730180740356445, "learning_rate": 8.566772374911911e-07, "loss": 0.3191, "step": 9760 }, { "epoch": 1.6694481862505515, "grad_norm": 29.191030502319336, "learning_rate": 8.522727272727273e-07, "loss": 0.3468, "step": 9770 }, { "epoch": 1.6711568339949376, "grad_norm": 30.42900848388672, "learning_rate": 8.478682170542637e-07, "loss": 0.3948, "step": 9780 }, { "epoch": 1.6728654817393238, "grad_norm": 36.10079574584961, "learning_rate": 8.434637068357999e-07, "loss": 0.3741, "step": 9790 }, { "epoch": 1.6745741294837102, "grad_norm": 24.84588050842285, "learning_rate": 8.390591966173363e-07, "loss": 0.3342, "step": 9800 }, { "epoch": 1.6762827772280966, "grad_norm": 28.362817764282227, "learning_rate": 8.346546863988725e-07, "loss": 0.3723, "step": 9810 }, { "epoch": 1.6779914249724828, "grad_norm": 31.064945220947266, "learning_rate": 8.302501761804088e-07, "loss": 0.3648, "step": 9820 }, { "epoch": 1.6797000727168692, "grad_norm": 43.73317337036133, "learning_rate": 8.258456659619452e-07, "loss": 0.3849, "step": 9830 }, { "epoch": 1.6814087204612553, "grad_norm": 36.45133590698242, "learning_rate": 8.214411557434814e-07, "loss": 0.4135, "step": 9840 }, { "epoch": 1.6831173682056417, "grad_norm": 24.040943145751953, "learning_rate": 8.170366455250177e-07, "loss": 0.3828, "step": 9850 }, { "epoch": 1.6848260159500281, "grad_norm": 34.76506805419922, "learning_rate": 8.12632135306554e-07, "loss": 0.3583, "step": 9860 }, { "epoch": 1.6865346636944143, "grad_norm": 34.13239669799805, "learning_rate": 8.082276250880903e-07, "loss": 0.3613, "step": 9870 }, { "epoch": 1.6882433114388005, "grad_norm": 25.49158477783203, "learning_rate": 8.038231148696265e-07, "loss": 0.4262, "step": 9880 }, { "epoch": 1.6899519591831869, "grad_norm": 35.75178909301758, "learning_rate": 7.994186046511629e-07, "loss": 0.4004, "step": 9890 }, { "epoch": 1.6916606069275733, "grad_norm": 32.253150939941406, "learning_rate": 7.95014094432699e-07, "loss": 0.3644, "step": 9900 }, { "epoch": 1.6933692546719596, "grad_norm": 37.89906692504883, "learning_rate": 7.906095842142354e-07, "loss": 0.3617, "step": 9910 }, { "epoch": 1.6950779024163458, "grad_norm": 21.644926071166992, "learning_rate": 7.862050739957718e-07, "loss": 0.3764, "step": 9920 }, { "epoch": 1.696786550160732, "grad_norm": 34.483253479003906, "learning_rate": 7.818005637773081e-07, "loss": 0.4131, "step": 9930 }, { "epoch": 1.6984951979051184, "grad_norm": 43.877708435058594, "learning_rate": 7.773960535588443e-07, "loss": 0.3782, "step": 9940 }, { "epoch": 1.7002038456495048, "grad_norm": 56.46201705932617, "learning_rate": 7.729915433403806e-07, "loss": 0.3922, "step": 9950 }, { "epoch": 1.701912493393891, "grad_norm": 30.294981002807617, "learning_rate": 7.68587033121917e-07, "loss": 0.3839, "step": 9960 }, { "epoch": 1.7036211411382773, "grad_norm": 36.37797927856445, "learning_rate": 7.641825229034531e-07, "loss": 0.37, "step": 9970 }, { "epoch": 1.7053297888826635, "grad_norm": 32.37224197387695, "learning_rate": 7.597780126849895e-07, "loss": 0.3552, "step": 9980 }, { "epoch": 1.70703843662705, "grad_norm": 37.46088790893555, "learning_rate": 7.553735024665259e-07, "loss": 0.3815, "step": 9990 }, { "epoch": 1.7087470843714363, "grad_norm": 32.850372314453125, "learning_rate": 7.50968992248062e-07, "loss": 0.3688, "step": 10000 }, { "epoch": 1.7104557321158225, "grad_norm": 47.176239013671875, "learning_rate": 7.465644820295984e-07, "loss": 0.3789, "step": 10010 }, { "epoch": 1.7121643798602086, "grad_norm": 24.945432662963867, "learning_rate": 7.421599718111347e-07, "loss": 0.382, "step": 10020 }, { "epoch": 1.713873027604595, "grad_norm": 21.04591941833496, "learning_rate": 7.377554615926709e-07, "loss": 0.3695, "step": 10030 }, { "epoch": 1.7155816753489814, "grad_norm": 33.52159881591797, "learning_rate": 7.333509513742072e-07, "loss": 0.3597, "step": 10040 }, { "epoch": 1.7172903230933678, "grad_norm": 30.122079849243164, "learning_rate": 7.289464411557436e-07, "loss": 0.3875, "step": 10050 }, { "epoch": 1.718998970837754, "grad_norm": 24.38621711730957, "learning_rate": 7.245419309372799e-07, "loss": 0.3351, "step": 10060 }, { "epoch": 1.7207076185821402, "grad_norm": 47.98723220825195, "learning_rate": 7.201374207188161e-07, "loss": 0.3836, "step": 10070 }, { "epoch": 1.7224162663265266, "grad_norm": 37.2187614440918, "learning_rate": 7.157329105003525e-07, "loss": 0.3614, "step": 10080 }, { "epoch": 1.724124914070913, "grad_norm": 30.610862731933594, "learning_rate": 7.113284002818887e-07, "loss": 0.3795, "step": 10090 }, { "epoch": 1.7258335618152991, "grad_norm": 22.508331298828125, "learning_rate": 7.06923890063425e-07, "loss": 0.3536, "step": 10100 }, { "epoch": 1.7275422095596853, "grad_norm": 37.26981735229492, "learning_rate": 7.025193798449613e-07, "loss": 0.3832, "step": 10110 }, { "epoch": 1.7292508573040717, "grad_norm": 19.104637145996094, "learning_rate": 6.981148696264976e-07, "loss": 0.3522, "step": 10120 }, { "epoch": 1.730959505048458, "grad_norm": 23.52967071533203, "learning_rate": 6.937103594080338e-07, "loss": 0.391, "step": 10130 }, { "epoch": 1.7326681527928445, "grad_norm": 27.223722457885742, "learning_rate": 6.893058491895702e-07, "loss": 0.3904, "step": 10140 }, { "epoch": 1.7343768005372306, "grad_norm": 30.344676971435547, "learning_rate": 6.849013389711065e-07, "loss": 0.391, "step": 10150 }, { "epoch": 1.7360854482816168, "grad_norm": 22.83699607849121, "learning_rate": 6.804968287526427e-07, "loss": 0.3653, "step": 10160 }, { "epoch": 1.7377940960260032, "grad_norm": 32.153663635253906, "learning_rate": 6.760923185341791e-07, "loss": 0.3531, "step": 10170 }, { "epoch": 1.7395027437703896, "grad_norm": 36.864925384521484, "learning_rate": 6.716878083157153e-07, "loss": 0.3391, "step": 10180 }, { "epoch": 1.7412113915147758, "grad_norm": 41.715576171875, "learning_rate": 6.672832980972517e-07, "loss": 0.3993, "step": 10190 }, { "epoch": 1.7429200392591622, "grad_norm": 35.69621276855469, "learning_rate": 6.628787878787879e-07, "loss": 0.3121, "step": 10200 }, { "epoch": 1.7446286870035483, "grad_norm": 33.0884895324707, "learning_rate": 6.584742776603242e-07, "loss": 0.3827, "step": 10210 }, { "epoch": 1.7463373347479347, "grad_norm": 26.627431869506836, "learning_rate": 6.540697674418606e-07, "loss": 0.3759, "step": 10220 }, { "epoch": 1.7480459824923211, "grad_norm": 32.8358039855957, "learning_rate": 6.496652572233968e-07, "loss": 0.3983, "step": 10230 }, { "epoch": 1.7497546302367073, "grad_norm": 26.243980407714844, "learning_rate": 6.452607470049331e-07, "loss": 0.3857, "step": 10240 }, { "epoch": 1.7514632779810935, "grad_norm": 26.84737205505371, "learning_rate": 6.408562367864694e-07, "loss": 0.3771, "step": 10250 }, { "epoch": 1.7531719257254799, "grad_norm": 31.410524368286133, "learning_rate": 6.364517265680057e-07, "loss": 0.3794, "step": 10260 }, { "epoch": 1.7548805734698663, "grad_norm": 38.454044342041016, "learning_rate": 6.320472163495419e-07, "loss": 0.3595, "step": 10270 }, { "epoch": 1.7565892212142526, "grad_norm": 18.861108779907227, "learning_rate": 6.276427061310783e-07, "loss": 0.3457, "step": 10280 }, { "epoch": 1.7582978689586388, "grad_norm": 28.400564193725586, "learning_rate": 6.232381959126146e-07, "loss": 0.3511, "step": 10290 }, { "epoch": 1.760006516703025, "grad_norm": 47.41775894165039, "learning_rate": 6.188336856941508e-07, "loss": 0.3544, "step": 10300 }, { "epoch": 1.7617151644474114, "grad_norm": 30.009010314941406, "learning_rate": 6.144291754756872e-07, "loss": 0.3273, "step": 10310 }, { "epoch": 1.7634238121917978, "grad_norm": 25.67041778564453, "learning_rate": 6.100246652572235e-07, "loss": 0.3619, "step": 10320 }, { "epoch": 1.765132459936184, "grad_norm": 28.06591796875, "learning_rate": 6.056201550387597e-07, "loss": 0.3577, "step": 10330 }, { "epoch": 1.7668411076805703, "grad_norm": 25.041889190673828, "learning_rate": 6.01215644820296e-07, "loss": 0.3688, "step": 10340 }, { "epoch": 1.7685497554249565, "grad_norm": 32.429443359375, "learning_rate": 5.968111346018323e-07, "loss": 0.4009, "step": 10350 }, { "epoch": 1.770258403169343, "grad_norm": 23.519460678100586, "learning_rate": 5.924066243833686e-07, "loss": 0.366, "step": 10360 }, { "epoch": 1.7719670509137293, "grad_norm": 36.32727813720703, "learning_rate": 5.880021141649049e-07, "loss": 0.4004, "step": 10370 }, { "epoch": 1.7736756986581155, "grad_norm": 30.008052825927734, "learning_rate": 5.835976039464412e-07, "loss": 0.3707, "step": 10380 }, { "epoch": 1.7753843464025016, "grad_norm": 34.22142791748047, "learning_rate": 5.791930937279775e-07, "loss": 0.3848, "step": 10390 }, { "epoch": 1.777092994146888, "grad_norm": 21.506912231445312, "learning_rate": 5.747885835095138e-07, "loss": 0.3677, "step": 10400 }, { "epoch": 1.7788016418912744, "grad_norm": 33.4599609375, "learning_rate": 5.703840732910502e-07, "loss": 0.394, "step": 10410 }, { "epoch": 1.7805102896356608, "grad_norm": 36.893394470214844, "learning_rate": 5.659795630725864e-07, "loss": 0.3871, "step": 10420 }, { "epoch": 1.782218937380047, "grad_norm": 46.39961624145508, "learning_rate": 5.615750528541227e-07, "loss": 0.4025, "step": 10430 }, { "epoch": 1.7839275851244332, "grad_norm": 23.366689682006836, "learning_rate": 5.57170542635659e-07, "loss": 0.3694, "step": 10440 }, { "epoch": 1.7856362328688196, "grad_norm": 61.47678756713867, "learning_rate": 5.527660324171952e-07, "loss": 0.3759, "step": 10450 }, { "epoch": 1.787344880613206, "grad_norm": 27.12241554260254, "learning_rate": 5.483615221987316e-07, "loss": 0.3552, "step": 10460 }, { "epoch": 1.7890535283575921, "grad_norm": 29.733963012695312, "learning_rate": 5.439570119802679e-07, "loss": 0.3966, "step": 10470 }, { "epoch": 1.7907621761019783, "grad_norm": 29.353418350219727, "learning_rate": 5.395525017618041e-07, "loss": 0.3642, "step": 10480 }, { "epoch": 1.7924708238463647, "grad_norm": 26.14151954650879, "learning_rate": 5.351479915433405e-07, "loss": 0.3776, "step": 10490 }, { "epoch": 1.794179471590751, "grad_norm": 33.61710739135742, "learning_rate": 5.307434813248768e-07, "loss": 0.3484, "step": 10500 }, { "epoch": 1.7958881193351375, "grad_norm": 25.79818344116211, "learning_rate": 5.26338971106413e-07, "loss": 0.3498, "step": 10510 }, { "epoch": 1.7975967670795236, "grad_norm": 34.903533935546875, "learning_rate": 5.219344608879493e-07, "loss": 0.4224, "step": 10520 }, { "epoch": 1.7993054148239098, "grad_norm": 42.71726608276367, "learning_rate": 5.175299506694856e-07, "loss": 0.3524, "step": 10530 }, { "epoch": 1.8010140625682962, "grad_norm": 35.538875579833984, "learning_rate": 5.131254404510219e-07, "loss": 0.3471, "step": 10540 }, { "epoch": 1.8027227103126826, "grad_norm": 34.071388244628906, "learning_rate": 5.087209302325582e-07, "loss": 0.3994, "step": 10550 }, { "epoch": 1.8044313580570688, "grad_norm": 30.25017738342285, "learning_rate": 5.043164200140945e-07, "loss": 0.4127, "step": 10560 }, { "epoch": 1.8061400058014552, "grad_norm": 27.28138542175293, "learning_rate": 4.999119097956308e-07, "loss": 0.3286, "step": 10570 }, { "epoch": 1.8078486535458413, "grad_norm": 38.14504623413086, "learning_rate": 4.955073995771671e-07, "loss": 0.3763, "step": 10580 }, { "epoch": 1.8095573012902277, "grad_norm": 30.078149795532227, "learning_rate": 4.911028893587034e-07, "loss": 0.352, "step": 10590 }, { "epoch": 1.8112659490346141, "grad_norm": 24.926767349243164, "learning_rate": 4.866983791402396e-07, "loss": 0.3421, "step": 10600 }, { "epoch": 1.8129745967790003, "grad_norm": 46.622650146484375, "learning_rate": 4.822938689217759e-07, "loss": 0.3835, "step": 10610 }, { "epoch": 1.8146832445233865, "grad_norm": 21.296682357788086, "learning_rate": 4.778893587033123e-07, "loss": 0.4098, "step": 10620 }, { "epoch": 1.8163918922677729, "grad_norm": 27.754459381103516, "learning_rate": 4.7348484848484853e-07, "loss": 0.3576, "step": 10630 }, { "epoch": 1.8181005400121593, "grad_norm": 26.44339942932129, "learning_rate": 4.690803382663848e-07, "loss": 0.3411, "step": 10640 }, { "epoch": 1.8198091877565457, "grad_norm": 24.727949142456055, "learning_rate": 4.646758280479211e-07, "loss": 0.333, "step": 10650 }, { "epoch": 1.8215178355009318, "grad_norm": 36.50139617919922, "learning_rate": 4.602713178294574e-07, "loss": 0.3346, "step": 10660 }, { "epoch": 1.823226483245318, "grad_norm": 32.99855041503906, "learning_rate": 4.5586680761099375e-07, "loss": 0.4303, "step": 10670 }, { "epoch": 1.8249351309897044, "grad_norm": 23.56210708618164, "learning_rate": 4.5146229739253e-07, "loss": 0.3567, "step": 10680 }, { "epoch": 1.8266437787340908, "grad_norm": 32.45067596435547, "learning_rate": 4.470577871740663e-07, "loss": 0.3615, "step": 10690 }, { "epoch": 1.828352426478477, "grad_norm": 26.946245193481445, "learning_rate": 4.426532769556026e-07, "loss": 0.3577, "step": 10700 }, { "epoch": 1.8300610742228633, "grad_norm": 25.937786102294922, "learning_rate": 4.3824876673713886e-07, "loss": 0.3657, "step": 10710 }, { "epoch": 1.8317697219672495, "grad_norm": 22.442626953125, "learning_rate": 4.338442565186752e-07, "loss": 0.3988, "step": 10720 }, { "epoch": 1.833478369711636, "grad_norm": 35.23172378540039, "learning_rate": 4.2943974630021144e-07, "loss": 0.3931, "step": 10730 }, { "epoch": 1.8351870174560223, "grad_norm": 36.66183090209961, "learning_rate": 4.250352360817477e-07, "loss": 0.4132, "step": 10740 }, { "epoch": 1.8368956652004085, "grad_norm": 38.16518783569336, "learning_rate": 4.206307258632841e-07, "loss": 0.3592, "step": 10750 }, { "epoch": 1.8386043129447946, "grad_norm": 29.179283142089844, "learning_rate": 4.1622621564482034e-07, "loss": 0.376, "step": 10760 }, { "epoch": 1.840312960689181, "grad_norm": 32.8124885559082, "learning_rate": 4.118217054263566e-07, "loss": 0.4341, "step": 10770 }, { "epoch": 1.8420216084335674, "grad_norm": 34.435943603515625, "learning_rate": 4.0741719520789293e-07, "loss": 0.3602, "step": 10780 }, { "epoch": 1.8437302561779538, "grad_norm": 23.411712646484375, "learning_rate": 4.030126849894292e-07, "loss": 0.326, "step": 10790 }, { "epoch": 1.84543890392234, "grad_norm": 39.64480209350586, "learning_rate": 3.986081747709655e-07, "loss": 0.393, "step": 10800 }, { "epoch": 1.8471475516667262, "grad_norm": 30.89308738708496, "learning_rate": 3.942036645525018e-07, "loss": 0.3746, "step": 10810 }, { "epoch": 1.8488561994111126, "grad_norm": 27.453231811523438, "learning_rate": 3.8979915433403804e-07, "loss": 0.3775, "step": 10820 }, { "epoch": 1.850564847155499, "grad_norm": 39.444679260253906, "learning_rate": 3.853946441155744e-07, "loss": 0.3238, "step": 10830 }, { "epoch": 1.8522734948998851, "grad_norm": 25.87100601196289, "learning_rate": 3.809901338971107e-07, "loss": 0.3351, "step": 10840 }, { "epoch": 1.8539821426442713, "grad_norm": 38.50906753540039, "learning_rate": 3.76585623678647e-07, "loss": 0.4047, "step": 10850 }, { "epoch": 1.8556907903886577, "grad_norm": 28.932676315307617, "learning_rate": 3.7218111346018326e-07, "loss": 0.3853, "step": 10860 }, { "epoch": 1.857399438133044, "grad_norm": 38.7553596496582, "learning_rate": 3.6777660324171953e-07, "loss": 0.4117, "step": 10870 }, { "epoch": 1.8591080858774305, "grad_norm": 37.8046760559082, "learning_rate": 3.6337209302325584e-07, "loss": 0.3444, "step": 10880 }, { "epoch": 1.8608167336218167, "grad_norm": 37.61636734008789, "learning_rate": 3.589675828047921e-07, "loss": 0.37, "step": 10890 }, { "epoch": 1.8625253813662028, "grad_norm": 31.169891357421875, "learning_rate": 3.545630725863284e-07, "loss": 0.3625, "step": 10900 }, { "epoch": 1.8642340291105892, "grad_norm": 33.97384262084961, "learning_rate": 3.5015856236786475e-07, "loss": 0.3595, "step": 10910 }, { "epoch": 1.8659426768549756, "grad_norm": 33.35996627807617, "learning_rate": 3.45754052149401e-07, "loss": 0.367, "step": 10920 }, { "epoch": 1.867651324599362, "grad_norm": 31.67682647705078, "learning_rate": 3.4134954193093733e-07, "loss": 0.3905, "step": 10930 }, { "epoch": 1.8693599723437482, "grad_norm": 34.16012954711914, "learning_rate": 3.369450317124736e-07, "loss": 0.3639, "step": 10940 }, { "epoch": 1.8710686200881343, "grad_norm": 38.885986328125, "learning_rate": 3.3254052149400986e-07, "loss": 0.3593, "step": 10950 }, { "epoch": 1.8727772678325207, "grad_norm": 35.09337615966797, "learning_rate": 3.281360112755462e-07, "loss": 0.4085, "step": 10960 }, { "epoch": 1.8744859155769071, "grad_norm": 36.90644073486328, "learning_rate": 3.2373150105708244e-07, "loss": 0.3615, "step": 10970 }, { "epoch": 1.8761945633212933, "grad_norm": 25.444183349609375, "learning_rate": 3.193269908386188e-07, "loss": 0.3644, "step": 10980 }, { "epoch": 1.8779032110656795, "grad_norm": 30.481740951538086, "learning_rate": 3.149224806201551e-07, "loss": 0.378, "step": 10990 }, { "epoch": 1.8796118588100659, "grad_norm": 37.78234100341797, "learning_rate": 3.1051797040169134e-07, "loss": 0.387, "step": 11000 }, { "epoch": 1.8813205065544523, "grad_norm": 25.613048553466797, "learning_rate": 3.061134601832276e-07, "loss": 0.3335, "step": 11010 }, { "epoch": 1.8830291542988387, "grad_norm": 39.77134323120117, "learning_rate": 3.0170894996476393e-07, "loss": 0.3843, "step": 11020 }, { "epoch": 1.8847378020432248, "grad_norm": 22.53700065612793, "learning_rate": 2.9730443974630025e-07, "loss": 0.3685, "step": 11030 }, { "epoch": 1.886446449787611, "grad_norm": 44.945308685302734, "learning_rate": 2.928999295278365e-07, "loss": 0.3302, "step": 11040 }, { "epoch": 1.8881550975319974, "grad_norm": 31.36821174621582, "learning_rate": 2.8849541930937283e-07, "loss": 0.3876, "step": 11050 }, { "epoch": 1.8898637452763838, "grad_norm": 38.52021408081055, "learning_rate": 2.840909090909091e-07, "loss": 0.3741, "step": 11060 }, { "epoch": 1.89157239302077, "grad_norm": 40.30624008178711, "learning_rate": 2.796863988724454e-07, "loss": 0.3921, "step": 11070 }, { "epoch": 1.8932810407651564, "grad_norm": 29.259140014648438, "learning_rate": 2.7528188865398173e-07, "loss": 0.3893, "step": 11080 }, { "epoch": 1.8949896885095425, "grad_norm": 25.17171287536621, "learning_rate": 2.70877378435518e-07, "loss": 0.3756, "step": 11090 }, { "epoch": 1.896698336253929, "grad_norm": 37.15606689453125, "learning_rate": 2.6647286821705426e-07, "loss": 0.4048, "step": 11100 }, { "epoch": 1.8984069839983153, "grad_norm": 24.475324630737305, "learning_rate": 2.620683579985906e-07, "loss": 0.3709, "step": 11110 }, { "epoch": 1.9001156317427015, "grad_norm": 28.089601516723633, "learning_rate": 2.576638477801269e-07, "loss": 0.3933, "step": 11120 }, { "epoch": 1.9018242794870877, "grad_norm": 24.580224990844727, "learning_rate": 2.5325933756166316e-07, "loss": 0.3557, "step": 11130 }, { "epoch": 1.903532927231474, "grad_norm": 31.057662963867188, "learning_rate": 2.4885482734319943e-07, "loss": 0.3912, "step": 11140 }, { "epoch": 1.9052415749758604, "grad_norm": 36.91437530517578, "learning_rate": 2.4445031712473575e-07, "loss": 0.363, "step": 11150 }, { "epoch": 1.9069502227202468, "grad_norm": 28.377185821533203, "learning_rate": 2.4004580690627206e-07, "loss": 0.41, "step": 11160 }, { "epoch": 1.908658870464633, "grad_norm": 33.51364517211914, "learning_rate": 2.3564129668780836e-07, "loss": 0.362, "step": 11170 }, { "epoch": 1.9103675182090192, "grad_norm": 23.851999282836914, "learning_rate": 2.3123678646934465e-07, "loss": 0.3516, "step": 11180 }, { "epoch": 1.9120761659534056, "grad_norm": 27.512645721435547, "learning_rate": 2.268322762508809e-07, "loss": 0.3736, "step": 11190 }, { "epoch": 1.913784813697792, "grad_norm": 33.09054183959961, "learning_rate": 2.224277660324172e-07, "loss": 0.3536, "step": 11200 }, { "epoch": 1.9154934614421781, "grad_norm": 39.228851318359375, "learning_rate": 2.1802325581395352e-07, "loss": 0.3922, "step": 11210 }, { "epoch": 1.9172021091865643, "grad_norm": 27.592710494995117, "learning_rate": 2.1361874559548981e-07, "loss": 0.3146, "step": 11220 }, { "epoch": 1.9189107569309507, "grad_norm": 50.0390739440918, "learning_rate": 2.0921423537702608e-07, "loss": 0.3441, "step": 11230 }, { "epoch": 1.920619404675337, "grad_norm": 39.61098098754883, "learning_rate": 2.0480972515856237e-07, "loss": 0.3493, "step": 11240 }, { "epoch": 1.9223280524197235, "grad_norm": 38.638954162597656, "learning_rate": 2.004052149400987e-07, "loss": 0.3626, "step": 11250 }, { "epoch": 1.9240367001641097, "grad_norm": 29.187583923339844, "learning_rate": 1.9600070472163498e-07, "loss": 0.3438, "step": 11260 }, { "epoch": 1.9257453479084958, "grad_norm": 28.951478958129883, "learning_rate": 1.9159619450317125e-07, "loss": 0.3602, "step": 11270 }, { "epoch": 1.9274539956528822, "grad_norm": 23.78569984436035, "learning_rate": 1.8719168428470754e-07, "loss": 0.4069, "step": 11280 }, { "epoch": 1.9291626433972686, "grad_norm": 28.406557083129883, "learning_rate": 1.8278717406624386e-07, "loss": 0.3304, "step": 11290 }, { "epoch": 1.930871291141655, "grad_norm": 36.53167724609375, "learning_rate": 1.7838266384778015e-07, "loss": 0.3741, "step": 11300 }, { "epoch": 1.9325799388860412, "grad_norm": 39.00297164916992, "learning_rate": 1.7397815362931644e-07, "loss": 0.3252, "step": 11310 }, { "epoch": 1.9342885866304274, "grad_norm": 37.43632888793945, "learning_rate": 1.695736434108527e-07, "loss": 0.3953, "step": 11320 }, { "epoch": 1.9359972343748137, "grad_norm": 34.39417266845703, "learning_rate": 1.6516913319238902e-07, "loss": 0.4112, "step": 11330 }, { "epoch": 1.9377058821192001, "grad_norm": 31.974533081054688, "learning_rate": 1.6076462297392531e-07, "loss": 0.3451, "step": 11340 }, { "epoch": 1.9394145298635863, "grad_norm": 26.460182189941406, "learning_rate": 1.563601127554616e-07, "loss": 0.318, "step": 11350 }, { "epoch": 1.9411231776079725, "grad_norm": 37.38439178466797, "learning_rate": 1.519556025369979e-07, "loss": 0.3795, "step": 11360 }, { "epoch": 1.9428318253523589, "grad_norm": 21.876747131347656, "learning_rate": 1.475510923185342e-07, "loss": 0.3331, "step": 11370 }, { "epoch": 1.9445404730967453, "grad_norm": 38.800811767578125, "learning_rate": 1.4314658210007048e-07, "loss": 0.4218, "step": 11380 }, { "epoch": 1.9462491208411317, "grad_norm": 32.05302810668945, "learning_rate": 1.3874207188160677e-07, "loss": 0.3484, "step": 11390 }, { "epoch": 1.9479577685855178, "grad_norm": 28.057106018066406, "learning_rate": 1.3433756166314306e-07, "loss": 0.3579, "step": 11400 }, { "epoch": 1.949666416329904, "grad_norm": 34.402259826660156, "learning_rate": 1.2993305144467938e-07, "loss": 0.3871, "step": 11410 }, { "epoch": 1.9513750640742904, "grad_norm": 23.892139434814453, "learning_rate": 1.2552854122621565e-07, "loss": 0.3813, "step": 11420 }, { "epoch": 1.9530837118186768, "grad_norm": 33.99159240722656, "learning_rate": 1.2112403100775197e-07, "loss": 0.3698, "step": 11430 }, { "epoch": 1.954792359563063, "grad_norm": 39.73171615600586, "learning_rate": 1.1671952078928824e-07, "loss": 0.3888, "step": 11440 }, { "epoch": 1.9565010073074494, "grad_norm": 40.77421951293945, "learning_rate": 1.1231501057082454e-07, "loss": 0.3514, "step": 11450 }, { "epoch": 1.9582096550518355, "grad_norm": 32.23746871948242, "learning_rate": 1.0791050035236083e-07, "loss": 0.3555, "step": 11460 }, { "epoch": 1.959918302796222, "grad_norm": 37.01948165893555, "learning_rate": 1.0350599013389712e-07, "loss": 0.3536, "step": 11470 }, { "epoch": 1.9616269505406083, "grad_norm": 29.169981002807617, "learning_rate": 9.910147991543341e-08, "loss": 0.3298, "step": 11480 }, { "epoch": 1.9633355982849945, "grad_norm": 21.912952423095703, "learning_rate": 9.46969696969697e-08, "loss": 0.2856, "step": 11490 }, { "epoch": 1.9650442460293807, "grad_norm": 31.11917495727539, "learning_rate": 9.0292459478506e-08, "loss": 0.3485, "step": 11500 }, { "epoch": 1.966752893773767, "grad_norm": 38.79056167602539, "learning_rate": 8.588794926004229e-08, "loss": 0.3723, "step": 11510 }, { "epoch": 1.9684615415181534, "grad_norm": 26.17238426208496, "learning_rate": 8.148343904157858e-08, "loss": 0.3234, "step": 11520 }, { "epoch": 1.9701701892625398, "grad_norm": 32.50604248046875, "learning_rate": 7.707892882311487e-08, "loss": 0.3453, "step": 11530 }, { "epoch": 1.971878837006926, "grad_norm": 42.897769927978516, "learning_rate": 7.267441860465117e-08, "loss": 0.3336, "step": 11540 }, { "epoch": 1.9735874847513122, "grad_norm": 31.65463638305664, "learning_rate": 6.826990838618747e-08, "loss": 0.3547, "step": 11550 }, { "epoch": 1.9752961324956986, "grad_norm": 36.5434684753418, "learning_rate": 6.386539816772376e-08, "loss": 0.3433, "step": 11560 }, { "epoch": 1.977004780240085, "grad_norm": 20.959672927856445, "learning_rate": 5.946088794926004e-08, "loss": 0.331, "step": 11570 }, { "epoch": 1.9787134279844711, "grad_norm": 32.63426971435547, "learning_rate": 5.5056377730796334e-08, "loss": 0.3522, "step": 11580 }, { "epoch": 1.9804220757288573, "grad_norm": 30.64798927307129, "learning_rate": 5.065186751233264e-08, "loss": 0.3582, "step": 11590 }, { "epoch": 1.9821307234732437, "grad_norm": 26.718338012695312, "learning_rate": 4.624735729386893e-08, "loss": 0.3232, "step": 11600 }, { "epoch": 1.98383937121763, "grad_norm": 34.196048736572266, "learning_rate": 4.184284707540522e-08, "loss": 0.329, "step": 11610 }, { "epoch": 1.9855480189620165, "grad_norm": 41.088165283203125, "learning_rate": 3.7438336856941514e-08, "loss": 0.3263, "step": 11620 }, { "epoch": 1.9872566667064027, "grad_norm": 30.549293518066406, "learning_rate": 3.3033826638477806e-08, "loss": 0.3245, "step": 11630 }, { "epoch": 1.9889653144507888, "grad_norm": 43.00529098510742, "learning_rate": 2.8629316420014098e-08, "loss": 0.3949, "step": 11640 }, { "epoch": 1.9906739621951752, "grad_norm": 38.130096435546875, "learning_rate": 2.422480620155039e-08, "loss": 0.3707, "step": 11650 }, { "epoch": 1.9923826099395616, "grad_norm": 32.19184112548828, "learning_rate": 1.982029598308668e-08, "loss": 0.3876, "step": 11660 }, { "epoch": 1.994091257683948, "grad_norm": 29.53122329711914, "learning_rate": 1.5415785764622976e-08, "loss": 0.3743, "step": 11670 }, { "epoch": 1.9957999054283342, "grad_norm": 28.583572387695312, "learning_rate": 1.1011275546159268e-08, "loss": 0.3703, "step": 11680 }, { "epoch": 1.9975085531727204, "grad_norm": 22.58035659790039, "learning_rate": 6.606765327695561e-09, "loss": 0.3334, "step": 11690 }, { "epoch": 1.9992172009171068, "grad_norm": 40.383663177490234, "learning_rate": 2.2022551092318538e-09, "loss": 0.3615, "step": 11700 }, { "epoch": 1.9999006600148612, "eval_loss": 0.954230546951294, "eval_runtime": 137.9658, "eval_samples_per_second": 71.46, "eval_steps_per_second": 8.937, "step": 11704 } ], "logging_steps": 10, "max_steps": 11704, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.0899612102793626e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }