{ "best_metric": null, "best_model_checkpoint": null, "epoch": 18.511662347278786, "eval_steps": 500, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037023324694557573, "grad_norm": 7.828796863555908, "learning_rate": 2.0000000000000002e-07, "loss": 2.2398, "step": 10 }, { "epoch": 0.0074046649389115145, "grad_norm": 8.8226900100708, "learning_rate": 4.0000000000000003e-07, "loss": 1.8938, "step": 20 }, { "epoch": 0.011106997408367271, "grad_norm": 9.546854972839355, "learning_rate": 6.000000000000001e-07, "loss": 2.3007, "step": 30 }, { "epoch": 0.014809329877823029, "grad_norm": 7.3556809425354, "learning_rate": 8.000000000000001e-07, "loss": 2.0526, "step": 40 }, { "epoch": 0.018511662347278787, "grad_norm": 12.601884841918945, "learning_rate": 1.0000000000000002e-06, "loss": 2.1123, "step": 50 }, { "epoch": 0.022213994816734542, "grad_norm": 8.209510803222656, "learning_rate": 1.2000000000000002e-06, "loss": 1.9249, "step": 60 }, { "epoch": 0.0259163272861903, "grad_norm": 8.863743782043457, "learning_rate": 1.4000000000000001e-06, "loss": 2.099, "step": 70 }, { "epoch": 0.029618659755646058, "grad_norm": 7.868948459625244, "learning_rate": 1.6000000000000001e-06, "loss": 1.9841, "step": 80 }, { "epoch": 0.03332099222510181, "grad_norm": 4.900806427001953, "learning_rate": 1.8e-06, "loss": 1.8462, "step": 90 }, { "epoch": 0.037023324694557574, "grad_norm": 6.918540000915527, "learning_rate": 2.0000000000000003e-06, "loss": 1.9919, "step": 100 }, { "epoch": 0.04072565716401333, "grad_norm": 5.5816650390625, "learning_rate": 2.2e-06, "loss": 1.827, "step": 110 }, { "epoch": 0.044427989633469084, "grad_norm": 4.087903022766113, "learning_rate": 2.4000000000000003e-06, "loss": 1.5708, "step": 120 }, { "epoch": 0.048130322102924845, "grad_norm": 2.740644931793213, "learning_rate": 2.6e-06, "loss": 1.4062, "step": 130 }, { "epoch": 0.0518326545723806, "grad_norm": 5.608458518981934, "learning_rate": 2.8000000000000003e-06, "loss": 1.4017, "step": 140 }, { "epoch": 0.055534987041836355, "grad_norm": 4.727485656738281, "learning_rate": 3e-06, "loss": 1.5339, "step": 150 }, { "epoch": 0.059237319511292116, "grad_norm": 1.7541900873184204, "learning_rate": 3.2000000000000003e-06, "loss": 1.0782, "step": 160 }, { "epoch": 0.06293965198074787, "grad_norm": 2.696484088897705, "learning_rate": 3.4000000000000005e-06, "loss": 1.0127, "step": 170 }, { "epoch": 0.06664198445020363, "grad_norm": 1.3211946487426758, "learning_rate": 3.6e-06, "loss": 0.8877, "step": 180 }, { "epoch": 0.07034431691965938, "grad_norm": 1.2911232709884644, "learning_rate": 3.8e-06, "loss": 0.7354, "step": 190 }, { "epoch": 0.07404664938911515, "grad_norm": 3.494590997695923, "learning_rate": 4.000000000000001e-06, "loss": 0.7249, "step": 200 }, { "epoch": 0.0777489818585709, "grad_norm": 1.9829717874526978, "learning_rate": 4.2000000000000004e-06, "loss": 1.0181, "step": 210 }, { "epoch": 0.08145131432802666, "grad_norm": 3.1482009887695312, "learning_rate": 4.4e-06, "loss": 0.8026, "step": 220 }, { "epoch": 0.08515364679748241, "grad_norm": 1.7549173831939697, "learning_rate": 4.6e-06, "loss": 0.7124, "step": 230 }, { "epoch": 0.08885597926693817, "grad_norm": 1.6905096769332886, "learning_rate": 4.800000000000001e-06, "loss": 0.7854, "step": 240 }, { "epoch": 0.09255831173639392, "grad_norm": 1.9769220352172852, "learning_rate": 5e-06, "loss": 0.602, "step": 250 }, { "epoch": 0.09626064420584969, "grad_norm": 2.3903021812438965, "learning_rate": 5.2e-06, "loss": 0.671, "step": 260 }, { "epoch": 0.09996297667530545, "grad_norm": 1.757415533065796, "learning_rate": 5.4e-06, "loss": 0.6388, "step": 270 }, { "epoch": 0.1036653091447612, "grad_norm": 3.8072926998138428, "learning_rate": 5.600000000000001e-06, "loss": 0.5105, "step": 280 }, { "epoch": 0.10736764161421695, "grad_norm": 2.558095932006836, "learning_rate": 5.8e-06, "loss": 0.4915, "step": 290 }, { "epoch": 0.11106997408367271, "grad_norm": 1.461645245552063, "learning_rate": 6e-06, "loss": 0.4086, "step": 300 }, { "epoch": 0.11477230655312846, "grad_norm": 4.776918411254883, "learning_rate": 6.2e-06, "loss": 0.4301, "step": 310 }, { "epoch": 0.11847463902258423, "grad_norm": 1.1484661102294922, "learning_rate": 6.4000000000000006e-06, "loss": 0.3627, "step": 320 }, { "epoch": 0.12217697149203999, "grad_norm": 2.4644663333892822, "learning_rate": 6.6e-06, "loss": 0.3916, "step": 330 }, { "epoch": 0.12587930396149574, "grad_norm": 1.826377272605896, "learning_rate": 6.800000000000001e-06, "loss": 0.2931, "step": 340 }, { "epoch": 0.1295816364309515, "grad_norm": 1.6970767974853516, "learning_rate": 7.000000000000001e-06, "loss": 0.3479, "step": 350 }, { "epoch": 0.13328396890040725, "grad_norm": 2.245983362197876, "learning_rate": 7.2e-06, "loss": 0.3675, "step": 360 }, { "epoch": 0.136986301369863, "grad_norm": 2.5633957386016846, "learning_rate": 7.4e-06, "loss": 0.317, "step": 370 }, { "epoch": 0.14068863383931876, "grad_norm": 1.5092730522155762, "learning_rate": 7.6e-06, "loss": 0.2988, "step": 380 }, { "epoch": 0.14439096630877452, "grad_norm": 1.4431575536727905, "learning_rate": 7.8e-06, "loss": 0.3035, "step": 390 }, { "epoch": 0.1480932987782303, "grad_norm": 2.0074455738067627, "learning_rate": 8.000000000000001e-06, "loss": 0.3222, "step": 400 }, { "epoch": 0.15179563124768605, "grad_norm": 4.691574573516846, "learning_rate": 8.200000000000001e-06, "loss": 0.3018, "step": 410 }, { "epoch": 0.1554979637171418, "grad_norm": 1.5947461128234863, "learning_rate": 8.400000000000001e-06, "loss": 0.2612, "step": 420 }, { "epoch": 0.15920029618659756, "grad_norm": 1.3304909467697144, "learning_rate": 8.599999999999999e-06, "loss": 0.2341, "step": 430 }, { "epoch": 0.16290262865605332, "grad_norm": 2.727384090423584, "learning_rate": 8.8e-06, "loss": 0.2526, "step": 440 }, { "epoch": 0.16660496112550907, "grad_norm": 1.5410687923431396, "learning_rate": 9e-06, "loss": 0.2494, "step": 450 }, { "epoch": 0.17030729359496483, "grad_norm": 2.0058510303497314, "learning_rate": 9.2e-06, "loss": 0.2189, "step": 460 }, { "epoch": 0.17400962606442058, "grad_norm": 1.541945219039917, "learning_rate": 9.4e-06, "loss": 0.2293, "step": 470 }, { "epoch": 0.17771195853387634, "grad_norm": 1.9422534704208374, "learning_rate": 9.600000000000001e-06, "loss": 0.2055, "step": 480 }, { "epoch": 0.1814142910033321, "grad_norm": 2.533372163772583, "learning_rate": 9.800000000000001e-06, "loss": 0.1905, "step": 490 }, { "epoch": 0.18511662347278784, "grad_norm": 1.3903000354766846, "learning_rate": 1e-05, "loss": 0.2073, "step": 500 }, { "epoch": 0.18881895594224363, "grad_norm": 1.296175241470337, "learning_rate": 1.02e-05, "loss": 0.1913, "step": 510 }, { "epoch": 0.19252128841169938, "grad_norm": 1.4335821866989136, "learning_rate": 1.04e-05, "loss": 0.2145, "step": 520 }, { "epoch": 0.19622362088115514, "grad_norm": 1.3062387704849243, "learning_rate": 1.06e-05, "loss": 0.2293, "step": 530 }, { "epoch": 0.1999259533506109, "grad_norm": 1.785085678100586, "learning_rate": 1.08e-05, "loss": 0.2111, "step": 540 }, { "epoch": 0.20362828582006665, "grad_norm": 1.518935203552246, "learning_rate": 1.1000000000000001e-05, "loss": 0.1964, "step": 550 }, { "epoch": 0.2073306182895224, "grad_norm": 3.290677785873413, "learning_rate": 1.1200000000000001e-05, "loss": 0.1701, "step": 560 }, { "epoch": 0.21103295075897816, "grad_norm": 1.172218680381775, "learning_rate": 1.1400000000000001e-05, "loss": 0.2093, "step": 570 }, { "epoch": 0.2147352832284339, "grad_norm": 1.457667350769043, "learning_rate": 1.16e-05, "loss": 0.1803, "step": 580 }, { "epoch": 0.21843761569788966, "grad_norm": 1.7684199810028076, "learning_rate": 1.18e-05, "loss": 0.205, "step": 590 }, { "epoch": 0.22213994816734542, "grad_norm": 1.0911297798156738, "learning_rate": 1.2e-05, "loss": 0.1813, "step": 600 }, { "epoch": 0.22584228063680117, "grad_norm": 1.9823895692825317, "learning_rate": 1.22e-05, "loss": 0.1964, "step": 610 }, { "epoch": 0.22954461310625693, "grad_norm": 1.5304362773895264, "learning_rate": 1.24e-05, "loss": 0.1812, "step": 620 }, { "epoch": 0.2332469455757127, "grad_norm": 1.8300291299819946, "learning_rate": 1.2600000000000001e-05, "loss": 0.1611, "step": 630 }, { "epoch": 0.23694927804516847, "grad_norm": 1.9234914779663086, "learning_rate": 1.2800000000000001e-05, "loss": 0.1702, "step": 640 }, { "epoch": 0.24065161051462422, "grad_norm": 1.0271013975143433, "learning_rate": 1.3000000000000001e-05, "loss": 0.1523, "step": 650 }, { "epoch": 0.24435394298407997, "grad_norm": 1.3445476293563843, "learning_rate": 1.32e-05, "loss": 0.1782, "step": 660 }, { "epoch": 0.24805627545353573, "grad_norm": 1.7473419904708862, "learning_rate": 1.3400000000000002e-05, "loss": 0.1481, "step": 670 }, { "epoch": 0.2517586079229915, "grad_norm": 1.9963879585266113, "learning_rate": 1.3600000000000002e-05, "loss": 0.1841, "step": 680 }, { "epoch": 0.25546094039244727, "grad_norm": 3.5533945560455322, "learning_rate": 1.3800000000000002e-05, "loss": 0.1496, "step": 690 }, { "epoch": 0.259163272861903, "grad_norm": 0.88379967212677, "learning_rate": 1.4000000000000001e-05, "loss": 0.145, "step": 700 }, { "epoch": 0.2628656053313588, "grad_norm": 1.5505783557891846, "learning_rate": 1.42e-05, "loss": 0.1641, "step": 710 }, { "epoch": 0.2665679378008145, "grad_norm": 1.660693645477295, "learning_rate": 1.44e-05, "loss": 0.1484, "step": 720 }, { "epoch": 0.2702702702702703, "grad_norm": 0.7512562274932861, "learning_rate": 1.4599999999999999e-05, "loss": 0.1501, "step": 730 }, { "epoch": 0.273972602739726, "grad_norm": 1.4029695987701416, "learning_rate": 1.48e-05, "loss": 0.1472, "step": 740 }, { "epoch": 0.2776749352091818, "grad_norm": 0.7520191073417664, "learning_rate": 1.5e-05, "loss": 0.1355, "step": 750 }, { "epoch": 0.2813772676786375, "grad_norm": 1.1688158512115479, "learning_rate": 1.52e-05, "loss": 0.1563, "step": 760 }, { "epoch": 0.2850796001480933, "grad_norm": 1.0042999982833862, "learning_rate": 1.54e-05, "loss": 0.1084, "step": 770 }, { "epoch": 0.28878193261754903, "grad_norm": 1.1872135400772095, "learning_rate": 1.56e-05, "loss": 0.123, "step": 780 }, { "epoch": 0.2924842650870048, "grad_norm": 1.2947394847869873, "learning_rate": 1.58e-05, "loss": 0.1394, "step": 790 }, { "epoch": 0.2961865975564606, "grad_norm": 1.27442467212677, "learning_rate": 1.6000000000000003e-05, "loss": 0.136, "step": 800 }, { "epoch": 0.2998889300259163, "grad_norm": 2.3196935653686523, "learning_rate": 1.62e-05, "loss": 0.1555, "step": 810 }, { "epoch": 0.3035912624953721, "grad_norm": 0.8781152963638306, "learning_rate": 1.6400000000000002e-05, "loss": 0.1566, "step": 820 }, { "epoch": 0.30729359496482783, "grad_norm": 1.0241625308990479, "learning_rate": 1.66e-05, "loss": 0.1676, "step": 830 }, { "epoch": 0.3109959274342836, "grad_norm": 0.8572613000869751, "learning_rate": 1.6800000000000002e-05, "loss": 0.1116, "step": 840 }, { "epoch": 0.31469825990373934, "grad_norm": 1.1864336729049683, "learning_rate": 1.7000000000000003e-05, "loss": 0.1247, "step": 850 }, { "epoch": 0.3184005923731951, "grad_norm": 1.4215986728668213, "learning_rate": 1.7199999999999998e-05, "loss": 0.168, "step": 860 }, { "epoch": 0.32210292484265085, "grad_norm": 1.0596152544021606, "learning_rate": 1.74e-05, "loss": 0.1151, "step": 870 }, { "epoch": 0.32580525731210663, "grad_norm": 1.6209696531295776, "learning_rate": 1.76e-05, "loss": 0.1499, "step": 880 }, { "epoch": 0.32950758978156236, "grad_norm": 1.8608975410461426, "learning_rate": 1.78e-05, "loss": 0.1465, "step": 890 }, { "epoch": 0.33320992225101814, "grad_norm": 0.8527662754058838, "learning_rate": 1.8e-05, "loss": 0.1234, "step": 900 }, { "epoch": 0.3369122547204739, "grad_norm": 1.1807057857513428, "learning_rate": 1.8200000000000002e-05, "loss": 0.1327, "step": 910 }, { "epoch": 0.34061458718992965, "grad_norm": 1.457122802734375, "learning_rate": 1.84e-05, "loss": 0.1588, "step": 920 }, { "epoch": 0.34431691965938543, "grad_norm": 1.6125288009643555, "learning_rate": 1.86e-05, "loss": 0.1332, "step": 930 }, { "epoch": 0.34801925212884116, "grad_norm": 1.30000638961792, "learning_rate": 1.88e-05, "loss": 0.1381, "step": 940 }, { "epoch": 0.35172158459829694, "grad_norm": 1.191200852394104, "learning_rate": 1.9e-05, "loss": 0.1225, "step": 950 }, { "epoch": 0.35542391706775267, "grad_norm": 2.6924760341644287, "learning_rate": 1.9200000000000003e-05, "loss": 0.1317, "step": 960 }, { "epoch": 0.35912624953720845, "grad_norm": 1.1938844919204712, "learning_rate": 1.94e-05, "loss": 0.1249, "step": 970 }, { "epoch": 0.3628285820066642, "grad_norm": 1.253490924835205, "learning_rate": 1.9600000000000002e-05, "loss": 0.1284, "step": 980 }, { "epoch": 0.36653091447611996, "grad_norm": 1.1216806173324585, "learning_rate": 1.9800000000000004e-05, "loss": 0.119, "step": 990 }, { "epoch": 0.3702332469455757, "grad_norm": 1.127566933631897, "learning_rate": 2e-05, "loss": 0.1041, "step": 1000 }, { "epoch": 0.37393557941503147, "grad_norm": 1.0156586170196533, "learning_rate": 2.0200000000000003e-05, "loss": 0.1209, "step": 1010 }, { "epoch": 0.37763791188448725, "grad_norm": 0.91090327501297, "learning_rate": 2.04e-05, "loss": 0.1026, "step": 1020 }, { "epoch": 0.381340244353943, "grad_norm": 1.7352076768875122, "learning_rate": 2.06e-05, "loss": 0.1086, "step": 1030 }, { "epoch": 0.38504257682339876, "grad_norm": 1.3340576887130737, "learning_rate": 2.08e-05, "loss": 0.1292, "step": 1040 }, { "epoch": 0.3887449092928545, "grad_norm": 3.069770336151123, "learning_rate": 2.1e-05, "loss": 0.1214, "step": 1050 }, { "epoch": 0.3924472417623103, "grad_norm": 0.97674161195755, "learning_rate": 2.12e-05, "loss": 0.1045, "step": 1060 }, { "epoch": 0.396149574231766, "grad_norm": 1.4582083225250244, "learning_rate": 2.1400000000000002e-05, "loss": 0.103, "step": 1070 }, { "epoch": 0.3998519067012218, "grad_norm": 0.7253577709197998, "learning_rate": 2.16e-05, "loss": 0.1139, "step": 1080 }, { "epoch": 0.4035542391706775, "grad_norm": 1.3638741970062256, "learning_rate": 2.18e-05, "loss": 0.1269, "step": 1090 }, { "epoch": 0.4072565716401333, "grad_norm": 0.7583422660827637, "learning_rate": 2.2000000000000003e-05, "loss": 0.104, "step": 1100 }, { "epoch": 0.410958904109589, "grad_norm": 1.5892572402954102, "learning_rate": 2.22e-05, "loss": 0.1036, "step": 1110 }, { "epoch": 0.4146612365790448, "grad_norm": 1.018933653831482, "learning_rate": 2.2400000000000002e-05, "loss": 0.1078, "step": 1120 }, { "epoch": 0.4183635690485005, "grad_norm": 0.5072354078292847, "learning_rate": 2.26e-05, "loss": 0.1086, "step": 1130 }, { "epoch": 0.4220659015179563, "grad_norm": 1.593956470489502, "learning_rate": 2.2800000000000002e-05, "loss": 0.1427, "step": 1140 }, { "epoch": 0.4257682339874121, "grad_norm": 1.3982990980148315, "learning_rate": 2.3000000000000003e-05, "loss": 0.1128, "step": 1150 }, { "epoch": 0.4294705664568678, "grad_norm": 1.1426753997802734, "learning_rate": 2.32e-05, "loss": 0.1026, "step": 1160 }, { "epoch": 0.4331728989263236, "grad_norm": 1.1700688600540161, "learning_rate": 2.3400000000000003e-05, "loss": 0.1065, "step": 1170 }, { "epoch": 0.43687523139577933, "grad_norm": 0.9543449282646179, "learning_rate": 2.36e-05, "loss": 0.0889, "step": 1180 }, { "epoch": 0.4405775638652351, "grad_norm": 0.6687193512916565, "learning_rate": 2.38e-05, "loss": 0.1129, "step": 1190 }, { "epoch": 0.44427989633469084, "grad_norm": 0.7762334942817688, "learning_rate": 2.4e-05, "loss": 0.0943, "step": 1200 }, { "epoch": 0.4479822288041466, "grad_norm": 0.99888014793396, "learning_rate": 2.4200000000000002e-05, "loss": 0.1005, "step": 1210 }, { "epoch": 0.45168456127360235, "grad_norm": 1.1133954524993896, "learning_rate": 2.44e-05, "loss": 0.1105, "step": 1220 }, { "epoch": 0.45538689374305813, "grad_norm": 1.148185133934021, "learning_rate": 2.46e-05, "loss": 0.1066, "step": 1230 }, { "epoch": 0.45908922621251386, "grad_norm": 0.9315077066421509, "learning_rate": 2.48e-05, "loss": 0.1116, "step": 1240 }, { "epoch": 0.46279155868196964, "grad_norm": 1.2880867719650269, "learning_rate": 2.5e-05, "loss": 0.09, "step": 1250 }, { "epoch": 0.4664938911514254, "grad_norm": 0.862191379070282, "learning_rate": 2.5200000000000003e-05, "loss": 0.0912, "step": 1260 }, { "epoch": 0.47019622362088115, "grad_norm": 0.41712453961372375, "learning_rate": 2.54e-05, "loss": 0.0883, "step": 1270 }, { "epoch": 0.47389855609033693, "grad_norm": 0.835080623626709, "learning_rate": 2.5600000000000002e-05, "loss": 0.1068, "step": 1280 }, { "epoch": 0.47760088855979266, "grad_norm": 0.7794963717460632, "learning_rate": 2.58e-05, "loss": 0.0927, "step": 1290 }, { "epoch": 0.48130322102924844, "grad_norm": 0.7698606252670288, "learning_rate": 2.6000000000000002e-05, "loss": 0.0933, "step": 1300 }, { "epoch": 0.48500555349870417, "grad_norm": 1.0623207092285156, "learning_rate": 2.6200000000000003e-05, "loss": 0.0978, "step": 1310 }, { "epoch": 0.48870788596815995, "grad_norm": 0.8132001161575317, "learning_rate": 2.64e-05, "loss": 0.1015, "step": 1320 }, { "epoch": 0.4924102184376157, "grad_norm": 1.1597167253494263, "learning_rate": 2.6600000000000003e-05, "loss": 0.0871, "step": 1330 }, { "epoch": 0.49611255090707146, "grad_norm": 1.1138064861297607, "learning_rate": 2.6800000000000004e-05, "loss": 0.1131, "step": 1340 }, { "epoch": 0.4998148833765272, "grad_norm": 0.9905324578285217, "learning_rate": 2.7000000000000002e-05, "loss": 0.1222, "step": 1350 }, { "epoch": 0.503517215845983, "grad_norm": 1.9129204750061035, "learning_rate": 2.7200000000000004e-05, "loss": 0.1358, "step": 1360 }, { "epoch": 0.5072195483154387, "grad_norm": 1.453940749168396, "learning_rate": 2.7400000000000002e-05, "loss": 0.0965, "step": 1370 }, { "epoch": 0.5109218807848945, "grad_norm": 1.0516706705093384, "learning_rate": 2.7600000000000003e-05, "loss": 0.1083, "step": 1380 }, { "epoch": 0.5146242132543503, "grad_norm": 1.0340856313705444, "learning_rate": 2.7800000000000005e-05, "loss": 0.1227, "step": 1390 }, { "epoch": 0.518326545723806, "grad_norm": 1.0512737035751343, "learning_rate": 2.8000000000000003e-05, "loss": 0.088, "step": 1400 }, { "epoch": 0.5220288781932617, "grad_norm": 1.0231947898864746, "learning_rate": 2.8199999999999998e-05, "loss": 0.1074, "step": 1410 }, { "epoch": 0.5257312106627176, "grad_norm": 0.9069696664810181, "learning_rate": 2.84e-05, "loss": 0.101, "step": 1420 }, { "epoch": 0.5294335431321733, "grad_norm": 1.3370598554611206, "learning_rate": 2.86e-05, "loss": 0.1018, "step": 1430 }, { "epoch": 0.533135875601629, "grad_norm": 0.9621454477310181, "learning_rate": 2.88e-05, "loss": 0.084, "step": 1440 }, { "epoch": 0.5368382080710847, "grad_norm": 1.030114769935608, "learning_rate": 2.9e-05, "loss": 0.1111, "step": 1450 }, { "epoch": 0.5405405405405406, "grad_norm": 0.6887611746788025, "learning_rate": 2.9199999999999998e-05, "loss": 0.1048, "step": 1460 }, { "epoch": 0.5442428730099963, "grad_norm": 1.5711811780929565, "learning_rate": 2.94e-05, "loss": 0.087, "step": 1470 }, { "epoch": 0.547945205479452, "grad_norm": 0.6075034141540527, "learning_rate": 2.96e-05, "loss": 0.0781, "step": 1480 }, { "epoch": 0.5516475379489079, "grad_norm": 0.9486133456230164, "learning_rate": 2.98e-05, "loss": 0.0858, "step": 1490 }, { "epoch": 0.5553498704183636, "grad_norm": 1.097822666168213, "learning_rate": 3e-05, "loss": 0.0784, "step": 1500 }, { "epoch": 0.5590522028878193, "grad_norm": 0.6440536975860596, "learning_rate": 3.02e-05, "loss": 0.0886, "step": 1510 }, { "epoch": 0.562754535357275, "grad_norm": 1.3130236864089966, "learning_rate": 3.04e-05, "loss": 0.1082, "step": 1520 }, { "epoch": 0.5664568678267309, "grad_norm": 0.7797932624816895, "learning_rate": 3.06e-05, "loss": 0.0896, "step": 1530 }, { "epoch": 0.5701592002961866, "grad_norm": 1.2153689861297607, "learning_rate": 3.08e-05, "loss": 0.0924, "step": 1540 }, { "epoch": 0.5738615327656423, "grad_norm": 1.0421634912490845, "learning_rate": 3.1e-05, "loss": 0.0894, "step": 1550 }, { "epoch": 0.5775638652350981, "grad_norm": 0.8021800518035889, "learning_rate": 3.12e-05, "loss": 0.0748, "step": 1560 }, { "epoch": 0.5812661977045539, "grad_norm": 0.47041162848472595, "learning_rate": 3.1400000000000004e-05, "loss": 0.0855, "step": 1570 }, { "epoch": 0.5849685301740096, "grad_norm": 0.8204790353775024, "learning_rate": 3.16e-05, "loss": 0.083, "step": 1580 }, { "epoch": 0.5886708626434654, "grad_norm": 0.64274662733078, "learning_rate": 3.18e-05, "loss": 0.0916, "step": 1590 }, { "epoch": 0.5923731951129212, "grad_norm": 0.6026656031608582, "learning_rate": 3.2000000000000005e-05, "loss": 0.0727, "step": 1600 }, { "epoch": 0.5960755275823769, "grad_norm": 0.7248187065124512, "learning_rate": 3.2200000000000003e-05, "loss": 0.0839, "step": 1610 }, { "epoch": 0.5997778600518326, "grad_norm": 0.8098479509353638, "learning_rate": 3.24e-05, "loss": 0.0776, "step": 1620 }, { "epoch": 0.6034801925212884, "grad_norm": 1.0797072649002075, "learning_rate": 3.26e-05, "loss": 0.0984, "step": 1630 }, { "epoch": 0.6071825249907442, "grad_norm": 0.9686888456344604, "learning_rate": 3.2800000000000004e-05, "loss": 0.0929, "step": 1640 }, { "epoch": 0.6108848574601999, "grad_norm": 1.8965777158737183, "learning_rate": 3.3e-05, "loss": 0.0981, "step": 1650 }, { "epoch": 0.6145871899296557, "grad_norm": 1.0512614250183105, "learning_rate": 3.32e-05, "loss": 0.0952, "step": 1660 }, { "epoch": 0.6182895223991114, "grad_norm": 0.9152876734733582, "learning_rate": 3.3400000000000005e-05, "loss": 0.0801, "step": 1670 }, { "epoch": 0.6219918548685672, "grad_norm": 1.700024962425232, "learning_rate": 3.3600000000000004e-05, "loss": 0.0976, "step": 1680 }, { "epoch": 0.625694187338023, "grad_norm": 1.4543591737747192, "learning_rate": 3.38e-05, "loss": 0.1175, "step": 1690 }, { "epoch": 0.6293965198074787, "grad_norm": 0.8551534414291382, "learning_rate": 3.4000000000000007e-05, "loss": 0.1019, "step": 1700 }, { "epoch": 0.6330988522769345, "grad_norm": 1.6461784839630127, "learning_rate": 3.4200000000000005e-05, "loss": 0.0894, "step": 1710 }, { "epoch": 0.6368011847463902, "grad_norm": 1.1104848384857178, "learning_rate": 3.4399999999999996e-05, "loss": 0.0842, "step": 1720 }, { "epoch": 0.640503517215846, "grad_norm": 1.3577065467834473, "learning_rate": 3.46e-05, "loss": 0.0825, "step": 1730 }, { "epoch": 0.6442058496853017, "grad_norm": 1.4395719766616821, "learning_rate": 3.48e-05, "loss": 0.0926, "step": 1740 }, { "epoch": 0.6479081821547575, "grad_norm": 0.9909624457359314, "learning_rate": 3.5e-05, "loss": 0.0997, "step": 1750 }, { "epoch": 0.6516105146242133, "grad_norm": 1.1013895273208618, "learning_rate": 3.52e-05, "loss": 0.103, "step": 1760 }, { "epoch": 0.655312847093669, "grad_norm": 0.7773327827453613, "learning_rate": 3.54e-05, "loss": 0.0808, "step": 1770 }, { "epoch": 0.6590151795631247, "grad_norm": 0.9390606880187988, "learning_rate": 3.56e-05, "loss": 0.0864, "step": 1780 }, { "epoch": 0.6627175120325806, "grad_norm": 0.5559529066085815, "learning_rate": 3.58e-05, "loss": 0.0858, "step": 1790 }, { "epoch": 0.6664198445020363, "grad_norm": 0.9162631034851074, "learning_rate": 3.6e-05, "loss": 0.0854, "step": 1800 }, { "epoch": 0.670122176971492, "grad_norm": 0.8399471640586853, "learning_rate": 3.62e-05, "loss": 0.0881, "step": 1810 }, { "epoch": 0.6738245094409478, "grad_norm": 1.509558081626892, "learning_rate": 3.6400000000000004e-05, "loss": 0.0951, "step": 1820 }, { "epoch": 0.6775268419104036, "grad_norm": 0.8585473895072937, "learning_rate": 3.66e-05, "loss": 0.0733, "step": 1830 }, { "epoch": 0.6812291743798593, "grad_norm": 0.9663869142532349, "learning_rate": 3.68e-05, "loss": 0.0895, "step": 1840 }, { "epoch": 0.684931506849315, "grad_norm": 0.7713764905929565, "learning_rate": 3.7e-05, "loss": 0.0766, "step": 1850 }, { "epoch": 0.6886338393187709, "grad_norm": 0.5924803018569946, "learning_rate": 3.72e-05, "loss": 0.068, "step": 1860 }, { "epoch": 0.6923361717882266, "grad_norm": 0.8274524211883545, "learning_rate": 3.74e-05, "loss": 0.0584, "step": 1870 }, { "epoch": 0.6960385042576823, "grad_norm": 0.7206615805625916, "learning_rate": 3.76e-05, "loss": 0.0796, "step": 1880 }, { "epoch": 0.699740836727138, "grad_norm": 1.694745421409607, "learning_rate": 3.7800000000000004e-05, "loss": 0.0947, "step": 1890 }, { "epoch": 0.7034431691965939, "grad_norm": 0.5733842849731445, "learning_rate": 3.8e-05, "loss": 0.0748, "step": 1900 }, { "epoch": 0.7071455016660496, "grad_norm": 1.0346266031265259, "learning_rate": 3.82e-05, "loss": 0.0779, "step": 1910 }, { "epoch": 0.7108478341355053, "grad_norm": 0.6124595403671265, "learning_rate": 3.8400000000000005e-05, "loss": 0.0865, "step": 1920 }, { "epoch": 0.7145501666049612, "grad_norm": 0.8405007719993591, "learning_rate": 3.86e-05, "loss": 0.0837, "step": 1930 }, { "epoch": 0.7182524990744169, "grad_norm": 1.1218745708465576, "learning_rate": 3.88e-05, "loss": 0.0631, "step": 1940 }, { "epoch": 0.7219548315438726, "grad_norm": 0.5776962637901306, "learning_rate": 3.9000000000000006e-05, "loss": 0.0808, "step": 1950 }, { "epoch": 0.7256571640133284, "grad_norm": 0.6330708861351013, "learning_rate": 3.9200000000000004e-05, "loss": 0.0738, "step": 1960 }, { "epoch": 0.7293594964827842, "grad_norm": 0.7523028254508972, "learning_rate": 3.94e-05, "loss": 0.0717, "step": 1970 }, { "epoch": 0.7330618289522399, "grad_norm": 0.6077460050582886, "learning_rate": 3.960000000000001e-05, "loss": 0.0879, "step": 1980 }, { "epoch": 0.7367641614216957, "grad_norm": 0.5729479193687439, "learning_rate": 3.9800000000000005e-05, "loss": 0.0753, "step": 1990 }, { "epoch": 0.7404664938911514, "grad_norm": 1.6984648704528809, "learning_rate": 4e-05, "loss": 0.0881, "step": 2000 }, { "epoch": 0.7441688263606072, "grad_norm": 1.5977784395217896, "learning_rate": 4.02e-05, "loss": 0.0788, "step": 2010 }, { "epoch": 0.7478711588300629, "grad_norm": 1.102576494216919, "learning_rate": 4.0400000000000006e-05, "loss": 0.087, "step": 2020 }, { "epoch": 0.7515734912995187, "grad_norm": 5.042935371398926, "learning_rate": 4.0600000000000004e-05, "loss": 0.0638, "step": 2030 }, { "epoch": 0.7552758237689745, "grad_norm": 0.7296939492225647, "learning_rate": 4.08e-05, "loss": 0.0584, "step": 2040 }, { "epoch": 0.7589781562384302, "grad_norm": 0.875615119934082, "learning_rate": 4.1e-05, "loss": 0.0755, "step": 2050 }, { "epoch": 0.762680488707886, "grad_norm": 0.7375776171684265, "learning_rate": 4.12e-05, "loss": 0.0995, "step": 2060 }, { "epoch": 0.7663828211773417, "grad_norm": 0.881028950214386, "learning_rate": 4.14e-05, "loss": 0.0712, "step": 2070 }, { "epoch": 0.7700851536467975, "grad_norm": 1.0653451681137085, "learning_rate": 4.16e-05, "loss": 0.0805, "step": 2080 }, { "epoch": 0.7737874861162533, "grad_norm": 0.685608983039856, "learning_rate": 4.18e-05, "loss": 0.0672, "step": 2090 }, { "epoch": 0.777489818585709, "grad_norm": 1.0025854110717773, "learning_rate": 4.2e-05, "loss": 0.0763, "step": 2100 }, { "epoch": 0.7811921510551647, "grad_norm": 0.4592123329639435, "learning_rate": 4.22e-05, "loss": 0.0741, "step": 2110 }, { "epoch": 0.7848944835246205, "grad_norm": 0.6735115647315979, "learning_rate": 4.24e-05, "loss": 0.0769, "step": 2120 }, { "epoch": 0.7885968159940763, "grad_norm": 12.2243070602417, "learning_rate": 4.26e-05, "loss": 0.0789, "step": 2130 }, { "epoch": 0.792299148463532, "grad_norm": 0.5446076989173889, "learning_rate": 4.2800000000000004e-05, "loss": 0.075, "step": 2140 }, { "epoch": 0.7960014809329878, "grad_norm": 1.9520716667175293, "learning_rate": 4.3e-05, "loss": 0.0729, "step": 2150 }, { "epoch": 0.7997038134024436, "grad_norm": 1.2135779857635498, "learning_rate": 4.32e-05, "loss": 0.0902, "step": 2160 }, { "epoch": 0.8034061458718993, "grad_norm": 0.47972410917282104, "learning_rate": 4.3400000000000005e-05, "loss": 0.0645, "step": 2170 }, { "epoch": 0.807108478341355, "grad_norm": 1.0668355226516724, "learning_rate": 4.36e-05, "loss": 0.0779, "step": 2180 }, { "epoch": 0.8108108108108109, "grad_norm": 0.7142060995101929, "learning_rate": 4.38e-05, "loss": 0.0704, "step": 2190 }, { "epoch": 0.8145131432802666, "grad_norm": 1.3185056447982788, "learning_rate": 4.4000000000000006e-05, "loss": 0.075, "step": 2200 }, { "epoch": 0.8182154757497223, "grad_norm": 0.8187047243118286, "learning_rate": 4.4200000000000004e-05, "loss": 0.0746, "step": 2210 }, { "epoch": 0.821917808219178, "grad_norm": 0.7806606888771057, "learning_rate": 4.44e-05, "loss": 0.0806, "step": 2220 }, { "epoch": 0.8256201406886339, "grad_norm": 0.6502716541290283, "learning_rate": 4.46e-05, "loss": 0.0847, "step": 2230 }, { "epoch": 0.8293224731580896, "grad_norm": 0.6993075609207153, "learning_rate": 4.4800000000000005e-05, "loss": 0.0703, "step": 2240 }, { "epoch": 0.8330248056275453, "grad_norm": 0.3910782039165497, "learning_rate": 4.5e-05, "loss": 0.0674, "step": 2250 }, { "epoch": 0.836727138097001, "grad_norm": 2.4554615020751953, "learning_rate": 4.52e-05, "loss": 0.0889, "step": 2260 }, { "epoch": 0.8404294705664569, "grad_norm": 0.678573727607727, "learning_rate": 4.5400000000000006e-05, "loss": 0.0655, "step": 2270 }, { "epoch": 0.8441318030359126, "grad_norm": 0.8971746563911438, "learning_rate": 4.5600000000000004e-05, "loss": 0.0811, "step": 2280 }, { "epoch": 0.8478341355053683, "grad_norm": 1.4997353553771973, "learning_rate": 4.58e-05, "loss": 0.089, "step": 2290 }, { "epoch": 0.8515364679748242, "grad_norm": 1.4162589311599731, "learning_rate": 4.600000000000001e-05, "loss": 0.0791, "step": 2300 }, { "epoch": 0.8552388004442799, "grad_norm": 0.6514264345169067, "learning_rate": 4.6200000000000005e-05, "loss": 0.0759, "step": 2310 }, { "epoch": 0.8589411329137356, "grad_norm": 0.6226025819778442, "learning_rate": 4.64e-05, "loss": 0.0681, "step": 2320 }, { "epoch": 0.8626434653831914, "grad_norm": 2.0695035457611084, "learning_rate": 4.660000000000001e-05, "loss": 0.0577, "step": 2330 }, { "epoch": 0.8663457978526472, "grad_norm": 1.143813967704773, "learning_rate": 4.6800000000000006e-05, "loss": 0.0974, "step": 2340 }, { "epoch": 0.8700481303221029, "grad_norm": 0.6336786150932312, "learning_rate": 4.7e-05, "loss": 0.0708, "step": 2350 }, { "epoch": 0.8737504627915587, "grad_norm": 0.5629249215126038, "learning_rate": 4.72e-05, "loss": 0.0731, "step": 2360 }, { "epoch": 0.8774527952610144, "grad_norm": 1.194870948791504, "learning_rate": 4.74e-05, "loss": 0.0788, "step": 2370 }, { "epoch": 0.8811551277304702, "grad_norm": 1.1911910772323608, "learning_rate": 4.76e-05, "loss": 0.088, "step": 2380 }, { "epoch": 0.884857460199926, "grad_norm": 0.5629979968070984, "learning_rate": 4.78e-05, "loss": 0.0881, "step": 2390 }, { "epoch": 0.8885597926693817, "grad_norm": 0.7456682324409485, "learning_rate": 4.8e-05, "loss": 0.0662, "step": 2400 }, { "epoch": 0.8922621251388375, "grad_norm": 0.8248304128646851, "learning_rate": 4.82e-05, "loss": 0.1037, "step": 2410 }, { "epoch": 0.8959644576082932, "grad_norm": 0.7122240662574768, "learning_rate": 4.8400000000000004e-05, "loss": 0.0896, "step": 2420 }, { "epoch": 0.899666790077749, "grad_norm": 0.7897347807884216, "learning_rate": 4.86e-05, "loss": 0.0824, "step": 2430 }, { "epoch": 0.9033691225472047, "grad_norm": 0.966989278793335, "learning_rate": 4.88e-05, "loss": 0.1078, "step": 2440 }, { "epoch": 0.9070714550166605, "grad_norm": 0.7865309715270996, "learning_rate": 4.9e-05, "loss": 0.0891, "step": 2450 }, { "epoch": 0.9107737874861163, "grad_norm": 0.45124945044517517, "learning_rate": 4.92e-05, "loss": 0.0757, "step": 2460 }, { "epoch": 0.914476119955572, "grad_norm": 0.7773862481117249, "learning_rate": 4.94e-05, "loss": 0.0862, "step": 2470 }, { "epoch": 0.9181784524250277, "grad_norm": 0.9926636815071106, "learning_rate": 4.96e-05, "loss": 0.0797, "step": 2480 }, { "epoch": 0.9218807848944836, "grad_norm": 0.7306967377662659, "learning_rate": 4.9800000000000004e-05, "loss": 0.0855, "step": 2490 }, { "epoch": 0.9255831173639393, "grad_norm": 0.8338248133659363, "learning_rate": 5e-05, "loss": 0.1027, "step": 2500 }, { "epoch": 0.929285449833395, "grad_norm": 1.0135060548782349, "learning_rate": 5.02e-05, "loss": 0.0685, "step": 2510 }, { "epoch": 0.9329877823028508, "grad_norm": 0.6353908777236938, "learning_rate": 5.0400000000000005e-05, "loss": 0.0551, "step": 2520 }, { "epoch": 0.9366901147723066, "grad_norm": 0.4269164800643921, "learning_rate": 5.0600000000000003e-05, "loss": 0.0777, "step": 2530 }, { "epoch": 0.9403924472417623, "grad_norm": 0.630162239074707, "learning_rate": 5.08e-05, "loss": 0.0747, "step": 2540 }, { "epoch": 0.944094779711218, "grad_norm": 0.5727850794792175, "learning_rate": 5.1000000000000006e-05, "loss": 0.0752, "step": 2550 }, { "epoch": 0.9477971121806739, "grad_norm": 0.7166389226913452, "learning_rate": 5.1200000000000004e-05, "loss": 0.0739, "step": 2560 }, { "epoch": 0.9514994446501296, "grad_norm": 0.6366196274757385, "learning_rate": 5.14e-05, "loss": 0.0582, "step": 2570 }, { "epoch": 0.9552017771195853, "grad_norm": 0.5067747831344604, "learning_rate": 5.16e-05, "loss": 0.0735, "step": 2580 }, { "epoch": 0.958904109589041, "grad_norm": 0.6684631109237671, "learning_rate": 5.1800000000000005e-05, "loss": 0.071, "step": 2590 }, { "epoch": 0.9626064420584969, "grad_norm": 2.2235560417175293, "learning_rate": 5.2000000000000004e-05, "loss": 0.076, "step": 2600 }, { "epoch": 0.9663087745279526, "grad_norm": 1.1507251262664795, "learning_rate": 5.22e-05, "loss": 0.068, "step": 2610 }, { "epoch": 0.9700111069974083, "grad_norm": 1.0106631517410278, "learning_rate": 5.2400000000000007e-05, "loss": 0.0645, "step": 2620 }, { "epoch": 0.9737134394668642, "grad_norm": 1.249396800994873, "learning_rate": 5.2600000000000005e-05, "loss": 0.0846, "step": 2630 }, { "epoch": 0.9774157719363199, "grad_norm": 0.8825678825378418, "learning_rate": 5.28e-05, "loss": 0.0684, "step": 2640 }, { "epoch": 0.9811181044057756, "grad_norm": 1.440587043762207, "learning_rate": 5.300000000000001e-05, "loss": 0.0641, "step": 2650 }, { "epoch": 0.9848204368752314, "grad_norm": 1.0670156478881836, "learning_rate": 5.3200000000000006e-05, "loss": 0.0805, "step": 2660 }, { "epoch": 0.9885227693446872, "grad_norm": 0.6590127944946289, "learning_rate": 5.3400000000000004e-05, "loss": 0.0759, "step": 2670 }, { "epoch": 0.9922251018141429, "grad_norm": 1.2742676734924316, "learning_rate": 5.360000000000001e-05, "loss": 0.0591, "step": 2680 }, { "epoch": 0.9959274342835986, "grad_norm": 1.3640694618225098, "learning_rate": 5.380000000000001e-05, "loss": 0.0814, "step": 2690 }, { "epoch": 0.9996297667530544, "grad_norm": 0.4869108498096466, "learning_rate": 5.4000000000000005e-05, "loss": 0.086, "step": 2700 }, { "epoch": 1.00333209922251, "grad_norm": 0.8933056592941284, "learning_rate": 5.420000000000001e-05, "loss": 0.0605, "step": 2710 }, { "epoch": 1.007034431691966, "grad_norm": 0.6190754175186157, "learning_rate": 5.440000000000001e-05, "loss": 0.0767, "step": 2720 }, { "epoch": 1.0107367641614218, "grad_norm": 0.5870069861412048, "learning_rate": 5.4600000000000006e-05, "loss": 0.0608, "step": 2730 }, { "epoch": 1.0144390966308774, "grad_norm": 0.47044599056243896, "learning_rate": 5.4800000000000004e-05, "loss": 0.0798, "step": 2740 }, { "epoch": 1.0181414291003332, "grad_norm": 0.8412430882453918, "learning_rate": 5.500000000000001e-05, "loss": 0.0707, "step": 2750 }, { "epoch": 1.021843761569789, "grad_norm": 0.7604972124099731, "learning_rate": 5.520000000000001e-05, "loss": 0.0736, "step": 2760 }, { "epoch": 1.0255460940392447, "grad_norm": 0.4134213924407959, "learning_rate": 5.5400000000000005e-05, "loss": 0.0725, "step": 2770 }, { "epoch": 1.0292484265087005, "grad_norm": 1.1310441493988037, "learning_rate": 5.560000000000001e-05, "loss": 0.0729, "step": 2780 }, { "epoch": 1.0329507589781561, "grad_norm": 0.998735249042511, "learning_rate": 5.580000000000001e-05, "loss": 0.0779, "step": 2790 }, { "epoch": 1.036653091447612, "grad_norm": 1.286402702331543, "learning_rate": 5.6000000000000006e-05, "loss": 0.0806, "step": 2800 }, { "epoch": 1.0403554239170678, "grad_norm": 0.7598419785499573, "learning_rate": 5.620000000000001e-05, "loss": 0.0557, "step": 2810 }, { "epoch": 1.0440577563865234, "grad_norm": 0.5296675562858582, "learning_rate": 5.6399999999999995e-05, "loss": 0.0748, "step": 2820 }, { "epoch": 1.0477600888559793, "grad_norm": 0.7779732942581177, "learning_rate": 5.66e-05, "loss": 0.0671, "step": 2830 }, { "epoch": 1.051462421325435, "grad_norm": 0.5745750069618225, "learning_rate": 5.68e-05, "loss": 0.0664, "step": 2840 }, { "epoch": 1.0551647537948907, "grad_norm": 0.7464932203292847, "learning_rate": 5.6999999999999996e-05, "loss": 0.0616, "step": 2850 }, { "epoch": 1.0588670862643466, "grad_norm": 2.0333192348480225, "learning_rate": 5.72e-05, "loss": 0.0543, "step": 2860 }, { "epoch": 1.0625694187338024, "grad_norm": 0.5955972671508789, "learning_rate": 5.74e-05, "loss": 0.0771, "step": 2870 }, { "epoch": 1.066271751203258, "grad_norm": 2.857065439224243, "learning_rate": 5.76e-05, "loss": 0.0628, "step": 2880 }, { "epoch": 1.0699740836727138, "grad_norm": 0.6842355728149414, "learning_rate": 5.7799999999999995e-05, "loss": 0.0656, "step": 2890 }, { "epoch": 1.0736764161421695, "grad_norm": 0.6913069486618042, "learning_rate": 5.8e-05, "loss": 0.0761, "step": 2900 }, { "epoch": 1.0773787486116253, "grad_norm": 1.7421122789382935, "learning_rate": 5.82e-05, "loss": 0.094, "step": 2910 }, { "epoch": 1.0810810810810811, "grad_norm": 0.562433660030365, "learning_rate": 5.8399999999999997e-05, "loss": 0.0809, "step": 2920 }, { "epoch": 1.0847834135505368, "grad_norm": 0.47511911392211914, "learning_rate": 5.86e-05, "loss": 0.0637, "step": 2930 }, { "epoch": 1.0884857460199926, "grad_norm": 0.5060316324234009, "learning_rate": 5.88e-05, "loss": 0.0617, "step": 2940 }, { "epoch": 1.0921880784894484, "grad_norm": 1.0001723766326904, "learning_rate": 5.9e-05, "loss": 0.0769, "step": 2950 }, { "epoch": 1.095890410958904, "grad_norm": 0.7079387307167053, "learning_rate": 5.92e-05, "loss": 0.0699, "step": 2960 }, { "epoch": 1.0995927434283599, "grad_norm": 0.8719462752342224, "learning_rate": 5.94e-05, "loss": 0.0673, "step": 2970 }, { "epoch": 1.1032950758978157, "grad_norm": 0.46147292852401733, "learning_rate": 5.96e-05, "loss": 0.0559, "step": 2980 }, { "epoch": 1.1069974083672713, "grad_norm": 0.7180224061012268, "learning_rate": 5.9800000000000003e-05, "loss": 0.0844, "step": 2990 }, { "epoch": 1.1106997408367272, "grad_norm": 0.6582174301147461, "learning_rate": 6e-05, "loss": 0.0728, "step": 3000 }, { "epoch": 1.1144020733061828, "grad_norm": 0.5128276348114014, "learning_rate": 6.02e-05, "loss": 0.0551, "step": 3010 }, { "epoch": 1.1181044057756386, "grad_norm": 0.6706491112709045, "learning_rate": 6.04e-05, "loss": 0.0736, "step": 3020 }, { "epoch": 1.1218067382450945, "grad_norm": 0.4908059239387512, "learning_rate": 6.06e-05, "loss": 0.0685, "step": 3030 }, { "epoch": 1.12550907071455, "grad_norm": 0.5453070402145386, "learning_rate": 6.08e-05, "loss": 0.0528, "step": 3040 }, { "epoch": 1.129211403184006, "grad_norm": 1.059179663658142, "learning_rate": 6.1e-05, "loss": 0.072, "step": 3050 }, { "epoch": 1.1329137356534618, "grad_norm": 1.0880593061447144, "learning_rate": 6.12e-05, "loss": 0.0762, "step": 3060 }, { "epoch": 1.1366160681229174, "grad_norm": 0.7174161076545715, "learning_rate": 6.14e-05, "loss": 0.0657, "step": 3070 }, { "epoch": 1.1403184005923732, "grad_norm": 1.287573218345642, "learning_rate": 6.16e-05, "loss": 0.0901, "step": 3080 }, { "epoch": 1.1440207330618288, "grad_norm": 0.7359610795974731, "learning_rate": 6.18e-05, "loss": 0.0738, "step": 3090 }, { "epoch": 1.1477230655312847, "grad_norm": 0.6233002543449402, "learning_rate": 6.2e-05, "loss": 0.082, "step": 3100 }, { "epoch": 1.1514253980007405, "grad_norm": 0.4811764359474182, "learning_rate": 6.220000000000001e-05, "loss": 0.0617, "step": 3110 }, { "epoch": 1.1551277304701961, "grad_norm": 0.595496654510498, "learning_rate": 6.24e-05, "loss": 0.0816, "step": 3120 }, { "epoch": 1.158830062939652, "grad_norm": 0.7390142679214478, "learning_rate": 6.26e-05, "loss": 0.0438, "step": 3130 }, { "epoch": 1.1625323954091078, "grad_norm": 0.6736302375793457, "learning_rate": 6.280000000000001e-05, "loss": 0.0641, "step": 3140 }, { "epoch": 1.1662347278785634, "grad_norm": 0.41487616300582886, "learning_rate": 6.3e-05, "loss": 0.066, "step": 3150 }, { "epoch": 1.1699370603480193, "grad_norm": 0.7759371399879456, "learning_rate": 6.32e-05, "loss": 0.0685, "step": 3160 }, { "epoch": 1.173639392817475, "grad_norm": 1.3681451082229614, "learning_rate": 6.340000000000001e-05, "loss": 0.1228, "step": 3170 }, { "epoch": 1.1773417252869307, "grad_norm": 1.0424679517745972, "learning_rate": 6.36e-05, "loss": 0.0703, "step": 3180 }, { "epoch": 1.1810440577563865, "grad_norm": 0.8835127949714661, "learning_rate": 6.38e-05, "loss": 0.0752, "step": 3190 }, { "epoch": 1.1847463902258424, "grad_norm": 0.8338480591773987, "learning_rate": 6.400000000000001e-05, "loss": 0.0827, "step": 3200 }, { "epoch": 1.188448722695298, "grad_norm": 0.6632696986198425, "learning_rate": 6.42e-05, "loss": 0.0669, "step": 3210 }, { "epoch": 1.1921510551647538, "grad_norm": 0.5312180519104004, "learning_rate": 6.440000000000001e-05, "loss": 0.0813, "step": 3220 }, { "epoch": 1.1958533876342097, "grad_norm": 0.6518285870552063, "learning_rate": 6.460000000000001e-05, "loss": 0.0859, "step": 3230 }, { "epoch": 1.1995557201036653, "grad_norm": 0.5502805113792419, "learning_rate": 6.48e-05, "loss": 0.0685, "step": 3240 }, { "epoch": 1.2032580525731211, "grad_norm": 1.665433406829834, "learning_rate": 6.500000000000001e-05, "loss": 0.0845, "step": 3250 }, { "epoch": 1.2069603850425767, "grad_norm": 0.7776069641113281, "learning_rate": 6.52e-05, "loss": 0.066, "step": 3260 }, { "epoch": 1.2106627175120326, "grad_norm": 1.250301480293274, "learning_rate": 6.54e-05, "loss": 0.0748, "step": 3270 }, { "epoch": 1.2143650499814884, "grad_norm": 0.7368317246437073, "learning_rate": 6.560000000000001e-05, "loss": 0.0662, "step": 3280 }, { "epoch": 1.218067382450944, "grad_norm": 0.9778260588645935, "learning_rate": 6.58e-05, "loss": 0.0926, "step": 3290 }, { "epoch": 1.2217697149203999, "grad_norm": 0.7283994555473328, "learning_rate": 6.6e-05, "loss": 0.0621, "step": 3300 }, { "epoch": 1.2254720473898555, "grad_norm": 0.7713655829429626, "learning_rate": 6.620000000000001e-05, "loss": 0.0597, "step": 3310 }, { "epoch": 1.2291743798593113, "grad_norm": 0.7390691041946411, "learning_rate": 6.64e-05, "loss": 0.0678, "step": 3320 }, { "epoch": 1.2328767123287672, "grad_norm": 0.3494139015674591, "learning_rate": 6.66e-05, "loss": 0.051, "step": 3330 }, { "epoch": 1.2365790447982228, "grad_norm": 0.43940427899360657, "learning_rate": 6.680000000000001e-05, "loss": 0.0735, "step": 3340 }, { "epoch": 1.2402813772676786, "grad_norm": 0.42999139428138733, "learning_rate": 6.7e-05, "loss": 0.0591, "step": 3350 }, { "epoch": 1.2439837097371345, "grad_norm": 0.7179346680641174, "learning_rate": 6.720000000000001e-05, "loss": 0.0912, "step": 3360 }, { "epoch": 1.24768604220659, "grad_norm": 0.7530249357223511, "learning_rate": 6.740000000000001e-05, "loss": 0.0735, "step": 3370 }, { "epoch": 1.251388374676046, "grad_norm": 0.8911184072494507, "learning_rate": 6.76e-05, "loss": 0.0816, "step": 3380 }, { "epoch": 1.2550907071455017, "grad_norm": 1.6254115104675293, "learning_rate": 6.780000000000001e-05, "loss": 0.0711, "step": 3390 }, { "epoch": 1.2587930396149574, "grad_norm": 0.7928197979927063, "learning_rate": 6.800000000000001e-05, "loss": 0.0542, "step": 3400 }, { "epoch": 1.2624953720844132, "grad_norm": 0.6970255970954895, "learning_rate": 6.82e-05, "loss": 0.0662, "step": 3410 }, { "epoch": 1.266197704553869, "grad_norm": 0.4685273766517639, "learning_rate": 6.840000000000001e-05, "loss": 0.0685, "step": 3420 }, { "epoch": 1.2699000370233247, "grad_norm": 0.3618628680706024, "learning_rate": 6.860000000000001e-05, "loss": 0.06, "step": 3430 }, { "epoch": 1.2736023694927805, "grad_norm": 1.8244625329971313, "learning_rate": 6.879999999999999e-05, "loss": 0.0692, "step": 3440 }, { "epoch": 1.2773047019622363, "grad_norm": 0.40677013993263245, "learning_rate": 6.9e-05, "loss": 0.0619, "step": 3450 }, { "epoch": 1.281007034431692, "grad_norm": 0.3562806248664856, "learning_rate": 6.92e-05, "loss": 0.063, "step": 3460 }, { "epoch": 1.2847093669011478, "grad_norm": 0.5669035315513611, "learning_rate": 6.939999999999999e-05, "loss": 0.0551, "step": 3470 }, { "epoch": 1.2884116993706034, "grad_norm": 0.5732166171073914, "learning_rate": 6.96e-05, "loss": 0.0538, "step": 3480 }, { "epoch": 1.2921140318400592, "grad_norm": 0.7917014360427856, "learning_rate": 6.98e-05, "loss": 0.0729, "step": 3490 }, { "epoch": 1.2958163643095149, "grad_norm": 0.7337891459465027, "learning_rate": 7e-05, "loss": 0.0504, "step": 3500 }, { "epoch": 1.2995186967789707, "grad_norm": 0.7491916418075562, "learning_rate": 7.02e-05, "loss": 0.0707, "step": 3510 }, { "epoch": 1.3032210292484265, "grad_norm": 2.3522109985351562, "learning_rate": 7.04e-05, "loss": 0.0777, "step": 3520 }, { "epoch": 1.3069233617178821, "grad_norm": 0.9382653832435608, "learning_rate": 7.06e-05, "loss": 0.07, "step": 3530 }, { "epoch": 1.310625694187338, "grad_norm": 0.9387719035148621, "learning_rate": 7.08e-05, "loss": 0.0716, "step": 3540 }, { "epoch": 1.3143280266567938, "grad_norm": 0.6674622297286987, "learning_rate": 7.1e-05, "loss": 0.0577, "step": 3550 }, { "epoch": 1.3180303591262494, "grad_norm": 1.6137107610702515, "learning_rate": 7.12e-05, "loss": 0.077, "step": 3560 }, { "epoch": 1.3217326915957053, "grad_norm": 0.6806771755218506, "learning_rate": 7.14e-05, "loss": 0.0791, "step": 3570 }, { "epoch": 1.3254350240651611, "grad_norm": 1.0449469089508057, "learning_rate": 7.16e-05, "loss": 0.0798, "step": 3580 }, { "epoch": 1.3291373565346167, "grad_norm": 0.5177164077758789, "learning_rate": 7.18e-05, "loss": 0.0721, "step": 3590 }, { "epoch": 1.3328396890040726, "grad_norm": 0.617209255695343, "learning_rate": 7.2e-05, "loss": 0.0631, "step": 3600 }, { "epoch": 1.3365420214735284, "grad_norm": 0.7034630179405212, "learning_rate": 7.22e-05, "loss": 0.066, "step": 3610 }, { "epoch": 1.340244353942984, "grad_norm": 0.7461096048355103, "learning_rate": 7.24e-05, "loss": 0.0902, "step": 3620 }, { "epoch": 1.3439466864124399, "grad_norm": 0.42178311944007874, "learning_rate": 7.26e-05, "loss": 0.0633, "step": 3630 }, { "epoch": 1.3476490188818957, "grad_norm": 0.6374081969261169, "learning_rate": 7.280000000000001e-05, "loss": 0.0727, "step": 3640 }, { "epoch": 1.3513513513513513, "grad_norm": 0.452883243560791, "learning_rate": 7.3e-05, "loss": 0.0745, "step": 3650 }, { "epoch": 1.3550536838208072, "grad_norm": 0.6694507598876953, "learning_rate": 7.32e-05, "loss": 0.0592, "step": 3660 }, { "epoch": 1.358756016290263, "grad_norm": 0.48653385043144226, "learning_rate": 7.340000000000001e-05, "loss": 0.0706, "step": 3670 }, { "epoch": 1.3624583487597186, "grad_norm": 0.42234811186790466, "learning_rate": 7.36e-05, "loss": 0.0529, "step": 3680 }, { "epoch": 1.3661606812291744, "grad_norm": 0.6036973595619202, "learning_rate": 7.38e-05, "loss": 0.0527, "step": 3690 }, { "epoch": 1.36986301369863, "grad_norm": 0.7706220746040344, "learning_rate": 7.4e-05, "loss": 0.0567, "step": 3700 }, { "epoch": 1.373565346168086, "grad_norm": 0.6481570601463318, "learning_rate": 7.42e-05, "loss": 0.0642, "step": 3710 }, { "epoch": 1.3772676786375415, "grad_norm": 0.5031047463417053, "learning_rate": 7.44e-05, "loss": 0.066, "step": 3720 }, { "epoch": 1.3809700111069974, "grad_norm": 0.7614627480506897, "learning_rate": 7.46e-05, "loss": 0.0637, "step": 3730 }, { "epoch": 1.3846723435764532, "grad_norm": 0.5621228814125061, "learning_rate": 7.48e-05, "loss": 0.0791, "step": 3740 }, { "epoch": 1.3883746760459088, "grad_norm": 0.5119731426239014, "learning_rate": 7.500000000000001e-05, "loss": 0.0727, "step": 3750 }, { "epoch": 1.3920770085153646, "grad_norm": 0.5974960327148438, "learning_rate": 7.52e-05, "loss": 0.0611, "step": 3760 }, { "epoch": 1.3957793409848205, "grad_norm": 0.5426230430603027, "learning_rate": 7.54e-05, "loss": 0.064, "step": 3770 }, { "epoch": 1.399481673454276, "grad_norm": 0.3557775914669037, "learning_rate": 7.560000000000001e-05, "loss": 0.0675, "step": 3780 }, { "epoch": 1.403184005923732, "grad_norm": 0.5942485332489014, "learning_rate": 7.58e-05, "loss": 0.0765, "step": 3790 }, { "epoch": 1.4068863383931878, "grad_norm": 0.483223557472229, "learning_rate": 7.6e-05, "loss": 0.0618, "step": 3800 }, { "epoch": 1.4105886708626434, "grad_norm": 0.4963008463382721, "learning_rate": 7.620000000000001e-05, "loss": 0.068, "step": 3810 }, { "epoch": 1.4142910033320992, "grad_norm": 1.3447846174240112, "learning_rate": 7.64e-05, "loss": 0.0709, "step": 3820 }, { "epoch": 1.417993335801555, "grad_norm": 0.9625268578529358, "learning_rate": 7.66e-05, "loss": 0.0476, "step": 3830 }, { "epoch": 1.4216956682710107, "grad_norm": 0.877081573009491, "learning_rate": 7.680000000000001e-05, "loss": 0.0763, "step": 3840 }, { "epoch": 1.4253980007404665, "grad_norm": 1.2372366189956665, "learning_rate": 7.7e-05, "loss": 0.0574, "step": 3850 }, { "epoch": 1.4291003332099224, "grad_norm": 0.9819295406341553, "learning_rate": 7.72e-05, "loss": 0.0726, "step": 3860 }, { "epoch": 1.432802665679378, "grad_norm": 0.4720100462436676, "learning_rate": 7.740000000000001e-05, "loss": 0.0668, "step": 3870 }, { "epoch": 1.4365049981488338, "grad_norm": 0.46679970622062683, "learning_rate": 7.76e-05, "loss": 0.0578, "step": 3880 }, { "epoch": 1.4402073306182896, "grad_norm": 0.5964988470077515, "learning_rate": 7.780000000000001e-05, "loss": 0.0672, "step": 3890 }, { "epoch": 1.4439096630877453, "grad_norm": 0.5359032154083252, "learning_rate": 7.800000000000001e-05, "loss": 0.062, "step": 3900 }, { "epoch": 1.447611995557201, "grad_norm": 0.8125001192092896, "learning_rate": 7.82e-05, "loss": 0.0566, "step": 3910 }, { "epoch": 1.4513143280266567, "grad_norm": 0.536466658115387, "learning_rate": 7.840000000000001e-05, "loss": 0.0558, "step": 3920 }, { "epoch": 1.4550166604961126, "grad_norm": 0.9360922574996948, "learning_rate": 7.860000000000001e-05, "loss": 0.0606, "step": 3930 }, { "epoch": 1.4587189929655682, "grad_norm": 0.40558168292045593, "learning_rate": 7.88e-05, "loss": 0.0693, "step": 3940 }, { "epoch": 1.462421325435024, "grad_norm": 0.6655269861221313, "learning_rate": 7.900000000000001e-05, "loss": 0.0673, "step": 3950 }, { "epoch": 1.4661236579044798, "grad_norm": 0.5840831398963928, "learning_rate": 7.920000000000001e-05, "loss": 0.0751, "step": 3960 }, { "epoch": 1.4698259903739355, "grad_norm": 0.4681914150714874, "learning_rate": 7.94e-05, "loss": 0.0526, "step": 3970 }, { "epoch": 1.4735283228433913, "grad_norm": 0.6261281371116638, "learning_rate": 7.960000000000001e-05, "loss": 0.0494, "step": 3980 }, { "epoch": 1.4772306553128471, "grad_norm": 0.6617164611816406, "learning_rate": 7.98e-05, "loss": 0.0501, "step": 3990 }, { "epoch": 1.4809329877823028, "grad_norm": 0.27041196823120117, "learning_rate": 8e-05, "loss": 0.0471, "step": 4000 }, { "epoch": 1.4846353202517586, "grad_norm": 3.60581374168396, "learning_rate": 8.020000000000001e-05, "loss": 0.0555, "step": 4010 }, { "epoch": 1.4883376527212144, "grad_norm": 0.558165967464447, "learning_rate": 8.04e-05, "loss": 0.0626, "step": 4020 }, { "epoch": 1.49203998519067, "grad_norm": 0.5027239918708801, "learning_rate": 8.060000000000001e-05, "loss": 0.0484, "step": 4030 }, { "epoch": 1.4957423176601259, "grad_norm": 0.5151306390762329, "learning_rate": 8.080000000000001e-05, "loss": 0.0726, "step": 4040 }, { "epoch": 1.4994446501295817, "grad_norm": 1.38392174243927, "learning_rate": 8.1e-05, "loss": 0.0526, "step": 4050 }, { "epoch": 1.5031469825990373, "grad_norm": 0.762907087802887, "learning_rate": 8.120000000000001e-05, "loss": 0.0668, "step": 4060 }, { "epoch": 1.5068493150684932, "grad_norm": 0.4064861536026001, "learning_rate": 8.14e-05, "loss": 0.0622, "step": 4070 }, { "epoch": 1.510551647537949, "grad_norm": 0.49630188941955566, "learning_rate": 8.16e-05, "loss": 0.0691, "step": 4080 }, { "epoch": 1.5142539800074046, "grad_norm": 0.5293760299682617, "learning_rate": 8.18e-05, "loss": 0.0766, "step": 4090 }, { "epoch": 1.5179563124768605, "grad_norm": 0.8031710982322693, "learning_rate": 8.2e-05, "loss": 0.0729, "step": 4100 }, { "epoch": 1.5216586449463163, "grad_norm": 2.3503975868225098, "learning_rate": 8.22e-05, "loss": 0.0909, "step": 4110 }, { "epoch": 1.525360977415772, "grad_norm": 0.8841959238052368, "learning_rate": 8.24e-05, "loss": 0.0622, "step": 4120 }, { "epoch": 1.5290633098852275, "grad_norm": 0.45459797978401184, "learning_rate": 8.26e-05, "loss": 0.058, "step": 4130 }, { "epoch": 1.5327656423546836, "grad_norm": 0.6228151321411133, "learning_rate": 8.28e-05, "loss": 0.0694, "step": 4140 }, { "epoch": 1.5364679748241392, "grad_norm": 0.46429064869880676, "learning_rate": 8.3e-05, "loss": 0.0613, "step": 4150 }, { "epoch": 1.5401703072935948, "grad_norm": 1.2689656019210815, "learning_rate": 8.32e-05, "loss": 0.0586, "step": 4160 }, { "epoch": 1.5438726397630507, "grad_norm": 0.5041284561157227, "learning_rate": 8.34e-05, "loss": 0.0627, "step": 4170 }, { "epoch": 1.5475749722325065, "grad_norm": 0.5469565391540527, "learning_rate": 8.36e-05, "loss": 0.0629, "step": 4180 }, { "epoch": 1.5512773047019621, "grad_norm": 0.4145233929157257, "learning_rate": 8.38e-05, "loss": 0.0547, "step": 4190 }, { "epoch": 1.554979637171418, "grad_norm": 2.6377787590026855, "learning_rate": 8.4e-05, "loss": 0.0577, "step": 4200 }, { "epoch": 1.5586819696408738, "grad_norm": 1.0077848434448242, "learning_rate": 8.42e-05, "loss": 0.0584, "step": 4210 }, { "epoch": 1.5623843021103294, "grad_norm": 0.7473629713058472, "learning_rate": 8.44e-05, "loss": 0.069, "step": 4220 }, { "epoch": 1.5660866345797853, "grad_norm": 1.4622706174850464, "learning_rate": 8.46e-05, "loss": 0.0632, "step": 4230 }, { "epoch": 1.569788967049241, "grad_norm": 0.45410504937171936, "learning_rate": 8.48e-05, "loss": 0.0719, "step": 4240 }, { "epoch": 1.5734912995186967, "grad_norm": 0.639041543006897, "learning_rate": 8.5e-05, "loss": 0.0715, "step": 4250 }, { "epoch": 1.5771936319881525, "grad_norm": 0.43846261501312256, "learning_rate": 8.52e-05, "loss": 0.0539, "step": 4260 }, { "epoch": 1.5808959644576084, "grad_norm": 0.6029319763183594, "learning_rate": 8.54e-05, "loss": 0.0575, "step": 4270 }, { "epoch": 1.584598296927064, "grad_norm": 1.1370168924331665, "learning_rate": 8.560000000000001e-05, "loss": 0.0624, "step": 4280 }, { "epoch": 1.5883006293965198, "grad_norm": 1.0093122720718384, "learning_rate": 8.58e-05, "loss": 0.0726, "step": 4290 }, { "epoch": 1.5920029618659757, "grad_norm": 0.38476455211639404, "learning_rate": 8.6e-05, "loss": 0.0484, "step": 4300 }, { "epoch": 1.5957052943354313, "grad_norm": 1.2300890684127808, "learning_rate": 8.620000000000001e-05, "loss": 0.059, "step": 4310 }, { "epoch": 1.5994076268048871, "grad_norm": 0.9110909700393677, "learning_rate": 8.64e-05, "loss": 0.0441, "step": 4320 }, { "epoch": 1.603109959274343, "grad_norm": 0.8963673114776611, "learning_rate": 8.66e-05, "loss": 0.0536, "step": 4330 }, { "epoch": 1.6068122917437986, "grad_norm": 0.4457288980484009, "learning_rate": 8.680000000000001e-05, "loss": 0.0531, "step": 4340 }, { "epoch": 1.6105146242132542, "grad_norm": 0.36408546566963196, "learning_rate": 8.7e-05, "loss": 0.0727, "step": 4350 }, { "epoch": 1.6142169566827103, "grad_norm": 0.8042135238647461, "learning_rate": 8.72e-05, "loss": 0.0582, "step": 4360 }, { "epoch": 1.6179192891521659, "grad_norm": 0.4815595746040344, "learning_rate": 8.740000000000001e-05, "loss": 0.0372, "step": 4370 }, { "epoch": 1.6216216216216215, "grad_norm": 0.7964209914207458, "learning_rate": 8.76e-05, "loss": 0.0539, "step": 4380 }, { "epoch": 1.6253239540910773, "grad_norm": 0.49673840403556824, "learning_rate": 8.78e-05, "loss": 0.0769, "step": 4390 }, { "epoch": 1.6290262865605332, "grad_norm": 1.0407365560531616, "learning_rate": 8.800000000000001e-05, "loss": 0.0687, "step": 4400 }, { "epoch": 1.6327286190299888, "grad_norm": 0.983752429485321, "learning_rate": 8.82e-05, "loss": 0.0706, "step": 4410 }, { "epoch": 1.6364309514994446, "grad_norm": 0.5486028790473938, "learning_rate": 8.840000000000001e-05, "loss": 0.0659, "step": 4420 }, { "epoch": 1.6401332839689005, "grad_norm": 0.4712786376476288, "learning_rate": 8.86e-05, "loss": 0.0656, "step": 4430 }, { "epoch": 1.643835616438356, "grad_norm": 0.3405558466911316, "learning_rate": 8.88e-05, "loss": 0.0519, "step": 4440 }, { "epoch": 1.647537948907812, "grad_norm": 0.4447254240512848, "learning_rate": 8.900000000000001e-05, "loss": 0.0533, "step": 4450 }, { "epoch": 1.6512402813772677, "grad_norm": 0.7940601110458374, "learning_rate": 8.92e-05, "loss": 0.0542, "step": 4460 }, { "epoch": 1.6549426138467234, "grad_norm": 0.785750150680542, "learning_rate": 8.94e-05, "loss": 0.0715, "step": 4470 }, { "epoch": 1.6586449463161792, "grad_norm": 0.6868900656700134, "learning_rate": 8.960000000000001e-05, "loss": 0.0533, "step": 4480 }, { "epoch": 1.662347278785635, "grad_norm": 0.844917356967926, "learning_rate": 8.98e-05, "loss": 0.0596, "step": 4490 }, { "epoch": 1.6660496112550907, "grad_norm": 0.719018816947937, "learning_rate": 9e-05, "loss": 0.0578, "step": 4500 }, { "epoch": 1.6697519437245465, "grad_norm": 1.0960392951965332, "learning_rate": 9.020000000000001e-05, "loss": 0.0561, "step": 4510 }, { "epoch": 1.6734542761940023, "grad_norm": 0.7648434638977051, "learning_rate": 9.04e-05, "loss": 0.0608, "step": 4520 }, { "epoch": 1.677156608663458, "grad_norm": 0.5149823427200317, "learning_rate": 9.06e-05, "loss": 0.0506, "step": 4530 }, { "epoch": 1.6808589411329138, "grad_norm": 0.2779891788959503, "learning_rate": 9.080000000000001e-05, "loss": 0.05, "step": 4540 }, { "epoch": 1.6845612736023696, "grad_norm": 0.5541907548904419, "learning_rate": 9.1e-05, "loss": 0.0592, "step": 4550 }, { "epoch": 1.6882636060718252, "grad_norm": 0.4604206383228302, "learning_rate": 9.120000000000001e-05, "loss": 0.0522, "step": 4560 }, { "epoch": 1.6919659385412809, "grad_norm": 0.33472198247909546, "learning_rate": 9.140000000000001e-05, "loss": 0.0562, "step": 4570 }, { "epoch": 1.695668271010737, "grad_norm": 0.7447779178619385, "learning_rate": 9.16e-05, "loss": 0.0595, "step": 4580 }, { "epoch": 1.6993706034801925, "grad_norm": 0.3424501419067383, "learning_rate": 9.180000000000001e-05, "loss": 0.0582, "step": 4590 }, { "epoch": 1.7030729359496481, "grad_norm": 0.468113511800766, "learning_rate": 9.200000000000001e-05, "loss": 0.0588, "step": 4600 }, { "epoch": 1.706775268419104, "grad_norm": 0.5728756189346313, "learning_rate": 9.22e-05, "loss": 0.0654, "step": 4610 }, { "epoch": 1.7104776008885598, "grad_norm": 0.6442884802818298, "learning_rate": 9.240000000000001e-05, "loss": 0.0669, "step": 4620 }, { "epoch": 1.7141799333580154, "grad_norm": 1.0347211360931396, "learning_rate": 9.260000000000001e-05, "loss": 0.0529, "step": 4630 }, { "epoch": 1.7178822658274713, "grad_norm": 0.8499534726142883, "learning_rate": 9.28e-05, "loss": 0.057, "step": 4640 }, { "epoch": 1.7215845982969271, "grad_norm": 0.5046526193618774, "learning_rate": 9.300000000000001e-05, "loss": 0.0723, "step": 4650 }, { "epoch": 1.7252869307663827, "grad_norm": 0.5424455404281616, "learning_rate": 9.320000000000002e-05, "loss": 0.0687, "step": 4660 }, { "epoch": 1.7289892632358386, "grad_norm": 0.6024232506752014, "learning_rate": 9.340000000000001e-05, "loss": 0.0549, "step": 4670 }, { "epoch": 1.7326915957052944, "grad_norm": 0.42882370948791504, "learning_rate": 9.360000000000001e-05, "loss": 0.0418, "step": 4680 }, { "epoch": 1.73639392817475, "grad_norm": 0.5678403973579407, "learning_rate": 9.38e-05, "loss": 0.0646, "step": 4690 }, { "epoch": 1.7400962606442059, "grad_norm": 0.7145687937736511, "learning_rate": 9.4e-05, "loss": 0.0521, "step": 4700 }, { "epoch": 1.7437985931136617, "grad_norm": 0.5542314648628235, "learning_rate": 9.42e-05, "loss": 0.0624, "step": 4710 }, { "epoch": 1.7475009255831173, "grad_norm": 0.5584444999694824, "learning_rate": 9.44e-05, "loss": 0.0482, "step": 4720 }, { "epoch": 1.7512032580525732, "grad_norm": 0.5657766461372375, "learning_rate": 9.46e-05, "loss": 0.053, "step": 4730 }, { "epoch": 1.754905590522029, "grad_norm": 1.101801872253418, "learning_rate": 9.48e-05, "loss": 0.0552, "step": 4740 }, { "epoch": 1.7586079229914846, "grad_norm": 0.4111199676990509, "learning_rate": 9.5e-05, "loss": 0.06, "step": 4750 }, { "epoch": 1.7623102554609404, "grad_norm": 0.5006869435310364, "learning_rate": 9.52e-05, "loss": 0.0731, "step": 4760 }, { "epoch": 1.7660125879303963, "grad_norm": 0.48495784401893616, "learning_rate": 9.54e-05, "loss": 0.0485, "step": 4770 }, { "epoch": 1.769714920399852, "grad_norm": 0.7870677709579468, "learning_rate": 9.56e-05, "loss": 0.0737, "step": 4780 }, { "epoch": 1.7734172528693075, "grad_norm": 0.46306851506233215, "learning_rate": 9.58e-05, "loss": 0.052, "step": 4790 }, { "epoch": 1.7771195853387636, "grad_norm": 0.3599608242511749, "learning_rate": 9.6e-05, "loss": 0.0538, "step": 4800 }, { "epoch": 1.7808219178082192, "grad_norm": 0.7135095596313477, "learning_rate": 9.620000000000001e-05, "loss": 0.0724, "step": 4810 }, { "epoch": 1.7845242502776748, "grad_norm": 0.44129109382629395, "learning_rate": 9.64e-05, "loss": 0.0675, "step": 4820 }, { "epoch": 1.7882265827471306, "grad_norm": 0.504058301448822, "learning_rate": 9.66e-05, "loss": 0.0591, "step": 4830 }, { "epoch": 1.7919289152165865, "grad_norm": 0.39418289065361023, "learning_rate": 9.680000000000001e-05, "loss": 0.0616, "step": 4840 }, { "epoch": 1.795631247686042, "grad_norm": 0.7345573306083679, "learning_rate": 9.7e-05, "loss": 0.0517, "step": 4850 }, { "epoch": 1.799333580155498, "grad_norm": 0.7205158472061157, "learning_rate": 9.72e-05, "loss": 0.0631, "step": 4860 }, { "epoch": 1.8030359126249538, "grad_norm": 0.4907108545303345, "learning_rate": 9.74e-05, "loss": 0.0462, "step": 4870 }, { "epoch": 1.8067382450944094, "grad_norm": 0.6083633303642273, "learning_rate": 9.76e-05, "loss": 0.0608, "step": 4880 }, { "epoch": 1.8104405775638652, "grad_norm": 0.9930976629257202, "learning_rate": 9.78e-05, "loss": 0.0661, "step": 4890 }, { "epoch": 1.814142910033321, "grad_norm": 0.5373798608779907, "learning_rate": 9.8e-05, "loss": 0.0642, "step": 4900 }, { "epoch": 1.8178452425027767, "grad_norm": 3.540651559829712, "learning_rate": 9.82e-05, "loss": 0.0885, "step": 4910 }, { "epoch": 1.8215475749722325, "grad_norm": 0.5413001775741577, "learning_rate": 9.84e-05, "loss": 0.0505, "step": 4920 }, { "epoch": 1.8252499074416884, "grad_norm": 0.6821585297584534, "learning_rate": 9.86e-05, "loss": 0.0685, "step": 4930 }, { "epoch": 1.828952239911144, "grad_norm": 0.6551912426948547, "learning_rate": 9.88e-05, "loss": 0.0556, "step": 4940 }, { "epoch": 1.8326545723805998, "grad_norm": 1.5381443500518799, "learning_rate": 9.900000000000001e-05, "loss": 0.0636, "step": 4950 }, { "epoch": 1.8363569048500556, "grad_norm": 0.7474282383918762, "learning_rate": 9.92e-05, "loss": 0.0672, "step": 4960 }, { "epoch": 1.8400592373195113, "grad_norm": 0.4646322429180145, "learning_rate": 9.94e-05, "loss": 0.0474, "step": 4970 }, { "epoch": 1.843761569788967, "grad_norm": 0.43209877610206604, "learning_rate": 9.960000000000001e-05, "loss": 0.0409, "step": 4980 }, { "epoch": 1.847463902258423, "grad_norm": 0.8244059681892395, "learning_rate": 9.98e-05, "loss": 0.0564, "step": 4990 }, { "epoch": 1.8511662347278786, "grad_norm": 0.3655155897140503, "learning_rate": 0.0001, "loss": 0.058, "step": 5000 }, { "epoch": 1.8548685671973342, "grad_norm": 0.7321513295173645, "learning_rate": 9.999999726603759e-05, "loss": 0.0591, "step": 5010 }, { "epoch": 1.8585708996667902, "grad_norm": 0.9724626541137695, "learning_rate": 9.999998906415065e-05, "loss": 0.066, "step": 5020 }, { "epoch": 1.8622732321362458, "grad_norm": 0.3442176580429077, "learning_rate": 9.999997539434007e-05, "loss": 0.0459, "step": 5030 }, { "epoch": 1.8659755646057015, "grad_norm": 0.46101999282836914, "learning_rate": 9.999995625660738e-05, "loss": 0.0535, "step": 5040 }, { "epoch": 1.8696778970751573, "grad_norm": 0.6077836751937866, "learning_rate": 9.999993165095463e-05, "loss": 0.0553, "step": 5050 }, { "epoch": 1.8733802295446131, "grad_norm": 0.4042462110519409, "learning_rate": 9.999990157738453e-05, "loss": 0.066, "step": 5060 }, { "epoch": 1.8770825620140688, "grad_norm": 0.4415290653705597, "learning_rate": 9.999986603590037e-05, "loss": 0.0733, "step": 5070 }, { "epoch": 1.8807848944835246, "grad_norm": 0.41534191370010376, "learning_rate": 9.999982502650602e-05, "loss": 0.0571, "step": 5080 }, { "epoch": 1.8844872269529804, "grad_norm": 0.49111345410346985, "learning_rate": 9.9999778549206e-05, "loss": 0.0627, "step": 5090 }, { "epoch": 1.888189559422436, "grad_norm": 0.32555460929870605, "learning_rate": 9.999972660400536e-05, "loss": 0.0564, "step": 5100 }, { "epoch": 1.8918918918918919, "grad_norm": 0.4351123869419098, "learning_rate": 9.99996691909098e-05, "loss": 0.0604, "step": 5110 }, { "epoch": 1.8955942243613477, "grad_norm": 0.5916552543640137, "learning_rate": 9.999960630992558e-05, "loss": 0.0658, "step": 5120 }, { "epoch": 1.8992965568308033, "grad_norm": 0.7157463431358337, "learning_rate": 9.999953796105959e-05, "loss": 0.0803, "step": 5130 }, { "epoch": 1.9029988893002592, "grad_norm": 0.5787460207939148, "learning_rate": 9.999946414431931e-05, "loss": 0.0772, "step": 5140 }, { "epoch": 1.906701221769715, "grad_norm": 0.4720671772956848, "learning_rate": 9.999938485971279e-05, "loss": 0.0642, "step": 5150 }, { "epoch": 1.9104035542391706, "grad_norm": 1.1181657314300537, "learning_rate": 9.999930010724872e-05, "loss": 0.0643, "step": 5160 }, { "epoch": 1.9141058867086265, "grad_norm": 0.7451878190040588, "learning_rate": 9.999920988693637e-05, "loss": 0.0799, "step": 5170 }, { "epoch": 1.9178082191780823, "grad_norm": 0.4678128659725189, "learning_rate": 9.999911419878559e-05, "loss": 0.073, "step": 5180 }, { "epoch": 1.921510551647538, "grad_norm": 0.5699427127838135, "learning_rate": 9.999901304280685e-05, "loss": 0.0579, "step": 5190 }, { "epoch": 1.9252128841169938, "grad_norm": 0.6569998860359192, "learning_rate": 9.999890641901125e-05, "loss": 0.0625, "step": 5200 }, { "epoch": 1.9289152165864496, "grad_norm": 0.5025046467781067, "learning_rate": 9.999879432741037e-05, "loss": 0.0535, "step": 5210 }, { "epoch": 1.9326175490559052, "grad_norm": 0.5750059485435486, "learning_rate": 9.999867676801655e-05, "loss": 0.0483, "step": 5220 }, { "epoch": 1.9363198815253608, "grad_norm": 0.7907508015632629, "learning_rate": 9.99985537408426e-05, "loss": 0.0691, "step": 5230 }, { "epoch": 1.940022213994817, "grad_norm": 0.5490067005157471, "learning_rate": 9.999842524590197e-05, "loss": 0.0513, "step": 5240 }, { "epoch": 1.9437245464642725, "grad_norm": 0.8603996634483337, "learning_rate": 9.999829128320874e-05, "loss": 0.0622, "step": 5250 }, { "epoch": 1.9474268789337281, "grad_norm": 1.259075403213501, "learning_rate": 9.999815185277755e-05, "loss": 0.0541, "step": 5260 }, { "epoch": 1.951129211403184, "grad_norm": 0.5384097099304199, "learning_rate": 9.999800695462362e-05, "loss": 0.0544, "step": 5270 }, { "epoch": 1.9548315438726398, "grad_norm": 0.8031284213066101, "learning_rate": 9.999785658876284e-05, "loss": 0.0488, "step": 5280 }, { "epoch": 1.9585338763420954, "grad_norm": 0.5232800245285034, "learning_rate": 9.999770075521164e-05, "loss": 0.0693, "step": 5290 }, { "epoch": 1.9622362088115513, "grad_norm": 0.3506251573562622, "learning_rate": 9.999753945398704e-05, "loss": 0.0485, "step": 5300 }, { "epoch": 1.965938541281007, "grad_norm": 0.2773245871067047, "learning_rate": 9.999737268510672e-05, "loss": 0.0522, "step": 5310 }, { "epoch": 1.9696408737504627, "grad_norm": 0.6592091917991638, "learning_rate": 9.999720044858886e-05, "loss": 0.0562, "step": 5320 }, { "epoch": 1.9733432062199185, "grad_norm": 1.011357307434082, "learning_rate": 9.999702274445236e-05, "loss": 0.074, "step": 5330 }, { "epoch": 1.9770455386893744, "grad_norm": 1.6354997158050537, "learning_rate": 9.999683957271659e-05, "loss": 0.0668, "step": 5340 }, { "epoch": 1.98074787115883, "grad_norm": 0.4745696485042572, "learning_rate": 9.999665093340165e-05, "loss": 0.0521, "step": 5350 }, { "epoch": 1.9844502036282858, "grad_norm": 0.4188210666179657, "learning_rate": 9.999645682652811e-05, "loss": 0.0623, "step": 5360 }, { "epoch": 1.9881525360977417, "grad_norm": 0.7739410400390625, "learning_rate": 9.999625725211721e-05, "loss": 0.0618, "step": 5370 }, { "epoch": 1.9918548685671973, "grad_norm": 1.178892731666565, "learning_rate": 9.999605221019081e-05, "loss": 0.0597, "step": 5380 }, { "epoch": 1.9955572010366531, "grad_norm": 0.41377919912338257, "learning_rate": 9.99958417007713e-05, "loss": 0.0561, "step": 5390 }, { "epoch": 1.999259533506109, "grad_norm": 0.7987796664237976, "learning_rate": 9.99956257238817e-05, "loss": 0.0658, "step": 5400 }, { "epoch": 2.0029618659755646, "grad_norm": 0.3465999364852905, "learning_rate": 9.999540427954565e-05, "loss": 0.0619, "step": 5410 }, { "epoch": 2.00666419844502, "grad_norm": 0.6324620246887207, "learning_rate": 9.999517736778735e-05, "loss": 0.0582, "step": 5420 }, { "epoch": 2.0103665309144763, "grad_norm": 0.2627883553504944, "learning_rate": 9.999494498863162e-05, "loss": 0.0551, "step": 5430 }, { "epoch": 2.014068863383932, "grad_norm": 1.1785273551940918, "learning_rate": 9.999470714210387e-05, "loss": 0.0515, "step": 5440 }, { "epoch": 2.0177711958533875, "grad_norm": 0.5090134739875793, "learning_rate": 9.999446382823013e-05, "loss": 0.0539, "step": 5450 }, { "epoch": 2.0214735283228435, "grad_norm": 0.3405911326408386, "learning_rate": 9.999421504703696e-05, "loss": 0.053, "step": 5460 }, { "epoch": 2.025175860792299, "grad_norm": 1.753676176071167, "learning_rate": 9.999396079855162e-05, "loss": 0.071, "step": 5470 }, { "epoch": 2.028878193261755, "grad_norm": 1.0460823774337769, "learning_rate": 9.999370108280188e-05, "loss": 0.059, "step": 5480 }, { "epoch": 2.032580525731211, "grad_norm": 0.9330565929412842, "learning_rate": 9.999343589981615e-05, "loss": 0.0557, "step": 5490 }, { "epoch": 2.0362828582006665, "grad_norm": 0.4037414789199829, "learning_rate": 9.999316524962345e-05, "loss": 0.0525, "step": 5500 }, { "epoch": 2.039985190670122, "grad_norm": 0.5043405294418335, "learning_rate": 9.999288913225335e-05, "loss": 0.0423, "step": 5510 }, { "epoch": 2.043687523139578, "grad_norm": 0.42634275555610657, "learning_rate": 9.999260754773607e-05, "loss": 0.0531, "step": 5520 }, { "epoch": 2.0473898556090337, "grad_norm": 0.5342641472816467, "learning_rate": 9.999232049610238e-05, "loss": 0.0783, "step": 5530 }, { "epoch": 2.0510921880784894, "grad_norm": 0.5899149179458618, "learning_rate": 9.999202797738369e-05, "loss": 0.0627, "step": 5540 }, { "epoch": 2.0547945205479454, "grad_norm": 0.6272445917129517, "learning_rate": 9.999172999161198e-05, "loss": 0.0552, "step": 5550 }, { "epoch": 2.058496853017401, "grad_norm": 0.38071388006210327, "learning_rate": 9.999142653881985e-05, "loss": 0.0463, "step": 5560 }, { "epoch": 2.0621991854868567, "grad_norm": 0.2892961800098419, "learning_rate": 9.999111761904046e-05, "loss": 0.051, "step": 5570 }, { "epoch": 2.0659015179563123, "grad_norm": 0.7310361862182617, "learning_rate": 9.999080323230761e-05, "loss": 0.0601, "step": 5580 }, { "epoch": 2.0696038504257683, "grad_norm": 0.6904370188713074, "learning_rate": 9.999048337865568e-05, "loss": 0.0437, "step": 5590 }, { "epoch": 2.073306182895224, "grad_norm": 0.768850564956665, "learning_rate": 9.999015805811965e-05, "loss": 0.0536, "step": 5600 }, { "epoch": 2.0770085153646796, "grad_norm": 0.6570295691490173, "learning_rate": 9.998982727073509e-05, "loss": 0.0534, "step": 5610 }, { "epoch": 2.0807108478341356, "grad_norm": 0.46489813923835754, "learning_rate": 9.998949101653817e-05, "loss": 0.0834, "step": 5620 }, { "epoch": 2.0844131803035912, "grad_norm": 0.750434935092926, "learning_rate": 9.998914929556569e-05, "loss": 0.0563, "step": 5630 }, { "epoch": 2.088115512773047, "grad_norm": 0.59445720911026, "learning_rate": 9.998880210785498e-05, "loss": 0.0544, "step": 5640 }, { "epoch": 2.091817845242503, "grad_norm": 0.46442338824272156, "learning_rate": 9.998844945344405e-05, "loss": 0.0434, "step": 5650 }, { "epoch": 2.0955201777119585, "grad_norm": 1.014467716217041, "learning_rate": 9.998809133237143e-05, "loss": 0.0606, "step": 5660 }, { "epoch": 2.099222510181414, "grad_norm": 1.0003297328948975, "learning_rate": 9.99877277446763e-05, "loss": 0.0589, "step": 5670 }, { "epoch": 2.10292484265087, "grad_norm": 0.4850516617298126, "learning_rate": 9.998735869039842e-05, "loss": 0.0533, "step": 5680 }, { "epoch": 2.106627175120326, "grad_norm": 0.6938893795013428, "learning_rate": 9.998698416957815e-05, "loss": 0.0596, "step": 5690 }, { "epoch": 2.1103295075897814, "grad_norm": 0.5484058260917664, "learning_rate": 9.998660418225645e-05, "loss": 0.057, "step": 5700 }, { "epoch": 2.1140318400592375, "grad_norm": 0.349052757024765, "learning_rate": 9.998621872847485e-05, "loss": 0.0586, "step": 5710 }, { "epoch": 2.117734172528693, "grad_norm": 0.5302473902702332, "learning_rate": 9.998582780827554e-05, "loss": 0.052, "step": 5720 }, { "epoch": 2.1214365049981487, "grad_norm": 0.47345638275146484, "learning_rate": 9.998543142170126e-05, "loss": 0.0546, "step": 5730 }, { "epoch": 2.125138837467605, "grad_norm": 0.8401845097541809, "learning_rate": 9.998502956879534e-05, "loss": 0.0478, "step": 5740 }, { "epoch": 2.1288411699370604, "grad_norm": 1.3141708374023438, "learning_rate": 9.998462224960175e-05, "loss": 0.0628, "step": 5750 }, { "epoch": 2.132543502406516, "grad_norm": 0.44975998997688293, "learning_rate": 9.9984209464165e-05, "loss": 0.0838, "step": 5760 }, { "epoch": 2.1362458348759716, "grad_norm": 0.5372299551963806, "learning_rate": 9.998379121253028e-05, "loss": 0.0751, "step": 5770 }, { "epoch": 2.1399481673454277, "grad_norm": 0.7908897995948792, "learning_rate": 9.998336749474329e-05, "loss": 0.0786, "step": 5780 }, { "epoch": 2.1436504998148833, "grad_norm": 0.3124464154243469, "learning_rate": 9.998293831085037e-05, "loss": 0.0652, "step": 5790 }, { "epoch": 2.147352832284339, "grad_norm": 0.6454853415489197, "learning_rate": 9.998250366089848e-05, "loss": 0.0563, "step": 5800 }, { "epoch": 2.151055164753795, "grad_norm": 0.5251432061195374, "learning_rate": 9.998206354493511e-05, "loss": 0.0661, "step": 5810 }, { "epoch": 2.1547574972232506, "grad_norm": 0.7137133479118347, "learning_rate": 9.998161796300845e-05, "loss": 0.0773, "step": 5820 }, { "epoch": 2.1584598296927062, "grad_norm": 0.5445514917373657, "learning_rate": 9.998116691516718e-05, "loss": 0.0466, "step": 5830 }, { "epoch": 2.1621621621621623, "grad_norm": 1.3937071561813354, "learning_rate": 9.998071040146064e-05, "loss": 0.0554, "step": 5840 }, { "epoch": 2.165864494631618, "grad_norm": 1.3480521440505981, "learning_rate": 9.998024842193876e-05, "loss": 0.0786, "step": 5850 }, { "epoch": 2.1695668271010735, "grad_norm": 0.44326072931289673, "learning_rate": 9.997978097665205e-05, "loss": 0.0472, "step": 5860 }, { "epoch": 2.1732691595705296, "grad_norm": 0.4386208951473236, "learning_rate": 9.997930806565166e-05, "loss": 0.045, "step": 5870 }, { "epoch": 2.176971492039985, "grad_norm": 0.6164724230766296, "learning_rate": 9.997882968898926e-05, "loss": 0.0633, "step": 5880 }, { "epoch": 2.180673824509441, "grad_norm": 0.6433140635490417, "learning_rate": 9.997834584671719e-05, "loss": 0.0469, "step": 5890 }, { "epoch": 2.184376156978897, "grad_norm": 0.4600784480571747, "learning_rate": 9.997785653888835e-05, "loss": 0.0438, "step": 5900 }, { "epoch": 2.1880784894483525, "grad_norm": 0.48180341720581055, "learning_rate": 9.997736176555628e-05, "loss": 0.0447, "step": 5910 }, { "epoch": 2.191780821917808, "grad_norm": 0.39204514026641846, "learning_rate": 9.997686152677506e-05, "loss": 0.0461, "step": 5920 }, { "epoch": 2.195483154387264, "grad_norm": 0.7759292125701904, "learning_rate": 9.99763558225994e-05, "loss": 0.0477, "step": 5930 }, { "epoch": 2.1991854868567198, "grad_norm": 0.4019148647785187, "learning_rate": 9.997584465308461e-05, "loss": 0.0601, "step": 5940 }, { "epoch": 2.2028878193261754, "grad_norm": 0.683978259563446, "learning_rate": 9.997532801828658e-05, "loss": 0.0496, "step": 5950 }, { "epoch": 2.2065901517956314, "grad_norm": 0.43661221861839294, "learning_rate": 9.997480591826183e-05, "loss": 0.049, "step": 5960 }, { "epoch": 2.210292484265087, "grad_norm": 0.5614250302314758, "learning_rate": 9.997427835306742e-05, "loss": 0.0578, "step": 5970 }, { "epoch": 2.2139948167345427, "grad_norm": 0.2551342248916626, "learning_rate": 9.997374532276107e-05, "loss": 0.0428, "step": 5980 }, { "epoch": 2.2176971492039987, "grad_norm": 0.5877577662467957, "learning_rate": 9.997320682740107e-05, "loss": 0.0559, "step": 5990 }, { "epoch": 2.2213994816734544, "grad_norm": 0.7145906686782837, "learning_rate": 9.997266286704631e-05, "loss": 0.0489, "step": 6000 }, { "epoch": 2.22510181414291, "grad_norm": 0.3701966404914856, "learning_rate": 9.997211344175626e-05, "loss": 0.0622, "step": 6010 }, { "epoch": 2.2288041466123656, "grad_norm": 0.3622521162033081, "learning_rate": 9.997155855159101e-05, "loss": 0.0545, "step": 6020 }, { "epoch": 2.2325064790818216, "grad_norm": 0.7279003858566284, "learning_rate": 9.997099819661127e-05, "loss": 0.0535, "step": 6030 }, { "epoch": 2.2362088115512773, "grad_norm": 0.5309158563613892, "learning_rate": 9.997043237687829e-05, "loss": 0.0658, "step": 6040 }, { "epoch": 2.239911144020733, "grad_norm": 0.3844463527202606, "learning_rate": 9.996986109245395e-05, "loss": 0.0443, "step": 6050 }, { "epoch": 2.243613476490189, "grad_norm": 0.5929186940193176, "learning_rate": 9.996928434340073e-05, "loss": 0.0555, "step": 6060 }, { "epoch": 2.2473158089596446, "grad_norm": 0.637549877166748, "learning_rate": 9.996870212978171e-05, "loss": 0.0557, "step": 6070 }, { "epoch": 2.2510181414291, "grad_norm": 0.5780567526817322, "learning_rate": 9.996811445166054e-05, "loss": 0.0505, "step": 6080 }, { "epoch": 2.2547204738985562, "grad_norm": 0.5982688665390015, "learning_rate": 9.996752130910149e-05, "loss": 0.0747, "step": 6090 }, { "epoch": 2.258422806368012, "grad_norm": 0.36574792861938477, "learning_rate": 9.996692270216947e-05, "loss": 0.043, "step": 6100 }, { "epoch": 2.2621251388374675, "grad_norm": 0.8327367305755615, "learning_rate": 9.99663186309299e-05, "loss": 0.0575, "step": 6110 }, { "epoch": 2.2658274713069235, "grad_norm": 0.7570099234580994, "learning_rate": 9.996570909544882e-05, "loss": 0.0506, "step": 6120 }, { "epoch": 2.269529803776379, "grad_norm": 0.4615638852119446, "learning_rate": 9.996509409579293e-05, "loss": 0.0513, "step": 6130 }, { "epoch": 2.2732321362458348, "grad_norm": 0.45306918025016785, "learning_rate": 9.996447363202946e-05, "loss": 0.0518, "step": 6140 }, { "epoch": 2.276934468715291, "grad_norm": 1.1733237504959106, "learning_rate": 9.996384770422629e-05, "loss": 0.0536, "step": 6150 }, { "epoch": 2.2806368011847464, "grad_norm": 0.6908769011497498, "learning_rate": 9.996321631245184e-05, "loss": 0.0591, "step": 6160 }, { "epoch": 2.284339133654202, "grad_norm": 0.6480868458747864, "learning_rate": 9.99625794567752e-05, "loss": 0.0476, "step": 6170 }, { "epoch": 2.2880414661236577, "grad_norm": 0.3427181839942932, "learning_rate": 9.996193713726596e-05, "loss": 0.0339, "step": 6180 }, { "epoch": 2.2917437985931137, "grad_norm": 0.18233607709407806, "learning_rate": 9.99612893539944e-05, "loss": 0.0435, "step": 6190 }, { "epoch": 2.2954461310625693, "grad_norm": 0.5500887036323547, "learning_rate": 9.996063610703137e-05, "loss": 0.0475, "step": 6200 }, { "epoch": 2.299148463532025, "grad_norm": 0.2822166681289673, "learning_rate": 9.995997739644826e-05, "loss": 0.047, "step": 6210 }, { "epoch": 2.302850796001481, "grad_norm": 0.5579500198364258, "learning_rate": 9.995931322231715e-05, "loss": 0.0543, "step": 6220 }, { "epoch": 2.3065531284709366, "grad_norm": 0.850584864616394, "learning_rate": 9.995864358471066e-05, "loss": 0.0638, "step": 6230 }, { "epoch": 2.3102554609403922, "grad_norm": 0.23522913455963135, "learning_rate": 9.995796848370203e-05, "loss": 0.0394, "step": 6240 }, { "epoch": 2.3139577934098483, "grad_norm": 0.2213016301393509, "learning_rate": 9.995728791936504e-05, "loss": 0.0566, "step": 6250 }, { "epoch": 2.317660125879304, "grad_norm": 0.43188080191612244, "learning_rate": 9.995660189177419e-05, "loss": 0.0597, "step": 6260 }, { "epoch": 2.3213624583487595, "grad_norm": 0.3667723536491394, "learning_rate": 9.995591040100446e-05, "loss": 0.0507, "step": 6270 }, { "epoch": 2.3250647908182156, "grad_norm": 0.638522207736969, "learning_rate": 9.995521344713147e-05, "loss": 0.0623, "step": 6280 }, { "epoch": 2.328767123287671, "grad_norm": 0.5070501565933228, "learning_rate": 9.995451103023144e-05, "loss": 0.0525, "step": 6290 }, { "epoch": 2.332469455757127, "grad_norm": 0.3448793590068817, "learning_rate": 9.995380315038119e-05, "loss": 0.0475, "step": 6300 }, { "epoch": 2.336171788226583, "grad_norm": 0.7250033020973206, "learning_rate": 9.995308980765815e-05, "loss": 0.0743, "step": 6310 }, { "epoch": 2.3398741206960385, "grad_norm": 1.1361966133117676, "learning_rate": 9.995237100214027e-05, "loss": 0.0495, "step": 6320 }, { "epoch": 2.343576453165494, "grad_norm": 1.5409963130950928, "learning_rate": 9.995164673390625e-05, "loss": 0.0625, "step": 6330 }, { "epoch": 2.34727878563495, "grad_norm": 0.5113590955734253, "learning_rate": 9.995091700303521e-05, "loss": 0.0695, "step": 6340 }, { "epoch": 2.350981118104406, "grad_norm": 0.686586320400238, "learning_rate": 9.9950181809607e-05, "loss": 0.0625, "step": 6350 }, { "epoch": 2.3546834505738614, "grad_norm": 0.49706733226776123, "learning_rate": 9.994944115370199e-05, "loss": 0.0536, "step": 6360 }, { "epoch": 2.3583857830433175, "grad_norm": 0.5875659584999084, "learning_rate": 9.994869503540122e-05, "loss": 0.0769, "step": 6370 }, { "epoch": 2.362088115512773, "grad_norm": 0.4846583306789398, "learning_rate": 9.994794345478624e-05, "loss": 0.0423, "step": 6380 }, { "epoch": 2.3657904479822287, "grad_norm": 0.46868041157722473, "learning_rate": 9.994718641193928e-05, "loss": 0.0567, "step": 6390 }, { "epoch": 2.3694927804516848, "grad_norm": 0.37351879477500916, "learning_rate": 9.994642390694308e-05, "loss": 0.0569, "step": 6400 }, { "epoch": 2.3731951129211404, "grad_norm": 0.32245203852653503, "learning_rate": 9.994565593988107e-05, "loss": 0.0507, "step": 6410 }, { "epoch": 2.376897445390596, "grad_norm": 0.6445807814598083, "learning_rate": 9.994488251083721e-05, "loss": 0.0525, "step": 6420 }, { "epoch": 2.380599777860052, "grad_norm": 0.5831187963485718, "learning_rate": 9.99441036198961e-05, "loss": 0.0571, "step": 6430 }, { "epoch": 2.3843021103295077, "grad_norm": 0.5170648097991943, "learning_rate": 9.99433192671429e-05, "loss": 0.0702, "step": 6440 }, { "epoch": 2.3880044427989633, "grad_norm": 0.5387572646141052, "learning_rate": 9.99425294526634e-05, "loss": 0.0616, "step": 6450 }, { "epoch": 2.3917067752684193, "grad_norm": 0.4130139946937561, "learning_rate": 9.994173417654395e-05, "loss": 0.0454, "step": 6460 }, { "epoch": 2.395409107737875, "grad_norm": 0.502524197101593, "learning_rate": 9.994093343887156e-05, "loss": 0.0527, "step": 6470 }, { "epoch": 2.3991114402073306, "grad_norm": 0.6072599291801453, "learning_rate": 9.994012723973376e-05, "loss": 0.0561, "step": 6480 }, { "epoch": 2.402813772676786, "grad_norm": 0.6158693432807922, "learning_rate": 9.993931557921874e-05, "loss": 0.0736, "step": 6490 }, { "epoch": 2.4065161051462423, "grad_norm": 0.5443888902664185, "learning_rate": 9.993849845741524e-05, "loss": 0.0557, "step": 6500 }, { "epoch": 2.410218437615698, "grad_norm": 0.49967968463897705, "learning_rate": 9.993767587441265e-05, "loss": 0.0665, "step": 6510 }, { "epoch": 2.4139207700851535, "grad_norm": 0.3511490821838379, "learning_rate": 9.993684783030088e-05, "loss": 0.0548, "step": 6520 }, { "epoch": 2.4176231025546095, "grad_norm": 0.38472792506217957, "learning_rate": 9.993601432517053e-05, "loss": 0.0462, "step": 6530 }, { "epoch": 2.421325435024065, "grad_norm": 0.524956464767456, "learning_rate": 9.993517535911273e-05, "loss": 0.0737, "step": 6540 }, { "epoch": 2.425027767493521, "grad_norm": 0.9004432559013367, "learning_rate": 9.99343309322192e-05, "loss": 0.0569, "step": 6550 }, { "epoch": 2.428730099962977, "grad_norm": 0.4326704740524292, "learning_rate": 9.993348104458234e-05, "loss": 0.0413, "step": 6560 }, { "epoch": 2.4324324324324325, "grad_norm": 0.6986762881278992, "learning_rate": 9.993262569629507e-05, "loss": 0.0615, "step": 6570 }, { "epoch": 2.436134764901888, "grad_norm": 0.3240361213684082, "learning_rate": 9.99317648874509e-05, "loss": 0.0417, "step": 6580 }, { "epoch": 2.439837097371344, "grad_norm": 0.4755275547504425, "learning_rate": 9.993089861814402e-05, "loss": 0.0405, "step": 6590 }, { "epoch": 2.4435394298407997, "grad_norm": 0.7781282663345337, "learning_rate": 9.993002688846913e-05, "loss": 0.053, "step": 6600 }, { "epoch": 2.4472417623102554, "grad_norm": 0.5590863823890686, "learning_rate": 9.992914969852158e-05, "loss": 0.0391, "step": 6610 }, { "epoch": 2.450944094779711, "grad_norm": 0.4796781539916992, "learning_rate": 9.992826704839727e-05, "loss": 0.0559, "step": 6620 }, { "epoch": 2.454646427249167, "grad_norm": 0.635891318321228, "learning_rate": 9.992737893819273e-05, "loss": 0.0737, "step": 6630 }, { "epoch": 2.4583487597186227, "grad_norm": 0.7815940380096436, "learning_rate": 9.992648536800512e-05, "loss": 0.0311, "step": 6640 }, { "epoch": 2.4620510921880783, "grad_norm": 0.4359890818595886, "learning_rate": 9.992558633793212e-05, "loss": 0.0485, "step": 6650 }, { "epoch": 2.4657534246575343, "grad_norm": 0.2714034914970398, "learning_rate": 9.992468184807206e-05, "loss": 0.0518, "step": 6660 }, { "epoch": 2.46945575712699, "grad_norm": 0.6843683123588562, "learning_rate": 9.992377189852387e-05, "loss": 0.0459, "step": 6670 }, { "epoch": 2.4731580895964456, "grad_norm": 0.34739699959754944, "learning_rate": 9.992285648938702e-05, "loss": 0.0446, "step": 6680 }, { "epoch": 2.4768604220659016, "grad_norm": 0.48650041222572327, "learning_rate": 9.992193562076166e-05, "loss": 0.0483, "step": 6690 }, { "epoch": 2.4805627545353572, "grad_norm": 1.3241970539093018, "learning_rate": 9.992100929274846e-05, "loss": 0.0538, "step": 6700 }, { "epoch": 2.484265087004813, "grad_norm": 0.36573997139930725, "learning_rate": 9.992007750544876e-05, "loss": 0.0423, "step": 6710 }, { "epoch": 2.487967419474269, "grad_norm": 0.44266849756240845, "learning_rate": 9.991914025896443e-05, "loss": 0.0537, "step": 6720 }, { "epoch": 2.4916697519437245, "grad_norm": 1.0741809606552124, "learning_rate": 9.991819755339796e-05, "loss": 0.0457, "step": 6730 }, { "epoch": 2.49537208441318, "grad_norm": 0.4157111346721649, "learning_rate": 9.991724938885248e-05, "loss": 0.0471, "step": 6740 }, { "epoch": 2.499074416882636, "grad_norm": 0.4052101969718933, "learning_rate": 9.991629576543163e-05, "loss": 0.054, "step": 6750 }, { "epoch": 2.502776749352092, "grad_norm": 0.3649333715438843, "learning_rate": 9.991533668323974e-05, "loss": 0.0552, "step": 6760 }, { "epoch": 2.5064790818215474, "grad_norm": 0.31294742226600647, "learning_rate": 9.991437214238167e-05, "loss": 0.0539, "step": 6770 }, { "epoch": 2.5101814142910035, "grad_norm": 0.8612927794456482, "learning_rate": 9.991340214296292e-05, "loss": 0.0561, "step": 6780 }, { "epoch": 2.513883746760459, "grad_norm": 0.5272373557090759, "learning_rate": 9.991242668508954e-05, "loss": 0.0603, "step": 6790 }, { "epoch": 2.5175860792299147, "grad_norm": 0.2939218282699585, "learning_rate": 9.991144576886823e-05, "loss": 0.0527, "step": 6800 }, { "epoch": 2.521288411699371, "grad_norm": 0.5364477038383484, "learning_rate": 9.991045939440625e-05, "loss": 0.068, "step": 6810 }, { "epoch": 2.5249907441688264, "grad_norm": 0.5472286343574524, "learning_rate": 9.990946756181146e-05, "loss": 0.0541, "step": 6820 }, { "epoch": 2.528693076638282, "grad_norm": 0.3843435049057007, "learning_rate": 9.990847027119234e-05, "loss": 0.0621, "step": 6830 }, { "epoch": 2.532395409107738, "grad_norm": 0.3598906695842743, "learning_rate": 9.990746752265796e-05, "loss": 0.0523, "step": 6840 }, { "epoch": 2.5360977415771937, "grad_norm": 1.1238840818405151, "learning_rate": 9.990645931631796e-05, "loss": 0.0455, "step": 6850 }, { "epoch": 2.5398000740466493, "grad_norm": 0.30649876594543457, "learning_rate": 9.990544565228259e-05, "loss": 0.0439, "step": 6860 }, { "epoch": 2.5435024065161054, "grad_norm": 0.4952414929866791, "learning_rate": 9.990442653066272e-05, "loss": 0.0587, "step": 6870 }, { "epoch": 2.547204738985561, "grad_norm": 0.4642448425292969, "learning_rate": 9.99034019515698e-05, "loss": 0.0539, "step": 6880 }, { "epoch": 2.5509070714550166, "grad_norm": 0.7735468745231628, "learning_rate": 9.990237191511587e-05, "loss": 0.0442, "step": 6890 }, { "epoch": 2.5546094039244727, "grad_norm": 0.9689661860466003, "learning_rate": 9.990133642141359e-05, "loss": 0.0484, "step": 6900 }, { "epoch": 2.5583117363939283, "grad_norm": 0.39976319670677185, "learning_rate": 9.990029547057615e-05, "loss": 0.0547, "step": 6910 }, { "epoch": 2.562014068863384, "grad_norm": 0.7647582292556763, "learning_rate": 9.989924906271746e-05, "loss": 0.0554, "step": 6920 }, { "epoch": 2.56571640133284, "grad_norm": 0.4299500584602356, "learning_rate": 9.989819719795188e-05, "loss": 0.0487, "step": 6930 }, { "epoch": 2.5694187338022956, "grad_norm": 0.5367316007614136, "learning_rate": 9.989713987639451e-05, "loss": 0.0557, "step": 6940 }, { "epoch": 2.573121066271751, "grad_norm": 0.9338808655738831, "learning_rate": 9.989607709816091e-05, "loss": 0.0638, "step": 6950 }, { "epoch": 2.576823398741207, "grad_norm": 0.4693072736263275, "learning_rate": 9.989500886336736e-05, "loss": 0.051, "step": 6960 }, { "epoch": 2.580525731210663, "grad_norm": 0.4323706030845642, "learning_rate": 9.989393517213064e-05, "loss": 0.0543, "step": 6970 }, { "epoch": 2.5842280636801185, "grad_norm": 0.31819948554039, "learning_rate": 9.989285602456819e-05, "loss": 0.0425, "step": 6980 }, { "epoch": 2.587930396149574, "grad_norm": 0.24610276520252228, "learning_rate": 9.989177142079802e-05, "loss": 0.0565, "step": 6990 }, { "epoch": 2.5916327286190297, "grad_norm": 0.23157992959022522, "learning_rate": 9.989068136093873e-05, "loss": 0.0504, "step": 7000 }, { "epoch": 2.5953350610884858, "grad_norm": 0.541242778301239, "learning_rate": 9.988958584510954e-05, "loss": 0.0345, "step": 7010 }, { "epoch": 2.5990373935579414, "grad_norm": 0.4953152537345886, "learning_rate": 9.988848487343026e-05, "loss": 0.0433, "step": 7020 }, { "epoch": 2.602739726027397, "grad_norm": 0.826622486114502, "learning_rate": 9.988737844602128e-05, "loss": 0.0519, "step": 7030 }, { "epoch": 2.606442058496853, "grad_norm": 1.0058649778366089, "learning_rate": 9.988626656300359e-05, "loss": 0.061, "step": 7040 }, { "epoch": 2.6101443909663087, "grad_norm": 0.3704119324684143, "learning_rate": 9.988514922449879e-05, "loss": 0.0456, "step": 7050 }, { "epoch": 2.6138467234357643, "grad_norm": 0.3223683536052704, "learning_rate": 9.988402643062907e-05, "loss": 0.0365, "step": 7060 }, { "epoch": 2.6175490559052204, "grad_norm": 0.5015390515327454, "learning_rate": 9.988289818151722e-05, "loss": 0.0394, "step": 7070 }, { "epoch": 2.621251388374676, "grad_norm": 0.5646992325782776, "learning_rate": 9.988176447728664e-05, "loss": 0.0479, "step": 7080 }, { "epoch": 2.6249537208441316, "grad_norm": 0.5799932479858398, "learning_rate": 9.988062531806126e-05, "loss": 0.0703, "step": 7090 }, { "epoch": 2.6286560533135876, "grad_norm": 0.6010279655456543, "learning_rate": 9.987948070396571e-05, "loss": 0.0577, "step": 7100 }, { "epoch": 2.6323583857830433, "grad_norm": 0.442953497171402, "learning_rate": 9.987833063512515e-05, "loss": 0.0484, "step": 7110 }, { "epoch": 2.636060718252499, "grad_norm": 0.7676416635513306, "learning_rate": 9.987717511166533e-05, "loss": 0.0549, "step": 7120 }, { "epoch": 2.639763050721955, "grad_norm": 0.4441109299659729, "learning_rate": 9.987601413371264e-05, "loss": 0.0535, "step": 7130 }, { "epoch": 2.6434653831914106, "grad_norm": 1.3590419292449951, "learning_rate": 9.987484770139401e-05, "loss": 0.0495, "step": 7140 }, { "epoch": 2.647167715660866, "grad_norm": 0.3216463625431061, "learning_rate": 9.987367581483705e-05, "loss": 0.05, "step": 7150 }, { "epoch": 2.6508700481303222, "grad_norm": 0.4827708303928375, "learning_rate": 9.987249847416987e-05, "loss": 0.0416, "step": 7160 }, { "epoch": 2.654572380599778, "grad_norm": 0.6856722831726074, "learning_rate": 9.987131567952124e-05, "loss": 0.0423, "step": 7170 }, { "epoch": 2.6582747130692335, "grad_norm": 1.5040169954299927, "learning_rate": 9.98701274310205e-05, "loss": 0.0503, "step": 7180 }, { "epoch": 2.6619770455386895, "grad_norm": 0.4276619851589203, "learning_rate": 9.986893372879762e-05, "loss": 0.0444, "step": 7190 }, { "epoch": 2.665679378008145, "grad_norm": 1.1035451889038086, "learning_rate": 9.986773457298311e-05, "loss": 0.0447, "step": 7200 }, { "epoch": 2.6693817104776008, "grad_norm": 1.0426604747772217, "learning_rate": 9.986652996370813e-05, "loss": 0.0539, "step": 7210 }, { "epoch": 2.673084042947057, "grad_norm": 0.7548753619194031, "learning_rate": 9.986531990110442e-05, "loss": 0.0439, "step": 7220 }, { "epoch": 2.6767863754165124, "grad_norm": 0.3933066725730896, "learning_rate": 9.986410438530427e-05, "loss": 0.0361, "step": 7230 }, { "epoch": 2.680488707885968, "grad_norm": 0.5383368134498596, "learning_rate": 9.986288341644064e-05, "loss": 0.059, "step": 7240 }, { "epoch": 2.684191040355424, "grad_norm": 0.477131724357605, "learning_rate": 9.986165699464705e-05, "loss": 0.05, "step": 7250 }, { "epoch": 2.6878933728248797, "grad_norm": 0.5520297884941101, "learning_rate": 9.986042512005763e-05, "loss": 0.0474, "step": 7260 }, { "epoch": 2.6915957052943353, "grad_norm": 0.5703570246696472, "learning_rate": 9.985918779280707e-05, "loss": 0.042, "step": 7270 }, { "epoch": 2.6952980377637914, "grad_norm": 0.4390963912010193, "learning_rate": 9.98579450130307e-05, "loss": 0.0355, "step": 7280 }, { "epoch": 2.699000370233247, "grad_norm": 0.3190418481826782, "learning_rate": 9.985669678086443e-05, "loss": 0.0559, "step": 7290 }, { "epoch": 2.7027027027027026, "grad_norm": 0.26831766963005066, "learning_rate": 9.985544309644475e-05, "loss": 0.0404, "step": 7300 }, { "epoch": 2.7064050351721587, "grad_norm": 0.6928516030311584, "learning_rate": 9.985418395990876e-05, "loss": 0.0352, "step": 7310 }, { "epoch": 2.7101073676416143, "grad_norm": 0.31640127301216125, "learning_rate": 9.985291937139418e-05, "loss": 0.0487, "step": 7320 }, { "epoch": 2.71380970011107, "grad_norm": 0.3296556770801544, "learning_rate": 9.985164933103929e-05, "loss": 0.042, "step": 7330 }, { "epoch": 2.717512032580526, "grad_norm": 0.4852615296840668, "learning_rate": 9.985037383898296e-05, "loss": 0.0576, "step": 7340 }, { "epoch": 2.7212143650499816, "grad_norm": 0.3478875756263733, "learning_rate": 9.984909289536473e-05, "loss": 0.0463, "step": 7350 }, { "epoch": 2.724916697519437, "grad_norm": 0.6485427021980286, "learning_rate": 9.984780650032463e-05, "loss": 0.042, "step": 7360 }, { "epoch": 2.7286190299888933, "grad_norm": 1.216952919960022, "learning_rate": 9.984651465400335e-05, "loss": 0.055, "step": 7370 }, { "epoch": 2.732321362458349, "grad_norm": 0.3620198369026184, "learning_rate": 9.984521735654218e-05, "loss": 0.0492, "step": 7380 }, { "epoch": 2.7360236949278045, "grad_norm": 0.6761988997459412, "learning_rate": 9.984391460808298e-05, "loss": 0.0516, "step": 7390 }, { "epoch": 2.73972602739726, "grad_norm": 0.3656267821788788, "learning_rate": 9.984260640876821e-05, "loss": 0.0463, "step": 7400 }, { "epoch": 2.743428359866716, "grad_norm": 0.3038039207458496, "learning_rate": 9.984129275874095e-05, "loss": 0.0417, "step": 7410 }, { "epoch": 2.747130692336172, "grad_norm": 0.36750689148902893, "learning_rate": 9.983997365814486e-05, "loss": 0.0582, "step": 7420 }, { "epoch": 2.7508330248056274, "grad_norm": 0.3524929881095886, "learning_rate": 9.983864910712416e-05, "loss": 0.0414, "step": 7430 }, { "epoch": 2.754535357275083, "grad_norm": 0.29835090041160583, "learning_rate": 9.983731910582373e-05, "loss": 0.0435, "step": 7440 }, { "epoch": 2.758237689744539, "grad_norm": 0.3539535701274872, "learning_rate": 9.983598365438902e-05, "loss": 0.0576, "step": 7450 }, { "epoch": 2.7619400222139947, "grad_norm": 0.5970607399940491, "learning_rate": 9.983464275296605e-05, "loss": 0.0591, "step": 7460 }, { "epoch": 2.7656423546834503, "grad_norm": 0.2369842529296875, "learning_rate": 9.983329640170149e-05, "loss": 0.0499, "step": 7470 }, { "epoch": 2.7693446871529064, "grad_norm": 0.8572582602500916, "learning_rate": 9.983194460074257e-05, "loss": 0.0509, "step": 7480 }, { "epoch": 2.773047019622362, "grad_norm": 0.2882023751735687, "learning_rate": 9.983058735023709e-05, "loss": 0.0561, "step": 7490 }, { "epoch": 2.7767493520918176, "grad_norm": 0.3670226037502289, "learning_rate": 9.98292246503335e-05, "loss": 0.0482, "step": 7500 }, { "epoch": 2.7804516845612737, "grad_norm": 0.2872715890407562, "learning_rate": 9.982785650118082e-05, "loss": 0.0368, "step": 7510 }, { "epoch": 2.7841540170307293, "grad_norm": 1.0136018991470337, "learning_rate": 9.982648290292867e-05, "loss": 0.0578, "step": 7520 }, { "epoch": 2.787856349500185, "grad_norm": 0.28085020184516907, "learning_rate": 9.982510385572725e-05, "loss": 0.0578, "step": 7530 }, { "epoch": 2.791558681969641, "grad_norm": 0.22957351803779602, "learning_rate": 9.98237193597274e-05, "loss": 0.0676, "step": 7540 }, { "epoch": 2.7952610144390966, "grad_norm": 0.8864136934280396, "learning_rate": 9.98223294150805e-05, "loss": 0.0588, "step": 7550 }, { "epoch": 2.798963346908552, "grad_norm": 0.6719812750816345, "learning_rate": 9.982093402193857e-05, "loss": 0.0732, "step": 7560 }, { "epoch": 2.8026656793780083, "grad_norm": 1.7380998134613037, "learning_rate": 9.981953318045419e-05, "loss": 0.0327, "step": 7570 }, { "epoch": 2.806368011847464, "grad_norm": 1.8229141235351562, "learning_rate": 9.981812689078057e-05, "loss": 0.0506, "step": 7580 }, { "epoch": 2.8100703443169195, "grad_norm": 0.27888596057891846, "learning_rate": 9.98167151530715e-05, "loss": 0.0449, "step": 7590 }, { "epoch": 2.8137726767863755, "grad_norm": 0.38934797048568726, "learning_rate": 9.981529796748134e-05, "loss": 0.0543, "step": 7600 }, { "epoch": 2.817475009255831, "grad_norm": 0.4639911353588104, "learning_rate": 9.981387533416511e-05, "loss": 0.0451, "step": 7610 }, { "epoch": 2.821177341725287, "grad_norm": 0.9491060376167297, "learning_rate": 9.981244725327836e-05, "loss": 0.0472, "step": 7620 }, { "epoch": 2.824879674194743, "grad_norm": 1.1310651302337646, "learning_rate": 9.981101372497727e-05, "loss": 0.0495, "step": 7630 }, { "epoch": 2.8285820066641985, "grad_norm": 0.5887041091918945, "learning_rate": 9.980957474941861e-05, "loss": 0.0489, "step": 7640 }, { "epoch": 2.832284339133654, "grad_norm": 0.8702982664108276, "learning_rate": 9.980813032675974e-05, "loss": 0.0572, "step": 7650 }, { "epoch": 2.83598667160311, "grad_norm": 0.8773539662361145, "learning_rate": 9.980668045715864e-05, "loss": 0.0489, "step": 7660 }, { "epoch": 2.8396890040725657, "grad_norm": 0.7576608657836914, "learning_rate": 9.980522514077382e-05, "loss": 0.0437, "step": 7670 }, { "epoch": 2.8433913365420214, "grad_norm": 0.27604958415031433, "learning_rate": 9.980376437776447e-05, "loss": 0.0415, "step": 7680 }, { "epoch": 2.8470936690114774, "grad_norm": 0.5393282175064087, "learning_rate": 9.980229816829034e-05, "loss": 0.0515, "step": 7690 }, { "epoch": 2.850796001480933, "grad_norm": 2.601621150970459, "learning_rate": 9.980082651251175e-05, "loss": 0.0482, "step": 7700 }, { "epoch": 2.8544983339503887, "grad_norm": 0.3446357846260071, "learning_rate": 9.979934941058964e-05, "loss": 0.0484, "step": 7710 }, { "epoch": 2.8582006664198447, "grad_norm": 1.2420684099197388, "learning_rate": 9.979786686268555e-05, "loss": 0.0579, "step": 7720 }, { "epoch": 2.8619029988893003, "grad_norm": 0.2946189045906067, "learning_rate": 9.979637886896163e-05, "loss": 0.0474, "step": 7730 }, { "epoch": 2.865605331358756, "grad_norm": 0.3234107196331024, "learning_rate": 9.979488542958057e-05, "loss": 0.0585, "step": 7740 }, { "epoch": 2.869307663828212, "grad_norm": 0.8774400353431702, "learning_rate": 9.979338654470569e-05, "loss": 0.0508, "step": 7750 }, { "epoch": 2.8730099962976676, "grad_norm": 2.2826132774353027, "learning_rate": 9.979188221450094e-05, "loss": 0.08, "step": 7760 }, { "epoch": 2.8767123287671232, "grad_norm": 0.43348217010498047, "learning_rate": 9.979037243913081e-05, "loss": 0.0403, "step": 7770 }, { "epoch": 2.8804146612365793, "grad_norm": 0.4661199748516083, "learning_rate": 9.978885721876041e-05, "loss": 0.0431, "step": 7780 }, { "epoch": 2.884116993706035, "grad_norm": 0.7607512474060059, "learning_rate": 9.978733655355544e-05, "loss": 0.0462, "step": 7790 }, { "epoch": 2.8878193261754905, "grad_norm": 0.8472355008125305, "learning_rate": 9.97858104436822e-05, "loss": 0.05, "step": 7800 }, { "epoch": 2.8915216586449466, "grad_norm": 0.4451065957546234, "learning_rate": 9.978427888930757e-05, "loss": 0.0465, "step": 7810 }, { "epoch": 2.895223991114402, "grad_norm": 1.378222107887268, "learning_rate": 9.978274189059906e-05, "loss": 0.0501, "step": 7820 }, { "epoch": 2.898926323583858, "grad_norm": 0.34215420484542847, "learning_rate": 9.978119944772475e-05, "loss": 0.0545, "step": 7830 }, { "epoch": 2.9026286560533134, "grad_norm": 0.8728203773498535, "learning_rate": 9.97796515608533e-05, "loss": 0.0464, "step": 7840 }, { "epoch": 2.9063309885227695, "grad_norm": 0.6420557498931885, "learning_rate": 9.977809823015401e-05, "loss": 0.063, "step": 7850 }, { "epoch": 2.910033320992225, "grad_norm": 0.7320551872253418, "learning_rate": 9.977653945579673e-05, "loss": 0.0501, "step": 7860 }, { "epoch": 2.9137356534616807, "grad_norm": 0.3070026934146881, "learning_rate": 9.977497523795194e-05, "loss": 0.0614, "step": 7870 }, { "epoch": 2.9174379859311363, "grad_norm": 0.6688214540481567, "learning_rate": 9.977340557679068e-05, "loss": 0.0422, "step": 7880 }, { "epoch": 2.9211403184005924, "grad_norm": 0.364985853433609, "learning_rate": 9.977183047248464e-05, "loss": 0.0435, "step": 7890 }, { "epoch": 2.924842650870048, "grad_norm": 0.4868282973766327, "learning_rate": 9.977024992520602e-05, "loss": 0.0574, "step": 7900 }, { "epoch": 2.9285449833395036, "grad_norm": 0.3718990087509155, "learning_rate": 9.976866393512771e-05, "loss": 0.0456, "step": 7910 }, { "epoch": 2.9322473158089597, "grad_norm": 1.2489930391311646, "learning_rate": 9.976707250242315e-05, "loss": 0.0515, "step": 7920 }, { "epoch": 2.9359496482784153, "grad_norm": 0.39183154702186584, "learning_rate": 9.976547562726636e-05, "loss": 0.05, "step": 7930 }, { "epoch": 2.939651980747871, "grad_norm": 0.5810664296150208, "learning_rate": 9.976387330983198e-05, "loss": 0.0557, "step": 7940 }, { "epoch": 2.943354313217327, "grad_norm": 0.7715959548950195, "learning_rate": 9.976226555029522e-05, "loss": 0.0581, "step": 7950 }, { "epoch": 2.9470566456867826, "grad_norm": 0.49081262946128845, "learning_rate": 9.976065234883193e-05, "loss": 0.0601, "step": 7960 }, { "epoch": 2.950758978156238, "grad_norm": 0.7034128904342651, "learning_rate": 9.97590337056185e-05, "loss": 0.0719, "step": 7970 }, { "epoch": 2.9544613106256943, "grad_norm": 1.7578150033950806, "learning_rate": 9.975740962083198e-05, "loss": 0.0483, "step": 7980 }, { "epoch": 2.95816364309515, "grad_norm": 0.5053656697273254, "learning_rate": 9.975578009464992e-05, "loss": 0.0727, "step": 7990 }, { "epoch": 2.9618659755646055, "grad_norm": 0.45432376861572266, "learning_rate": 9.975414512725057e-05, "loss": 0.0511, "step": 8000 }, { "epoch": 2.9655683080340616, "grad_norm": 1.811920166015625, "learning_rate": 9.975250471881273e-05, "loss": 0.044, "step": 8010 }, { "epoch": 2.969270640503517, "grad_norm": 0.37090957164764404, "learning_rate": 9.975085886951575e-05, "loss": 0.0397, "step": 8020 }, { "epoch": 2.972972972972973, "grad_norm": 0.4516374170780182, "learning_rate": 9.974920757953965e-05, "loss": 0.0492, "step": 8030 }, { "epoch": 2.976675305442429, "grad_norm": 0.39773133397102356, "learning_rate": 9.974755084906502e-05, "loss": 0.0457, "step": 8040 }, { "epoch": 2.9803776379118845, "grad_norm": 0.7112022042274475, "learning_rate": 9.974588867827301e-05, "loss": 0.0524, "step": 8050 }, { "epoch": 2.98407997038134, "grad_norm": 0.37746259570121765, "learning_rate": 9.97442210673454e-05, "loss": 0.0481, "step": 8060 }, { "epoch": 2.987782302850796, "grad_norm": 0.5700675249099731, "learning_rate": 9.974254801646457e-05, "loss": 0.0509, "step": 8070 }, { "epoch": 2.9914846353202518, "grad_norm": 0.3275069296360016, "learning_rate": 9.974086952581348e-05, "loss": 0.0385, "step": 8080 }, { "epoch": 2.9951869677897074, "grad_norm": 0.49255290627479553, "learning_rate": 9.97391855955757e-05, "loss": 0.0446, "step": 8090 }, { "epoch": 2.9988893002591634, "grad_norm": 3.170539617538452, "learning_rate": 9.973749622593534e-05, "loss": 0.056, "step": 8100 }, { "epoch": 3.002591632728619, "grad_norm": 0.44546711444854736, "learning_rate": 9.973580141707717e-05, "loss": 0.0372, "step": 8110 }, { "epoch": 3.0062939651980747, "grad_norm": 0.476161926984787, "learning_rate": 9.973410116918653e-05, "loss": 0.0507, "step": 8120 }, { "epoch": 3.0099962976675307, "grad_norm": 0.2712325155735016, "learning_rate": 9.973239548244939e-05, "loss": 0.0377, "step": 8130 }, { "epoch": 3.0136986301369864, "grad_norm": 0.4373539388179779, "learning_rate": 9.973068435705222e-05, "loss": 0.0433, "step": 8140 }, { "epoch": 3.017400962606442, "grad_norm": 1.1624771356582642, "learning_rate": 9.972896779318219e-05, "loss": 0.0396, "step": 8150 }, { "epoch": 3.021103295075898, "grad_norm": 0.4215792715549469, "learning_rate": 9.972724579102702e-05, "loss": 0.034, "step": 8160 }, { "epoch": 3.0248056275453536, "grad_norm": 1.2334634065628052, "learning_rate": 9.9725518350775e-05, "loss": 0.0465, "step": 8170 }, { "epoch": 3.0285079600148093, "grad_norm": 0.5117453336715698, "learning_rate": 9.972378547261504e-05, "loss": 0.046, "step": 8180 }, { "epoch": 3.032210292484265, "grad_norm": 0.7486348152160645, "learning_rate": 9.972204715673669e-05, "loss": 0.0521, "step": 8190 }, { "epoch": 3.035912624953721, "grad_norm": 0.4859667420387268, "learning_rate": 9.972030340333001e-05, "loss": 0.0461, "step": 8200 }, { "epoch": 3.0396149574231766, "grad_norm": 0.32663974165916443, "learning_rate": 9.97185542125857e-05, "loss": 0.048, "step": 8210 }, { "epoch": 3.043317289892632, "grad_norm": 0.4456329345703125, "learning_rate": 9.971679958469505e-05, "loss": 0.0458, "step": 8220 }, { "epoch": 3.0470196223620882, "grad_norm": 1.3441894054412842, "learning_rate": 9.971503951984995e-05, "loss": 0.0496, "step": 8230 }, { "epoch": 3.050721954831544, "grad_norm": 0.3215849995613098, "learning_rate": 9.971327401824287e-05, "loss": 0.0597, "step": 8240 }, { "epoch": 3.0544242873009995, "grad_norm": 0.4057064950466156, "learning_rate": 9.97115030800669e-05, "loss": 0.044, "step": 8250 }, { "epoch": 3.0581266197704555, "grad_norm": 0.7969862222671509, "learning_rate": 9.970972670551566e-05, "loss": 0.0396, "step": 8260 }, { "epoch": 3.061828952239911, "grad_norm": 1.3189862966537476, "learning_rate": 9.970794489478348e-05, "loss": 0.0544, "step": 8270 }, { "epoch": 3.0655312847093668, "grad_norm": 0.31267067790031433, "learning_rate": 9.970615764806516e-05, "loss": 0.0452, "step": 8280 }, { "epoch": 3.069233617178823, "grad_norm": 0.37295666337013245, "learning_rate": 9.970436496555617e-05, "loss": 0.0404, "step": 8290 }, { "epoch": 3.0729359496482784, "grad_norm": 0.6406019330024719, "learning_rate": 9.970256684745258e-05, "loss": 0.0408, "step": 8300 }, { "epoch": 3.076638282117734, "grad_norm": 0.45908844470977783, "learning_rate": 9.970076329395098e-05, "loss": 0.0268, "step": 8310 }, { "epoch": 3.08034061458719, "grad_norm": 0.7154117822647095, "learning_rate": 9.969895430524865e-05, "loss": 0.0702, "step": 8320 }, { "epoch": 3.0840429470566457, "grad_norm": 1.1496729850769043, "learning_rate": 9.969713988154339e-05, "loss": 0.0592, "step": 8330 }, { "epoch": 3.0877452795261013, "grad_norm": 1.6149482727050781, "learning_rate": 9.969532002303363e-05, "loss": 0.0568, "step": 8340 }, { "epoch": 3.0914476119955574, "grad_norm": 0.5557804107666016, "learning_rate": 9.969349472991838e-05, "loss": 0.0408, "step": 8350 }, { "epoch": 3.095149944465013, "grad_norm": 0.5655856728553772, "learning_rate": 9.969166400239726e-05, "loss": 0.0471, "step": 8360 }, { "epoch": 3.0988522769344686, "grad_norm": 0.42767566442489624, "learning_rate": 9.968982784067049e-05, "loss": 0.0607, "step": 8370 }, { "epoch": 3.1025546094039242, "grad_norm": 0.3109833598136902, "learning_rate": 9.968798624493885e-05, "loss": 0.0626, "step": 8380 }, { "epoch": 3.1062569418733803, "grad_norm": 1.5858838558197021, "learning_rate": 9.968613921540373e-05, "loss": 0.0541, "step": 8390 }, { "epoch": 3.109959274342836, "grad_norm": 0.9221243858337402, "learning_rate": 9.968428675226714e-05, "loss": 0.047, "step": 8400 }, { "epoch": 3.1136616068122915, "grad_norm": 0.46590495109558105, "learning_rate": 9.968242885573165e-05, "loss": 0.0542, "step": 8410 }, { "epoch": 3.1173639392817476, "grad_norm": 0.5895950198173523, "learning_rate": 9.968056552600043e-05, "loss": 0.0436, "step": 8420 }, { "epoch": 3.121066271751203, "grad_norm": 0.4294150769710541, "learning_rate": 9.967869676327726e-05, "loss": 0.049, "step": 8430 }, { "epoch": 3.124768604220659, "grad_norm": 0.3577798306941986, "learning_rate": 9.967682256776649e-05, "loss": 0.0334, "step": 8440 }, { "epoch": 3.128470936690115, "grad_norm": 0.18765830993652344, "learning_rate": 9.967494293967312e-05, "loss": 0.0437, "step": 8450 }, { "epoch": 3.1321732691595705, "grad_norm": 0.7858066558837891, "learning_rate": 9.967305787920264e-05, "loss": 0.0562, "step": 8460 }, { "epoch": 3.135875601629026, "grad_norm": 0.6371629238128662, "learning_rate": 9.967116738656126e-05, "loss": 0.0496, "step": 8470 }, { "epoch": 3.139577934098482, "grad_norm": 0.3670298457145691, "learning_rate": 9.966927146195568e-05, "loss": 0.0506, "step": 8480 }, { "epoch": 3.143280266567938, "grad_norm": 1.1010178327560425, "learning_rate": 9.966737010559326e-05, "loss": 0.0614, "step": 8490 }, { "epoch": 3.1469825990373934, "grad_norm": 0.3833625614643097, "learning_rate": 9.966546331768191e-05, "loss": 0.0607, "step": 8500 }, { "epoch": 3.1506849315068495, "grad_norm": 0.5338154435157776, "learning_rate": 9.966355109843018e-05, "loss": 0.0646, "step": 8510 }, { "epoch": 3.154387263976305, "grad_norm": 0.46006524562835693, "learning_rate": 9.966163344804716e-05, "loss": 0.0565, "step": 8520 }, { "epoch": 3.1580895964457607, "grad_norm": 0.5559606552124023, "learning_rate": 9.965971036674255e-05, "loss": 0.0476, "step": 8530 }, { "epoch": 3.1617919289152168, "grad_norm": 0.3462209105491638, "learning_rate": 9.96577818547267e-05, "loss": 0.0491, "step": 8540 }, { "epoch": 3.1654942613846724, "grad_norm": 0.630509614944458, "learning_rate": 9.965584791221048e-05, "loss": 0.0612, "step": 8550 }, { "epoch": 3.169196593854128, "grad_norm": 0.35430237650871277, "learning_rate": 9.96539085394054e-05, "loss": 0.0431, "step": 8560 }, { "epoch": 3.172898926323584, "grad_norm": 0.6210508942604065, "learning_rate": 9.965196373652351e-05, "loss": 0.0439, "step": 8570 }, { "epoch": 3.1766012587930397, "grad_norm": 0.31630268692970276, "learning_rate": 9.965001350377753e-05, "loss": 0.0475, "step": 8580 }, { "epoch": 3.1803035912624953, "grad_norm": 0.6861318349838257, "learning_rate": 9.964805784138072e-05, "loss": 0.0338, "step": 8590 }, { "epoch": 3.1840059237319513, "grad_norm": 0.2325112372636795, "learning_rate": 9.964609674954696e-05, "loss": 0.0431, "step": 8600 }, { "epoch": 3.187708256201407, "grad_norm": 0.3484036922454834, "learning_rate": 9.964413022849068e-05, "loss": 0.039, "step": 8610 }, { "epoch": 3.1914105886708626, "grad_norm": 0.37834542989730835, "learning_rate": 9.964215827842698e-05, "loss": 0.0591, "step": 8620 }, { "epoch": 3.195112921140318, "grad_norm": 0.49438267946243286, "learning_rate": 9.964018089957147e-05, "loss": 0.0523, "step": 8630 }, { "epoch": 3.1988152536097743, "grad_norm": 0.6419568061828613, "learning_rate": 9.963819809214041e-05, "loss": 0.0526, "step": 8640 }, { "epoch": 3.20251758607923, "grad_norm": 0.6687449216842651, "learning_rate": 9.963620985635065e-05, "loss": 0.0454, "step": 8650 }, { "epoch": 3.2062199185486855, "grad_norm": 0.9991720914840698, "learning_rate": 9.96342161924196e-05, "loss": 0.053, "step": 8660 }, { "epoch": 3.2099222510181415, "grad_norm": 0.25290447473526, "learning_rate": 9.963221710056529e-05, "loss": 0.0536, "step": 8670 }, { "epoch": 3.213624583487597, "grad_norm": 0.8883000612258911, "learning_rate": 9.963021258100633e-05, "loss": 0.046, "step": 8680 }, { "epoch": 3.217326915957053, "grad_norm": 0.6249110102653503, "learning_rate": 9.962820263396195e-05, "loss": 0.0464, "step": 8690 }, { "epoch": 3.221029248426509, "grad_norm": 0.6721606850624084, "learning_rate": 9.962618725965196e-05, "loss": 0.0518, "step": 8700 }, { "epoch": 3.2247315808959645, "grad_norm": 0.34805232286453247, "learning_rate": 9.962416645829672e-05, "loss": 0.0498, "step": 8710 }, { "epoch": 3.22843391336542, "grad_norm": 1.1840654611587524, "learning_rate": 9.962214023011725e-05, "loss": 0.0494, "step": 8720 }, { "epoch": 3.232136245834876, "grad_norm": 0.3208483159542084, "learning_rate": 9.962010857533514e-05, "loss": 0.0397, "step": 8730 }, { "epoch": 3.2358385783043317, "grad_norm": 0.3602789044380188, "learning_rate": 9.961807149417256e-05, "loss": 0.0544, "step": 8740 }, { "epoch": 3.2395409107737874, "grad_norm": 0.38687050342559814, "learning_rate": 9.961602898685226e-05, "loss": 0.0451, "step": 8750 }, { "epoch": 3.2432432432432434, "grad_norm": 0.44546279311180115, "learning_rate": 9.961398105359764e-05, "loss": 0.0483, "step": 8760 }, { "epoch": 3.246945575712699, "grad_norm": 0.45689454674720764, "learning_rate": 9.961192769463264e-05, "loss": 0.0452, "step": 8770 }, { "epoch": 3.2506479081821547, "grad_norm": 0.7935534715652466, "learning_rate": 9.960986891018183e-05, "loss": 0.0516, "step": 8780 }, { "epoch": 3.2543502406516103, "grad_norm": 0.5152283906936646, "learning_rate": 9.960780470047033e-05, "loss": 0.056, "step": 8790 }, { "epoch": 3.2580525731210663, "grad_norm": 0.4295962452888489, "learning_rate": 9.96057350657239e-05, "loss": 0.0405, "step": 8800 }, { "epoch": 3.261754905590522, "grad_norm": 0.4544573426246643, "learning_rate": 9.960366000616885e-05, "loss": 0.0465, "step": 8810 }, { "epoch": 3.2654572380599776, "grad_norm": 0.7920482158660889, "learning_rate": 9.960157952203214e-05, "loss": 0.0472, "step": 8820 }, { "epoch": 3.2691595705294336, "grad_norm": 0.8529848456382751, "learning_rate": 9.959949361354126e-05, "loss": 0.0616, "step": 8830 }, { "epoch": 3.2728619029988892, "grad_norm": 0.33752381801605225, "learning_rate": 9.959740228092434e-05, "loss": 0.04, "step": 8840 }, { "epoch": 3.276564235468345, "grad_norm": 0.4068237245082855, "learning_rate": 9.959530552441005e-05, "loss": 0.0505, "step": 8850 }, { "epoch": 3.280266567937801, "grad_norm": 0.3404857814311981, "learning_rate": 9.959320334422772e-05, "loss": 0.044, "step": 8860 }, { "epoch": 3.2839689004072565, "grad_norm": 0.29635435342788696, "learning_rate": 9.959109574060726e-05, "loss": 0.0388, "step": 8870 }, { "epoch": 3.287671232876712, "grad_norm": 0.5862019658088684, "learning_rate": 9.958898271377911e-05, "loss": 0.0493, "step": 8880 }, { "epoch": 3.291373565346168, "grad_norm": 0.7364274263381958, "learning_rate": 9.958686426397437e-05, "loss": 0.0443, "step": 8890 }, { "epoch": 3.295075897815624, "grad_norm": 1.2766162157058716, "learning_rate": 9.95847403914247e-05, "loss": 0.0541, "step": 8900 }, { "epoch": 3.2987782302850794, "grad_norm": 0.32108375430107117, "learning_rate": 9.958261109636238e-05, "loss": 0.0467, "step": 8910 }, { "epoch": 3.3024805627545355, "grad_norm": 0.5035872459411621, "learning_rate": 9.958047637902025e-05, "loss": 0.0448, "step": 8920 }, { "epoch": 3.306182895223991, "grad_norm": 0.631062388420105, "learning_rate": 9.957833623963177e-05, "loss": 0.0404, "step": 8930 }, { "epoch": 3.3098852276934467, "grad_norm": 0.4304696023464203, "learning_rate": 9.957619067843098e-05, "loss": 0.0397, "step": 8940 }, { "epoch": 3.313587560162903, "grad_norm": 0.29231688380241394, "learning_rate": 9.95740396956525e-05, "loss": 0.0467, "step": 8950 }, { "epoch": 3.3172898926323584, "grad_norm": 0.36338841915130615, "learning_rate": 9.95718832915316e-05, "loss": 0.0599, "step": 8960 }, { "epoch": 3.320992225101814, "grad_norm": 0.33326008915901184, "learning_rate": 9.956972146630405e-05, "loss": 0.0456, "step": 8970 }, { "epoch": 3.32469455757127, "grad_norm": 0.31440269947052, "learning_rate": 9.95675542202063e-05, "loss": 0.0533, "step": 8980 }, { "epoch": 3.3283968900407257, "grad_norm": 0.19008393585681915, "learning_rate": 9.956538155347534e-05, "loss": 0.0402, "step": 8990 }, { "epoch": 3.3320992225101813, "grad_norm": 0.27238529920578003, "learning_rate": 9.956320346634876e-05, "loss": 0.0446, "step": 9000 }, { "epoch": 3.3358015549796374, "grad_norm": 0.993145227432251, "learning_rate": 9.956101995906479e-05, "loss": 0.0497, "step": 9010 }, { "epoch": 3.339503887449093, "grad_norm": 0.2236570566892624, "learning_rate": 9.955883103186218e-05, "loss": 0.0378, "step": 9020 }, { "epoch": 3.3432062199185486, "grad_norm": 0.35319218039512634, "learning_rate": 9.955663668498032e-05, "loss": 0.039, "step": 9030 }, { "epoch": 3.3469085523880047, "grad_norm": 0.6803251504898071, "learning_rate": 9.955443691865917e-05, "loss": 0.0508, "step": 9040 }, { "epoch": 3.3506108848574603, "grad_norm": 0.24138985574245453, "learning_rate": 9.955223173313931e-05, "loss": 0.0517, "step": 9050 }, { "epoch": 3.354313217326916, "grad_norm": 0.4137996435165405, "learning_rate": 9.95500211286619e-05, "loss": 0.0394, "step": 9060 }, { "epoch": 3.358015549796372, "grad_norm": 0.7139816284179688, "learning_rate": 9.954780510546866e-05, "loss": 0.0449, "step": 9070 }, { "epoch": 3.3617178822658276, "grad_norm": 0.4769621789455414, "learning_rate": 9.954558366380195e-05, "loss": 0.0504, "step": 9080 }, { "epoch": 3.365420214735283, "grad_norm": 0.5325402021408081, "learning_rate": 9.95433568039047e-05, "loss": 0.0601, "step": 9090 }, { "epoch": 3.369122547204739, "grad_norm": 0.24619315564632416, "learning_rate": 9.954112452602045e-05, "loss": 0.0585, "step": 9100 }, { "epoch": 3.372824879674195, "grad_norm": 0.5308611989021301, "learning_rate": 9.95388868303933e-05, "loss": 0.042, "step": 9110 }, { "epoch": 3.3765272121436505, "grad_norm": 0.4345863163471222, "learning_rate": 9.953664371726797e-05, "loss": 0.0376, "step": 9120 }, { "epoch": 3.380229544613106, "grad_norm": 0.799601674079895, "learning_rate": 9.953439518688974e-05, "loss": 0.0599, "step": 9130 }, { "epoch": 3.383931877082562, "grad_norm": 0.35244646668434143, "learning_rate": 9.953214123950454e-05, "loss": 0.0518, "step": 9140 }, { "epoch": 3.3876342095520178, "grad_norm": 0.32283830642700195, "learning_rate": 9.952988187535886e-05, "loss": 0.0594, "step": 9150 }, { "epoch": 3.3913365420214734, "grad_norm": 0.290268212556839, "learning_rate": 9.952761709469975e-05, "loss": 0.0492, "step": 9160 }, { "epoch": 3.3950388744909294, "grad_norm": 0.28423768281936646, "learning_rate": 9.952534689777489e-05, "loss": 0.0487, "step": 9170 }, { "epoch": 3.398741206960385, "grad_norm": 0.4903419315814972, "learning_rate": 9.952307128483256e-05, "loss": 0.0452, "step": 9180 }, { "epoch": 3.4024435394298407, "grad_norm": 0.3382343649864197, "learning_rate": 9.952079025612162e-05, "loss": 0.0442, "step": 9190 }, { "epoch": 3.4061458718992967, "grad_norm": 0.26325491070747375, "learning_rate": 9.95185038118915e-05, "loss": 0.0352, "step": 9200 }, { "epoch": 3.4098482043687524, "grad_norm": 0.2575511932373047, "learning_rate": 9.951621195239227e-05, "loss": 0.0354, "step": 9210 }, { "epoch": 3.413550536838208, "grad_norm": 0.34924963116645813, "learning_rate": 9.951391467787452e-05, "loss": 0.0409, "step": 9220 }, { "epoch": 3.4172528693076636, "grad_norm": 0.41855478286743164, "learning_rate": 9.951161198858953e-05, "loss": 0.0474, "step": 9230 }, { "epoch": 3.4209552017771196, "grad_norm": 0.6499406695365906, "learning_rate": 9.950930388478908e-05, "loss": 0.0425, "step": 9240 }, { "epoch": 3.4246575342465753, "grad_norm": 1.5365420579910278, "learning_rate": 9.950699036672559e-05, "loss": 0.0513, "step": 9250 }, { "epoch": 3.428359866716031, "grad_norm": 0.5783532857894897, "learning_rate": 9.950467143465207e-05, "loss": 0.0406, "step": 9260 }, { "epoch": 3.432062199185487, "grad_norm": 0.21111617982387543, "learning_rate": 9.950234708882212e-05, "loss": 0.0378, "step": 9270 }, { "epoch": 3.4357645316549426, "grad_norm": 0.5452075004577637, "learning_rate": 9.95000173294899e-05, "loss": 0.0422, "step": 9280 }, { "epoch": 3.439466864124398, "grad_norm": 0.38843458890914917, "learning_rate": 9.949768215691022e-05, "loss": 0.0461, "step": 9290 }, { "epoch": 3.4431691965938542, "grad_norm": 0.4740297794342041, "learning_rate": 9.949534157133844e-05, "loss": 0.0524, "step": 9300 }, { "epoch": 3.44687152906331, "grad_norm": 0.3487686812877655, "learning_rate": 9.949299557303051e-05, "loss": 0.0589, "step": 9310 }, { "epoch": 3.4505738615327655, "grad_norm": 0.3194868266582489, "learning_rate": 9.949064416224301e-05, "loss": 0.0472, "step": 9320 }, { "epoch": 3.4542761940022215, "grad_norm": 0.34994328022003174, "learning_rate": 9.948828733923305e-05, "loss": 0.0573, "step": 9330 }, { "epoch": 3.457978526471677, "grad_norm": 0.5668218731880188, "learning_rate": 9.948592510425842e-05, "loss": 0.0364, "step": 9340 }, { "epoch": 3.4616808589411328, "grad_norm": 0.5993531942367554, "learning_rate": 9.948355745757741e-05, "loss": 0.045, "step": 9350 }, { "epoch": 3.465383191410589, "grad_norm": 0.41019952297210693, "learning_rate": 9.948118439944895e-05, "loss": 0.0377, "step": 9360 }, { "epoch": 3.4690855238800444, "grad_norm": 0.3183557987213135, "learning_rate": 9.947880593013255e-05, "loss": 0.0484, "step": 9370 }, { "epoch": 3.4727878563495, "grad_norm": 0.4042440950870514, "learning_rate": 9.947642204988835e-05, "loss": 0.0474, "step": 9380 }, { "epoch": 3.476490188818956, "grad_norm": 0.31576457619667053, "learning_rate": 9.9474032758977e-05, "loss": 0.053, "step": 9390 }, { "epoch": 3.4801925212884117, "grad_norm": 0.5087863206863403, "learning_rate": 9.94716380576598e-05, "loss": 0.0415, "step": 9400 }, { "epoch": 3.4838948537578673, "grad_norm": 0.33447012305259705, "learning_rate": 9.946923794619867e-05, "loss": 0.0427, "step": 9410 }, { "epoch": 3.4875971862273234, "grad_norm": 0.4139365255832672, "learning_rate": 9.946683242485604e-05, "loss": 0.0507, "step": 9420 }, { "epoch": 3.491299518696779, "grad_norm": 0.3293374180793762, "learning_rate": 9.946442149389497e-05, "loss": 0.0351, "step": 9430 }, { "epoch": 3.4950018511662346, "grad_norm": 0.4002557396888733, "learning_rate": 9.946200515357916e-05, "loss": 0.0624, "step": 9440 }, { "epoch": 3.4987041836356907, "grad_norm": 0.4885154962539673, "learning_rate": 9.945958340417283e-05, "loss": 0.0501, "step": 9450 }, { "epoch": 3.5024065161051463, "grad_norm": 0.37228238582611084, "learning_rate": 9.945715624594081e-05, "loss": 0.0428, "step": 9460 }, { "epoch": 3.506108848574602, "grad_norm": 0.20963998138904572, "learning_rate": 9.945472367914855e-05, "loss": 0.0337, "step": 9470 }, { "epoch": 3.509811181044058, "grad_norm": 0.39341792464256287, "learning_rate": 9.945228570406205e-05, "loss": 0.0369, "step": 9480 }, { "epoch": 3.5135135135135136, "grad_norm": 0.19017550349235535, "learning_rate": 9.944984232094794e-05, "loss": 0.042, "step": 9490 }, { "epoch": 3.517215845982969, "grad_norm": 0.3878214657306671, "learning_rate": 9.944739353007344e-05, "loss": 0.0447, "step": 9500 }, { "epoch": 3.5209181784524253, "grad_norm": 0.29828569293022156, "learning_rate": 9.94449393317063e-05, "loss": 0.0375, "step": 9510 }, { "epoch": 3.524620510921881, "grad_norm": 1.4864635467529297, "learning_rate": 9.944247972611494e-05, "loss": 0.0543, "step": 9520 }, { "epoch": 3.5283228433913365, "grad_norm": 0.45949506759643555, "learning_rate": 9.944001471356835e-05, "loss": 0.0448, "step": 9530 }, { "epoch": 3.5320251758607926, "grad_norm": 0.2900116443634033, "learning_rate": 9.943754429433606e-05, "loss": 0.0498, "step": 9540 }, { "epoch": 3.535727508330248, "grad_norm": 0.4369143843650818, "learning_rate": 9.943506846868826e-05, "loss": 0.0434, "step": 9550 }, { "epoch": 3.539429840799704, "grad_norm": 0.2815883457660675, "learning_rate": 9.94325872368957e-05, "loss": 0.0413, "step": 9560 }, { "epoch": 3.5431321732691594, "grad_norm": 1.0715023279190063, "learning_rate": 9.943010059922973e-05, "loss": 0.0437, "step": 9570 }, { "epoch": 3.5468345057386155, "grad_norm": 0.30114156007766724, "learning_rate": 9.942760855596226e-05, "loss": 0.0496, "step": 9580 }, { "epoch": 3.550536838208071, "grad_norm": 0.8074672222137451, "learning_rate": 9.942511110736584e-05, "loss": 0.0405, "step": 9590 }, { "epoch": 3.5542391706775267, "grad_norm": 0.5031940340995789, "learning_rate": 9.942260825371358e-05, "loss": 0.0534, "step": 9600 }, { "epoch": 3.5579415031469828, "grad_norm": 0.3794572353363037, "learning_rate": 9.94200999952792e-05, "loss": 0.0756, "step": 9610 }, { "epoch": 3.5616438356164384, "grad_norm": 0.49994346499443054, "learning_rate": 9.941758633233696e-05, "loss": 0.0602, "step": 9620 }, { "epoch": 3.565346168085894, "grad_norm": 0.42696917057037354, "learning_rate": 9.941506726516179e-05, "loss": 0.0504, "step": 9630 }, { "epoch": 3.5690485005553496, "grad_norm": 0.572075366973877, "learning_rate": 9.941254279402915e-05, "loss": 0.062, "step": 9640 }, { "epoch": 3.5727508330248057, "grad_norm": 0.25338518619537354, "learning_rate": 9.941001291921512e-05, "loss": 0.0459, "step": 9650 }, { "epoch": 3.5764531654942613, "grad_norm": 0.28398460149765015, "learning_rate": 9.940747764099638e-05, "loss": 0.0354, "step": 9660 }, { "epoch": 3.580155497963717, "grad_norm": 0.5692498683929443, "learning_rate": 9.940493695965016e-05, "loss": 0.0577, "step": 9670 }, { "epoch": 3.583857830433173, "grad_norm": 0.5029468536376953, "learning_rate": 9.940239087545431e-05, "loss": 0.0431, "step": 9680 }, { "epoch": 3.5875601629026286, "grad_norm": 0.4097803831100464, "learning_rate": 9.939983938868726e-05, "loss": 0.0628, "step": 9690 }, { "epoch": 3.591262495372084, "grad_norm": 1.1207470893859863, "learning_rate": 9.939728249962807e-05, "loss": 0.0589, "step": 9700 }, { "epoch": 3.5949648278415403, "grad_norm": 0.36201655864715576, "learning_rate": 9.939472020855633e-05, "loss": 0.0437, "step": 9710 }, { "epoch": 3.598667160310996, "grad_norm": 0.5178046822547913, "learning_rate": 9.939215251575224e-05, "loss": 0.0412, "step": 9720 }, { "epoch": 3.6023694927804515, "grad_norm": 0.2805769145488739, "learning_rate": 9.93895794214966e-05, "loss": 0.0428, "step": 9730 }, { "epoch": 3.6060718252499075, "grad_norm": 1.0265041589736938, "learning_rate": 9.938700092607083e-05, "loss": 0.0393, "step": 9740 }, { "epoch": 3.609774157719363, "grad_norm": 0.27804049849510193, "learning_rate": 9.938441702975689e-05, "loss": 0.0454, "step": 9750 }, { "epoch": 3.613476490188819, "grad_norm": 0.29519858956336975, "learning_rate": 9.938182773283735e-05, "loss": 0.0408, "step": 9760 }, { "epoch": 3.617178822658275, "grad_norm": 0.3501887321472168, "learning_rate": 9.937923303559538e-05, "loss": 0.0415, "step": 9770 }, { "epoch": 3.6208811551277305, "grad_norm": 0.2487824708223343, "learning_rate": 9.937663293831471e-05, "loss": 0.0348, "step": 9780 }, { "epoch": 3.624583487597186, "grad_norm": 0.4012928307056427, "learning_rate": 9.93740274412797e-05, "loss": 0.056, "step": 9790 }, { "epoch": 3.628285820066642, "grad_norm": 0.3310798704624176, "learning_rate": 9.937141654477528e-05, "loss": 0.0392, "step": 9800 }, { "epoch": 3.6319881525360977, "grad_norm": 0.2311433106660843, "learning_rate": 9.9368800249087e-05, "loss": 0.0407, "step": 9810 }, { "epoch": 3.6356904850055534, "grad_norm": 0.4627179503440857, "learning_rate": 9.936617855450092e-05, "loss": 0.0517, "step": 9820 }, { "epoch": 3.6393928174750094, "grad_norm": 0.3468697965145111, "learning_rate": 9.936355146130379e-05, "loss": 0.0444, "step": 9830 }, { "epoch": 3.643095149944465, "grad_norm": 0.47202494740486145, "learning_rate": 9.936091896978289e-05, "loss": 0.036, "step": 9840 }, { "epoch": 3.6467974824139207, "grad_norm": 0.40438759326934814, "learning_rate": 9.93582810802261e-05, "loss": 0.0542, "step": 9850 }, { "epoch": 3.6504998148833767, "grad_norm": 0.38317063450813293, "learning_rate": 9.93556377929219e-05, "loss": 0.0415, "step": 9860 }, { "epoch": 3.6542021473528323, "grad_norm": 0.882982611656189, "learning_rate": 9.935298910815936e-05, "loss": 0.0422, "step": 9870 }, { "epoch": 3.657904479822288, "grad_norm": 0.2890973389148712, "learning_rate": 9.935033502622813e-05, "loss": 0.0469, "step": 9880 }, { "epoch": 3.661606812291744, "grad_norm": 0.3368619382381439, "learning_rate": 9.934767554741846e-05, "loss": 0.0571, "step": 9890 }, { "epoch": 3.6653091447611996, "grad_norm": 1.4884029626846313, "learning_rate": 9.934501067202117e-05, "loss": 0.0427, "step": 9900 }, { "epoch": 3.6690114772306552, "grad_norm": 0.7529737949371338, "learning_rate": 9.934234040032773e-05, "loss": 0.0583, "step": 9910 }, { "epoch": 3.6727138097001113, "grad_norm": 0.291808158159256, "learning_rate": 9.93396647326301e-05, "loss": 0.0415, "step": 9920 }, { "epoch": 3.676416142169567, "grad_norm": 0.37896764278411865, "learning_rate": 9.933698366922093e-05, "loss": 0.0509, "step": 9930 }, { "epoch": 3.6801184746390225, "grad_norm": 0.3226650357246399, "learning_rate": 9.93342972103934e-05, "loss": 0.0343, "step": 9940 }, { "epoch": 3.6838208071084786, "grad_norm": 0.3963494896888733, "learning_rate": 9.93316053564413e-05, "loss": 0.0436, "step": 9950 }, { "epoch": 3.687523139577934, "grad_norm": 0.4156726896762848, "learning_rate": 9.932890810765902e-05, "loss": 0.037, "step": 9960 }, { "epoch": 3.69122547204739, "grad_norm": 0.451204776763916, "learning_rate": 9.932620546434151e-05, "loss": 0.0301, "step": 9970 }, { "epoch": 3.694927804516846, "grad_norm": 0.2114568054676056, "learning_rate": 9.932349742678433e-05, "loss": 0.0301, "step": 9980 }, { "epoch": 3.6986301369863015, "grad_norm": 0.7390180230140686, "learning_rate": 9.932078399528361e-05, "loss": 0.0584, "step": 9990 }, { "epoch": 3.702332469455757, "grad_norm": 0.5608471035957336, "learning_rate": 9.931806517013612e-05, "loss": 0.0445, "step": 10000 }, { "epoch": 3.7060348019252127, "grad_norm": 0.28636929392814636, "learning_rate": 9.931534095163916e-05, "loss": 0.0341, "step": 10010 }, { "epoch": 3.709737134394669, "grad_norm": 0.39831414818763733, "learning_rate": 9.931261134009069e-05, "loss": 0.0737, "step": 10020 }, { "epoch": 3.7134394668641244, "grad_norm": 0.47219353914260864, "learning_rate": 9.930987633578915e-05, "loss": 0.0501, "step": 10030 }, { "epoch": 3.71714179933358, "grad_norm": 0.33501482009887695, "learning_rate": 9.930713593903369e-05, "loss": 0.0481, "step": 10040 }, { "epoch": 3.7208441318030356, "grad_norm": 0.26657384634017944, "learning_rate": 9.930439015012396e-05, "loss": 0.0422, "step": 10050 }, { "epoch": 3.7245464642724917, "grad_norm": 0.7021580338478088, "learning_rate": 9.930163896936027e-05, "loss": 0.0411, "step": 10060 }, { "epoch": 3.7282487967419473, "grad_norm": 0.37857306003570557, "learning_rate": 9.929888239704345e-05, "loss": 0.0325, "step": 10070 }, { "epoch": 3.731951129211403, "grad_norm": 0.26796677708625793, "learning_rate": 9.929612043347498e-05, "loss": 0.0443, "step": 10080 }, { "epoch": 3.735653461680859, "grad_norm": 0.26812517642974854, "learning_rate": 9.929335307895689e-05, "loss": 0.0414, "step": 10090 }, { "epoch": 3.7393557941503146, "grad_norm": 0.8585242629051208, "learning_rate": 9.929058033379181e-05, "loss": 0.0543, "step": 10100 }, { "epoch": 3.74305812661977, "grad_norm": 0.20859141647815704, "learning_rate": 9.9287802198283e-05, "loss": 0.0519, "step": 10110 }, { "epoch": 3.7467604590892263, "grad_norm": 0.3133762776851654, "learning_rate": 9.928501867273423e-05, "loss": 0.0455, "step": 10120 }, { "epoch": 3.750462791558682, "grad_norm": 0.3743823766708374, "learning_rate": 9.928222975744991e-05, "loss": 0.043, "step": 10130 }, { "epoch": 3.7541651240281375, "grad_norm": 0.7152413129806519, "learning_rate": 9.927943545273504e-05, "loss": 0.0352, "step": 10140 }, { "epoch": 3.7578674564975936, "grad_norm": 0.34856945276260376, "learning_rate": 9.927663575889521e-05, "loss": 0.0515, "step": 10150 }, { "epoch": 3.761569788967049, "grad_norm": 0.6770338416099548, "learning_rate": 9.927383067623657e-05, "loss": 0.038, "step": 10160 }, { "epoch": 3.765272121436505, "grad_norm": 0.7418534755706787, "learning_rate": 9.927102020506588e-05, "loss": 0.0423, "step": 10170 }, { "epoch": 3.768974453905961, "grad_norm": 0.25797131657600403, "learning_rate": 9.926820434569051e-05, "loss": 0.0476, "step": 10180 }, { "epoch": 3.7726767863754165, "grad_norm": 0.2747720777988434, "learning_rate": 9.926538309841839e-05, "loss": 0.0573, "step": 10190 }, { "epoch": 3.776379118844872, "grad_norm": 0.39335164427757263, "learning_rate": 9.926255646355804e-05, "loss": 0.0369, "step": 10200 }, { "epoch": 3.780081451314328, "grad_norm": 0.334306925535202, "learning_rate": 9.925972444141858e-05, "loss": 0.032, "step": 10210 }, { "epoch": 3.7837837837837838, "grad_norm": 0.30151796340942383, "learning_rate": 9.92568870323097e-05, "loss": 0.0461, "step": 10220 }, { "epoch": 3.7874861162532394, "grad_norm": 0.368055522441864, "learning_rate": 9.925404423654174e-05, "loss": 0.0499, "step": 10230 }, { "epoch": 3.7911884487226954, "grad_norm": 0.23567283153533936, "learning_rate": 9.925119605442554e-05, "loss": 0.055, "step": 10240 }, { "epoch": 3.794890781192151, "grad_norm": 0.35681676864624023, "learning_rate": 9.92483424862726e-05, "loss": 0.0356, "step": 10250 }, { "epoch": 3.7985931136616067, "grad_norm": 0.7239909172058105, "learning_rate": 9.924548353239495e-05, "loss": 0.0539, "step": 10260 }, { "epoch": 3.8022954461310627, "grad_norm": 0.32610049843788147, "learning_rate": 9.924261919310527e-05, "loss": 0.0302, "step": 10270 }, { "epoch": 3.8059977786005184, "grad_norm": 0.4758892357349396, "learning_rate": 9.923974946871679e-05, "loss": 0.0438, "step": 10280 }, { "epoch": 3.809700111069974, "grad_norm": 0.5681472420692444, "learning_rate": 9.923687435954334e-05, "loss": 0.0465, "step": 10290 }, { "epoch": 3.81340244353943, "grad_norm": 0.5661081671714783, "learning_rate": 9.923399386589933e-05, "loss": 0.0391, "step": 10300 }, { "epoch": 3.8171047760088856, "grad_norm": 0.1776338517665863, "learning_rate": 9.923110798809978e-05, "loss": 0.0458, "step": 10310 }, { "epoch": 3.8208071084783413, "grad_norm": 0.31126439571380615, "learning_rate": 9.922821672646027e-05, "loss": 0.0391, "step": 10320 }, { "epoch": 3.8245094409477973, "grad_norm": 0.5467144250869751, "learning_rate": 9.9225320081297e-05, "loss": 0.043, "step": 10330 }, { "epoch": 3.828211773417253, "grad_norm": 0.3818420469760895, "learning_rate": 9.922241805292674e-05, "loss": 0.0536, "step": 10340 }, { "epoch": 3.8319141058867086, "grad_norm": 1.2595306634902954, "learning_rate": 9.921951064166684e-05, "loss": 0.0386, "step": 10350 }, { "epoch": 3.8356164383561646, "grad_norm": 0.24238716065883636, "learning_rate": 9.921659784783526e-05, "loss": 0.051, "step": 10360 }, { "epoch": 3.8393187708256202, "grad_norm": 0.3219453990459442, "learning_rate": 9.921367967175052e-05, "loss": 0.0435, "step": 10370 }, { "epoch": 3.843021103295076, "grad_norm": 0.37658238410949707, "learning_rate": 9.921075611373179e-05, "loss": 0.0414, "step": 10380 }, { "epoch": 3.846723435764532, "grad_norm": 0.4870176911354065, "learning_rate": 9.920782717409873e-05, "loss": 0.0394, "step": 10390 }, { "epoch": 3.8504257682339875, "grad_norm": 0.4594362676143646, "learning_rate": 9.92048928531717e-05, "loss": 0.0414, "step": 10400 }, { "epoch": 3.854128100703443, "grad_norm": 0.38341495394706726, "learning_rate": 9.920195315127155e-05, "loss": 0.0456, "step": 10410 }, { "epoch": 3.857830433172899, "grad_norm": 0.40470513701438904, "learning_rate": 9.919900806871976e-05, "loss": 0.0393, "step": 10420 }, { "epoch": 3.861532765642355, "grad_norm": 0.1972712278366089, "learning_rate": 9.919605760583845e-05, "loss": 0.0341, "step": 10430 }, { "epoch": 3.8652350981118104, "grad_norm": 0.5341705679893494, "learning_rate": 9.919310176295022e-05, "loss": 0.0381, "step": 10440 }, { "epoch": 3.868937430581266, "grad_norm": 0.9306214451789856, "learning_rate": 9.919014054037836e-05, "loss": 0.0464, "step": 10450 }, { "epoch": 3.872639763050722, "grad_norm": 0.19788560271263123, "learning_rate": 9.918717393844669e-05, "loss": 0.0564, "step": 10460 }, { "epoch": 3.8763420955201777, "grad_norm": 0.4614839255809784, "learning_rate": 9.918420195747962e-05, "loss": 0.045, "step": 10470 }, { "epoch": 3.8800444279896333, "grad_norm": 0.2777542769908905, "learning_rate": 9.918122459780217e-05, "loss": 0.0458, "step": 10480 }, { "epoch": 3.883746760459089, "grad_norm": 0.4761795103549957, "learning_rate": 9.917824185973994e-05, "loss": 0.0376, "step": 10490 }, { "epoch": 3.887449092928545, "grad_norm": 0.6137073040008545, "learning_rate": 9.917525374361912e-05, "loss": 0.0391, "step": 10500 }, { "epoch": 3.8911514253980006, "grad_norm": 1.4308817386627197, "learning_rate": 9.917226024976649e-05, "loss": 0.0555, "step": 10510 }, { "epoch": 3.8948537578674562, "grad_norm": 0.9913435578346252, "learning_rate": 9.91692613785094e-05, "loss": 0.0413, "step": 10520 }, { "epoch": 3.8985560903369123, "grad_norm": 0.6160308718681335, "learning_rate": 9.916625713017583e-05, "loss": 0.0633, "step": 10530 }, { "epoch": 3.902258422806368, "grad_norm": 0.7191548943519592, "learning_rate": 9.916324750509427e-05, "loss": 0.0646, "step": 10540 }, { "epoch": 3.9059607552758235, "grad_norm": 0.40342530608177185, "learning_rate": 9.91602325035939e-05, "loss": 0.0492, "step": 10550 }, { "epoch": 3.9096630877452796, "grad_norm": 0.30707457661628723, "learning_rate": 9.915721212600441e-05, "loss": 0.0442, "step": 10560 }, { "epoch": 3.913365420214735, "grad_norm": 0.3921012878417969, "learning_rate": 9.91541863726561e-05, "loss": 0.0531, "step": 10570 }, { "epoch": 3.917067752684191, "grad_norm": 0.25046592950820923, "learning_rate": 9.915115524387988e-05, "loss": 0.0359, "step": 10580 }, { "epoch": 3.920770085153647, "grad_norm": 0.5641696453094482, "learning_rate": 9.914811874000723e-05, "loss": 0.0336, "step": 10590 }, { "epoch": 3.9244724176231025, "grad_norm": 0.3551032245159149, "learning_rate": 9.914507686137019e-05, "loss": 0.044, "step": 10600 }, { "epoch": 3.928174750092558, "grad_norm": 0.4770016372203827, "learning_rate": 9.914202960830144e-05, "loss": 0.0376, "step": 10610 }, { "epoch": 3.931877082562014, "grad_norm": 0.1637192666530609, "learning_rate": 9.913897698113422e-05, "loss": 0.0261, "step": 10620 }, { "epoch": 3.93557941503147, "grad_norm": 0.3228137493133545, "learning_rate": 9.913591898020235e-05, "loss": 0.0463, "step": 10630 }, { "epoch": 3.9392817475009254, "grad_norm": 1.354215145111084, "learning_rate": 9.913285560584025e-05, "loss": 0.0451, "step": 10640 }, { "epoch": 3.9429840799703815, "grad_norm": 0.570547342300415, "learning_rate": 9.912978685838294e-05, "loss": 0.0392, "step": 10650 }, { "epoch": 3.946686412439837, "grad_norm": 1.0891202688217163, "learning_rate": 9.912671273816601e-05, "loss": 0.0464, "step": 10660 }, { "epoch": 3.9503887449092927, "grad_norm": 0.2339106947183609, "learning_rate": 9.912363324552563e-05, "loss": 0.0431, "step": 10670 }, { "epoch": 3.9540910773787488, "grad_norm": 0.38278475403785706, "learning_rate": 9.912054838079856e-05, "loss": 0.0428, "step": 10680 }, { "epoch": 3.9577934098482044, "grad_norm": 1.0752979516983032, "learning_rate": 9.911745814432218e-05, "loss": 0.0481, "step": 10690 }, { "epoch": 3.96149574231766, "grad_norm": 0.9817699790000916, "learning_rate": 9.911436253643445e-05, "loss": 0.0369, "step": 10700 }, { "epoch": 3.965198074787116, "grad_norm": 0.45461368560791016, "learning_rate": 9.911126155747385e-05, "loss": 0.0324, "step": 10710 }, { "epoch": 3.9689004072565717, "grad_norm": 0.3775395452976227, "learning_rate": 9.910815520777952e-05, "loss": 0.0408, "step": 10720 }, { "epoch": 3.9726027397260273, "grad_norm": 0.7538658380508423, "learning_rate": 9.910504348769118e-05, "loss": 0.0515, "step": 10730 }, { "epoch": 3.9763050721954833, "grad_norm": 0.5082865953445435, "learning_rate": 9.910192639754911e-05, "loss": 0.041, "step": 10740 }, { "epoch": 3.980007404664939, "grad_norm": 0.36482319235801697, "learning_rate": 9.90988039376942e-05, "loss": 0.0393, "step": 10750 }, { "epoch": 3.9837097371343946, "grad_norm": 0.2972713112831116, "learning_rate": 9.909567610846788e-05, "loss": 0.0453, "step": 10760 }, { "epoch": 3.9874120696038506, "grad_norm": 0.6491296887397766, "learning_rate": 9.909254291021228e-05, "loss": 0.0456, "step": 10770 }, { "epoch": 3.9911144020733063, "grad_norm": 0.19626876711845398, "learning_rate": 9.908940434326997e-05, "loss": 0.0447, "step": 10780 }, { "epoch": 3.994816734542762, "grad_norm": 0.4287501573562622, "learning_rate": 9.90862604079842e-05, "loss": 0.0334, "step": 10790 }, { "epoch": 3.998519067012218, "grad_norm": 0.25744253396987915, "learning_rate": 9.90831111046988e-05, "loss": 0.0425, "step": 10800 }, { "epoch": 4.002221399481673, "grad_norm": 0.4075479209423065, "learning_rate": 9.907995643375818e-05, "loss": 0.0386, "step": 10810 }, { "epoch": 4.005923731951129, "grad_norm": 0.42011111974716187, "learning_rate": 9.907679639550729e-05, "loss": 0.0395, "step": 10820 }, { "epoch": 4.009626064420585, "grad_norm": 0.391236811876297, "learning_rate": 9.907363099029175e-05, "loss": 0.0406, "step": 10830 }, { "epoch": 4.01332839689004, "grad_norm": 0.30385634303092957, "learning_rate": 9.907046021845769e-05, "loss": 0.0526, "step": 10840 }, { "epoch": 4.0170307293594965, "grad_norm": 0.3955989181995392, "learning_rate": 9.90672840803519e-05, "loss": 0.0501, "step": 10850 }, { "epoch": 4.0207330618289525, "grad_norm": 0.35471734404563904, "learning_rate": 9.906410257632168e-05, "loss": 0.045, "step": 10860 }, { "epoch": 4.024435394298408, "grad_norm": 0.43937554955482483, "learning_rate": 9.906091570671497e-05, "loss": 0.0379, "step": 10870 }, { "epoch": 4.028137726767864, "grad_norm": 0.3660019040107727, "learning_rate": 9.905772347188029e-05, "loss": 0.0499, "step": 10880 }, { "epoch": 4.03184005923732, "grad_norm": 0.4237229526042938, "learning_rate": 9.90545258721667e-05, "loss": 0.0503, "step": 10890 }, { "epoch": 4.035542391706775, "grad_norm": 1.0714974403381348, "learning_rate": 9.905132290792394e-05, "loss": 0.053, "step": 10900 }, { "epoch": 4.039244724176231, "grad_norm": 0.4847368597984314, "learning_rate": 9.904811457950225e-05, "loss": 0.0421, "step": 10910 }, { "epoch": 4.042947056645687, "grad_norm": 0.1863749772310257, "learning_rate": 9.904490088725249e-05, "loss": 0.0445, "step": 10920 }, { "epoch": 4.046649389115142, "grad_norm": 0.4583654999732971, "learning_rate": 9.90416818315261e-05, "loss": 0.0518, "step": 10930 }, { "epoch": 4.050351721584598, "grad_norm": 0.27936431765556335, "learning_rate": 9.903845741267513e-05, "loss": 0.037, "step": 10940 }, { "epoch": 4.054054054054054, "grad_norm": 0.3661942481994629, "learning_rate": 9.903522763105218e-05, "loss": 0.0376, "step": 10950 }, { "epoch": 4.05775638652351, "grad_norm": 0.30436912178993225, "learning_rate": 9.903199248701044e-05, "loss": 0.0326, "step": 10960 }, { "epoch": 4.061458718992966, "grad_norm": 0.46762505173683167, "learning_rate": 9.902875198090375e-05, "loss": 0.0454, "step": 10970 }, { "epoch": 4.065161051462422, "grad_norm": 0.28114989399909973, "learning_rate": 9.902550611308645e-05, "loss": 0.0468, "step": 10980 }, { "epoch": 4.068863383931877, "grad_norm": 0.7953407168388367, "learning_rate": 9.90222548839135e-05, "loss": 0.0388, "step": 10990 }, { "epoch": 4.072565716401333, "grad_norm": 0.7961453199386597, "learning_rate": 9.901899829374047e-05, "loss": 0.0374, "step": 11000 }, { "epoch": 4.076268048870789, "grad_norm": 0.2528325617313385, "learning_rate": 9.901573634292348e-05, "loss": 0.033, "step": 11010 }, { "epoch": 4.079970381340244, "grad_norm": 0.9981855154037476, "learning_rate": 9.901246903181926e-05, "loss": 0.0525, "step": 11020 }, { "epoch": 4.0836727138097, "grad_norm": 0.30865567922592163, "learning_rate": 9.900919636078512e-05, "loss": 0.0422, "step": 11030 }, { "epoch": 4.087375046279156, "grad_norm": 0.35715293884277344, "learning_rate": 9.900591833017894e-05, "loss": 0.0417, "step": 11040 }, { "epoch": 4.091077378748611, "grad_norm": 0.7212464809417725, "learning_rate": 9.900263494035921e-05, "loss": 0.0431, "step": 11050 }, { "epoch": 4.0947797112180675, "grad_norm": 0.40731534361839294, "learning_rate": 9.899934619168501e-05, "loss": 0.0378, "step": 11060 }, { "epoch": 4.098482043687524, "grad_norm": 0.3766903877258301, "learning_rate": 9.899605208451598e-05, "loss": 0.0461, "step": 11070 }, { "epoch": 4.102184376156979, "grad_norm": 0.5072706341743469, "learning_rate": 9.899275261921234e-05, "loss": 0.0536, "step": 11080 }, { "epoch": 4.105886708626435, "grad_norm": 0.297306627035141, "learning_rate": 9.898944779613495e-05, "loss": 0.049, "step": 11090 }, { "epoch": 4.109589041095891, "grad_norm": 0.29051604866981506, "learning_rate": 9.89861376156452e-05, "loss": 0.0362, "step": 11100 }, { "epoch": 4.113291373565346, "grad_norm": 0.7263736724853516, "learning_rate": 9.89828220781051e-05, "loss": 0.0378, "step": 11110 }, { "epoch": 4.116993706034802, "grad_norm": 0.3375471830368042, "learning_rate": 9.89795011838772e-05, "loss": 0.0467, "step": 11120 }, { "epoch": 4.120696038504257, "grad_norm": 0.4616067707538605, "learning_rate": 9.89761749333247e-05, "loss": 0.0423, "step": 11130 }, { "epoch": 4.124398370973713, "grad_norm": 0.22792908549308777, "learning_rate": 9.897284332681134e-05, "loss": 0.0449, "step": 11140 }, { "epoch": 4.128100703443169, "grad_norm": 0.4961409568786621, "learning_rate": 9.896950636470147e-05, "loss": 0.0367, "step": 11150 }, { "epoch": 4.1318030359126245, "grad_norm": 0.38282066583633423, "learning_rate": 9.896616404736001e-05, "loss": 0.0443, "step": 11160 }, { "epoch": 4.135505368382081, "grad_norm": 0.41254183650016785, "learning_rate": 9.896281637515246e-05, "loss": 0.0579, "step": 11170 }, { "epoch": 4.139207700851537, "grad_norm": 0.6006028056144714, "learning_rate": 9.895946334844494e-05, "loss": 0.0383, "step": 11180 }, { "epoch": 4.142910033320992, "grad_norm": 0.23027925193309784, "learning_rate": 9.89561049676041e-05, "loss": 0.0525, "step": 11190 }, { "epoch": 4.146612365790448, "grad_norm": 0.1779184192419052, "learning_rate": 9.895274123299723e-05, "loss": 0.0539, "step": 11200 }, { "epoch": 4.150314698259904, "grad_norm": 0.23164507746696472, "learning_rate": 9.89493721449922e-05, "loss": 0.0449, "step": 11210 }, { "epoch": 4.154017030729359, "grad_norm": 0.2562166154384613, "learning_rate": 9.894599770395741e-05, "loss": 0.0497, "step": 11220 }, { "epoch": 4.157719363198815, "grad_norm": 0.5311615467071533, "learning_rate": 9.894261791026189e-05, "loss": 0.0419, "step": 11230 }, { "epoch": 4.161421695668271, "grad_norm": 0.2831803560256958, "learning_rate": 9.893923276427527e-05, "loss": 0.0372, "step": 11240 }, { "epoch": 4.165124028137726, "grad_norm": 0.4038902819156647, "learning_rate": 9.893584226636772e-05, "loss": 0.0505, "step": 11250 }, { "epoch": 4.1688263606071825, "grad_norm": 0.4051962196826935, "learning_rate": 9.893244641691006e-05, "loss": 0.0404, "step": 11260 }, { "epoch": 4.1725286930766385, "grad_norm": 0.13896960020065308, "learning_rate": 9.892904521627361e-05, "loss": 0.0567, "step": 11270 }, { "epoch": 4.176231025546094, "grad_norm": 0.59201979637146, "learning_rate": 9.892563866483035e-05, "loss": 0.0582, "step": 11280 }, { "epoch": 4.17993335801555, "grad_norm": 0.2605248987674713, "learning_rate": 9.89222267629528e-05, "loss": 0.0335, "step": 11290 }, { "epoch": 4.183635690485006, "grad_norm": 0.43937188386917114, "learning_rate": 9.891880951101407e-05, "loss": 0.0432, "step": 11300 }, { "epoch": 4.187338022954461, "grad_norm": 0.4371347427368164, "learning_rate": 9.89153869093879e-05, "loss": 0.0352, "step": 11310 }, { "epoch": 4.191040355423917, "grad_norm": 1.1726341247558594, "learning_rate": 9.891195895844855e-05, "loss": 0.0438, "step": 11320 }, { "epoch": 4.194742687893373, "grad_norm": 0.5153020620346069, "learning_rate": 9.890852565857092e-05, "loss": 0.0443, "step": 11330 }, { "epoch": 4.198445020362828, "grad_norm": 0.6052640080451965, "learning_rate": 9.890508701013044e-05, "loss": 0.0396, "step": 11340 }, { "epoch": 4.202147352832284, "grad_norm": 0.26795780658721924, "learning_rate": 9.890164301350318e-05, "loss": 0.0391, "step": 11350 }, { "epoch": 4.20584968530174, "grad_norm": 0.29687023162841797, "learning_rate": 9.889819366906577e-05, "loss": 0.0478, "step": 11360 }, { "epoch": 4.209552017771196, "grad_norm": 0.5270669460296631, "learning_rate": 9.889473897719539e-05, "loss": 0.0431, "step": 11370 }, { "epoch": 4.213254350240652, "grad_norm": 0.2869660556316376, "learning_rate": 9.889127893826989e-05, "loss": 0.0413, "step": 11380 }, { "epoch": 4.216956682710108, "grad_norm": 0.6478520035743713, "learning_rate": 9.888781355266763e-05, "loss": 0.0421, "step": 11390 }, { "epoch": 4.220659015179563, "grad_norm": 2.287658452987671, "learning_rate": 9.888434282076758e-05, "loss": 0.042, "step": 11400 }, { "epoch": 4.224361347649019, "grad_norm": 0.6547231078147888, "learning_rate": 9.888086674294929e-05, "loss": 0.0539, "step": 11410 }, { "epoch": 4.228063680118475, "grad_norm": 0.41066229343414307, "learning_rate": 9.887738531959292e-05, "loss": 0.0407, "step": 11420 }, { "epoch": 4.23176601258793, "grad_norm": 0.2792793810367584, "learning_rate": 9.887389855107916e-05, "loss": 0.0546, "step": 11430 }, { "epoch": 4.235468345057386, "grad_norm": 0.5446696281433105, "learning_rate": 9.887040643778936e-05, "loss": 0.0442, "step": 11440 }, { "epoch": 4.239170677526842, "grad_norm": 0.5462769269943237, "learning_rate": 9.886690898010535e-05, "loss": 0.0407, "step": 11450 }, { "epoch": 4.2428730099962975, "grad_norm": 0.6039749383926392, "learning_rate": 9.886340617840968e-05, "loss": 0.0425, "step": 11460 }, { "epoch": 4.2465753424657535, "grad_norm": 0.32524004578590393, "learning_rate": 9.885989803308535e-05, "loss": 0.0357, "step": 11470 }, { "epoch": 4.25027767493521, "grad_norm": 0.3395145535469055, "learning_rate": 9.885638454451604e-05, "loss": 0.0577, "step": 11480 }, { "epoch": 4.253980007404665, "grad_norm": 0.949039876461029, "learning_rate": 9.885286571308598e-05, "loss": 0.042, "step": 11490 }, { "epoch": 4.257682339874121, "grad_norm": 0.278865247964859, "learning_rate": 9.884934153917997e-05, "loss": 0.0325, "step": 11500 }, { "epoch": 4.261384672343576, "grad_norm": 0.5357979536056519, "learning_rate": 9.884581202318341e-05, "loss": 0.0345, "step": 11510 }, { "epoch": 4.265087004813032, "grad_norm": 0.3160979747772217, "learning_rate": 9.884227716548228e-05, "loss": 0.04, "step": 11520 }, { "epoch": 4.268789337282488, "grad_norm": 0.44170624017715454, "learning_rate": 9.883873696646316e-05, "loss": 0.0445, "step": 11530 }, { "epoch": 4.272491669751943, "grad_norm": 0.44453707337379456, "learning_rate": 9.88351914265132e-05, "loss": 0.037, "step": 11540 }, { "epoch": 4.276194002221399, "grad_norm": 0.2809351980686188, "learning_rate": 9.883164054602012e-05, "loss": 0.0413, "step": 11550 }, { "epoch": 4.279896334690855, "grad_norm": 0.1617765724658966, "learning_rate": 9.882808432537224e-05, "loss": 0.0364, "step": 11560 }, { "epoch": 4.283598667160311, "grad_norm": 0.5973482728004456, "learning_rate": 9.882452276495848e-05, "loss": 0.0351, "step": 11570 }, { "epoch": 4.287300999629767, "grad_norm": 0.49270468950271606, "learning_rate": 9.882095586516831e-05, "loss": 0.0396, "step": 11580 }, { "epoch": 4.291003332099223, "grad_norm": 0.2542863190174103, "learning_rate": 9.881738362639182e-05, "loss": 0.0404, "step": 11590 }, { "epoch": 4.294705664568678, "grad_norm": 0.4031014144420624, "learning_rate": 9.881380604901964e-05, "loss": 0.0382, "step": 11600 }, { "epoch": 4.298407997038134, "grad_norm": 0.4366876184940338, "learning_rate": 9.881022313344302e-05, "loss": 0.0344, "step": 11610 }, { "epoch": 4.30211032950759, "grad_norm": 0.35278838872909546, "learning_rate": 9.880663488005379e-05, "loss": 0.0444, "step": 11620 }, { "epoch": 4.305812661977045, "grad_norm": 0.6750023365020752, "learning_rate": 9.880304128924434e-05, "loss": 0.0369, "step": 11630 }, { "epoch": 4.309514994446501, "grad_norm": 0.3653678894042969, "learning_rate": 9.879944236140768e-05, "loss": 0.033, "step": 11640 }, { "epoch": 4.313217326915957, "grad_norm": 0.3099222183227539, "learning_rate": 9.879583809693738e-05, "loss": 0.0482, "step": 11650 }, { "epoch": 4.3169196593854124, "grad_norm": 0.978136420249939, "learning_rate": 9.879222849622758e-05, "loss": 0.044, "step": 11660 }, { "epoch": 4.3206219918548685, "grad_norm": 0.6735360622406006, "learning_rate": 9.878861355967302e-05, "loss": 0.0465, "step": 11670 }, { "epoch": 4.324324324324325, "grad_norm": 0.43065229058265686, "learning_rate": 9.878499328766904e-05, "loss": 0.0347, "step": 11680 }, { "epoch": 4.32802665679378, "grad_norm": 0.4424453377723694, "learning_rate": 9.878136768061154e-05, "loss": 0.0653, "step": 11690 }, { "epoch": 4.331728989263236, "grad_norm": 0.24775971472263336, "learning_rate": 9.877773673889701e-05, "loss": 0.0487, "step": 11700 }, { "epoch": 4.335431321732692, "grad_norm": 0.30000728368759155, "learning_rate": 9.877410046292255e-05, "loss": 0.033, "step": 11710 }, { "epoch": 4.339133654202147, "grad_norm": 0.42017072439193726, "learning_rate": 9.877045885308577e-05, "loss": 0.0597, "step": 11720 }, { "epoch": 4.342835986671603, "grad_norm": 0.28831538558006287, "learning_rate": 9.876681190978494e-05, "loss": 0.0314, "step": 11730 }, { "epoch": 4.346538319141059, "grad_norm": 0.4300960898399353, "learning_rate": 9.876315963341887e-05, "loss": 0.0323, "step": 11740 }, { "epoch": 4.350240651610514, "grad_norm": 0.3726477026939392, "learning_rate": 9.8759502024387e-05, "loss": 0.0512, "step": 11750 }, { "epoch": 4.35394298407997, "grad_norm": 0.21438486874103546, "learning_rate": 9.875583908308928e-05, "loss": 0.0489, "step": 11760 }, { "epoch": 4.357645316549426, "grad_norm": 0.34607234597206116, "learning_rate": 9.875217080992631e-05, "loss": 0.0499, "step": 11770 }, { "epoch": 4.361347649018882, "grad_norm": 1.0786375999450684, "learning_rate": 9.874849720529921e-05, "loss": 0.0423, "step": 11780 }, { "epoch": 4.365049981488338, "grad_norm": 1.1195790767669678, "learning_rate": 9.874481826960979e-05, "loss": 0.0503, "step": 11790 }, { "epoch": 4.368752313957794, "grad_norm": 0.42606592178344727, "learning_rate": 9.87411340032603e-05, "loss": 0.0452, "step": 11800 }, { "epoch": 4.372454646427249, "grad_norm": 0.3193686604499817, "learning_rate": 9.873744440665369e-05, "loss": 0.042, "step": 11810 }, { "epoch": 4.376156978896705, "grad_norm": 0.20542915165424347, "learning_rate": 9.873374948019343e-05, "loss": 0.0364, "step": 11820 }, { "epoch": 4.379859311366161, "grad_norm": 0.23099495470523834, "learning_rate": 9.873004922428361e-05, "loss": 0.0344, "step": 11830 }, { "epoch": 4.383561643835616, "grad_norm": 0.45145153999328613, "learning_rate": 9.872634363932887e-05, "loss": 0.0436, "step": 11840 }, { "epoch": 4.387263976305072, "grad_norm": 0.33801138401031494, "learning_rate": 9.872263272573443e-05, "loss": 0.0397, "step": 11850 }, { "epoch": 4.390966308774528, "grad_norm": 0.8124662637710571, "learning_rate": 9.871891648390614e-05, "loss": 0.0477, "step": 11860 }, { "epoch": 4.3946686412439835, "grad_norm": 0.28956758975982666, "learning_rate": 9.871519491425038e-05, "loss": 0.0373, "step": 11870 }, { "epoch": 4.3983709737134395, "grad_norm": 0.3310887813568115, "learning_rate": 9.871146801717417e-05, "loss": 0.0399, "step": 11880 }, { "epoch": 4.402073306182896, "grad_norm": 0.45273247361183167, "learning_rate": 9.870773579308503e-05, "loss": 0.0607, "step": 11890 }, { "epoch": 4.405775638652351, "grad_norm": 0.5494795441627502, "learning_rate": 9.870399824239117e-05, "loss": 0.0354, "step": 11900 }, { "epoch": 4.409477971121807, "grad_norm": 0.40476536750793457, "learning_rate": 9.870025536550125e-05, "loss": 0.0384, "step": 11910 }, { "epoch": 4.413180303591263, "grad_norm": 0.36139532923698425, "learning_rate": 9.869650716282464e-05, "loss": 0.0371, "step": 11920 }, { "epoch": 4.416882636060718, "grad_norm": 0.30705147981643677, "learning_rate": 9.869275363477122e-05, "loss": 0.0368, "step": 11930 }, { "epoch": 4.420584968530174, "grad_norm": 0.41305050253868103, "learning_rate": 9.868899478175147e-05, "loss": 0.0317, "step": 11940 }, { "epoch": 4.42428730099963, "grad_norm": 0.40575841069221497, "learning_rate": 9.868523060417646e-05, "loss": 0.0579, "step": 11950 }, { "epoch": 4.427989633469085, "grad_norm": 0.24244552850723267, "learning_rate": 9.86814611024578e-05, "loss": 0.0463, "step": 11960 }, { "epoch": 4.431691965938541, "grad_norm": 0.24754585325717926, "learning_rate": 9.867768627700776e-05, "loss": 0.0512, "step": 11970 }, { "epoch": 4.4353942984079975, "grad_norm": 0.8864213228225708, "learning_rate": 9.867390612823914e-05, "loss": 0.0352, "step": 11980 }, { "epoch": 4.439096630877453, "grad_norm": 0.37776169180870056, "learning_rate": 9.867012065656533e-05, "loss": 0.0353, "step": 11990 }, { "epoch": 4.442798963346909, "grad_norm": 0.3489532768726349, "learning_rate": 9.86663298624003e-05, "loss": 0.0391, "step": 12000 }, { "epoch": 4.446501295816365, "grad_norm": 0.7855028510093689, "learning_rate": 9.86625337461586e-05, "loss": 0.0378, "step": 12010 }, { "epoch": 4.45020362828582, "grad_norm": 1.9475842714309692, "learning_rate": 9.865873230825538e-05, "loss": 0.0354, "step": 12020 }, { "epoch": 4.453905960755276, "grad_norm": 0.8662212491035461, "learning_rate": 9.865492554910633e-05, "loss": 0.0372, "step": 12030 }, { "epoch": 4.457608293224731, "grad_norm": 0.2637218236923218, "learning_rate": 9.865111346912779e-05, "loss": 0.0261, "step": 12040 }, { "epoch": 4.461310625694187, "grad_norm": 0.34643012285232544, "learning_rate": 9.864729606873663e-05, "loss": 0.0313, "step": 12050 }, { "epoch": 4.465012958163643, "grad_norm": 0.20442858338356018, "learning_rate": 9.86434733483503e-05, "loss": 0.0295, "step": 12060 }, { "epoch": 4.4687152906330985, "grad_norm": 0.20726414024829865, "learning_rate": 9.863964530838687e-05, "loss": 0.0404, "step": 12070 }, { "epoch": 4.4724176231025545, "grad_norm": 0.5898615717887878, "learning_rate": 9.863581194926495e-05, "loss": 0.0467, "step": 12080 }, { "epoch": 4.476119955572011, "grad_norm": 0.3302665948867798, "learning_rate": 9.863197327140376e-05, "loss": 0.0474, "step": 12090 }, { "epoch": 4.479822288041466, "grad_norm": 0.23538997769355774, "learning_rate": 9.862812927522309e-05, "loss": 0.0405, "step": 12100 }, { "epoch": 4.483524620510922, "grad_norm": 1.2154473066329956, "learning_rate": 9.862427996114332e-05, "loss": 0.0423, "step": 12110 }, { "epoch": 4.487226952980378, "grad_norm": 0.6051502227783203, "learning_rate": 9.86204253295854e-05, "loss": 0.0406, "step": 12120 }, { "epoch": 4.490929285449833, "grad_norm": 0.36276936531066895, "learning_rate": 9.861656538097086e-05, "loss": 0.0302, "step": 12130 }, { "epoch": 4.494631617919289, "grad_norm": 1.1374493837356567, "learning_rate": 9.861270011572182e-05, "loss": 0.0377, "step": 12140 }, { "epoch": 4.498333950388745, "grad_norm": 0.4048938751220703, "learning_rate": 9.860882953426099e-05, "loss": 0.0442, "step": 12150 }, { "epoch": 4.5020362828582, "grad_norm": 0.5019957423210144, "learning_rate": 9.860495363701164e-05, "loss": 0.0358, "step": 12160 }, { "epoch": 4.505738615327656, "grad_norm": 0.2428198754787445, "learning_rate": 9.860107242439764e-05, "loss": 0.042, "step": 12170 }, { "epoch": 4.5094409477971125, "grad_norm": 0.5933547616004944, "learning_rate": 9.859718589684344e-05, "loss": 0.0449, "step": 12180 }, { "epoch": 4.513143280266568, "grad_norm": 0.8898560404777527, "learning_rate": 9.859329405477403e-05, "loss": 0.0496, "step": 12190 }, { "epoch": 4.516845612736024, "grad_norm": 0.792847216129303, "learning_rate": 9.858939689861506e-05, "loss": 0.053, "step": 12200 }, { "epoch": 4.52054794520548, "grad_norm": 0.21596506237983704, "learning_rate": 9.858549442879269e-05, "loss": 0.0341, "step": 12210 }, { "epoch": 4.524250277674935, "grad_norm": 0.3379891812801361, "learning_rate": 9.85815866457337e-05, "loss": 0.0418, "step": 12220 }, { "epoch": 4.527952610144391, "grad_norm": 1.2101670503616333, "learning_rate": 9.857767354986545e-05, "loss": 0.0535, "step": 12230 }, { "epoch": 4.531654942613847, "grad_norm": 0.3365684747695923, "learning_rate": 9.857375514161583e-05, "loss": 0.0395, "step": 12240 }, { "epoch": 4.535357275083302, "grad_norm": 0.27503034472465515, "learning_rate": 9.856983142141339e-05, "loss": 0.038, "step": 12250 }, { "epoch": 4.539059607552758, "grad_norm": 0.3356274664402008, "learning_rate": 9.856590238968721e-05, "loss": 0.0463, "step": 12260 }, { "epoch": 4.542761940022214, "grad_norm": 0.41449061036109924, "learning_rate": 9.856196804686696e-05, "loss": 0.0444, "step": 12270 }, { "epoch": 4.5464642724916695, "grad_norm": 0.40115228295326233, "learning_rate": 9.85580283933829e-05, "loss": 0.0386, "step": 12280 }, { "epoch": 4.550166604961126, "grad_norm": 0.9528221487998962, "learning_rate": 9.855408342966585e-05, "loss": 0.0302, "step": 12290 }, { "epoch": 4.553868937430582, "grad_norm": 0.23996932804584503, "learning_rate": 9.855013315614725e-05, "loss": 0.0403, "step": 12300 }, { "epoch": 4.557571269900037, "grad_norm": 0.5597036480903625, "learning_rate": 9.854617757325908e-05, "loss": 0.0652, "step": 12310 }, { "epoch": 4.561273602369493, "grad_norm": 0.3432157337665558, "learning_rate": 9.854221668143391e-05, "loss": 0.045, "step": 12320 }, { "epoch": 4.564975934838949, "grad_norm": 0.20778366923332214, "learning_rate": 9.85382504811049e-05, "loss": 0.0328, "step": 12330 }, { "epoch": 4.568678267308404, "grad_norm": 0.8993721604347229, "learning_rate": 9.853427897270582e-05, "loss": 0.0487, "step": 12340 }, { "epoch": 4.57238059977786, "grad_norm": 0.4159921407699585, "learning_rate": 9.853030215667093e-05, "loss": 0.039, "step": 12350 }, { "epoch": 4.576082932247315, "grad_norm": 0.21833036839962006, "learning_rate": 9.852632003343518e-05, "loss": 0.034, "step": 12360 }, { "epoch": 4.579785264716771, "grad_norm": 0.28287604451179504, "learning_rate": 9.852233260343403e-05, "loss": 0.0302, "step": 12370 }, { "epoch": 4.5834875971862274, "grad_norm": 0.36655136942863464, "learning_rate": 9.851833986710353e-05, "loss": 0.0551, "step": 12380 }, { "epoch": 4.587189929655683, "grad_norm": 0.4113033711910248, "learning_rate": 9.851434182488033e-05, "loss": 0.0375, "step": 12390 }, { "epoch": 4.590892262125139, "grad_norm": 0.49815472960472107, "learning_rate": 9.851033847720166e-05, "loss": 0.0432, "step": 12400 }, { "epoch": 4.594594594594595, "grad_norm": 0.4025605320930481, "learning_rate": 9.85063298245053e-05, "loss": 0.0321, "step": 12410 }, { "epoch": 4.59829692706405, "grad_norm": 0.46160605549812317, "learning_rate": 9.850231586722963e-05, "loss": 0.0288, "step": 12420 }, { "epoch": 4.601999259533506, "grad_norm": 0.5430002808570862, "learning_rate": 9.849829660581363e-05, "loss": 0.0461, "step": 12430 }, { "epoch": 4.605701592002962, "grad_norm": 0.3386933505535126, "learning_rate": 9.849427204069684e-05, "loss": 0.0428, "step": 12440 }, { "epoch": 4.609403924472417, "grad_norm": 0.39253854751586914, "learning_rate": 9.849024217231935e-05, "loss": 0.0372, "step": 12450 }, { "epoch": 4.613106256941873, "grad_norm": 0.44929781556129456, "learning_rate": 9.848620700112188e-05, "loss": 0.0384, "step": 12460 }, { "epoch": 4.616808589411329, "grad_norm": 0.2821360230445862, "learning_rate": 9.848216652754571e-05, "loss": 0.0368, "step": 12470 }, { "epoch": 4.6205109218807845, "grad_norm": 0.33656391501426697, "learning_rate": 9.847812075203271e-05, "loss": 0.031, "step": 12480 }, { "epoch": 4.6242132543502406, "grad_norm": 0.2767808437347412, "learning_rate": 9.84740696750253e-05, "loss": 0.0441, "step": 12490 }, { "epoch": 4.627915586819697, "grad_norm": 0.5707509517669678, "learning_rate": 9.847001329696653e-05, "loss": 0.0526, "step": 12500 }, { "epoch": 4.631617919289152, "grad_norm": 1.976943850517273, "learning_rate": 9.846595161829996e-05, "loss": 0.0362, "step": 12510 }, { "epoch": 4.635320251758608, "grad_norm": 0.4927927255630493, "learning_rate": 9.84618846394698e-05, "loss": 0.0447, "step": 12520 }, { "epoch": 4.639022584228064, "grad_norm": 0.40919098258018494, "learning_rate": 9.845781236092078e-05, "loss": 0.0421, "step": 12530 }, { "epoch": 4.642724916697519, "grad_norm": 0.3780827820301056, "learning_rate": 9.845373478309827e-05, "loss": 0.0348, "step": 12540 }, { "epoch": 4.646427249166975, "grad_norm": 0.9501088261604309, "learning_rate": 9.844965190644817e-05, "loss": 0.0495, "step": 12550 }, { "epoch": 4.650129581636431, "grad_norm": 0.3671375513076782, "learning_rate": 9.844556373141699e-05, "loss": 0.0399, "step": 12560 }, { "epoch": 4.653831914105886, "grad_norm": 0.6778060793876648, "learning_rate": 9.844147025845178e-05, "loss": 0.0401, "step": 12570 }, { "epoch": 4.657534246575342, "grad_norm": 1.058711051940918, "learning_rate": 9.843737148800023e-05, "loss": 0.0389, "step": 12580 }, { "epoch": 4.6612365790447985, "grad_norm": 5.907308578491211, "learning_rate": 9.843326742051055e-05, "loss": 0.0448, "step": 12590 }, { "epoch": 4.664938911514254, "grad_norm": 0.3658979535102844, "learning_rate": 9.842915805643155e-05, "loss": 0.0346, "step": 12600 }, { "epoch": 4.66864124398371, "grad_norm": 0.962878406047821, "learning_rate": 9.842504339621266e-05, "loss": 0.0513, "step": 12610 }, { "epoch": 4.672343576453166, "grad_norm": 0.3146485984325409, "learning_rate": 9.842092344030382e-05, "loss": 0.0412, "step": 12620 }, { "epoch": 4.676045908922621, "grad_norm": 0.2750859260559082, "learning_rate": 9.841679818915559e-05, "loss": 0.0323, "step": 12630 }, { "epoch": 4.679748241392077, "grad_norm": 1.45376718044281, "learning_rate": 9.84126676432191e-05, "loss": 0.0453, "step": 12640 }, { "epoch": 4.683450573861533, "grad_norm": 0.31723785400390625, "learning_rate": 9.840853180294608e-05, "loss": 0.0276, "step": 12650 }, { "epoch": 4.687152906330988, "grad_norm": 0.5612754821777344, "learning_rate": 9.84043906687888e-05, "loss": 0.0359, "step": 12660 }, { "epoch": 4.690855238800444, "grad_norm": 0.19044950604438782, "learning_rate": 9.84002442412001e-05, "loss": 0.043, "step": 12670 }, { "epoch": 4.6945575712699, "grad_norm": 0.5323657393455505, "learning_rate": 9.839609252063351e-05, "loss": 0.0338, "step": 12680 }, { "epoch": 4.6982599037393555, "grad_norm": 0.8756014108657837, "learning_rate": 9.839193550754297e-05, "loss": 0.046, "step": 12690 }, { "epoch": 4.701962236208812, "grad_norm": 0.5290841460227966, "learning_rate": 9.838777320238312e-05, "loss": 0.0348, "step": 12700 }, { "epoch": 4.705664568678268, "grad_norm": 0.2768500745296478, "learning_rate": 9.838360560560915e-05, "loss": 0.0319, "step": 12710 }, { "epoch": 4.709366901147723, "grad_norm": 0.6183340549468994, "learning_rate": 9.837943271767682e-05, "loss": 0.0355, "step": 12720 }, { "epoch": 4.713069233617179, "grad_norm": 0.6569205522537231, "learning_rate": 9.837525453904246e-05, "loss": 0.0516, "step": 12730 }, { "epoch": 4.716771566086635, "grad_norm": 1.0956566333770752, "learning_rate": 9.837107107016299e-05, "loss": 0.0373, "step": 12740 }, { "epoch": 4.72047389855609, "grad_norm": 1.571851134300232, "learning_rate": 9.836688231149592e-05, "loss": 0.0465, "step": 12750 }, { "epoch": 4.724176231025546, "grad_norm": 1.0270299911499023, "learning_rate": 9.836268826349933e-05, "loss": 0.0302, "step": 12760 }, { "epoch": 4.727878563495002, "grad_norm": 0.18089310824871063, "learning_rate": 9.835848892663184e-05, "loss": 0.0413, "step": 12770 }, { "epoch": 4.731580895964457, "grad_norm": 0.34993448853492737, "learning_rate": 9.835428430135271e-05, "loss": 0.0392, "step": 12780 }, { "epoch": 4.7352832284339135, "grad_norm": 0.4056856334209442, "learning_rate": 9.835007438812177e-05, "loss": 0.0355, "step": 12790 }, { "epoch": 4.7389855609033695, "grad_norm": 0.35660526156425476, "learning_rate": 9.834585918739936e-05, "loss": 0.0429, "step": 12800 }, { "epoch": 4.742687893372825, "grad_norm": 0.1707174926996231, "learning_rate": 9.834163869964649e-05, "loss": 0.0414, "step": 12810 }, { "epoch": 4.746390225842281, "grad_norm": 0.20487700402736664, "learning_rate": 9.833741292532469e-05, "loss": 0.0273, "step": 12820 }, { "epoch": 4.750092558311737, "grad_norm": 0.31882354617118835, "learning_rate": 9.833318186489609e-05, "loss": 0.0296, "step": 12830 }, { "epoch": 4.753794890781192, "grad_norm": 0.21337322890758514, "learning_rate": 9.832894551882339e-05, "loss": 0.0466, "step": 12840 }, { "epoch": 4.757497223250648, "grad_norm": 0.26227816939353943, "learning_rate": 9.832470388756987e-05, "loss": 0.0392, "step": 12850 }, { "epoch": 4.761199555720104, "grad_norm": 0.7037726044654846, "learning_rate": 9.832045697159938e-05, "loss": 0.0301, "step": 12860 }, { "epoch": 4.764901888189559, "grad_norm": 0.34246835112571716, "learning_rate": 9.831620477137638e-05, "loss": 0.0396, "step": 12870 }, { "epoch": 4.768604220659015, "grad_norm": 0.2239648401737213, "learning_rate": 9.831194728736585e-05, "loss": 0.028, "step": 12880 }, { "epoch": 4.772306553128471, "grad_norm": 1.0103567838668823, "learning_rate": 9.830768452003341e-05, "loss": 0.0671, "step": 12890 }, { "epoch": 4.776008885597927, "grad_norm": 0.3315214514732361, "learning_rate": 9.830341646984521e-05, "loss": 0.0391, "step": 12900 }, { "epoch": 4.779711218067383, "grad_norm": 0.2442672699689865, "learning_rate": 9.829914313726802e-05, "loss": 0.0336, "step": 12910 }, { "epoch": 4.783413550536839, "grad_norm": 0.6658567786216736, "learning_rate": 9.829486452276915e-05, "loss": 0.0602, "step": 12920 }, { "epoch": 4.787115883006294, "grad_norm": 0.5691308379173279, "learning_rate": 9.82905806268165e-05, "loss": 0.0352, "step": 12930 }, { "epoch": 4.79081821547575, "grad_norm": 0.3166933059692383, "learning_rate": 9.828629144987857e-05, "loss": 0.0534, "step": 12940 }, { "epoch": 4.794520547945205, "grad_norm": 0.21193470060825348, "learning_rate": 9.82819969924244e-05, "loss": 0.0395, "step": 12950 }, { "epoch": 4.798222880414661, "grad_norm": 0.44033563137054443, "learning_rate": 9.827769725492362e-05, "loss": 0.0512, "step": 12960 }, { "epoch": 4.801925212884117, "grad_norm": 0.29194557666778564, "learning_rate": 9.827339223784646e-05, "loss": 0.0271, "step": 12970 }, { "epoch": 4.805627545353572, "grad_norm": 0.7171773910522461, "learning_rate": 9.82690819416637e-05, "loss": 0.0432, "step": 12980 }, { "epoch": 4.8093298778230285, "grad_norm": 0.3875778317451477, "learning_rate": 9.826476636684671e-05, "loss": 0.0477, "step": 12990 }, { "epoch": 4.8130322102924845, "grad_norm": 2.645402431488037, "learning_rate": 9.826044551386744e-05, "loss": 0.0367, "step": 13000 }, { "epoch": 4.81673454276194, "grad_norm": 0.6007974743843079, "learning_rate": 9.825611938319841e-05, "loss": 0.0482, "step": 13010 }, { "epoch": 4.820436875231396, "grad_norm": 0.5358085632324219, "learning_rate": 9.82517879753127e-05, "loss": 0.0401, "step": 13020 }, { "epoch": 4.824139207700852, "grad_norm": 0.2744055688381195, "learning_rate": 9.824745129068402e-05, "loss": 0.0369, "step": 13030 }, { "epoch": 4.827841540170307, "grad_norm": 0.29470574855804443, "learning_rate": 9.82431093297866e-05, "loss": 0.0511, "step": 13040 }, { "epoch": 4.831543872639763, "grad_norm": 0.4158346354961395, "learning_rate": 9.823876209309527e-05, "loss": 0.053, "step": 13050 }, { "epoch": 4.835246205109219, "grad_norm": 0.6694441437721252, "learning_rate": 9.823440958108545e-05, "loss": 0.04, "step": 13060 }, { "epoch": 4.838948537578674, "grad_norm": 0.8820638060569763, "learning_rate": 9.823005179423311e-05, "loss": 0.0517, "step": 13070 }, { "epoch": 4.84265087004813, "grad_norm": 0.36223071813583374, "learning_rate": 9.822568873301484e-05, "loss": 0.0446, "step": 13080 }, { "epoch": 4.846353202517586, "grad_norm": 0.4024738669395447, "learning_rate": 9.822132039790773e-05, "loss": 0.0361, "step": 13090 }, { "epoch": 4.850055534987042, "grad_norm": 0.2936398983001709, "learning_rate": 9.821694678938953e-05, "loss": 0.0372, "step": 13100 }, { "epoch": 4.853757867456498, "grad_norm": 0.3762844502925873, "learning_rate": 9.821256790793853e-05, "loss": 0.0536, "step": 13110 }, { "epoch": 4.857460199925954, "grad_norm": 0.34429386258125305, "learning_rate": 9.820818375403357e-05, "loss": 0.0346, "step": 13120 }, { "epoch": 4.861162532395409, "grad_norm": 0.45081526041030884, "learning_rate": 9.820379432815414e-05, "loss": 0.0413, "step": 13130 }, { "epoch": 4.864864864864865, "grad_norm": 0.5748254060745239, "learning_rate": 9.819939963078022e-05, "loss": 0.0468, "step": 13140 }, { "epoch": 4.868567197334321, "grad_norm": 0.3474520742893219, "learning_rate": 9.819499966239243e-05, "loss": 0.0522, "step": 13150 }, { "epoch": 4.872269529803776, "grad_norm": 0.6037681102752686, "learning_rate": 9.819059442347193e-05, "loss": 0.0458, "step": 13160 }, { "epoch": 4.875971862273232, "grad_norm": 0.24622498452663422, "learning_rate": 9.81861839145005e-05, "loss": 0.0434, "step": 13170 }, { "epoch": 4.879674194742688, "grad_norm": 0.4303992986679077, "learning_rate": 9.818176813596041e-05, "loss": 0.0444, "step": 13180 }, { "epoch": 4.883376527212143, "grad_norm": 2.1123404502868652, "learning_rate": 9.817734708833461e-05, "loss": 0.0396, "step": 13190 }, { "epoch": 4.8870788596815995, "grad_norm": 0.35068976879119873, "learning_rate": 9.817292077210659e-05, "loss": 0.0377, "step": 13200 }, { "epoch": 4.890781192151055, "grad_norm": 0.4693909287452698, "learning_rate": 9.816848918776035e-05, "loss": 0.0355, "step": 13210 }, { "epoch": 4.894483524620511, "grad_norm": 0.24239595234394073, "learning_rate": 9.816405233578057e-05, "loss": 0.0352, "step": 13220 }, { "epoch": 4.898185857089967, "grad_norm": 0.5567657351493835, "learning_rate": 9.815961021665243e-05, "loss": 0.0543, "step": 13230 }, { "epoch": 4.901888189559422, "grad_norm": 0.24888746440410614, "learning_rate": 9.815516283086172e-05, "loss": 0.0372, "step": 13240 }, { "epoch": 4.905590522028878, "grad_norm": 0.5209914445877075, "learning_rate": 9.815071017889482e-05, "loss": 0.0383, "step": 13250 }, { "epoch": 4.909292854498334, "grad_norm": 0.2629760205745697, "learning_rate": 9.814625226123862e-05, "loss": 0.0496, "step": 13260 }, { "epoch": 4.912995186967789, "grad_norm": 0.5409017205238342, "learning_rate": 9.814178907838069e-05, "loss": 0.0437, "step": 13270 }, { "epoch": 4.916697519437245, "grad_norm": 0.7028839588165283, "learning_rate": 9.813732063080907e-05, "loss": 0.0324, "step": 13280 }, { "epoch": 4.920399851906701, "grad_norm": 0.21834196150302887, "learning_rate": 9.813284691901243e-05, "loss": 0.0357, "step": 13290 }, { "epoch": 4.9241021843761565, "grad_norm": 0.32022494077682495, "learning_rate": 9.812836794348004e-05, "loss": 0.0372, "step": 13300 }, { "epoch": 4.927804516845613, "grad_norm": 0.5559567809104919, "learning_rate": 9.81238837047017e-05, "loss": 0.0522, "step": 13310 }, { "epoch": 4.931506849315069, "grad_norm": 0.9398505091667175, "learning_rate": 9.811939420316776e-05, "loss": 0.0338, "step": 13320 }, { "epoch": 4.935209181784524, "grad_norm": 0.46268174052238464, "learning_rate": 9.811489943936922e-05, "loss": 0.032, "step": 13330 }, { "epoch": 4.93891151425398, "grad_norm": 0.3357439637184143, "learning_rate": 9.811039941379763e-05, "loss": 0.0373, "step": 13340 }, { "epoch": 4.942613846723436, "grad_norm": 0.43305787444114685, "learning_rate": 9.81058941269451e-05, "loss": 0.047, "step": 13350 }, { "epoch": 4.946316179192891, "grad_norm": 0.11118819564580917, "learning_rate": 9.81013835793043e-05, "loss": 0.0292, "step": 13360 }, { "epoch": 4.950018511662347, "grad_norm": 0.25074589252471924, "learning_rate": 9.809686777136852e-05, "loss": 0.0371, "step": 13370 }, { "epoch": 4.953720844131803, "grad_norm": 0.24006715416908264, "learning_rate": 9.809234670363159e-05, "loss": 0.0344, "step": 13380 }, { "epoch": 4.957423176601258, "grad_norm": 0.71839839220047, "learning_rate": 9.808782037658792e-05, "loss": 0.0606, "step": 13390 }, { "epoch": 4.9611255090707145, "grad_norm": 0.6262936592102051, "learning_rate": 9.808328879073251e-05, "loss": 0.0459, "step": 13400 }, { "epoch": 4.9648278415401705, "grad_norm": 0.3356289863586426, "learning_rate": 9.807875194656096e-05, "loss": 0.0303, "step": 13410 }, { "epoch": 4.968530174009626, "grad_norm": 0.588992714881897, "learning_rate": 9.807420984456937e-05, "loss": 0.0342, "step": 13420 }, { "epoch": 4.972232506479082, "grad_norm": 0.2957456707954407, "learning_rate": 9.806966248525445e-05, "loss": 0.037, "step": 13430 }, { "epoch": 4.975934838948538, "grad_norm": 0.37708166241645813, "learning_rate": 9.806510986911353e-05, "loss": 0.0446, "step": 13440 }, { "epoch": 4.979637171417993, "grad_norm": 0.45606350898742676, "learning_rate": 9.806055199664446e-05, "loss": 0.0371, "step": 13450 }, { "epoch": 4.983339503887449, "grad_norm": 0.4124199151992798, "learning_rate": 9.805598886834567e-05, "loss": 0.0399, "step": 13460 }, { "epoch": 4.987041836356905, "grad_norm": 0.3298896849155426, "learning_rate": 9.805142048471619e-05, "loss": 0.0341, "step": 13470 }, { "epoch": 4.99074416882636, "grad_norm": 0.7033610343933105, "learning_rate": 9.804684684625563e-05, "loss": 0.0422, "step": 13480 }, { "epoch": 4.994446501295816, "grad_norm": 0.622135579586029, "learning_rate": 9.804226795346411e-05, "loss": 0.0369, "step": 13490 }, { "epoch": 4.998148833765272, "grad_norm": 0.5078036785125732, "learning_rate": 9.803768380684242e-05, "loss": 0.0511, "step": 13500 }, { "epoch": 5.001851166234728, "grad_norm": 0.334186851978302, "learning_rate": 9.803309440689184e-05, "loss": 0.0355, "step": 13510 }, { "epoch": 5.005553498704184, "grad_norm": 0.5290921926498413, "learning_rate": 9.802849975411426e-05, "loss": 0.0455, "step": 13520 }, { "epoch": 5.00925583117364, "grad_norm": 0.3891530930995941, "learning_rate": 9.802389984901218e-05, "loss": 0.037, "step": 13530 }, { "epoch": 5.012958163643095, "grad_norm": 2.598414182662964, "learning_rate": 9.80192946920886e-05, "loss": 0.0392, "step": 13540 }, { "epoch": 5.016660496112551, "grad_norm": 0.3142007887363434, "learning_rate": 9.801468428384716e-05, "loss": 0.0315, "step": 13550 }, { "epoch": 5.020362828582007, "grad_norm": 0.3127557337284088, "learning_rate": 9.801006862479202e-05, "loss": 0.046, "step": 13560 }, { "epoch": 5.024065161051462, "grad_norm": 0.4564449191093445, "learning_rate": 9.800544771542797e-05, "loss": 0.034, "step": 13570 }, { "epoch": 5.027767493520918, "grad_norm": 0.3254350423812866, "learning_rate": 9.800082155626034e-05, "loss": 0.0421, "step": 13580 }, { "epoch": 5.031469825990374, "grad_norm": 0.4054834842681885, "learning_rate": 9.799619014779503e-05, "loss": 0.0412, "step": 13590 }, { "epoch": 5.0351721584598295, "grad_norm": 0.32222580909729004, "learning_rate": 9.799155349053851e-05, "loss": 0.0472, "step": 13600 }, { "epoch": 5.0388744909292855, "grad_norm": 0.4467261731624603, "learning_rate": 9.798691158499787e-05, "loss": 0.0346, "step": 13610 }, { "epoch": 5.042576823398742, "grad_norm": 0.26683735847473145, "learning_rate": 9.798226443168072e-05, "loss": 0.0377, "step": 13620 }, { "epoch": 5.046279155868197, "grad_norm": 0.7709314823150635, "learning_rate": 9.797761203109527e-05, "loss": 0.0473, "step": 13630 }, { "epoch": 5.049981488337653, "grad_norm": 0.623724639415741, "learning_rate": 9.79729543837503e-05, "loss": 0.0363, "step": 13640 }, { "epoch": 5.053683820807109, "grad_norm": 0.6134189963340759, "learning_rate": 9.796829149015517e-05, "loss": 0.0278, "step": 13650 }, { "epoch": 5.057386153276564, "grad_norm": 0.3226659893989563, "learning_rate": 9.79636233508198e-05, "loss": 0.0498, "step": 13660 }, { "epoch": 5.06108848574602, "grad_norm": 0.4612640142440796, "learning_rate": 9.79589499662547e-05, "loss": 0.0376, "step": 13670 }, { "epoch": 5.064790818215476, "grad_norm": 0.3677539825439453, "learning_rate": 9.795427133697092e-05, "loss": 0.0336, "step": 13680 }, { "epoch": 5.068493150684931, "grad_norm": 0.280361145734787, "learning_rate": 9.794958746348013e-05, "loss": 0.0433, "step": 13690 }, { "epoch": 5.072195483154387, "grad_norm": 0.5246485471725464, "learning_rate": 9.794489834629455e-05, "loss": 0.032, "step": 13700 }, { "epoch": 5.0758978156238435, "grad_norm": 0.29233473539352417, "learning_rate": 9.794020398592699e-05, "loss": 0.0309, "step": 13710 }, { "epoch": 5.079600148093299, "grad_norm": 4.449918270111084, "learning_rate": 9.793550438289076e-05, "loss": 0.0571, "step": 13720 }, { "epoch": 5.083302480562755, "grad_norm": 0.2531052529811859, "learning_rate": 9.793079953769987e-05, "loss": 0.0362, "step": 13730 }, { "epoch": 5.08700481303221, "grad_norm": 0.8351958990097046, "learning_rate": 9.79260894508688e-05, "loss": 0.0427, "step": 13740 }, { "epoch": 5.090707145501666, "grad_norm": 0.34126415848731995, "learning_rate": 9.792137412291265e-05, "loss": 0.035, "step": 13750 }, { "epoch": 5.094409477971122, "grad_norm": 0.49799150228500366, "learning_rate": 9.791665355434705e-05, "loss": 0.0347, "step": 13760 }, { "epoch": 5.098111810440577, "grad_norm": 0.33583953976631165, "learning_rate": 9.791192774568827e-05, "loss": 0.0353, "step": 13770 }, { "epoch": 5.101814142910033, "grad_norm": 0.23866772651672363, "learning_rate": 9.790719669745312e-05, "loss": 0.0387, "step": 13780 }, { "epoch": 5.105516475379489, "grad_norm": 0.495891809463501, "learning_rate": 9.790246041015896e-05, "loss": 0.0414, "step": 13790 }, { "epoch": 5.109218807848944, "grad_norm": 0.20961828529834747, "learning_rate": 9.789771888432375e-05, "loss": 0.0276, "step": 13800 }, { "epoch": 5.1129211403184005, "grad_norm": 0.4432090222835541, "learning_rate": 9.7892972120466e-05, "loss": 0.0235, "step": 13810 }, { "epoch": 5.116623472787857, "grad_norm": 0.17148946225643158, "learning_rate": 9.788822011910485e-05, "loss": 0.0471, "step": 13820 }, { "epoch": 5.120325805257312, "grad_norm": 0.3721024692058563, "learning_rate": 9.788346288075994e-05, "loss": 0.0357, "step": 13830 }, { "epoch": 5.124028137726768, "grad_norm": 0.3830939531326294, "learning_rate": 9.787870040595151e-05, "loss": 0.048, "step": 13840 }, { "epoch": 5.127730470196224, "grad_norm": 0.22573009133338928, "learning_rate": 9.787393269520039e-05, "loss": 0.0333, "step": 13850 }, { "epoch": 5.131432802665679, "grad_norm": 0.48888567090034485, "learning_rate": 9.786915974902798e-05, "loss": 0.0442, "step": 13860 }, { "epoch": 5.135135135135135, "grad_norm": 0.3571733236312866, "learning_rate": 9.786438156795621e-05, "loss": 0.0419, "step": 13870 }, { "epoch": 5.138837467604591, "grad_norm": 0.4489794969558716, "learning_rate": 9.785959815250765e-05, "loss": 0.0257, "step": 13880 }, { "epoch": 5.142539800074046, "grad_norm": 0.2528819739818573, "learning_rate": 9.785480950320538e-05, "loss": 0.0438, "step": 13890 }, { "epoch": 5.146242132543502, "grad_norm": 0.4707394540309906, "learning_rate": 9.785001562057309e-05, "loss": 0.0412, "step": 13900 }, { "epoch": 5.149944465012958, "grad_norm": 0.17212767899036407, "learning_rate": 9.784521650513505e-05, "loss": 0.0315, "step": 13910 }, { "epoch": 5.153646797482414, "grad_norm": 1.1750742197036743, "learning_rate": 9.784041215741606e-05, "loss": 0.0489, "step": 13920 }, { "epoch": 5.15734912995187, "grad_norm": 0.2802017629146576, "learning_rate": 9.783560257794154e-05, "loss": 0.0281, "step": 13930 }, { "epoch": 5.161051462421326, "grad_norm": 0.8771383762359619, "learning_rate": 9.783078776723742e-05, "loss": 0.047, "step": 13940 }, { "epoch": 5.164753794890781, "grad_norm": 0.5565239787101746, "learning_rate": 9.782596772583026e-05, "loss": 0.0403, "step": 13950 }, { "epoch": 5.168456127360237, "grad_norm": 0.27493467926979065, "learning_rate": 9.782114245424718e-05, "loss": 0.0377, "step": 13960 }, { "epoch": 5.172158459829693, "grad_norm": 0.40921515226364136, "learning_rate": 9.781631195301586e-05, "loss": 0.0401, "step": 13970 }, { "epoch": 5.175860792299148, "grad_norm": 1.5148049592971802, "learning_rate": 9.781147622266455e-05, "loss": 0.0543, "step": 13980 }, { "epoch": 5.179563124768604, "grad_norm": 0.2153245061635971, "learning_rate": 9.78066352637221e-05, "loss": 0.0297, "step": 13990 }, { "epoch": 5.18326545723806, "grad_norm": 1.0064527988433838, "learning_rate": 9.780178907671789e-05, "loss": 0.0434, "step": 14000 }, { "epoch": 5.1869677897075155, "grad_norm": 0.19441038370132446, "learning_rate": 9.779693766218189e-05, "loss": 0.0271, "step": 14010 }, { "epoch": 5.1906701221769715, "grad_norm": 0.4712996184825897, "learning_rate": 9.779208102064465e-05, "loss": 0.0352, "step": 14020 }, { "epoch": 5.194372454646428, "grad_norm": 0.5416314005851746, "learning_rate": 9.778721915263727e-05, "loss": 0.034, "step": 14030 }, { "epoch": 5.198074787115883, "grad_norm": 0.5610817670822144, "learning_rate": 9.778235205869148e-05, "loss": 0.0298, "step": 14040 }, { "epoch": 5.201777119585339, "grad_norm": 0.7416524291038513, "learning_rate": 9.777747973933948e-05, "loss": 0.0372, "step": 14050 }, { "epoch": 5.205479452054795, "grad_norm": 0.5152773857116699, "learning_rate": 9.777260219511415e-05, "loss": 0.0444, "step": 14060 }, { "epoch": 5.20918178452425, "grad_norm": 0.2872082591056824, "learning_rate": 9.776771942654885e-05, "loss": 0.0495, "step": 14070 }, { "epoch": 5.212884116993706, "grad_norm": 0.44778281450271606, "learning_rate": 9.776283143417759e-05, "loss": 0.0307, "step": 14080 }, { "epoch": 5.216586449463162, "grad_norm": 0.4376593232154846, "learning_rate": 9.775793821853488e-05, "loss": 0.0486, "step": 14090 }, { "epoch": 5.220288781932617, "grad_norm": 0.30007144808769226, "learning_rate": 9.775303978015585e-05, "loss": 0.0411, "step": 14100 }, { "epoch": 5.223991114402073, "grad_norm": 0.24668386578559875, "learning_rate": 9.774813611957618e-05, "loss": 0.0348, "step": 14110 }, { "epoch": 5.2276934468715295, "grad_norm": 0.5565990209579468, "learning_rate": 9.774322723733216e-05, "loss": 0.0392, "step": 14120 }, { "epoch": 5.231395779340985, "grad_norm": 0.3717818558216095, "learning_rate": 9.773831313396055e-05, "loss": 0.0368, "step": 14130 }, { "epoch": 5.235098111810441, "grad_norm": 0.6933141946792603, "learning_rate": 9.773339380999882e-05, "loss": 0.0427, "step": 14140 }, { "epoch": 5.238800444279896, "grad_norm": 0.20644691586494446, "learning_rate": 9.772846926598491e-05, "loss": 0.0302, "step": 14150 }, { "epoch": 5.242502776749352, "grad_norm": 0.9931360483169556, "learning_rate": 9.772353950245734e-05, "loss": 0.0391, "step": 14160 }, { "epoch": 5.246205109218808, "grad_norm": 0.9853276014328003, "learning_rate": 9.771860451995525e-05, "loss": 0.0359, "step": 14170 }, { "epoch": 5.249907441688263, "grad_norm": 0.604703426361084, "learning_rate": 9.771366431901831e-05, "loss": 0.0347, "step": 14180 }, { "epoch": 5.253609774157719, "grad_norm": 0.38489648699760437, "learning_rate": 9.77087189001868e-05, "loss": 0.0449, "step": 14190 }, { "epoch": 5.257312106627175, "grad_norm": 0.37190812826156616, "learning_rate": 9.77037682640015e-05, "loss": 0.0334, "step": 14200 }, { "epoch": 5.2610144390966305, "grad_norm": 0.29955556988716125, "learning_rate": 9.769881241100383e-05, "loss": 0.0324, "step": 14210 }, { "epoch": 5.2647167715660865, "grad_norm": 0.6281033158302307, "learning_rate": 9.769385134173573e-05, "loss": 0.0456, "step": 14220 }, { "epoch": 5.268419104035543, "grad_norm": 0.3858572244644165, "learning_rate": 9.768888505673976e-05, "loss": 0.0303, "step": 14230 }, { "epoch": 5.272121436504998, "grad_norm": 0.3038386106491089, "learning_rate": 9.768391355655903e-05, "loss": 0.038, "step": 14240 }, { "epoch": 5.275823768974454, "grad_norm": 0.23065948486328125, "learning_rate": 9.767893684173721e-05, "loss": 0.0335, "step": 14250 }, { "epoch": 5.27952610144391, "grad_norm": 0.49098479747772217, "learning_rate": 9.767395491281855e-05, "loss": 0.0454, "step": 14260 }, { "epoch": 5.283228433913365, "grad_norm": 0.4085569679737091, "learning_rate": 9.766896777034781e-05, "loss": 0.0259, "step": 14270 }, { "epoch": 5.286930766382821, "grad_norm": 0.6155716776847839, "learning_rate": 9.766397541487047e-05, "loss": 0.0298, "step": 14280 }, { "epoch": 5.290633098852277, "grad_norm": 0.3332420289516449, "learning_rate": 9.765897784693243e-05, "loss": 0.0599, "step": 14290 }, { "epoch": 5.294335431321732, "grad_norm": 0.31821179389953613, "learning_rate": 9.765397506708023e-05, "loss": 0.0356, "step": 14300 }, { "epoch": 5.298037763791188, "grad_norm": 0.5388307571411133, "learning_rate": 9.764896707586096e-05, "loss": 0.0338, "step": 14310 }, { "epoch": 5.3017400962606445, "grad_norm": 0.28174489736557007, "learning_rate": 9.764395387382227e-05, "loss": 0.0303, "step": 14320 }, { "epoch": 5.3054424287301, "grad_norm": 0.3525988757610321, "learning_rate": 9.763893546151244e-05, "loss": 0.0303, "step": 14330 }, { "epoch": 5.309144761199556, "grad_norm": 1.404644250869751, "learning_rate": 9.763391183948023e-05, "loss": 0.051, "step": 14340 }, { "epoch": 5.312847093669012, "grad_norm": 0.3305329382419586, "learning_rate": 9.762888300827507e-05, "loss": 0.0358, "step": 14350 }, { "epoch": 5.316549426138467, "grad_norm": 0.3386758267879486, "learning_rate": 9.762384896844684e-05, "loss": 0.0417, "step": 14360 }, { "epoch": 5.320251758607923, "grad_norm": 0.5891600847244263, "learning_rate": 9.76188097205461e-05, "loss": 0.0564, "step": 14370 }, { "epoch": 5.323954091077379, "grad_norm": 0.35766392946243286, "learning_rate": 9.761376526512394e-05, "loss": 0.043, "step": 14380 }, { "epoch": 5.327656423546834, "grad_norm": 0.45796218514442444, "learning_rate": 9.760871560273197e-05, "loss": 0.0498, "step": 14390 }, { "epoch": 5.33135875601629, "grad_norm": 3.0268421173095703, "learning_rate": 9.760366073392246e-05, "loss": 0.0477, "step": 14400 }, { "epoch": 5.335061088485746, "grad_norm": 0.30799534916877747, "learning_rate": 9.759860065924818e-05, "loss": 0.0423, "step": 14410 }, { "epoch": 5.3387634209552015, "grad_norm": 0.34366559982299805, "learning_rate": 9.759353537926247e-05, "loss": 0.0349, "step": 14420 }, { "epoch": 5.342465753424658, "grad_norm": 0.5641450881958008, "learning_rate": 9.758846489451931e-05, "loss": 0.0397, "step": 14430 }, { "epoch": 5.346168085894114, "grad_norm": 0.8382695913314819, "learning_rate": 9.758338920557318e-05, "loss": 0.0339, "step": 14440 }, { "epoch": 5.349870418363569, "grad_norm": 0.5334224104881287, "learning_rate": 9.757830831297914e-05, "loss": 0.0403, "step": 14450 }, { "epoch": 5.353572750833025, "grad_norm": 2.3245322704315186, "learning_rate": 9.757322221729283e-05, "loss": 0.0434, "step": 14460 }, { "epoch": 5.357275083302481, "grad_norm": 0.49561408162117004, "learning_rate": 9.756813091907049e-05, "loss": 0.0397, "step": 14470 }, { "epoch": 5.360977415771936, "grad_norm": 0.35379454493522644, "learning_rate": 9.756303441886885e-05, "loss": 0.0501, "step": 14480 }, { "epoch": 5.364679748241392, "grad_norm": 0.3526187241077423, "learning_rate": 9.755793271724526e-05, "loss": 0.0336, "step": 14490 }, { "epoch": 5.368382080710848, "grad_norm": 0.5160998702049255, "learning_rate": 9.755282581475769e-05, "loss": 0.0377, "step": 14500 }, { "epoch": 5.372084413180303, "grad_norm": 0.4107300043106079, "learning_rate": 9.754771371196456e-05, "loss": 0.0293, "step": 14510 }, { "epoch": 5.3757867456497594, "grad_norm": 0.3015038073062897, "learning_rate": 9.754259640942493e-05, "loss": 0.0484, "step": 14520 }, { "epoch": 5.3794890781192155, "grad_norm": 0.5979242324829102, "learning_rate": 9.753747390769847e-05, "loss": 0.0418, "step": 14530 }, { "epoch": 5.383191410588671, "grad_norm": 0.3456868827342987, "learning_rate": 9.75323462073453e-05, "loss": 0.0494, "step": 14540 }, { "epoch": 5.386893743058127, "grad_norm": 0.6747062802314758, "learning_rate": 9.752721330892624e-05, "loss": 0.0396, "step": 14550 }, { "epoch": 5.390596075527583, "grad_norm": 0.5396828651428223, "learning_rate": 9.752207521300258e-05, "loss": 0.0402, "step": 14560 }, { "epoch": 5.394298407997038, "grad_norm": 0.3512726426124573, "learning_rate": 9.751693192013623e-05, "loss": 0.0404, "step": 14570 }, { "epoch": 5.398000740466494, "grad_norm": 0.5015305876731873, "learning_rate": 9.751178343088963e-05, "loss": 0.0373, "step": 14580 }, { "epoch": 5.40170307293595, "grad_norm": 0.95047527551651, "learning_rate": 9.750662974582584e-05, "loss": 0.0474, "step": 14590 }, { "epoch": 5.405405405405405, "grad_norm": 0.45271649956703186, "learning_rate": 9.750147086550844e-05, "loss": 0.0375, "step": 14600 }, { "epoch": 5.409107737874861, "grad_norm": 1.6235988140106201, "learning_rate": 9.74963067905016e-05, "loss": 0.0414, "step": 14610 }, { "epoch": 5.412810070344317, "grad_norm": 0.38294100761413574, "learning_rate": 9.749113752137007e-05, "loss": 0.0341, "step": 14620 }, { "epoch": 5.4165124028137726, "grad_norm": 0.3867463767528534, "learning_rate": 9.748596305867913e-05, "loss": 0.0335, "step": 14630 }, { "epoch": 5.420214735283229, "grad_norm": 0.28683337569236755, "learning_rate": 9.748078340299466e-05, "loss": 0.0343, "step": 14640 }, { "epoch": 5.423917067752684, "grad_norm": 0.324318528175354, "learning_rate": 9.747559855488313e-05, "loss": 0.0427, "step": 14650 }, { "epoch": 5.42761940022214, "grad_norm": 0.1262185424566269, "learning_rate": 9.747040851491149e-05, "loss": 0.0373, "step": 14660 }, { "epoch": 5.431321732691596, "grad_norm": 0.18249976634979248, "learning_rate": 9.746521328364738e-05, "loss": 0.0403, "step": 14670 }, { "epoch": 5.435024065161051, "grad_norm": 0.3577211797237396, "learning_rate": 9.746001286165887e-05, "loss": 0.0254, "step": 14680 }, { "epoch": 5.438726397630507, "grad_norm": 0.18920058012008667, "learning_rate": 9.745480724951473e-05, "loss": 0.0327, "step": 14690 }, { "epoch": 5.442428730099963, "grad_norm": 0.24698929488658905, "learning_rate": 9.744959644778422e-05, "loss": 0.0252, "step": 14700 }, { "epoch": 5.446131062569418, "grad_norm": 0.5864214301109314, "learning_rate": 9.744438045703717e-05, "loss": 0.031, "step": 14710 }, { "epoch": 5.449833395038874, "grad_norm": 0.29562172293663025, "learning_rate": 9.743915927784402e-05, "loss": 0.0292, "step": 14720 }, { "epoch": 5.4535357275083305, "grad_norm": 0.22318384051322937, "learning_rate": 9.743393291077572e-05, "loss": 0.0366, "step": 14730 }, { "epoch": 5.457238059977786, "grad_norm": 0.26374393701553345, "learning_rate": 9.742870135640382e-05, "loss": 0.0288, "step": 14740 }, { "epoch": 5.460940392447242, "grad_norm": 0.27356788516044617, "learning_rate": 9.742346461530048e-05, "loss": 0.0354, "step": 14750 }, { "epoch": 5.464642724916698, "grad_norm": 0.3913300931453705, "learning_rate": 9.741822268803833e-05, "loss": 0.0457, "step": 14760 }, { "epoch": 5.468345057386153, "grad_norm": 0.39870306849479675, "learning_rate": 9.741297557519064e-05, "loss": 0.038, "step": 14770 }, { "epoch": 5.472047389855609, "grad_norm": 1.4568631649017334, "learning_rate": 9.740772327733123e-05, "loss": 0.035, "step": 14780 }, { "epoch": 5.475749722325065, "grad_norm": 0.46844518184661865, "learning_rate": 9.740246579503447e-05, "loss": 0.0354, "step": 14790 }, { "epoch": 5.47945205479452, "grad_norm": 1.1958684921264648, "learning_rate": 9.739720312887535e-05, "loss": 0.031, "step": 14800 }, { "epoch": 5.483154387263976, "grad_norm": 0.3721141815185547, "learning_rate": 9.739193527942932e-05, "loss": 0.0463, "step": 14810 }, { "epoch": 5.486856719733432, "grad_norm": 0.4626169800758362, "learning_rate": 9.73866622472725e-05, "loss": 0.0356, "step": 14820 }, { "epoch": 5.4905590522028875, "grad_norm": 0.23886245489120483, "learning_rate": 9.738138403298157e-05, "loss": 0.0429, "step": 14830 }, { "epoch": 5.494261384672344, "grad_norm": 1.164699912071228, "learning_rate": 9.737610063713371e-05, "loss": 0.0367, "step": 14840 }, { "epoch": 5.4979637171418, "grad_norm": 0.5211086273193359, "learning_rate": 9.73708120603067e-05, "loss": 0.05, "step": 14850 }, { "epoch": 5.501666049611255, "grad_norm": 0.20329831540584564, "learning_rate": 9.736551830307892e-05, "loss": 0.0452, "step": 14860 }, { "epoch": 5.505368382080711, "grad_norm": 0.41560643911361694, "learning_rate": 9.736021936602926e-05, "loss": 0.034, "step": 14870 }, { "epoch": 5.509070714550167, "grad_norm": 0.9589786529541016, "learning_rate": 9.735491524973722e-05, "loss": 0.0392, "step": 14880 }, { "epoch": 5.512773047019622, "grad_norm": 0.44319236278533936, "learning_rate": 9.734960595478284e-05, "loss": 0.039, "step": 14890 }, { "epoch": 5.516475379489078, "grad_norm": 0.2313593477010727, "learning_rate": 9.734429148174675e-05, "loss": 0.0316, "step": 14900 }, { "epoch": 5.520177711958534, "grad_norm": 3.0369813442230225, "learning_rate": 9.733897183121012e-05, "loss": 0.0333, "step": 14910 }, { "epoch": 5.523880044427989, "grad_norm": 0.6545285582542419, "learning_rate": 9.73336470037547e-05, "loss": 0.0425, "step": 14920 }, { "epoch": 5.5275823768974455, "grad_norm": 0.4394074082374573, "learning_rate": 9.73283169999628e-05, "loss": 0.0482, "step": 14930 }, { "epoch": 5.5312847093669015, "grad_norm": 0.6787770390510559, "learning_rate": 9.732298182041734e-05, "loss": 0.0508, "step": 14940 }, { "epoch": 5.534987041836357, "grad_norm": 1.749817132949829, "learning_rate": 9.731764146570173e-05, "loss": 0.0472, "step": 14950 }, { "epoch": 5.538689374305813, "grad_norm": 0.3023727834224701, "learning_rate": 9.731229593639997e-05, "loss": 0.0429, "step": 14960 }, { "epoch": 5.542391706775268, "grad_norm": 0.4154205322265625, "learning_rate": 9.730694523309668e-05, "loss": 0.0486, "step": 14970 }, { "epoch": 5.546094039244724, "grad_norm": 0.26423853635787964, "learning_rate": 9.730158935637697e-05, "loss": 0.0341, "step": 14980 }, { "epoch": 5.54979637171418, "grad_norm": 0.4516962766647339, "learning_rate": 9.729622830682657e-05, "loss": 0.0389, "step": 14990 }, { "epoch": 5.553498704183635, "grad_norm": 0.5246504545211792, "learning_rate": 9.729086208503174e-05, "loss": 0.0318, "step": 15000 }, { "epoch": 5.557201036653091, "grad_norm": 0.36818090081214905, "learning_rate": 9.728549069157934e-05, "loss": 0.0377, "step": 15010 }, { "epoch": 5.560903369122547, "grad_norm": 0.20630581676959991, "learning_rate": 9.728011412705678e-05, "loss": 0.0312, "step": 15020 }, { "epoch": 5.5646057015920025, "grad_norm": 0.12490003556013107, "learning_rate": 9.727473239205201e-05, "loss": 0.047, "step": 15030 }, { "epoch": 5.568308034061459, "grad_norm": 0.5457134246826172, "learning_rate": 9.726934548715358e-05, "loss": 0.0447, "step": 15040 }, { "epoch": 5.572010366530915, "grad_norm": 0.30024972558021545, "learning_rate": 9.726395341295062e-05, "loss": 0.0492, "step": 15050 }, { "epoch": 5.57571269900037, "grad_norm": 0.3822610080242157, "learning_rate": 9.725855617003275e-05, "loss": 0.0435, "step": 15060 }, { "epoch": 5.579415031469826, "grad_norm": 0.28521785140037537, "learning_rate": 9.725315375899024e-05, "loss": 0.0298, "step": 15070 }, { "epoch": 5.583117363939282, "grad_norm": 0.37411460280418396, "learning_rate": 9.724774618041388e-05, "loss": 0.0297, "step": 15080 }, { "epoch": 5.586819696408737, "grad_norm": 0.5386850237846375, "learning_rate": 9.724233343489504e-05, "loss": 0.0435, "step": 15090 }, { "epoch": 5.590522028878193, "grad_norm": 0.8217758536338806, "learning_rate": 9.723691552302562e-05, "loss": 0.0373, "step": 15100 }, { "epoch": 5.594224361347649, "grad_norm": 0.6294009685516357, "learning_rate": 9.723149244539817e-05, "loss": 0.0241, "step": 15110 }, { "epoch": 5.597926693817104, "grad_norm": 0.5231125354766846, "learning_rate": 9.72260642026057e-05, "loss": 0.0264, "step": 15120 }, { "epoch": 5.6016290262865605, "grad_norm": 0.2294943928718567, "learning_rate": 9.722063079524185e-05, "loss": 0.0322, "step": 15130 }, { "epoch": 5.6053313587560165, "grad_norm": 0.22882111370563507, "learning_rate": 9.721519222390082e-05, "loss": 0.0379, "step": 15140 }, { "epoch": 5.609033691225472, "grad_norm": 0.4310031831264496, "learning_rate": 9.720974848917735e-05, "loss": 0.033, "step": 15150 }, { "epoch": 5.612736023694928, "grad_norm": 0.13932617008686066, "learning_rate": 9.720429959166675e-05, "loss": 0.0375, "step": 15160 }, { "epoch": 5.616438356164384, "grad_norm": 0.11925756186246872, "learning_rate": 9.719884553196495e-05, "loss": 0.0238, "step": 15170 }, { "epoch": 5.620140688633839, "grad_norm": 0.24261805415153503, "learning_rate": 9.719338631066834e-05, "loss": 0.0282, "step": 15180 }, { "epoch": 5.623843021103295, "grad_norm": 0.3845974802970886, "learning_rate": 9.718792192837396e-05, "loss": 0.0458, "step": 15190 }, { "epoch": 5.627545353572751, "grad_norm": 0.47789695858955383, "learning_rate": 9.718245238567939e-05, "loss": 0.0508, "step": 15200 }, { "epoch": 5.631247686042206, "grad_norm": 0.31964144110679626, "learning_rate": 9.717697768318276e-05, "loss": 0.0373, "step": 15210 }, { "epoch": 5.634950018511662, "grad_norm": 0.24289806187152863, "learning_rate": 9.717149782148278e-05, "loss": 0.0408, "step": 15220 }, { "epoch": 5.638652350981118, "grad_norm": 0.8231571316719055, "learning_rate": 9.716601280117873e-05, "loss": 0.0357, "step": 15230 }, { "epoch": 5.642354683450574, "grad_norm": 0.2610478401184082, "learning_rate": 9.716052262287043e-05, "loss": 0.0357, "step": 15240 }, { "epoch": 5.64605701592003, "grad_norm": 0.25202634930610657, "learning_rate": 9.715502728715826e-05, "loss": 0.0598, "step": 15250 }, { "epoch": 5.649759348389486, "grad_norm": 0.6371153593063354, "learning_rate": 9.714952679464323e-05, "loss": 0.0365, "step": 15260 }, { "epoch": 5.653461680858941, "grad_norm": 0.2375907599925995, "learning_rate": 9.714402114592682e-05, "loss": 0.031, "step": 15270 }, { "epoch": 5.657164013328397, "grad_norm": 0.515634298324585, "learning_rate": 9.713851034161114e-05, "loss": 0.0281, "step": 15280 }, { "epoch": 5.660866345797853, "grad_norm": 0.45679914951324463, "learning_rate": 9.713299438229886e-05, "loss": 0.0376, "step": 15290 }, { "epoch": 5.664568678267308, "grad_norm": 0.18195930123329163, "learning_rate": 9.712747326859315e-05, "loss": 0.0402, "step": 15300 }, { "epoch": 5.668271010736764, "grad_norm": 0.22995096445083618, "learning_rate": 9.712194700109784e-05, "loss": 0.043, "step": 15310 }, { "epoch": 5.67197334320622, "grad_norm": 1.3459807634353638, "learning_rate": 9.711641558041724e-05, "loss": 0.0421, "step": 15320 }, { "epoch": 5.675675675675675, "grad_norm": 0.4052985608577728, "learning_rate": 9.711087900715627e-05, "loss": 0.0434, "step": 15330 }, { "epoch": 5.6793780081451315, "grad_norm": 0.385602742433548, "learning_rate": 9.710533728192041e-05, "loss": 0.0278, "step": 15340 }, { "epoch": 5.6830803406145876, "grad_norm": 0.33064964413642883, "learning_rate": 9.709979040531569e-05, "loss": 0.0465, "step": 15350 }, { "epoch": 5.686782673084043, "grad_norm": 0.38488683104515076, "learning_rate": 9.709423837794869e-05, "loss": 0.0394, "step": 15360 }, { "epoch": 5.690485005553499, "grad_norm": 0.1594313085079193, "learning_rate": 9.70886812004266e-05, "loss": 0.0364, "step": 15370 }, { "epoch": 5.694187338022955, "grad_norm": 0.2828812897205353, "learning_rate": 9.708311887335713e-05, "loss": 0.0348, "step": 15380 }, { "epoch": 5.69788967049241, "grad_norm": 0.23148606717586517, "learning_rate": 9.707755139734855e-05, "loss": 0.0343, "step": 15390 }, { "epoch": 5.701592002961866, "grad_norm": 1.250565528869629, "learning_rate": 9.707197877300974e-05, "loss": 0.0467, "step": 15400 }, { "epoch": 5.705294335431322, "grad_norm": 0.24087318778038025, "learning_rate": 9.70664010009501e-05, "loss": 0.0385, "step": 15410 }, { "epoch": 5.708996667900777, "grad_norm": 0.49067211151123047, "learning_rate": 9.706081808177963e-05, "loss": 0.0402, "step": 15420 }, { "epoch": 5.712699000370233, "grad_norm": 0.3807332515716553, "learning_rate": 9.705523001610883e-05, "loss": 0.0297, "step": 15430 }, { "epoch": 5.716401332839689, "grad_norm": 0.2509849965572357, "learning_rate": 9.704963680454883e-05, "loss": 0.0448, "step": 15440 }, { "epoch": 5.720103665309145, "grad_norm": 1.4573026895523071, "learning_rate": 9.704403844771128e-05, "loss": 0.0383, "step": 15450 }, { "epoch": 5.723805997778601, "grad_norm": 0.6331608295440674, "learning_rate": 9.70384349462084e-05, "loss": 0.0453, "step": 15460 }, { "epoch": 5.727508330248057, "grad_norm": 0.27385959029197693, "learning_rate": 9.703282630065302e-05, "loss": 0.0352, "step": 15470 }, { "epoch": 5.731210662717512, "grad_norm": 0.39436665177345276, "learning_rate": 9.702721251165848e-05, "loss": 0.0571, "step": 15480 }, { "epoch": 5.734912995186968, "grad_norm": 0.4851819574832916, "learning_rate": 9.702159357983866e-05, "loss": 0.0354, "step": 15490 }, { "epoch": 5.738615327656424, "grad_norm": 0.33027273416519165, "learning_rate": 9.701596950580806e-05, "loss": 0.0352, "step": 15500 }, { "epoch": 5.742317660125879, "grad_norm": 0.2917264401912689, "learning_rate": 9.701034029018174e-05, "loss": 0.049, "step": 15510 }, { "epoch": 5.746019992595335, "grad_norm": 0.6656366586685181, "learning_rate": 9.700470593357526e-05, "loss": 0.0274, "step": 15520 }, { "epoch": 5.749722325064791, "grad_norm": 0.23660966753959656, "learning_rate": 9.699906643660483e-05, "loss": 0.0268, "step": 15530 }, { "epoch": 5.7534246575342465, "grad_norm": 0.2203638255596161, "learning_rate": 9.699342179988716e-05, "loss": 0.0331, "step": 15540 }, { "epoch": 5.7571269900037025, "grad_norm": 0.20430423319339752, "learning_rate": 9.698777202403953e-05, "loss": 0.0343, "step": 15550 }, { "epoch": 5.760829322473158, "grad_norm": 0.315231055021286, "learning_rate": 9.69821171096798e-05, "loss": 0.0374, "step": 15560 }, { "epoch": 5.764531654942614, "grad_norm": 1.1536118984222412, "learning_rate": 9.697645705742637e-05, "loss": 0.0345, "step": 15570 }, { "epoch": 5.76823398741207, "grad_norm": 0.4724923372268677, "learning_rate": 9.697079186789823e-05, "loss": 0.0322, "step": 15580 }, { "epoch": 5.771936319881525, "grad_norm": 0.6091195940971375, "learning_rate": 9.696512154171492e-05, "loss": 0.0525, "step": 15590 }, { "epoch": 5.775638652350981, "grad_norm": 0.25102925300598145, "learning_rate": 9.695944607949649e-05, "loss": 0.028, "step": 15600 }, { "epoch": 5.779340984820437, "grad_norm": 0.17833241820335388, "learning_rate": 9.695376548186368e-05, "loss": 0.0298, "step": 15610 }, { "epoch": 5.783043317289892, "grad_norm": 1.107251524925232, "learning_rate": 9.694807974943767e-05, "loss": 0.044, "step": 15620 }, { "epoch": 5.786745649759348, "grad_norm": 0.18355421721935272, "learning_rate": 9.694238888284022e-05, "loss": 0.0372, "step": 15630 }, { "epoch": 5.790447982228804, "grad_norm": 0.17884956300258636, "learning_rate": 9.693669288269372e-05, "loss": 0.0434, "step": 15640 }, { "epoch": 5.79415031469826, "grad_norm": 0.19738052785396576, "learning_rate": 9.693099174962103e-05, "loss": 0.034, "step": 15650 }, { "epoch": 5.797852647167716, "grad_norm": 0.2931630313396454, "learning_rate": 9.692528548424567e-05, "loss": 0.0547, "step": 15660 }, { "epoch": 5.801554979637172, "grad_norm": 0.30799078941345215, "learning_rate": 9.691957408719162e-05, "loss": 0.0315, "step": 15670 }, { "epoch": 5.805257312106627, "grad_norm": 0.3385046720504761, "learning_rate": 9.69138575590835e-05, "loss": 0.0478, "step": 15680 }, { "epoch": 5.808959644576083, "grad_norm": 0.29224011301994324, "learning_rate": 9.690813590054645e-05, "loss": 0.0329, "step": 15690 }, { "epoch": 5.812661977045539, "grad_norm": 0.22590769827365875, "learning_rate": 9.690240911220618e-05, "loss": 0.0342, "step": 15700 }, { "epoch": 5.816364309514994, "grad_norm": 0.267183393239975, "learning_rate": 9.689667719468897e-05, "loss": 0.0406, "step": 15710 }, { "epoch": 5.82006664198445, "grad_norm": 0.337457537651062, "learning_rate": 9.689094014862165e-05, "loss": 0.0395, "step": 15720 }, { "epoch": 5.823768974453906, "grad_norm": 0.6789131760597229, "learning_rate": 9.688519797463161e-05, "loss": 0.0355, "step": 15730 }, { "epoch": 5.8274713069233615, "grad_norm": 0.3839409351348877, "learning_rate": 9.68794506733468e-05, "loss": 0.0593, "step": 15740 }, { "epoch": 5.8311736393928175, "grad_norm": 1.139883279800415, "learning_rate": 9.687369824539577e-05, "loss": 0.0335, "step": 15750 }, { "epoch": 5.834875971862274, "grad_norm": 0.15372563898563385, "learning_rate": 9.686794069140756e-05, "loss": 0.036, "step": 15760 }, { "epoch": 5.838578304331729, "grad_norm": 0.17333881556987762, "learning_rate": 9.686217801201182e-05, "loss": 0.0357, "step": 15770 }, { "epoch": 5.842280636801185, "grad_norm": 0.6726700067520142, "learning_rate": 9.685641020783876e-05, "loss": 0.0358, "step": 15780 }, { "epoch": 5.845982969270641, "grad_norm": 0.5687008500099182, "learning_rate": 9.685063727951914e-05, "loss": 0.0291, "step": 15790 }, { "epoch": 5.849685301740096, "grad_norm": 0.36156028509140015, "learning_rate": 9.684485922768422e-05, "loss": 0.0351, "step": 15800 }, { "epoch": 5.853387634209552, "grad_norm": 0.4800468981266022, "learning_rate": 9.683907605296597e-05, "loss": 0.0426, "step": 15810 }, { "epoch": 5.857089966679007, "grad_norm": 0.1666531264781952, "learning_rate": 9.683328775599676e-05, "loss": 0.0469, "step": 15820 }, { "epoch": 5.860792299148463, "grad_norm": 0.24136528372764587, "learning_rate": 9.682749433740962e-05, "loss": 0.0327, "step": 15830 }, { "epoch": 5.864494631617919, "grad_norm": 0.3582218289375305, "learning_rate": 9.68216957978381e-05, "loss": 0.03, "step": 15840 }, { "epoch": 5.868196964087375, "grad_norm": 0.9970892071723938, "learning_rate": 9.681589213791633e-05, "loss": 0.0271, "step": 15850 }, { "epoch": 5.871899296556831, "grad_norm": 0.5047776103019714, "learning_rate": 9.681008335827898e-05, "loss": 0.0359, "step": 15860 }, { "epoch": 5.875601629026287, "grad_norm": 0.1673145443201065, "learning_rate": 9.680426945956129e-05, "loss": 0.047, "step": 15870 }, { "epoch": 5.879303961495742, "grad_norm": 0.6962110996246338, "learning_rate": 9.679845044239906e-05, "loss": 0.0434, "step": 15880 }, { "epoch": 5.883006293965198, "grad_norm": 0.23283414542675018, "learning_rate": 9.679262630742865e-05, "loss": 0.0508, "step": 15890 }, { "epoch": 5.886708626434654, "grad_norm": 0.24731235206127167, "learning_rate": 9.6786797055287e-05, "loss": 0.047, "step": 15900 }, { "epoch": 5.890410958904109, "grad_norm": 0.4676780700683594, "learning_rate": 9.678096268661153e-05, "loss": 0.0347, "step": 15910 }, { "epoch": 5.894113291373565, "grad_norm": 0.48476555943489075, "learning_rate": 9.677512320204035e-05, "loss": 0.0459, "step": 15920 }, { "epoch": 5.897815623843021, "grad_norm": 0.6579380035400391, "learning_rate": 9.676927860221199e-05, "loss": 0.0337, "step": 15930 }, { "epoch": 5.901517956312476, "grad_norm": 0.5175186991691589, "learning_rate": 9.676342888776566e-05, "loss": 0.043, "step": 15940 }, { "epoch": 5.9052202887819325, "grad_norm": 0.21840262413024902, "learning_rate": 9.675757405934103e-05, "loss": 0.0438, "step": 15950 }, { "epoch": 5.908922621251389, "grad_norm": 0.2999378740787506, "learning_rate": 9.675171411757842e-05, "loss": 0.0476, "step": 15960 }, { "epoch": 5.912624953720844, "grad_norm": 0.3655230402946472, "learning_rate": 9.674584906311865e-05, "loss": 0.0331, "step": 15970 }, { "epoch": 5.9163272861903, "grad_norm": 0.5317322611808777, "learning_rate": 9.67399788966031e-05, "loss": 0.0431, "step": 15980 }, { "epoch": 5.920029618659756, "grad_norm": 0.6204783916473389, "learning_rate": 9.673410361867373e-05, "loss": 0.0408, "step": 15990 }, { "epoch": 5.923731951129211, "grad_norm": 0.5740066766738892, "learning_rate": 9.672822322997305e-05, "loss": 0.0341, "step": 16000 }, { "epoch": 5.927434283598667, "grad_norm": 0.6559865474700928, "learning_rate": 9.672233773114413e-05, "loss": 0.0353, "step": 16010 }, { "epoch": 5.931136616068123, "grad_norm": 0.17292578518390656, "learning_rate": 9.671644712283061e-05, "loss": 0.0347, "step": 16020 }, { "epoch": 5.934838948537578, "grad_norm": 0.34373748302459717, "learning_rate": 9.671055140567667e-05, "loss": 0.0392, "step": 16030 }, { "epoch": 5.938541281007034, "grad_norm": 0.4248723089694977, "learning_rate": 9.670465058032705e-05, "loss": 0.0342, "step": 16040 }, { "epoch": 5.94224361347649, "grad_norm": 0.5599302053451538, "learning_rate": 9.669874464742705e-05, "loss": 0.0449, "step": 16050 }, { "epoch": 5.945945945945946, "grad_norm": 0.6548541784286499, "learning_rate": 9.669283360762258e-05, "loss": 0.0437, "step": 16060 }, { "epoch": 5.949648278415402, "grad_norm": 0.4954637587070465, "learning_rate": 9.668691746156e-05, "loss": 0.0354, "step": 16070 }, { "epoch": 5.953350610884858, "grad_norm": 1.3086881637573242, "learning_rate": 9.668099620988631e-05, "loss": 0.0321, "step": 16080 }, { "epoch": 5.957052943354313, "grad_norm": 0.21692593395709991, "learning_rate": 9.667506985324909e-05, "loss": 0.0319, "step": 16090 }, { "epoch": 5.960755275823769, "grad_norm": 0.2744145691394806, "learning_rate": 9.66691383922964e-05, "loss": 0.025, "step": 16100 }, { "epoch": 5.964457608293225, "grad_norm": 0.4217912554740906, "learning_rate": 9.666320182767689e-05, "loss": 0.0317, "step": 16110 }, { "epoch": 5.96815994076268, "grad_norm": 0.3989556133747101, "learning_rate": 9.665726016003977e-05, "loss": 0.0416, "step": 16120 }, { "epoch": 5.971862273232136, "grad_norm": 0.3858644366264343, "learning_rate": 9.665131339003486e-05, "loss": 0.0392, "step": 16130 }, { "epoch": 5.975564605701592, "grad_norm": 0.3333354890346527, "learning_rate": 9.664536151831245e-05, "loss": 0.0414, "step": 16140 }, { "epoch": 5.9792669381710475, "grad_norm": 0.1583453267812729, "learning_rate": 9.663940454552342e-05, "loss": 0.0324, "step": 16150 }, { "epoch": 5.9829692706405035, "grad_norm": 0.17916513979434967, "learning_rate": 9.663344247231922e-05, "loss": 0.0476, "step": 16160 }, { "epoch": 5.98667160310996, "grad_norm": 0.3771452009677887, "learning_rate": 9.66274752993519e-05, "loss": 0.0422, "step": 16170 }, { "epoch": 5.990373935579415, "grad_norm": 0.27213534712791443, "learning_rate": 9.662150302727395e-05, "loss": 0.0357, "step": 16180 }, { "epoch": 5.994076268048871, "grad_norm": 0.26840025186538696, "learning_rate": 9.661552565673855e-05, "loss": 0.0426, "step": 16190 }, { "epoch": 5.997778600518327, "grad_norm": 0.7138798236846924, "learning_rate": 9.660954318839933e-05, "loss": 0.0536, "step": 16200 }, { "epoch": 6.001480932987782, "grad_norm": 0.25824373960494995, "learning_rate": 9.660355562291055e-05, "loss": 0.035, "step": 16210 }, { "epoch": 6.005183265457238, "grad_norm": 0.33051925897598267, "learning_rate": 9.659756296092699e-05, "loss": 0.0425, "step": 16220 }, { "epoch": 6.008885597926694, "grad_norm": 0.37002530694007874, "learning_rate": 9.659156520310402e-05, "loss": 0.0358, "step": 16230 }, { "epoch": 6.012587930396149, "grad_norm": 0.5423300266265869, "learning_rate": 9.658556235009752e-05, "loss": 0.0275, "step": 16240 }, { "epoch": 6.016290262865605, "grad_norm": 0.24624310433864594, "learning_rate": 9.657955440256395e-05, "loss": 0.0363, "step": 16250 }, { "epoch": 6.0199925953350615, "grad_norm": 0.19062738120555878, "learning_rate": 9.657354136116035e-05, "loss": 0.0283, "step": 16260 }, { "epoch": 6.023694927804517, "grad_norm": 0.36930760741233826, "learning_rate": 9.65675232265443e-05, "loss": 0.0277, "step": 16270 }, { "epoch": 6.027397260273973, "grad_norm": 0.8005725741386414, "learning_rate": 9.656149999937391e-05, "loss": 0.0362, "step": 16280 }, { "epoch": 6.031099592743429, "grad_norm": 0.3685515522956848, "learning_rate": 9.655547168030789e-05, "loss": 0.0297, "step": 16290 }, { "epoch": 6.034801925212884, "grad_norm": 0.2758430242538452, "learning_rate": 9.654943827000548e-05, "loss": 0.0384, "step": 16300 }, { "epoch": 6.03850425768234, "grad_norm": 0.2078869342803955, "learning_rate": 9.654339976912648e-05, "loss": 0.0445, "step": 16310 }, { "epoch": 6.042206590151796, "grad_norm": 0.3275933265686035, "learning_rate": 9.653735617833126e-05, "loss": 0.0392, "step": 16320 }, { "epoch": 6.045908922621251, "grad_norm": 0.33614253997802734, "learning_rate": 9.653130749828075e-05, "loss": 0.0292, "step": 16330 }, { "epoch": 6.049611255090707, "grad_norm": 0.4759649634361267, "learning_rate": 9.652525372963638e-05, "loss": 0.0377, "step": 16340 }, { "epoch": 6.0533135875601625, "grad_norm": 0.3318818509578705, "learning_rate": 9.651919487306025e-05, "loss": 0.0408, "step": 16350 }, { "epoch": 6.0570159200296185, "grad_norm": 0.352183073759079, "learning_rate": 9.65131309292149e-05, "loss": 0.0416, "step": 16360 }, { "epoch": 6.060718252499075, "grad_norm": 0.3496416211128235, "learning_rate": 9.650706189876346e-05, "loss": 0.0321, "step": 16370 }, { "epoch": 6.06442058496853, "grad_norm": 0.24956470727920532, "learning_rate": 9.650098778236968e-05, "loss": 0.029, "step": 16380 }, { "epoch": 6.068122917437986, "grad_norm": 0.20541685819625854, "learning_rate": 9.649490858069777e-05, "loss": 0.0314, "step": 16390 }, { "epoch": 6.071825249907442, "grad_norm": 0.6646993160247803, "learning_rate": 9.648882429441257e-05, "loss": 0.0231, "step": 16400 }, { "epoch": 6.075527582376897, "grad_norm": 0.4097200036048889, "learning_rate": 9.648273492417946e-05, "loss": 0.0468, "step": 16410 }, { "epoch": 6.079229914846353, "grad_norm": 0.8726889491081238, "learning_rate": 9.647664047066431e-05, "loss": 0.0522, "step": 16420 }, { "epoch": 6.082932247315809, "grad_norm": 0.3308892846107483, "learning_rate": 9.647054093453365e-05, "loss": 0.0276, "step": 16430 }, { "epoch": 6.086634579785264, "grad_norm": 0.3006655275821686, "learning_rate": 9.64644363164545e-05, "loss": 0.037, "step": 16440 }, { "epoch": 6.09033691225472, "grad_norm": 0.3908838629722595, "learning_rate": 9.645832661709444e-05, "loss": 0.0337, "step": 16450 }, { "epoch": 6.0940392447241765, "grad_norm": 0.541049063205719, "learning_rate": 9.645221183712165e-05, "loss": 0.0261, "step": 16460 }, { "epoch": 6.097741577193632, "grad_norm": 0.2969602346420288, "learning_rate": 9.644609197720481e-05, "loss": 0.0459, "step": 16470 }, { "epoch": 6.101443909663088, "grad_norm": 0.3268207907676697, "learning_rate": 9.643996703801317e-05, "loss": 0.0408, "step": 16480 }, { "epoch": 6.105146242132544, "grad_norm": 0.3666388988494873, "learning_rate": 9.643383702021658e-05, "loss": 0.0376, "step": 16490 }, { "epoch": 6.108848574601999, "grad_norm": 0.2687675654888153, "learning_rate": 9.642770192448536e-05, "loss": 0.029, "step": 16500 }, { "epoch": 6.112550907071455, "grad_norm": 0.6128365397453308, "learning_rate": 9.642156175149046e-05, "loss": 0.0379, "step": 16510 }, { "epoch": 6.116253239540911, "grad_norm": 0.32871121168136597, "learning_rate": 9.641541650190338e-05, "loss": 0.0347, "step": 16520 }, { "epoch": 6.119955572010366, "grad_norm": 0.3759867250919342, "learning_rate": 9.640926617639613e-05, "loss": 0.0341, "step": 16530 }, { "epoch": 6.123657904479822, "grad_norm": 0.2742760479450226, "learning_rate": 9.640311077564131e-05, "loss": 0.0463, "step": 16540 }, { "epoch": 6.127360236949278, "grad_norm": 0.8179171085357666, "learning_rate": 9.639695030031204e-05, "loss": 0.0479, "step": 16550 }, { "epoch": 6.1310625694187335, "grad_norm": 0.5033558011054993, "learning_rate": 9.639078475108206e-05, "loss": 0.0388, "step": 16560 }, { "epoch": 6.13476490188819, "grad_norm": 0.5876268744468689, "learning_rate": 9.63846141286256e-05, "loss": 0.0433, "step": 16570 }, { "epoch": 6.138467234357646, "grad_norm": 0.2539377510547638, "learning_rate": 9.637843843361749e-05, "loss": 0.0313, "step": 16580 }, { "epoch": 6.142169566827101, "grad_norm": 0.3438958525657654, "learning_rate": 9.637225766673307e-05, "loss": 0.0388, "step": 16590 }, { "epoch": 6.145871899296557, "grad_norm": 0.41825249791145325, "learning_rate": 9.636607182864827e-05, "loss": 0.0372, "step": 16600 }, { "epoch": 6.149574231766013, "grad_norm": 0.3539368808269501, "learning_rate": 9.635988092003958e-05, "loss": 0.0371, "step": 16610 }, { "epoch": 6.153276564235468, "grad_norm": 0.6668055057525635, "learning_rate": 9.6353684941584e-05, "loss": 0.0547, "step": 16620 }, { "epoch": 6.156978896704924, "grad_norm": 0.6268014907836914, "learning_rate": 9.634748389395914e-05, "loss": 0.0284, "step": 16630 }, { "epoch": 6.16068122917438, "grad_norm": 0.2415703535079956, "learning_rate": 9.63412777778431e-05, "loss": 0.0294, "step": 16640 }, { "epoch": 6.164383561643835, "grad_norm": 0.3317531943321228, "learning_rate": 9.63350665939146e-05, "loss": 0.046, "step": 16650 }, { "epoch": 6.1680858941132914, "grad_norm": 0.4195694327354431, "learning_rate": 9.632885034285291e-05, "loss": 0.03, "step": 16660 }, { "epoch": 6.1717882265827475, "grad_norm": 0.3410828709602356, "learning_rate": 9.632262902533778e-05, "loss": 0.0471, "step": 16670 }, { "epoch": 6.175490559052203, "grad_norm": 0.3127780258655548, "learning_rate": 9.631640264204958e-05, "loss": 0.0368, "step": 16680 }, { "epoch": 6.179192891521659, "grad_norm": 0.4655129015445709, "learning_rate": 9.631017119366922e-05, "loss": 0.0354, "step": 16690 }, { "epoch": 6.182895223991115, "grad_norm": 0.5220103859901428, "learning_rate": 9.630393468087818e-05, "loss": 0.0294, "step": 16700 }, { "epoch": 6.18659755646057, "grad_norm": 0.35622021555900574, "learning_rate": 9.629769310435844e-05, "loss": 0.0346, "step": 16710 }, { "epoch": 6.190299888930026, "grad_norm": 0.16808712482452393, "learning_rate": 9.62914464647926e-05, "loss": 0.0315, "step": 16720 }, { "epoch": 6.194002221399482, "grad_norm": 1.6080045700073242, "learning_rate": 9.628519476286379e-05, "loss": 0.0347, "step": 16730 }, { "epoch": 6.197704553868937, "grad_norm": 0.6969470381736755, "learning_rate": 9.627893799925565e-05, "loss": 0.0389, "step": 16740 }, { "epoch": 6.201406886338393, "grad_norm": 0.548959493637085, "learning_rate": 9.627267617465243e-05, "loss": 0.0374, "step": 16750 }, { "epoch": 6.2051092188078485, "grad_norm": 0.17218267917633057, "learning_rate": 9.626640928973892e-05, "loss": 0.0501, "step": 16760 }, { "epoch": 6.2088115512773046, "grad_norm": 0.8369432687759399, "learning_rate": 9.626013734520046e-05, "loss": 0.0404, "step": 16770 }, { "epoch": 6.212513883746761, "grad_norm": 0.2733405828475952, "learning_rate": 9.62538603417229e-05, "loss": 0.0385, "step": 16780 }, { "epoch": 6.216216216216216, "grad_norm": 0.7804387807846069, "learning_rate": 9.624757827999273e-05, "loss": 0.049, "step": 16790 }, { "epoch": 6.219918548685672, "grad_norm": 0.19090330600738525, "learning_rate": 9.624129116069694e-05, "loss": 0.0341, "step": 16800 }, { "epoch": 6.223620881155128, "grad_norm": 0.38678011298179626, "learning_rate": 9.623499898452307e-05, "loss": 0.0351, "step": 16810 }, { "epoch": 6.227323213624583, "grad_norm": 0.259072482585907, "learning_rate": 9.622870175215922e-05, "loss": 0.0299, "step": 16820 }, { "epoch": 6.231025546094039, "grad_norm": 0.8292836546897888, "learning_rate": 9.622239946429406e-05, "loss": 0.0315, "step": 16830 }, { "epoch": 6.234727878563495, "grad_norm": 0.5124112963676453, "learning_rate": 9.621609212161677e-05, "loss": 0.0413, "step": 16840 }, { "epoch": 6.23843021103295, "grad_norm": 0.2750075161457062, "learning_rate": 9.620977972481716e-05, "loss": 0.0303, "step": 16850 }, { "epoch": 6.242132543502406, "grad_norm": 0.7554794549942017, "learning_rate": 9.620346227458547e-05, "loss": 0.034, "step": 16860 }, { "epoch": 6.2458348759718625, "grad_norm": 0.4528452754020691, "learning_rate": 9.619713977161265e-05, "loss": 0.0423, "step": 16870 }, { "epoch": 6.249537208441318, "grad_norm": 0.26291462779045105, "learning_rate": 9.619081221659007e-05, "loss": 0.0241, "step": 16880 }, { "epoch": 6.253239540910774, "grad_norm": 0.13799603283405304, "learning_rate": 9.618447961020971e-05, "loss": 0.0357, "step": 16890 }, { "epoch": 6.25694187338023, "grad_norm": 1.811971664428711, "learning_rate": 9.617814195316411e-05, "loss": 0.0285, "step": 16900 }, { "epoch": 6.260644205849685, "grad_norm": 0.41995540261268616, "learning_rate": 9.617179924614631e-05, "loss": 0.0258, "step": 16910 }, { "epoch": 6.264346538319141, "grad_norm": 0.8568551540374756, "learning_rate": 9.616545148984997e-05, "loss": 0.0375, "step": 16920 }, { "epoch": 6.268048870788597, "grad_norm": 0.5983242988586426, "learning_rate": 9.615909868496928e-05, "loss": 0.04, "step": 16930 }, { "epoch": 6.271751203258052, "grad_norm": 0.38276001811027527, "learning_rate": 9.615274083219894e-05, "loss": 0.0244, "step": 16940 }, { "epoch": 6.275453535727508, "grad_norm": 0.5456444025039673, "learning_rate": 9.614637793223425e-05, "loss": 0.0399, "step": 16950 }, { "epoch": 6.279155868196964, "grad_norm": 0.21752169728279114, "learning_rate": 9.614000998577106e-05, "loss": 0.0359, "step": 16960 }, { "epoch": 6.2828582006664195, "grad_norm": 0.3181115984916687, "learning_rate": 9.613363699350575e-05, "loss": 0.0314, "step": 16970 }, { "epoch": 6.286560533135876, "grad_norm": 0.1967797428369522, "learning_rate": 9.612725895613526e-05, "loss": 0.0314, "step": 16980 }, { "epoch": 6.290262865605332, "grad_norm": 0.5341852903366089, "learning_rate": 9.612087587435707e-05, "loss": 0.0313, "step": 16990 }, { "epoch": 6.293965198074787, "grad_norm": 15.825501441955566, "learning_rate": 9.611448774886924e-05, "loss": 0.0379, "step": 17000 }, { "epoch": 6.297667530544243, "grad_norm": 1.4880977869033813, "learning_rate": 9.610809458037037e-05, "loss": 0.0361, "step": 17010 }, { "epoch": 6.301369863013699, "grad_norm": 0.23238743841648102, "learning_rate": 9.610169636955958e-05, "loss": 0.0315, "step": 17020 }, { "epoch": 6.305072195483154, "grad_norm": 0.33040231466293335, "learning_rate": 9.609529311713661e-05, "loss": 0.032, "step": 17030 }, { "epoch": 6.30877452795261, "grad_norm": 0.18484333157539368, "learning_rate": 9.608888482380168e-05, "loss": 0.0345, "step": 17040 }, { "epoch": 6.312476860422066, "grad_norm": 0.2639807164669037, "learning_rate": 9.60824714902556e-05, "loss": 0.0408, "step": 17050 }, { "epoch": 6.316179192891521, "grad_norm": 0.3887314796447754, "learning_rate": 9.607605311719972e-05, "loss": 0.0265, "step": 17060 }, { "epoch": 6.3198815253609775, "grad_norm": 0.5693978071212769, "learning_rate": 9.606962970533595e-05, "loss": 0.052, "step": 17070 }, { "epoch": 6.3235838578304335, "grad_norm": 0.28259143233299255, "learning_rate": 9.606320125536673e-05, "loss": 0.0411, "step": 17080 }, { "epoch": 6.327286190299889, "grad_norm": 0.3018623888492584, "learning_rate": 9.605676776799508e-05, "loss": 0.0437, "step": 17090 }, { "epoch": 6.330988522769345, "grad_norm": 0.5651513338088989, "learning_rate": 9.605032924392457e-05, "loss": 0.0363, "step": 17100 }, { "epoch": 6.334690855238801, "grad_norm": 1.9293724298477173, "learning_rate": 9.604388568385926e-05, "loss": 0.043, "step": 17110 }, { "epoch": 6.338393187708256, "grad_norm": 0.1826363503932953, "learning_rate": 9.603743708850386e-05, "loss": 0.0401, "step": 17120 }, { "epoch": 6.342095520177712, "grad_norm": 0.2400437295436859, "learning_rate": 9.603098345856354e-05, "loss": 0.0272, "step": 17130 }, { "epoch": 6.345797852647168, "grad_norm": 0.3028673231601715, "learning_rate": 9.602452479474408e-05, "loss": 0.0354, "step": 17140 }, { "epoch": 6.349500185116623, "grad_norm": 0.3906211853027344, "learning_rate": 9.601806109775179e-05, "loss": 0.0375, "step": 17150 }, { "epoch": 6.353202517586079, "grad_norm": 0.41113123297691345, "learning_rate": 9.601159236829352e-05, "loss": 0.0312, "step": 17160 }, { "epoch": 6.356904850055535, "grad_norm": 0.18676267564296722, "learning_rate": 9.600511860707669e-05, "loss": 0.0264, "step": 17170 }, { "epoch": 6.360607182524991, "grad_norm": 2.109422445297241, "learning_rate": 9.599863981480926e-05, "loss": 0.0381, "step": 17180 }, { "epoch": 6.364309514994447, "grad_norm": 0.23500250279903412, "learning_rate": 9.599215599219973e-05, "loss": 0.0266, "step": 17190 }, { "epoch": 6.368011847463903, "grad_norm": 0.32772669196128845, "learning_rate": 9.598566713995718e-05, "loss": 0.0368, "step": 17200 }, { "epoch": 6.371714179933358, "grad_norm": 1.6876620054244995, "learning_rate": 9.59791732587912e-05, "loss": 0.0329, "step": 17210 }, { "epoch": 6.375416512402814, "grad_norm": 0.3294353485107422, "learning_rate": 9.597267434941196e-05, "loss": 0.0369, "step": 17220 }, { "epoch": 6.37911884487227, "grad_norm": 1.3587802648544312, "learning_rate": 9.596617041253018e-05, "loss": 0.0434, "step": 17230 }, { "epoch": 6.382821177341725, "grad_norm": 0.3247582018375397, "learning_rate": 9.59596614488571e-05, "loss": 0.0427, "step": 17240 }, { "epoch": 6.386523509811181, "grad_norm": 0.4163466691970825, "learning_rate": 9.595314745910456e-05, "loss": 0.0384, "step": 17250 }, { "epoch": 6.390225842280636, "grad_norm": 0.724882185459137, "learning_rate": 9.59466284439849e-05, "loss": 0.0424, "step": 17260 }, { "epoch": 6.3939281747500925, "grad_norm": 0.6465385556221008, "learning_rate": 9.594010440421102e-05, "loss": 0.0375, "step": 17270 }, { "epoch": 6.3976305072195485, "grad_norm": 0.22866930067539215, "learning_rate": 9.59335753404964e-05, "loss": 0.0302, "step": 17280 }, { "epoch": 6.401332839689004, "grad_norm": 0.412992924451828, "learning_rate": 9.592704125355505e-05, "loss": 0.0539, "step": 17290 }, { "epoch": 6.40503517215846, "grad_norm": 0.8773066401481628, "learning_rate": 9.59205021441015e-05, "loss": 0.0412, "step": 17300 }, { "epoch": 6.408737504627916, "grad_norm": 0.7320972084999084, "learning_rate": 9.591395801285091e-05, "loss": 0.0341, "step": 17310 }, { "epoch": 6.412439837097371, "grad_norm": 0.3864886462688446, "learning_rate": 9.590740886051885e-05, "loss": 0.0388, "step": 17320 }, { "epoch": 6.416142169566827, "grad_norm": 0.3122796416282654, "learning_rate": 9.590085468782162e-05, "loss": 0.0335, "step": 17330 }, { "epoch": 6.419844502036283, "grad_norm": 0.29090866446495056, "learning_rate": 9.589429549547592e-05, "loss": 0.0273, "step": 17340 }, { "epoch": 6.423546834505738, "grad_norm": 0.842481255531311, "learning_rate": 9.588773128419906e-05, "loss": 0.057, "step": 17350 }, { "epoch": 6.427249166975194, "grad_norm": 0.3693122863769531, "learning_rate": 9.588116205470891e-05, "loss": 0.0357, "step": 17360 }, { "epoch": 6.43095149944465, "grad_norm": 0.250253826379776, "learning_rate": 9.587458780772385e-05, "loss": 0.0264, "step": 17370 }, { "epoch": 6.434653831914106, "grad_norm": 0.43036016821861267, "learning_rate": 9.586800854396283e-05, "loss": 0.0202, "step": 17380 }, { "epoch": 6.438356164383562, "grad_norm": 0.4271714687347412, "learning_rate": 9.586142426414538e-05, "loss": 0.0333, "step": 17390 }, { "epoch": 6.442058496853018, "grad_norm": 0.19166572391986847, "learning_rate": 9.58548349689915e-05, "loss": 0.0318, "step": 17400 }, { "epoch": 6.445760829322473, "grad_norm": 0.23017355799674988, "learning_rate": 9.584824065922182e-05, "loss": 0.0336, "step": 17410 }, { "epoch": 6.449463161791929, "grad_norm": 0.3874351382255554, "learning_rate": 9.584164133555749e-05, "loss": 0.0222, "step": 17420 }, { "epoch": 6.453165494261385, "grad_norm": 0.3184267580509186, "learning_rate": 9.583503699872016e-05, "loss": 0.0379, "step": 17430 }, { "epoch": 6.45686782673084, "grad_norm": 5.639077186584473, "learning_rate": 9.58284276494321e-05, "loss": 0.0383, "step": 17440 }, { "epoch": 6.460570159200296, "grad_norm": 0.19072987139225006, "learning_rate": 9.582181328841611e-05, "loss": 0.0277, "step": 17450 }, { "epoch": 6.464272491669752, "grad_norm": 0.29667624831199646, "learning_rate": 9.581519391639549e-05, "loss": 0.0345, "step": 17460 }, { "epoch": 6.467974824139207, "grad_norm": 0.21199139952659607, "learning_rate": 9.580856953409416e-05, "loss": 0.0417, "step": 17470 }, { "epoch": 6.4716771566086635, "grad_norm": 0.8528262376785278, "learning_rate": 9.580194014223653e-05, "loss": 0.0391, "step": 17480 }, { "epoch": 6.4753794890781196, "grad_norm": 0.2429375797510147, "learning_rate": 9.57953057415476e-05, "loss": 0.0418, "step": 17490 }, { "epoch": 6.479081821547575, "grad_norm": 0.22531534731388092, "learning_rate": 9.578866633275288e-05, "loss": 0.0446, "step": 17500 }, { "epoch": 6.482784154017031, "grad_norm": 0.40845465660095215, "learning_rate": 9.578202191657844e-05, "loss": 0.0393, "step": 17510 }, { "epoch": 6.486486486486487, "grad_norm": 0.29067522287368774, "learning_rate": 9.577537249375093e-05, "loss": 0.0356, "step": 17520 }, { "epoch": 6.490188818955942, "grad_norm": 0.2506934702396393, "learning_rate": 9.57687180649975e-05, "loss": 0.0228, "step": 17530 }, { "epoch": 6.493891151425398, "grad_norm": 0.5537269115447998, "learning_rate": 9.576205863104588e-05, "loss": 0.0305, "step": 17540 }, { "epoch": 6.497593483894854, "grad_norm": 0.5472648739814758, "learning_rate": 9.575539419262434e-05, "loss": 0.042, "step": 17550 }, { "epoch": 6.501295816364309, "grad_norm": 0.48875290155410767, "learning_rate": 9.574872475046166e-05, "loss": 0.0431, "step": 17560 }, { "epoch": 6.504998148833765, "grad_norm": 0.48074859380722046, "learning_rate": 9.574205030528723e-05, "loss": 0.0349, "step": 17570 }, { "epoch": 6.5087004813032205, "grad_norm": 0.5005581974983215, "learning_rate": 9.573537085783095e-05, "loss": 0.0315, "step": 17580 }, { "epoch": 6.512402813772677, "grad_norm": 0.703551173210144, "learning_rate": 9.572868640882328e-05, "loss": 0.0516, "step": 17590 }, { "epoch": 6.516105146242133, "grad_norm": 0.2450946718454361, "learning_rate": 9.572199695899522e-05, "loss": 0.0325, "step": 17600 }, { "epoch": 6.519807478711588, "grad_norm": 1.2083258628845215, "learning_rate": 9.571530250907832e-05, "loss": 0.0453, "step": 17610 }, { "epoch": 6.523509811181044, "grad_norm": 0.9560840725898743, "learning_rate": 9.570860305980466e-05, "loss": 0.04, "step": 17620 }, { "epoch": 6.5272121436505, "grad_norm": 0.19327126443386078, "learning_rate": 9.570189861190689e-05, "loss": 0.0341, "step": 17630 }, { "epoch": 6.530914476119955, "grad_norm": 0.3366299271583557, "learning_rate": 9.569518916611819e-05, "loss": 0.0452, "step": 17640 }, { "epoch": 6.534616808589411, "grad_norm": 0.8444273471832275, "learning_rate": 9.568847472317232e-05, "loss": 0.0332, "step": 17650 }, { "epoch": 6.538319141058867, "grad_norm": 0.2960146367549896, "learning_rate": 9.568175528380354e-05, "loss": 0.04, "step": 17660 }, { "epoch": 6.542021473528322, "grad_norm": 0.19330039620399475, "learning_rate": 9.567503084874669e-05, "loss": 0.0316, "step": 17670 }, { "epoch": 6.5457238059977785, "grad_norm": 0.5117852091789246, "learning_rate": 9.566830141873714e-05, "loss": 0.0317, "step": 17680 }, { "epoch": 6.5494261384672345, "grad_norm": 0.3125264644622803, "learning_rate": 9.56615669945108e-05, "loss": 0.0312, "step": 17690 }, { "epoch": 6.55312847093669, "grad_norm": 0.3813202381134033, "learning_rate": 9.565482757680415e-05, "loss": 0.0501, "step": 17700 }, { "epoch": 6.556830803406146, "grad_norm": 0.8920242786407471, "learning_rate": 9.564808316635419e-05, "loss": 0.04, "step": 17710 }, { "epoch": 6.560533135875602, "grad_norm": 0.5337396264076233, "learning_rate": 9.564133376389849e-05, "loss": 0.0395, "step": 17720 }, { "epoch": 6.564235468345057, "grad_norm": 0.18919062614440918, "learning_rate": 9.563457937017515e-05, "loss": 0.0373, "step": 17730 }, { "epoch": 6.567937800814513, "grad_norm": 0.5932152271270752, "learning_rate": 9.562781998592281e-05, "loss": 0.0408, "step": 17740 }, { "epoch": 6.571640133283969, "grad_norm": 0.23739691078662872, "learning_rate": 9.562105561188069e-05, "loss": 0.031, "step": 17750 }, { "epoch": 6.575342465753424, "grad_norm": 2.6960842609405518, "learning_rate": 9.561428624878852e-05, "loss": 0.0374, "step": 17760 }, { "epoch": 6.57904479822288, "grad_norm": 0.4722159802913666, "learning_rate": 9.560751189738657e-05, "loss": 0.0345, "step": 17770 }, { "epoch": 6.582747130692336, "grad_norm": 2.889413595199585, "learning_rate": 9.560073255841571e-05, "loss": 0.0322, "step": 17780 }, { "epoch": 6.586449463161792, "grad_norm": 0.2936749756336212, "learning_rate": 9.55939482326173e-05, "loss": 0.029, "step": 17790 }, { "epoch": 6.590151795631248, "grad_norm": 0.4156076908111572, "learning_rate": 9.558715892073323e-05, "loss": 0.0386, "step": 17800 }, { "epoch": 6.593854128100704, "grad_norm": 0.356571763753891, "learning_rate": 9.558036462350602e-05, "loss": 0.0318, "step": 17810 }, { "epoch": 6.597556460570159, "grad_norm": 0.32922565937042236, "learning_rate": 9.557356534167866e-05, "loss": 0.0277, "step": 17820 }, { "epoch": 6.601258793039615, "grad_norm": 0.2690236270427704, "learning_rate": 9.556676107599472e-05, "loss": 0.03, "step": 17830 }, { "epoch": 6.604961125509071, "grad_norm": 0.9357364773750305, "learning_rate": 9.555995182719827e-05, "loss": 0.0267, "step": 17840 }, { "epoch": 6.608663457978526, "grad_norm": 0.8764751553535461, "learning_rate": 9.555313759603402e-05, "loss": 0.0344, "step": 17850 }, { "epoch": 6.612365790447982, "grad_norm": 0.2879052758216858, "learning_rate": 9.554631838324713e-05, "loss": 0.0303, "step": 17860 }, { "epoch": 6.616068122917438, "grad_norm": 2.684232711791992, "learning_rate": 9.553949418958332e-05, "loss": 0.0295, "step": 17870 }, { "epoch": 6.6197704553868935, "grad_norm": 0.31763553619384766, "learning_rate": 9.55326650157889e-05, "loss": 0.0254, "step": 17880 }, { "epoch": 6.6234727878563495, "grad_norm": 0.39555761218070984, "learning_rate": 9.552583086261069e-05, "loss": 0.0463, "step": 17890 }, { "epoch": 6.627175120325806, "grad_norm": 0.8678424954414368, "learning_rate": 9.551899173079607e-05, "loss": 0.0336, "step": 17900 }, { "epoch": 6.630877452795261, "grad_norm": 0.3626122772693634, "learning_rate": 9.551214762109295e-05, "loss": 0.0402, "step": 17910 }, { "epoch": 6.634579785264717, "grad_norm": 0.33596566319465637, "learning_rate": 9.550529853424979e-05, "loss": 0.0561, "step": 17920 }, { "epoch": 6.638282117734173, "grad_norm": 0.3032425343990326, "learning_rate": 9.549844447101559e-05, "loss": 0.024, "step": 17930 }, { "epoch": 6.641984450203628, "grad_norm": 0.210702583193779, "learning_rate": 9.549158543213992e-05, "loss": 0.0278, "step": 17940 }, { "epoch": 6.645686782673084, "grad_norm": 1.016567349433899, "learning_rate": 9.548472141837286e-05, "loss": 0.045, "step": 17950 }, { "epoch": 6.64938911514254, "grad_norm": 0.2578694820404053, "learning_rate": 9.547785243046505e-05, "loss": 0.0304, "step": 17960 }, { "epoch": 6.653091447611995, "grad_norm": 0.3059206008911133, "learning_rate": 9.547097846916769e-05, "loss": 0.0365, "step": 17970 }, { "epoch": 6.656793780081451, "grad_norm": 0.24779878556728363, "learning_rate": 9.546409953523247e-05, "loss": 0.0289, "step": 17980 }, { "epoch": 6.6604961125509075, "grad_norm": 0.22249463200569153, "learning_rate": 9.545721562941168e-05, "loss": 0.0334, "step": 17990 }, { "epoch": 6.664198445020363, "grad_norm": 0.7858086824417114, "learning_rate": 9.545032675245813e-05, "loss": 0.0314, "step": 18000 }, { "epoch": 6.667900777489819, "grad_norm": 0.41931232810020447, "learning_rate": 9.544343290512519e-05, "loss": 0.0369, "step": 18010 }, { "epoch": 6.671603109959275, "grad_norm": 0.3093984127044678, "learning_rate": 9.543653408816674e-05, "loss": 0.0423, "step": 18020 }, { "epoch": 6.67530544242873, "grad_norm": 0.6579623818397522, "learning_rate": 9.542963030233724e-05, "loss": 0.039, "step": 18030 }, { "epoch": 6.679007774898186, "grad_norm": 1.2468937635421753, "learning_rate": 9.542272154839167e-05, "loss": 0.0302, "step": 18040 }, { "epoch": 6.682710107367642, "grad_norm": 0.3760744035243988, "learning_rate": 9.541580782708557e-05, "loss": 0.0344, "step": 18050 }, { "epoch": 6.686412439837097, "grad_norm": 0.3937174081802368, "learning_rate": 9.540888913917501e-05, "loss": 0.0338, "step": 18060 }, { "epoch": 6.690114772306553, "grad_norm": 0.31030935049057007, "learning_rate": 9.54019654854166e-05, "loss": 0.0329, "step": 18070 }, { "epoch": 6.693817104776009, "grad_norm": 0.3404783606529236, "learning_rate": 9.53950368665675e-05, "loss": 0.0291, "step": 18080 }, { "epoch": 6.6975194372454645, "grad_norm": 0.2959967255592346, "learning_rate": 9.538810328338543e-05, "loss": 0.0262, "step": 18090 }, { "epoch": 6.701221769714921, "grad_norm": 0.3140711486339569, "learning_rate": 9.538116473662861e-05, "loss": 0.0572, "step": 18100 }, { "epoch": 6.704924102184377, "grad_norm": 0.3773135542869568, "learning_rate": 9.537422122705585e-05, "loss": 0.0328, "step": 18110 }, { "epoch": 6.708626434653832, "grad_norm": 0.319583535194397, "learning_rate": 9.536727275542648e-05, "loss": 0.0364, "step": 18120 }, { "epoch": 6.712328767123288, "grad_norm": 0.5526044368743896, "learning_rate": 9.536031932250036e-05, "loss": 0.0306, "step": 18130 }, { "epoch": 6.716031099592744, "grad_norm": 0.420190691947937, "learning_rate": 9.535336092903793e-05, "loss": 0.0388, "step": 18140 }, { "epoch": 6.719733432062199, "grad_norm": 0.18137121200561523, "learning_rate": 9.534639757580013e-05, "loss": 0.0337, "step": 18150 }, { "epoch": 6.723435764531655, "grad_norm": 0.4472156763076782, "learning_rate": 9.533942926354847e-05, "loss": 0.0375, "step": 18160 }, { "epoch": 6.72713809700111, "grad_norm": 0.2999706566333771, "learning_rate": 9.5332455993045e-05, "loss": 0.0335, "step": 18170 }, { "epoch": 6.730840429470566, "grad_norm": 0.49739089608192444, "learning_rate": 9.532547776505229e-05, "loss": 0.0351, "step": 18180 }, { "epoch": 6.734542761940022, "grad_norm": 0.3553900718688965, "learning_rate": 9.531849458033349e-05, "loss": 0.029, "step": 18190 }, { "epoch": 6.738245094409478, "grad_norm": 0.5915079116821289, "learning_rate": 9.531150643965223e-05, "loss": 0.0378, "step": 18200 }, { "epoch": 6.741947426878934, "grad_norm": 0.4874799847602844, "learning_rate": 9.530451334377277e-05, "loss": 0.0393, "step": 18210 }, { "epoch": 6.74564975934839, "grad_norm": 0.27570387721061707, "learning_rate": 9.529751529345984e-05, "loss": 0.0359, "step": 18220 }, { "epoch": 6.749352091817845, "grad_norm": 0.6256853342056274, "learning_rate": 9.529051228947875e-05, "loss": 0.0304, "step": 18230 }, { "epoch": 6.753054424287301, "grad_norm": 0.4091845452785492, "learning_rate": 9.528350433259531e-05, "loss": 0.0301, "step": 18240 }, { "epoch": 6.756756756756757, "grad_norm": 1.0798511505126953, "learning_rate": 9.527649142357596e-05, "loss": 0.0286, "step": 18250 }, { "epoch": 6.760459089226212, "grad_norm": 0.25857478380203247, "learning_rate": 9.526947356318754e-05, "loss": 0.028, "step": 18260 }, { "epoch": 6.764161421695668, "grad_norm": 0.2803562581539154, "learning_rate": 9.526245075219757e-05, "loss": 0.0357, "step": 18270 }, { "epoch": 6.767863754165124, "grad_norm": 3.1643319129943848, "learning_rate": 9.525542299137402e-05, "loss": 0.0262, "step": 18280 }, { "epoch": 6.7715660866345795, "grad_norm": 0.3033042848110199, "learning_rate": 9.524839028148547e-05, "loss": 0.0338, "step": 18290 }, { "epoch": 6.7752684191040355, "grad_norm": 1.0253231525421143, "learning_rate": 9.524135262330098e-05, "loss": 0.0471, "step": 18300 }, { "epoch": 6.778970751573492, "grad_norm": 0.4890193045139313, "learning_rate": 9.523431001759019e-05, "loss": 0.039, "step": 18310 }, { "epoch": 6.782673084042947, "grad_norm": 1.0785274505615234, "learning_rate": 9.522726246512326e-05, "loss": 0.0364, "step": 18320 }, { "epoch": 6.786375416512403, "grad_norm": 0.21980999410152435, "learning_rate": 9.522020996667092e-05, "loss": 0.0366, "step": 18330 }, { "epoch": 6.790077748981859, "grad_norm": 0.2293756604194641, "learning_rate": 9.521315252300438e-05, "loss": 0.0603, "step": 18340 }, { "epoch": 6.793780081451314, "grad_norm": 0.1973317712545395, "learning_rate": 9.520609013489547e-05, "loss": 0.0309, "step": 18350 }, { "epoch": 6.79748241392077, "grad_norm": 0.3510529100894928, "learning_rate": 9.519902280311653e-05, "loss": 0.0445, "step": 18360 }, { "epoch": 6.801184746390226, "grad_norm": 1.1823757886886597, "learning_rate": 9.519195052844039e-05, "loss": 0.0408, "step": 18370 }, { "epoch": 6.804887078859681, "grad_norm": 0.24163515865802765, "learning_rate": 9.518487331164048e-05, "loss": 0.0353, "step": 18380 }, { "epoch": 6.808589411329137, "grad_norm": 0.2761327028274536, "learning_rate": 9.517779115349077e-05, "loss": 0.0417, "step": 18390 }, { "epoch": 6.8122917437985935, "grad_norm": 0.5271710157394409, "learning_rate": 9.517070405476575e-05, "loss": 0.0518, "step": 18400 }, { "epoch": 6.815994076268049, "grad_norm": 0.3580666780471802, "learning_rate": 9.516361201624043e-05, "loss": 0.0287, "step": 18410 }, { "epoch": 6.819696408737505, "grad_norm": 0.347884863615036, "learning_rate": 9.51565150386904e-05, "loss": 0.0255, "step": 18420 }, { "epoch": 6.823398741206961, "grad_norm": 0.43501436710357666, "learning_rate": 9.51494131228918e-05, "loss": 0.034, "step": 18430 }, { "epoch": 6.827101073676416, "grad_norm": 0.6786378026008606, "learning_rate": 9.514230626962127e-05, "loss": 0.0375, "step": 18440 }, { "epoch": 6.830803406145872, "grad_norm": 0.13378579914569855, "learning_rate": 9.513519447965595e-05, "loss": 0.0289, "step": 18450 }, { "epoch": 6.834505738615327, "grad_norm": 0.3840615153312683, "learning_rate": 9.512807775377366e-05, "loss": 0.0337, "step": 18460 }, { "epoch": 6.838208071084783, "grad_norm": 0.22734865546226501, "learning_rate": 9.512095609275263e-05, "loss": 0.0377, "step": 18470 }, { "epoch": 6.841910403554239, "grad_norm": 0.2885906398296356, "learning_rate": 9.511382949737166e-05, "loss": 0.0364, "step": 18480 }, { "epoch": 6.8456127360236945, "grad_norm": 0.32127442955970764, "learning_rate": 9.510669796841014e-05, "loss": 0.0401, "step": 18490 }, { "epoch": 6.8493150684931505, "grad_norm": 0.38105425238609314, "learning_rate": 9.509956150664796e-05, "loss": 0.0304, "step": 18500 }, { "epoch": 6.853017400962607, "grad_norm": 0.2569659948348999, "learning_rate": 9.509242011286552e-05, "loss": 0.0368, "step": 18510 }, { "epoch": 6.856719733432062, "grad_norm": 0.3877871632575989, "learning_rate": 9.508527378784383e-05, "loss": 0.0359, "step": 18520 }, { "epoch": 6.860422065901518, "grad_norm": 0.11447782069444656, "learning_rate": 9.507812253236435e-05, "loss": 0.0278, "step": 18530 }, { "epoch": 6.864124398370974, "grad_norm": 0.3126146197319031, "learning_rate": 9.507096634720918e-05, "loss": 0.0335, "step": 18540 }, { "epoch": 6.867826730840429, "grad_norm": 0.22635827958583832, "learning_rate": 9.50638052331609e-05, "loss": 0.039, "step": 18550 }, { "epoch": 6.871529063309885, "grad_norm": 0.3325413465499878, "learning_rate": 9.505663919100264e-05, "loss": 0.0377, "step": 18560 }, { "epoch": 6.875231395779341, "grad_norm": 0.16681741178035736, "learning_rate": 9.504946822151804e-05, "loss": 0.0299, "step": 18570 }, { "epoch": 6.878933728248796, "grad_norm": 0.47735363245010376, "learning_rate": 9.504229232549134e-05, "loss": 0.0336, "step": 18580 }, { "epoch": 6.882636060718252, "grad_norm": 0.2898331880569458, "learning_rate": 9.503511150370727e-05, "loss": 0.0362, "step": 18590 }, { "epoch": 6.8863383931877085, "grad_norm": 0.39174285531044006, "learning_rate": 9.502792575695112e-05, "loss": 0.0247, "step": 18600 }, { "epoch": 6.890040725657164, "grad_norm": 0.25644487142562866, "learning_rate": 9.502073508600869e-05, "loss": 0.0247, "step": 18610 }, { "epoch": 6.89374305812662, "grad_norm": 0.48524534702301025, "learning_rate": 9.501353949166637e-05, "loss": 0.0387, "step": 18620 }, { "epoch": 6.897445390596076, "grad_norm": 0.4486519694328308, "learning_rate": 9.500633897471106e-05, "loss": 0.0287, "step": 18630 }, { "epoch": 6.901147723065531, "grad_norm": 0.2110287994146347, "learning_rate": 9.499913353593015e-05, "loss": 0.0469, "step": 18640 }, { "epoch": 6.904850055534987, "grad_norm": 0.3026105761528015, "learning_rate": 9.499192317611167e-05, "loss": 0.0281, "step": 18650 }, { "epoch": 6.908552388004443, "grad_norm": 0.3369300365447998, "learning_rate": 9.498470789604413e-05, "loss": 0.0441, "step": 18660 }, { "epoch": 6.912254720473898, "grad_norm": 0.15630415081977844, "learning_rate": 9.497748769651655e-05, "loss": 0.0453, "step": 18670 }, { "epoch": 6.915957052943354, "grad_norm": 0.3863113820552826, "learning_rate": 9.497026257831855e-05, "loss": 0.023, "step": 18680 }, { "epoch": 6.91965938541281, "grad_norm": 0.563814103603363, "learning_rate": 9.496303254224024e-05, "loss": 0.0354, "step": 18690 }, { "epoch": 6.9233617178822655, "grad_norm": 0.3622777760028839, "learning_rate": 9.49557975890723e-05, "loss": 0.0425, "step": 18700 }, { "epoch": 6.927064050351722, "grad_norm": 0.19050171971321106, "learning_rate": 9.494855771960593e-05, "loss": 0.0291, "step": 18710 }, { "epoch": 6.930766382821178, "grad_norm": 0.23894785344600677, "learning_rate": 9.494131293463285e-05, "loss": 0.0286, "step": 18720 }, { "epoch": 6.934468715290633, "grad_norm": 0.34749525785446167, "learning_rate": 9.493406323494535e-05, "loss": 0.026, "step": 18730 }, { "epoch": 6.938171047760089, "grad_norm": 0.24914944171905518, "learning_rate": 9.492680862133628e-05, "loss": 0.0348, "step": 18740 }, { "epoch": 6.941873380229545, "grad_norm": 0.6983808279037476, "learning_rate": 9.491954909459895e-05, "loss": 0.0359, "step": 18750 }, { "epoch": 6.945575712699, "grad_norm": 0.4660937786102295, "learning_rate": 9.491228465552726e-05, "loss": 0.0336, "step": 18760 }, { "epoch": 6.949278045168456, "grad_norm": 0.3150526285171509, "learning_rate": 9.490501530491566e-05, "loss": 0.0287, "step": 18770 }, { "epoch": 6.952980377637912, "grad_norm": 0.3916177749633789, "learning_rate": 9.489774104355909e-05, "loss": 0.0404, "step": 18780 }, { "epoch": 6.956682710107367, "grad_norm": 0.5799281001091003, "learning_rate": 9.489046187225306e-05, "loss": 0.0382, "step": 18790 }, { "epoch": 6.9603850425768234, "grad_norm": 0.46913567185401917, "learning_rate": 9.488317779179361e-05, "loss": 0.0568, "step": 18800 }, { "epoch": 6.9640873750462795, "grad_norm": 0.5384354591369629, "learning_rate": 9.487588880297733e-05, "loss": 0.0295, "step": 18810 }, { "epoch": 6.967789707515735, "grad_norm": 0.28896111249923706, "learning_rate": 9.486859490660132e-05, "loss": 0.0335, "step": 18820 }, { "epoch": 6.971492039985191, "grad_norm": 0.36311039328575134, "learning_rate": 9.486129610346321e-05, "loss": 0.0398, "step": 18830 }, { "epoch": 6.975194372454647, "grad_norm": 0.39282575249671936, "learning_rate": 9.485399239436122e-05, "loss": 0.0308, "step": 18840 }, { "epoch": 6.978896704924102, "grad_norm": 0.21331708133220673, "learning_rate": 9.484668378009408e-05, "loss": 0.0289, "step": 18850 }, { "epoch": 6.982599037393558, "grad_norm": 0.36408400535583496, "learning_rate": 9.4839370261461e-05, "loss": 0.0391, "step": 18860 }, { "epoch": 6.986301369863014, "grad_norm": 0.23400011658668518, "learning_rate": 9.483205183926181e-05, "loss": 0.0467, "step": 18870 }, { "epoch": 6.990003702332469, "grad_norm": 0.3752362132072449, "learning_rate": 9.482472851429682e-05, "loss": 0.0293, "step": 18880 }, { "epoch": 6.993706034801925, "grad_norm": 0.44533419609069824, "learning_rate": 9.481740028736692e-05, "loss": 0.0364, "step": 18890 }, { "epoch": 6.997408367271381, "grad_norm": 0.3158087432384491, "learning_rate": 9.481006715927351e-05, "loss": 0.0321, "step": 18900 }, { "epoch": 7.0011106997408366, "grad_norm": 0.6746332049369812, "learning_rate": 9.480272913081854e-05, "loss": 0.0305, "step": 18910 }, { "epoch": 7.004813032210293, "grad_norm": 1.3958219289779663, "learning_rate": 9.479538620280445e-05, "loss": 0.0432, "step": 18920 }, { "epoch": 7.008515364679749, "grad_norm": 0.37652045488357544, "learning_rate": 9.47880383760343e-05, "loss": 0.0331, "step": 18930 }, { "epoch": 7.012217697149204, "grad_norm": 0.7753661274909973, "learning_rate": 9.478068565131159e-05, "loss": 0.0221, "step": 18940 }, { "epoch": 7.01592002961866, "grad_norm": 0.8662757277488708, "learning_rate": 9.477332802944044e-05, "loss": 0.0287, "step": 18950 }, { "epoch": 7.019622362088115, "grad_norm": 0.1534758359193802, "learning_rate": 9.476596551122543e-05, "loss": 0.0299, "step": 18960 }, { "epoch": 7.023324694557571, "grad_norm": 0.6986712217330933, "learning_rate": 9.475859809747175e-05, "loss": 0.0345, "step": 18970 }, { "epoch": 7.027027027027027, "grad_norm": 0.5259981751441956, "learning_rate": 9.475122578898507e-05, "loss": 0.0433, "step": 18980 }, { "epoch": 7.030729359496482, "grad_norm": 0.4068642556667328, "learning_rate": 9.474384858657164e-05, "loss": 0.0326, "step": 18990 }, { "epoch": 7.034431691965938, "grad_norm": 1.099714756011963, "learning_rate": 9.473646649103818e-05, "loss": 0.0537, "step": 19000 }, { "epoch": 7.0381340244353945, "grad_norm": 0.5344736576080322, "learning_rate": 9.472907950319201e-05, "loss": 0.0493, "step": 19010 }, { "epoch": 7.04183635690485, "grad_norm": 0.5603709816932678, "learning_rate": 9.472168762384095e-05, "loss": 0.0374, "step": 19020 }, { "epoch": 7.045538689374306, "grad_norm": 0.8446599841117859, "learning_rate": 9.471429085379338e-05, "loss": 0.0387, "step": 19030 }, { "epoch": 7.049241021843762, "grad_norm": 1.1765782833099365, "learning_rate": 9.470688919385818e-05, "loss": 0.0415, "step": 19040 }, { "epoch": 7.052943354313217, "grad_norm": 0.33307895064353943, "learning_rate": 9.46994826448448e-05, "loss": 0.0362, "step": 19050 }, { "epoch": 7.056645686782673, "grad_norm": 0.6239914894104004, "learning_rate": 9.46920712075632e-05, "loss": 0.0489, "step": 19060 }, { "epoch": 7.060348019252129, "grad_norm": 0.5302589535713196, "learning_rate": 9.468465488282388e-05, "loss": 0.0444, "step": 19070 }, { "epoch": 7.064050351721584, "grad_norm": 1.0294079780578613, "learning_rate": 9.467723367143789e-05, "loss": 0.0261, "step": 19080 }, { "epoch": 7.06775268419104, "grad_norm": 0.600630521774292, "learning_rate": 9.466980757421679e-05, "loss": 0.031, "step": 19090 }, { "epoch": 7.071455016660496, "grad_norm": 0.21999573707580566, "learning_rate": 9.46623765919727e-05, "loss": 0.0244, "step": 19100 }, { "epoch": 7.0751573491299515, "grad_norm": 0.26924172043800354, "learning_rate": 9.465494072551825e-05, "loss": 0.0322, "step": 19110 }, { "epoch": 7.078859681599408, "grad_norm": 0.27884694933891296, "learning_rate": 9.464749997566664e-05, "loss": 0.0486, "step": 19120 }, { "epoch": 7.082562014068864, "grad_norm": 0.470651239156723, "learning_rate": 9.464005434323154e-05, "loss": 0.0265, "step": 19130 }, { "epoch": 7.086264346538319, "grad_norm": 0.5595779418945312, "learning_rate": 9.463260382902721e-05, "loss": 0.0358, "step": 19140 }, { "epoch": 7.089966679007775, "grad_norm": 0.34614136815071106, "learning_rate": 9.462514843386845e-05, "loss": 0.0453, "step": 19150 }, { "epoch": 7.093669011477231, "grad_norm": 0.2919809818267822, "learning_rate": 9.461768815857053e-05, "loss": 0.0406, "step": 19160 }, { "epoch": 7.097371343946686, "grad_norm": 0.37942934036254883, "learning_rate": 9.461022300394932e-05, "loss": 0.0444, "step": 19170 }, { "epoch": 7.101073676416142, "grad_norm": 0.25602468848228455, "learning_rate": 9.460275297082119e-05, "loss": 0.0388, "step": 19180 }, { "epoch": 7.104776008885598, "grad_norm": 0.5467396974563599, "learning_rate": 9.459527806000305e-05, "loss": 0.044, "step": 19190 }, { "epoch": 7.108478341355053, "grad_norm": 0.26108744740486145, "learning_rate": 9.458779827231237e-05, "loss": 0.0467, "step": 19200 }, { "epoch": 7.1121806738245095, "grad_norm": 0.2104610651731491, "learning_rate": 9.45803136085671e-05, "loss": 0.0242, "step": 19210 }, { "epoch": 7.1158830062939655, "grad_norm": 0.28878387808799744, "learning_rate": 9.457282406958574e-05, "loss": 0.0288, "step": 19220 }, { "epoch": 7.119585338763421, "grad_norm": 0.3231697678565979, "learning_rate": 9.456532965618737e-05, "loss": 0.0375, "step": 19230 }, { "epoch": 7.123287671232877, "grad_norm": 0.3482407033443451, "learning_rate": 9.455783036919155e-05, "loss": 0.0246, "step": 19240 }, { "epoch": 7.126990003702333, "grad_norm": 0.19860218465328217, "learning_rate": 9.45503262094184e-05, "loss": 0.0351, "step": 19250 }, { "epoch": 7.130692336171788, "grad_norm": 0.2987569570541382, "learning_rate": 9.454281717768854e-05, "loss": 0.0352, "step": 19260 }, { "epoch": 7.134394668641244, "grad_norm": 1.4480801820755005, "learning_rate": 9.453530327482318e-05, "loss": 0.0511, "step": 19270 }, { "epoch": 7.1380970011107, "grad_norm": 0.421322226524353, "learning_rate": 9.4527784501644e-05, "loss": 0.0324, "step": 19280 }, { "epoch": 7.141799333580155, "grad_norm": 1.3228349685668945, "learning_rate": 9.452026085897325e-05, "loss": 0.0384, "step": 19290 }, { "epoch": 7.145501666049611, "grad_norm": 0.28345412015914917, "learning_rate": 9.451273234763371e-05, "loss": 0.0269, "step": 19300 }, { "epoch": 7.149203998519067, "grad_norm": 2.63496470451355, "learning_rate": 9.45051989684487e-05, "loss": 0.031, "step": 19310 }, { "epoch": 7.152906330988523, "grad_norm": 0.2845596373081207, "learning_rate": 9.449766072224203e-05, "loss": 0.0355, "step": 19320 }, { "epoch": 7.156608663457979, "grad_norm": 0.2615615427494049, "learning_rate": 9.449011760983809e-05, "loss": 0.0224, "step": 19330 }, { "epoch": 7.160310995927435, "grad_norm": 0.15179356932640076, "learning_rate": 9.448256963206177e-05, "loss": 0.0283, "step": 19340 }, { "epoch": 7.16401332839689, "grad_norm": 0.272245317697525, "learning_rate": 9.447501678973852e-05, "loss": 0.0258, "step": 19350 }, { "epoch": 7.167715660866346, "grad_norm": 0.3633345365524292, "learning_rate": 9.446745908369429e-05, "loss": 0.038, "step": 19360 }, { "epoch": 7.171417993335801, "grad_norm": 0.5943030714988708, "learning_rate": 9.44598965147556e-05, "loss": 0.0368, "step": 19370 }, { "epoch": 7.175120325805257, "grad_norm": 0.3735424280166626, "learning_rate": 9.445232908374948e-05, "loss": 0.0252, "step": 19380 }, { "epoch": 7.178822658274713, "grad_norm": 0.8405268788337708, "learning_rate": 9.444475679150348e-05, "loss": 0.0314, "step": 19390 }, { "epoch": 7.182524990744168, "grad_norm": 0.125558540225029, "learning_rate": 9.443717963884569e-05, "loss": 0.0316, "step": 19400 }, { "epoch": 7.1862273232136245, "grad_norm": 0.1408279538154602, "learning_rate": 9.442959762660475e-05, "loss": 0.0221, "step": 19410 }, { "epoch": 7.1899296556830805, "grad_norm": 0.5282895565032959, "learning_rate": 9.442201075560981e-05, "loss": 0.038, "step": 19420 }, { "epoch": 7.193631988152536, "grad_norm": 0.7458646893501282, "learning_rate": 9.441441902669056e-05, "loss": 0.0335, "step": 19430 }, { "epoch": 7.197334320621992, "grad_norm": 0.24365656077861786, "learning_rate": 9.440682244067724e-05, "loss": 0.0265, "step": 19440 }, { "epoch": 7.201036653091448, "grad_norm": 0.40293875336647034, "learning_rate": 9.439922099840054e-05, "loss": 0.0306, "step": 19450 }, { "epoch": 7.204738985560903, "grad_norm": 0.24227403104305267, "learning_rate": 9.439161470069184e-05, "loss": 0.049, "step": 19460 }, { "epoch": 7.208441318030359, "grad_norm": 0.4861457943916321, "learning_rate": 9.438400354838286e-05, "loss": 0.0387, "step": 19470 }, { "epoch": 7.212143650499815, "grad_norm": 0.5609987378120422, "learning_rate": 9.4376387542306e-05, "loss": 0.0307, "step": 19480 }, { "epoch": 7.21584598296927, "grad_norm": 0.28701546788215637, "learning_rate": 9.436876668329411e-05, "loss": 0.0322, "step": 19490 }, { "epoch": 7.219548315438726, "grad_norm": 0.93808913230896, "learning_rate": 9.43611409721806e-05, "loss": 0.038, "step": 19500 }, { "epoch": 7.223250647908182, "grad_norm": 0.20078794658184052, "learning_rate": 9.435351040979941e-05, "loss": 0.0274, "step": 19510 }, { "epoch": 7.226952980377638, "grad_norm": 0.3695733845233917, "learning_rate": 9.4345874996985e-05, "loss": 0.0334, "step": 19520 }, { "epoch": 7.230655312847094, "grad_norm": 0.1927928328514099, "learning_rate": 9.43382347345724e-05, "loss": 0.0238, "step": 19530 }, { "epoch": 7.23435764531655, "grad_norm": 0.355118989944458, "learning_rate": 9.433058962339708e-05, "loss": 0.0217, "step": 19540 }, { "epoch": 7.238059977786005, "grad_norm": 0.3600209355354309, "learning_rate": 9.432293966429514e-05, "loss": 0.0247, "step": 19550 }, { "epoch": 7.241762310255461, "grad_norm": 0.3145217001438141, "learning_rate": 9.431528485810316e-05, "loss": 0.0308, "step": 19560 }, { "epoch": 7.245464642724917, "grad_norm": 0.7971790432929993, "learning_rate": 9.430762520565826e-05, "loss": 0.0367, "step": 19570 }, { "epoch": 7.249166975194372, "grad_norm": 0.345550000667572, "learning_rate": 9.429996070779808e-05, "loss": 0.0401, "step": 19580 }, { "epoch": 7.252869307663828, "grad_norm": 0.5253550410270691, "learning_rate": 9.429229136536079e-05, "loss": 0.0463, "step": 19590 }, { "epoch": 7.256571640133284, "grad_norm": 0.20525288581848145, "learning_rate": 9.428461717918511e-05, "loss": 0.0295, "step": 19600 }, { "epoch": 7.260273972602739, "grad_norm": 0.6766973733901978, "learning_rate": 9.427693815011028e-05, "loss": 0.0436, "step": 19610 }, { "epoch": 7.2639763050721955, "grad_norm": 0.22719590365886688, "learning_rate": 9.426925427897608e-05, "loss": 0.0367, "step": 19620 }, { "epoch": 7.2676786375416516, "grad_norm": 0.5431828498840332, "learning_rate": 9.426156556662276e-05, "loss": 0.0267, "step": 19630 }, { "epoch": 7.271380970011107, "grad_norm": 0.8531977534294128, "learning_rate": 9.42538720138912e-05, "loss": 0.0283, "step": 19640 }, { "epoch": 7.275083302480563, "grad_norm": 0.3799513876438141, "learning_rate": 9.424617362162271e-05, "loss": 0.0343, "step": 19650 }, { "epoch": 7.278785634950019, "grad_norm": 0.5024363994598389, "learning_rate": 9.423847039065922e-05, "loss": 0.0286, "step": 19660 }, { "epoch": 7.282487967419474, "grad_norm": 0.7305864691734314, "learning_rate": 9.423076232184311e-05, "loss": 0.0331, "step": 19670 }, { "epoch": 7.28619029988893, "grad_norm": 0.23596738278865814, "learning_rate": 9.422304941601733e-05, "loss": 0.0329, "step": 19680 }, { "epoch": 7.289892632358386, "grad_norm": 0.30287081003189087, "learning_rate": 9.421533167402534e-05, "loss": 0.0335, "step": 19690 }, { "epoch": 7.293594964827841, "grad_norm": 0.24540627002716064, "learning_rate": 9.420760909671118e-05, "loss": 0.0384, "step": 19700 }, { "epoch": 7.297297297297297, "grad_norm": 0.2916576564311981, "learning_rate": 9.419988168491934e-05, "loss": 0.0411, "step": 19710 }, { "epoch": 7.300999629766753, "grad_norm": 0.6737951636314392, "learning_rate": 9.41921494394949e-05, "loss": 0.0409, "step": 19720 }, { "epoch": 7.304701962236209, "grad_norm": 0.3398020267486572, "learning_rate": 9.418441236128343e-05, "loss": 0.0246, "step": 19730 }, { "epoch": 7.308404294705665, "grad_norm": 0.3370512127876282, "learning_rate": 9.417667045113107e-05, "loss": 0.0268, "step": 19740 }, { "epoch": 7.312106627175121, "grad_norm": 0.2830483615398407, "learning_rate": 9.416892370988444e-05, "loss": 0.0421, "step": 19750 }, { "epoch": 7.315808959644576, "grad_norm": 0.97735995054245, "learning_rate": 9.416117213839071e-05, "loss": 0.0324, "step": 19760 }, { "epoch": 7.319511292114032, "grad_norm": 0.2657328248023987, "learning_rate": 9.415341573749761e-05, "loss": 0.0468, "step": 19770 }, { "epoch": 7.323213624583488, "grad_norm": 0.25635525584220886, "learning_rate": 9.414565450805333e-05, "loss": 0.0455, "step": 19780 }, { "epoch": 7.326915957052943, "grad_norm": 1.0264618396759033, "learning_rate": 9.413788845090666e-05, "loss": 0.0475, "step": 19790 }, { "epoch": 7.330618289522399, "grad_norm": 0.24272926151752472, "learning_rate": 9.413011756690685e-05, "loss": 0.0434, "step": 19800 }, { "epoch": 7.334320621991855, "grad_norm": 0.7051565051078796, "learning_rate": 9.412234185690374e-05, "loss": 0.0385, "step": 19810 }, { "epoch": 7.3380229544613105, "grad_norm": 0.3552158772945404, "learning_rate": 9.411456132174767e-05, "loss": 0.0322, "step": 19820 }, { "epoch": 7.3417252869307665, "grad_norm": 0.23746570944786072, "learning_rate": 9.41067759622895e-05, "loss": 0.0368, "step": 19830 }, { "epoch": 7.345427619400223, "grad_norm": 0.32037651538848877, "learning_rate": 9.409898577938062e-05, "loss": 0.0255, "step": 19840 }, { "epoch": 7.349129951869678, "grad_norm": 0.31287437677383423, "learning_rate": 9.409119077387294e-05, "loss": 0.0402, "step": 19850 }, { "epoch": 7.352832284339134, "grad_norm": 0.40430963039398193, "learning_rate": 9.408339094661895e-05, "loss": 0.0403, "step": 19860 }, { "epoch": 7.356534616808589, "grad_norm": 0.17504091560840607, "learning_rate": 9.40755862984716e-05, "loss": 0.03, "step": 19870 }, { "epoch": 7.360236949278045, "grad_norm": 0.26945483684539795, "learning_rate": 9.406777683028438e-05, "loss": 0.0385, "step": 19880 }, { "epoch": 7.363939281747501, "grad_norm": 0.4258517920970917, "learning_rate": 9.405996254291136e-05, "loss": 0.0331, "step": 19890 }, { "epoch": 7.367641614216956, "grad_norm": 0.19391551613807678, "learning_rate": 9.405214343720707e-05, "loss": 0.0237, "step": 19900 }, { "epoch": 7.371343946686412, "grad_norm": 0.41345930099487305, "learning_rate": 9.404431951402663e-05, "loss": 0.0315, "step": 19910 }, { "epoch": 7.375046279155868, "grad_norm": 0.33584368228912354, "learning_rate": 9.40364907742256e-05, "loss": 0.0246, "step": 19920 }, { "epoch": 7.378748611625324, "grad_norm": 0.22747483849525452, "learning_rate": 9.402865721866015e-05, "loss": 0.0307, "step": 19930 }, { "epoch": 7.38245094409478, "grad_norm": 0.4877525866031647, "learning_rate": 9.402081884818695e-05, "loss": 0.0341, "step": 19940 }, { "epoch": 7.386153276564236, "grad_norm": 0.29096636176109314, "learning_rate": 9.401297566366318e-05, "loss": 0.0363, "step": 19950 }, { "epoch": 7.389855609033691, "grad_norm": 0.3858053982257843, "learning_rate": 9.400512766594659e-05, "loss": 0.0466, "step": 19960 }, { "epoch": 7.393557941503147, "grad_norm": 0.24450914561748505, "learning_rate": 9.399727485589536e-05, "loss": 0.0277, "step": 19970 }, { "epoch": 7.397260273972603, "grad_norm": 0.4248185455799103, "learning_rate": 9.398941723436831e-05, "loss": 0.0322, "step": 19980 }, { "epoch": 7.400962606442058, "grad_norm": 0.7522688508033752, "learning_rate": 9.398155480222474e-05, "loss": 0.0387, "step": 19990 }, { "epoch": 7.404664938911514, "grad_norm": 0.24172647297382355, "learning_rate": 9.397368756032445e-05, "loss": 0.0339, "step": 20000 }, { "epoch": 7.40836727138097, "grad_norm": 0.4754628837108612, "learning_rate": 9.396581550952781e-05, "loss": 0.0424, "step": 20010 }, { "epoch": 7.4120696038504255, "grad_norm": 0.2658296525478363, "learning_rate": 9.395793865069568e-05, "loss": 0.0323, "step": 20020 }, { "epoch": 7.4157719363198815, "grad_norm": 4.10125207901001, "learning_rate": 9.395005698468949e-05, "loss": 0.0384, "step": 20030 }, { "epoch": 7.419474268789338, "grad_norm": 0.39737218618392944, "learning_rate": 9.394217051237113e-05, "loss": 0.023, "step": 20040 }, { "epoch": 7.423176601258793, "grad_norm": 0.3428436517715454, "learning_rate": 9.393427923460308e-05, "loss": 0.035, "step": 20050 }, { "epoch": 7.426878933728249, "grad_norm": 0.17002390325069427, "learning_rate": 9.392638315224829e-05, "loss": 0.0234, "step": 20060 }, { "epoch": 7.430581266197705, "grad_norm": 0.4350597858428955, "learning_rate": 9.391848226617028e-05, "loss": 0.0314, "step": 20070 }, { "epoch": 7.43428359866716, "grad_norm": 0.343028724193573, "learning_rate": 9.39105765772331e-05, "loss": 0.0278, "step": 20080 }, { "epoch": 7.437985931136616, "grad_norm": 0.14311254024505615, "learning_rate": 9.390266608630128e-05, "loss": 0.0341, "step": 20090 }, { "epoch": 7.441688263606072, "grad_norm": 0.13498277962207794, "learning_rate": 9.389475079423988e-05, "loss": 0.035, "step": 20100 }, { "epoch": 7.445390596075527, "grad_norm": 0.33183854818344116, "learning_rate": 9.388683070191455e-05, "loss": 0.0295, "step": 20110 }, { "epoch": 7.449092928544983, "grad_norm": 0.13442687690258026, "learning_rate": 9.387890581019139e-05, "loss": 0.0267, "step": 20120 }, { "epoch": 7.4527952610144395, "grad_norm": 0.3099074959754944, "learning_rate": 9.387097611993707e-05, "loss": 0.0297, "step": 20130 }, { "epoch": 7.456497593483895, "grad_norm": 0.1554974764585495, "learning_rate": 9.386304163201875e-05, "loss": 0.0277, "step": 20140 }, { "epoch": 7.460199925953351, "grad_norm": 0.3707822859287262, "learning_rate": 9.385510234730415e-05, "loss": 0.041, "step": 20150 }, { "epoch": 7.463902258422807, "grad_norm": 0.44758886098861694, "learning_rate": 9.384715826666148e-05, "loss": 0.0176, "step": 20160 }, { "epoch": 7.467604590892262, "grad_norm": 0.1418813169002533, "learning_rate": 9.383920939095952e-05, "loss": 0.0305, "step": 20170 }, { "epoch": 7.471306923361718, "grad_norm": 0.3303236663341522, "learning_rate": 9.383125572106752e-05, "loss": 0.0315, "step": 20180 }, { "epoch": 7.475009255831174, "grad_norm": 0.32399052381515503, "learning_rate": 9.38232972578553e-05, "loss": 0.0344, "step": 20190 }, { "epoch": 7.478711588300629, "grad_norm": 0.20151634514331818, "learning_rate": 9.381533400219318e-05, "loss": 0.0228, "step": 20200 }, { "epoch": 7.482413920770085, "grad_norm": 0.2998364269733429, "learning_rate": 9.3807365954952e-05, "loss": 0.0343, "step": 20210 }, { "epoch": 7.48611625323954, "grad_norm": 0.8102840781211853, "learning_rate": 9.379939311700317e-05, "loss": 0.036, "step": 20220 }, { "epoch": 7.4898185857089965, "grad_norm": 0.2670249938964844, "learning_rate": 9.379141548921853e-05, "loss": 0.0217, "step": 20230 }, { "epoch": 7.493520918178453, "grad_norm": 0.3593599796295166, "learning_rate": 9.378343307247055e-05, "loss": 0.0307, "step": 20240 }, { "epoch": 7.497223250647908, "grad_norm": 0.4155714511871338, "learning_rate": 9.377544586763215e-05, "loss": 0.0284, "step": 20250 }, { "epoch": 7.500925583117364, "grad_norm": 0.45991235971450806, "learning_rate": 9.376745387557681e-05, "loss": 0.0354, "step": 20260 }, { "epoch": 7.50462791558682, "grad_norm": 0.19245049357414246, "learning_rate": 9.37594570971785e-05, "loss": 0.0222, "step": 20270 }, { "epoch": 7.508330248056275, "grad_norm": 0.412892609834671, "learning_rate": 9.375145553331177e-05, "loss": 0.026, "step": 20280 }, { "epoch": 7.512032580525731, "grad_norm": 0.8330375552177429, "learning_rate": 9.374344918485164e-05, "loss": 0.0401, "step": 20290 }, { "epoch": 7.515734912995187, "grad_norm": 0.20634329319000244, "learning_rate": 9.373543805267368e-05, "loss": 0.033, "step": 20300 }, { "epoch": 7.519437245464642, "grad_norm": 0.2715367376804352, "learning_rate": 9.372742213765395e-05, "loss": 0.0355, "step": 20310 }, { "epoch": 7.523139577934098, "grad_norm": 0.199245885014534, "learning_rate": 9.371940144066909e-05, "loss": 0.0339, "step": 20320 }, { "epoch": 7.526841910403554, "grad_norm": 0.1691221296787262, "learning_rate": 9.371137596259623e-05, "loss": 0.0274, "step": 20330 }, { "epoch": 7.53054424287301, "grad_norm": 0.23335789144039154, "learning_rate": 9.3703345704313e-05, "loss": 0.0226, "step": 20340 }, { "epoch": 7.534246575342466, "grad_norm": 1.1745249032974243, "learning_rate": 9.369531066669758e-05, "loss": 0.0313, "step": 20350 }, { "epoch": 7.537948907811922, "grad_norm": 0.3588294982910156, "learning_rate": 9.368727085062872e-05, "loss": 0.0385, "step": 20360 }, { "epoch": 7.541651240281377, "grad_norm": 0.40689119696617126, "learning_rate": 9.367922625698558e-05, "loss": 0.0297, "step": 20370 }, { "epoch": 7.545353572750833, "grad_norm": 0.3472216725349426, "learning_rate": 9.367117688664791e-05, "loss": 0.0286, "step": 20380 }, { "epoch": 7.549055905220289, "grad_norm": 0.49883460998535156, "learning_rate": 9.366312274049602e-05, "loss": 0.0297, "step": 20390 }, { "epoch": 7.552758237689744, "grad_norm": 0.12725666165351868, "learning_rate": 9.365506381941066e-05, "loss": 0.0394, "step": 20400 }, { "epoch": 7.5564605701592, "grad_norm": 0.26880937814712524, "learning_rate": 9.364700012427316e-05, "loss": 0.0541, "step": 20410 }, { "epoch": 7.560162902628656, "grad_norm": 0.4830535650253296, "learning_rate": 9.363893165596533e-05, "loss": 0.0286, "step": 20420 }, { "epoch": 7.5638652350981115, "grad_norm": 0.39657720923423767, "learning_rate": 9.363085841536957e-05, "loss": 0.0407, "step": 20430 }, { "epoch": 7.5675675675675675, "grad_norm": 0.31643447279930115, "learning_rate": 9.362278040336872e-05, "loss": 0.0267, "step": 20440 }, { "epoch": 7.571269900037024, "grad_norm": 0.15282364189624786, "learning_rate": 9.36146976208462e-05, "loss": 0.0378, "step": 20450 }, { "epoch": 7.574972232506479, "grad_norm": 0.23860226571559906, "learning_rate": 9.360661006868592e-05, "loss": 0.0243, "step": 20460 }, { "epoch": 7.578674564975935, "grad_norm": 0.19018079340457916, "learning_rate": 9.35985177477723e-05, "loss": 0.0266, "step": 20470 }, { "epoch": 7.582376897445391, "grad_norm": 0.2185067981481552, "learning_rate": 9.359042065899036e-05, "loss": 0.031, "step": 20480 }, { "epoch": 7.586079229914846, "grad_norm": 0.41039735078811646, "learning_rate": 9.358231880322554e-05, "loss": 0.0362, "step": 20490 }, { "epoch": 7.589781562384302, "grad_norm": 0.34255924820899963, "learning_rate": 9.357421218136386e-05, "loss": 0.0304, "step": 20500 }, { "epoch": 7.593483894853758, "grad_norm": 0.5721972584724426, "learning_rate": 9.356610079429186e-05, "loss": 0.032, "step": 20510 }, { "epoch": 7.597186227323213, "grad_norm": 0.22817738354206085, "learning_rate": 9.355798464289658e-05, "loss": 0.0214, "step": 20520 }, { "epoch": 7.600888559792669, "grad_norm": 0.2271643579006195, "learning_rate": 9.354986372806557e-05, "loss": 0.0332, "step": 20530 }, { "epoch": 7.6045908922621255, "grad_norm": 0.2106265425682068, "learning_rate": 9.354173805068695e-05, "loss": 0.0309, "step": 20540 }, { "epoch": 7.608293224731581, "grad_norm": 0.1954166293144226, "learning_rate": 9.353360761164931e-05, "loss": 0.0341, "step": 20550 }, { "epoch": 7.611995557201037, "grad_norm": 0.567064106464386, "learning_rate": 9.352547241184179e-05, "loss": 0.0311, "step": 20560 }, { "epoch": 7.615697889670493, "grad_norm": 0.23700597882270813, "learning_rate": 9.351733245215407e-05, "loss": 0.0188, "step": 20570 }, { "epoch": 7.619400222139948, "grad_norm": 0.42123308777809143, "learning_rate": 9.35091877334763e-05, "loss": 0.044, "step": 20580 }, { "epoch": 7.623102554609404, "grad_norm": 0.36968398094177246, "learning_rate": 9.350103825669916e-05, "loss": 0.0393, "step": 20590 }, { "epoch": 7.62680488707886, "grad_norm": 0.43652546405792236, "learning_rate": 9.349288402271388e-05, "loss": 0.0408, "step": 20600 }, { "epoch": 7.630507219548315, "grad_norm": 0.34492790699005127, "learning_rate": 9.348472503241219e-05, "loss": 0.029, "step": 20610 }, { "epoch": 7.634209552017771, "grad_norm": 0.5338844656944275, "learning_rate": 9.347656128668636e-05, "loss": 0.0301, "step": 20620 }, { "epoch": 7.637911884487227, "grad_norm": 0.3408099114894867, "learning_rate": 9.346839278642913e-05, "loss": 0.0367, "step": 20630 }, { "epoch": 7.6416142169566825, "grad_norm": 0.28750473260879517, "learning_rate": 9.346021953253384e-05, "loss": 0.0251, "step": 20640 }, { "epoch": 7.645316549426139, "grad_norm": 0.8267560005187988, "learning_rate": 9.345204152589428e-05, "loss": 0.0329, "step": 20650 }, { "epoch": 7.649018881895595, "grad_norm": 0.4687594771385193, "learning_rate": 9.34438587674048e-05, "loss": 0.048, "step": 20660 }, { "epoch": 7.65272121436505, "grad_norm": 1.6606333255767822, "learning_rate": 9.343567125796022e-05, "loss": 0.0329, "step": 20670 }, { "epoch": 7.656423546834506, "grad_norm": 0.4146777093410492, "learning_rate": 9.342747899845594e-05, "loss": 0.0349, "step": 20680 }, { "epoch": 7.660125879303962, "grad_norm": 1.0413986444473267, "learning_rate": 9.341928198978787e-05, "loss": 0.052, "step": 20690 }, { "epoch": 7.663828211773417, "grad_norm": 0.23670397698879242, "learning_rate": 9.341108023285238e-05, "loss": 0.0375, "step": 20700 }, { "epoch": 7.667530544242873, "grad_norm": 0.40257880091667175, "learning_rate": 9.340287372854643e-05, "loss": 0.0308, "step": 20710 }, { "epoch": 7.671232876712329, "grad_norm": 0.47566166520118713, "learning_rate": 9.339466247776746e-05, "loss": 0.0317, "step": 20720 }, { "epoch": 7.674935209181784, "grad_norm": 3.485480546951294, "learning_rate": 9.338644648141346e-05, "loss": 0.0313, "step": 20730 }, { "epoch": 7.6786375416512405, "grad_norm": 2.7708773612976074, "learning_rate": 9.337822574038289e-05, "loss": 0.029, "step": 20740 }, { "epoch": 7.6823398741206965, "grad_norm": 0.2776050269603729, "learning_rate": 9.337000025557476e-05, "loss": 0.0319, "step": 20750 }, { "epoch": 7.686042206590152, "grad_norm": 0.2721838653087616, "learning_rate": 9.336177002788862e-05, "loss": 0.0317, "step": 20760 }, { "epoch": 7.689744539059608, "grad_norm": 0.25339096784591675, "learning_rate": 9.33535350582245e-05, "loss": 0.0254, "step": 20770 }, { "epoch": 7.693446871529063, "grad_norm": 0.2580418586730957, "learning_rate": 9.334529534748297e-05, "loss": 0.0201, "step": 20780 }, { "epoch": 7.697149203998519, "grad_norm": 0.1983409821987152, "learning_rate": 9.333705089656512e-05, "loss": 0.0255, "step": 20790 }, { "epoch": 7.700851536467975, "grad_norm": 0.32042476534843445, "learning_rate": 9.332880170637252e-05, "loss": 0.0323, "step": 20800 }, { "epoch": 7.70455386893743, "grad_norm": 1.0303195714950562, "learning_rate": 9.332054777780732e-05, "loss": 0.0336, "step": 20810 }, { "epoch": 7.708256201406886, "grad_norm": 0.28821682929992676, "learning_rate": 9.331228911177215e-05, "loss": 0.0415, "step": 20820 }, { "epoch": 7.711958533876342, "grad_norm": 0.7842141389846802, "learning_rate": 9.330402570917016e-05, "loss": 0.0484, "step": 20830 }, { "epoch": 7.7156608663457975, "grad_norm": 0.9024096131324768, "learning_rate": 9.329575757090504e-05, "loss": 0.0356, "step": 20840 }, { "epoch": 7.719363198815254, "grad_norm": 0.37221598625183105, "learning_rate": 9.328748469788093e-05, "loss": 0.0325, "step": 20850 }, { "epoch": 7.72306553128471, "grad_norm": 0.2511886656284332, "learning_rate": 9.327920709100259e-05, "loss": 0.0347, "step": 20860 }, { "epoch": 7.726767863754165, "grad_norm": 0.5381499528884888, "learning_rate": 9.327092475117525e-05, "loss": 0.0357, "step": 20870 }, { "epoch": 7.730470196223621, "grad_norm": 1.457426905632019, "learning_rate": 9.326263767930463e-05, "loss": 0.0293, "step": 20880 }, { "epoch": 7.734172528693077, "grad_norm": 0.19604390859603882, "learning_rate": 9.325434587629698e-05, "loss": 0.0324, "step": 20890 }, { "epoch": 7.737874861162532, "grad_norm": 0.32901573181152344, "learning_rate": 9.32460493430591e-05, "loss": 0.0392, "step": 20900 }, { "epoch": 7.741577193631988, "grad_norm": 0.3207454979419708, "learning_rate": 9.32377480804983e-05, "loss": 0.0301, "step": 20910 }, { "epoch": 7.745279526101444, "grad_norm": 0.30969902873039246, "learning_rate": 9.322944208952235e-05, "loss": 0.0305, "step": 20920 }, { "epoch": 7.748981858570899, "grad_norm": 0.26648738980293274, "learning_rate": 9.322113137103964e-05, "loss": 0.0328, "step": 20930 }, { "epoch": 7.752684191040355, "grad_norm": 0.5955448150634766, "learning_rate": 9.321281592595896e-05, "loss": 0.0227, "step": 20940 }, { "epoch": 7.7563865235098115, "grad_norm": 0.7501233816146851, "learning_rate": 9.320449575518972e-05, "loss": 0.0422, "step": 20950 }, { "epoch": 7.760088855979267, "grad_norm": 0.2694724202156067, "learning_rate": 9.319617085964176e-05, "loss": 0.03, "step": 20960 }, { "epoch": 7.763791188448723, "grad_norm": 0.25935518741607666, "learning_rate": 9.318784124022552e-05, "loss": 0.0277, "step": 20970 }, { "epoch": 7.767493520918179, "grad_norm": 0.837621808052063, "learning_rate": 9.317950689785188e-05, "loss": 0.0329, "step": 20980 }, { "epoch": 7.771195853387634, "grad_norm": 0.33323490619659424, "learning_rate": 9.31711678334323e-05, "loss": 0.0394, "step": 20990 }, { "epoch": 7.77489818585709, "grad_norm": 0.4656914174556732, "learning_rate": 9.316282404787871e-05, "loss": 0.0361, "step": 21000 }, { "epoch": 7.778600518326546, "grad_norm": 0.1826416403055191, "learning_rate": 9.315447554210356e-05, "loss": 0.0336, "step": 21010 }, { "epoch": 7.782302850796001, "grad_norm": 0.2705000638961792, "learning_rate": 9.314612231701984e-05, "loss": 0.0437, "step": 21020 }, { "epoch": 7.786005183265457, "grad_norm": 0.13609470427036285, "learning_rate": 9.313776437354108e-05, "loss": 0.0426, "step": 21030 }, { "epoch": 7.789707515734913, "grad_norm": 0.2924700379371643, "learning_rate": 9.312940171258125e-05, "loss": 0.0391, "step": 21040 }, { "epoch": 7.7934098482043686, "grad_norm": 0.3234819769859314, "learning_rate": 9.31210343350549e-05, "loss": 0.0341, "step": 21050 }, { "epoch": 7.797112180673825, "grad_norm": 0.23555099964141846, "learning_rate": 9.311266224187706e-05, "loss": 0.0327, "step": 21060 }, { "epoch": 7.80081451314328, "grad_norm": 0.4553994834423065, "learning_rate": 9.31042854339633e-05, "loss": 0.0436, "step": 21070 }, { "epoch": 7.804516845612736, "grad_norm": 0.8272779583930969, "learning_rate": 9.309590391222968e-05, "loss": 0.0472, "step": 21080 }, { "epoch": 7.808219178082192, "grad_norm": 0.9853896498680115, "learning_rate": 9.308751767759282e-05, "loss": 0.0347, "step": 21090 }, { "epoch": 7.811921510551647, "grad_norm": 0.26174789667129517, "learning_rate": 9.30791267309698e-05, "loss": 0.0294, "step": 21100 }, { "epoch": 7.815623843021103, "grad_norm": 5.031636714935303, "learning_rate": 9.307073107327824e-05, "loss": 0.0368, "step": 21110 }, { "epoch": 7.819326175490559, "grad_norm": 0.21117693185806274, "learning_rate": 9.306233070543631e-05, "loss": 0.0333, "step": 21120 }, { "epoch": 7.823028507960014, "grad_norm": 1.1765223741531372, "learning_rate": 9.305392562836262e-05, "loss": 0.0273, "step": 21130 }, { "epoch": 7.82673084042947, "grad_norm": 0.8158743381500244, "learning_rate": 9.304551584297634e-05, "loss": 0.0321, "step": 21140 }, { "epoch": 7.8304331728989265, "grad_norm": 2.535811424255371, "learning_rate": 9.30371013501972e-05, "loss": 0.023, "step": 21150 }, { "epoch": 7.834135505368382, "grad_norm": 0.5861503481864929, "learning_rate": 9.302868215094534e-05, "loss": 0.0517, "step": 21160 }, { "epoch": 7.837837837837838, "grad_norm": 0.5780404210090637, "learning_rate": 9.302025824614149e-05, "loss": 0.0349, "step": 21170 }, { "epoch": 7.841540170307294, "grad_norm": 0.2831591069698334, "learning_rate": 9.301182963670688e-05, "loss": 0.0309, "step": 21180 }, { "epoch": 7.845242502776749, "grad_norm": 0.7823783159255981, "learning_rate": 9.300339632356325e-05, "loss": 0.0458, "step": 21190 }, { "epoch": 7.848944835246205, "grad_norm": 0.33640384674072266, "learning_rate": 9.299495830763286e-05, "loss": 0.0331, "step": 21200 }, { "epoch": 7.852647167715661, "grad_norm": 0.23992566764354706, "learning_rate": 9.298651558983846e-05, "loss": 0.0426, "step": 21210 }, { "epoch": 7.856349500185116, "grad_norm": 0.4061243534088135, "learning_rate": 9.297806817110335e-05, "loss": 0.0409, "step": 21220 }, { "epoch": 7.860051832654572, "grad_norm": 0.3777242600917816, "learning_rate": 9.296961605235133e-05, "loss": 0.0392, "step": 21230 }, { "epoch": 7.863754165124028, "grad_norm": 0.22679005563259125, "learning_rate": 9.296115923450668e-05, "loss": 0.022, "step": 21240 }, { "epoch": 7.8674564975934835, "grad_norm": 0.1536603569984436, "learning_rate": 9.295269771849427e-05, "loss": 0.0326, "step": 21250 }, { "epoch": 7.87115883006294, "grad_norm": 0.1739882379770279, "learning_rate": 9.29442315052394e-05, "loss": 0.0255, "step": 21260 }, { "epoch": 7.874861162532396, "grad_norm": 0.5768941044807434, "learning_rate": 9.293576059566795e-05, "loss": 0.0362, "step": 21270 }, { "epoch": 7.878563495001851, "grad_norm": 0.2523496747016907, "learning_rate": 9.292728499070626e-05, "loss": 0.0242, "step": 21280 }, { "epoch": 7.882265827471307, "grad_norm": 0.3791975975036621, "learning_rate": 9.291880469128124e-05, "loss": 0.0262, "step": 21290 }, { "epoch": 7.885968159940763, "grad_norm": 0.41857677698135376, "learning_rate": 9.291031969832026e-05, "loss": 0.0265, "step": 21300 }, { "epoch": 7.889670492410218, "grad_norm": 0.4145677089691162, "learning_rate": 9.290183001275123e-05, "loss": 0.0362, "step": 21310 }, { "epoch": 7.893372824879674, "grad_norm": 0.28306668996810913, "learning_rate": 9.289333563550258e-05, "loss": 0.04, "step": 21320 }, { "epoch": 7.89707515734913, "grad_norm": 0.5742865204811096, "learning_rate": 9.288483656750322e-05, "loss": 0.0297, "step": 21330 }, { "epoch": 7.900777489818585, "grad_norm": 0.49633845686912537, "learning_rate": 9.287633280968261e-05, "loss": 0.0319, "step": 21340 }, { "epoch": 7.9044798222880415, "grad_norm": 0.6552034616470337, "learning_rate": 9.286782436297073e-05, "loss": 0.0388, "step": 21350 }, { "epoch": 7.9081821547574975, "grad_norm": 0.46808797121047974, "learning_rate": 9.2859311228298e-05, "loss": 0.0516, "step": 21360 }, { "epoch": 7.911884487226953, "grad_norm": 0.3723561465740204, "learning_rate": 9.285079340659545e-05, "loss": 0.0342, "step": 21370 }, { "epoch": 7.915586819696409, "grad_norm": 1.089861512184143, "learning_rate": 9.284227089879456e-05, "loss": 0.0304, "step": 21380 }, { "epoch": 7.919289152165865, "grad_norm": 0.2580506205558777, "learning_rate": 9.283374370582732e-05, "loss": 0.0362, "step": 21390 }, { "epoch": 7.92299148463532, "grad_norm": 0.30127236247062683, "learning_rate": 9.282521182862629e-05, "loss": 0.0282, "step": 21400 }, { "epoch": 7.926693817104776, "grad_norm": 0.25791460275650024, "learning_rate": 9.281667526812446e-05, "loss": 0.0306, "step": 21410 }, { "epoch": 7.930396149574232, "grad_norm": 0.42048391699790955, "learning_rate": 9.280813402525541e-05, "loss": 0.038, "step": 21420 }, { "epoch": 7.934098482043687, "grad_norm": 0.4555668532848358, "learning_rate": 9.279958810095317e-05, "loss": 0.0385, "step": 21430 }, { "epoch": 7.937800814513143, "grad_norm": 0.43698006868362427, "learning_rate": 9.279103749615234e-05, "loss": 0.0338, "step": 21440 }, { "epoch": 7.941503146982599, "grad_norm": 0.2344343513250351, "learning_rate": 9.278248221178798e-05, "loss": 0.0373, "step": 21450 }, { "epoch": 7.945205479452055, "grad_norm": 0.25977036356925964, "learning_rate": 9.277392224879568e-05, "loss": 0.037, "step": 21460 }, { "epoch": 7.948907811921511, "grad_norm": 0.1816616803407669, "learning_rate": 9.276535760811156e-05, "loss": 0.0408, "step": 21470 }, { "epoch": 7.952610144390967, "grad_norm": 0.2528235912322998, "learning_rate": 9.275678829067223e-05, "loss": 0.0305, "step": 21480 }, { "epoch": 7.956312476860422, "grad_norm": 0.32074055075645447, "learning_rate": 9.274821429741482e-05, "loss": 0.0368, "step": 21490 }, { "epoch": 7.960014809329878, "grad_norm": 0.5694864988327026, "learning_rate": 9.273963562927695e-05, "loss": 0.0325, "step": 21500 }, { "epoch": 7.963717141799334, "grad_norm": 0.1999489665031433, "learning_rate": 9.27310522871968e-05, "loss": 0.0257, "step": 21510 }, { "epoch": 7.967419474268789, "grad_norm": 0.2180819809436798, "learning_rate": 9.272246427211302e-05, "loss": 0.0323, "step": 21520 }, { "epoch": 7.971121806738245, "grad_norm": 0.22506406903266907, "learning_rate": 9.271387158496476e-05, "loss": 0.0451, "step": 21530 }, { "epoch": 7.974824139207701, "grad_norm": 0.3879120945930481, "learning_rate": 9.270527422669173e-05, "loss": 0.0377, "step": 21540 }, { "epoch": 7.9785264716771565, "grad_norm": 0.3043726980686188, "learning_rate": 9.269667219823412e-05, "loss": 0.03, "step": 21550 }, { "epoch": 7.9822288041466125, "grad_norm": 0.3067872226238251, "learning_rate": 9.268806550053264e-05, "loss": 0.0421, "step": 21560 }, { "epoch": 7.985931136616069, "grad_norm": 0.4311786890029907, "learning_rate": 9.26794541345285e-05, "loss": 0.0427, "step": 21570 }, { "epoch": 7.989633469085524, "grad_norm": 0.36543673276901245, "learning_rate": 9.26708381011634e-05, "loss": 0.0341, "step": 21580 }, { "epoch": 7.99333580155498, "grad_norm": 0.4176437556743622, "learning_rate": 9.266221740137961e-05, "loss": 0.0398, "step": 21590 }, { "epoch": 7.997038134024436, "grad_norm": 1.169721007347107, "learning_rate": 9.265359203611987e-05, "loss": 0.0461, "step": 21600 }, { "epoch": 8.000740466493891, "grad_norm": 0.5372480750083923, "learning_rate": 9.264496200632744e-05, "loss": 0.0229, "step": 21610 }, { "epoch": 8.004442798963346, "grad_norm": 0.22830787301063538, "learning_rate": 9.263632731294609e-05, "loss": 0.0307, "step": 21620 }, { "epoch": 8.008145131432803, "grad_norm": 0.6420533657073975, "learning_rate": 9.262768795692006e-05, "loss": 0.0346, "step": 21630 }, { "epoch": 8.011847463902258, "grad_norm": 0.3090662658214569, "learning_rate": 9.261904393919418e-05, "loss": 0.0333, "step": 21640 }, { "epoch": 8.015549796371713, "grad_norm": 0.2855483591556549, "learning_rate": 9.261039526071374e-05, "loss": 0.0288, "step": 21650 }, { "epoch": 8.01925212884117, "grad_norm": 0.41389867663383484, "learning_rate": 9.260174192242453e-05, "loss": 0.0289, "step": 21660 }, { "epoch": 8.022954461310626, "grad_norm": 0.36661240458488464, "learning_rate": 9.259308392527286e-05, "loss": 0.0308, "step": 21670 }, { "epoch": 8.02665679378008, "grad_norm": 0.2565365731716156, "learning_rate": 9.258442127020558e-05, "loss": 0.0391, "step": 21680 }, { "epoch": 8.030359126249538, "grad_norm": 0.4881899654865265, "learning_rate": 9.257575395817001e-05, "loss": 0.0363, "step": 21690 }, { "epoch": 8.034061458718993, "grad_norm": 0.413076251745224, "learning_rate": 9.256708199011401e-05, "loss": 0.0319, "step": 21700 }, { "epoch": 8.037763791188448, "grad_norm": 0.38978517055511475, "learning_rate": 9.255840536698593e-05, "loss": 0.0365, "step": 21710 }, { "epoch": 8.041466123657905, "grad_norm": 0.638915479183197, "learning_rate": 9.254972408973461e-05, "loss": 0.0305, "step": 21720 }, { "epoch": 8.04516845612736, "grad_norm": 0.27733251452445984, "learning_rate": 9.254103815930943e-05, "loss": 0.0337, "step": 21730 }, { "epoch": 8.048870788596815, "grad_norm": 0.5574961304664612, "learning_rate": 9.253234757666027e-05, "loss": 0.032, "step": 21740 }, { "epoch": 8.052573121066272, "grad_norm": 0.30966252088546753, "learning_rate": 9.252365234273755e-05, "loss": 0.034, "step": 21750 }, { "epoch": 8.056275453535727, "grad_norm": 0.25040102005004883, "learning_rate": 9.251495245849214e-05, "loss": 0.0323, "step": 21760 }, { "epoch": 8.059977786005183, "grad_norm": 0.8938419818878174, "learning_rate": 9.250624792487544e-05, "loss": 0.0557, "step": 21770 }, { "epoch": 8.06368011847464, "grad_norm": 0.27632731199264526, "learning_rate": 9.249753874283937e-05, "loss": 0.0311, "step": 21780 }, { "epoch": 8.067382450944095, "grad_norm": 0.3055751621723175, "learning_rate": 9.248882491333637e-05, "loss": 0.0313, "step": 21790 }, { "epoch": 8.07108478341355, "grad_norm": 0.7817893028259277, "learning_rate": 9.248010643731935e-05, "loss": 0.0239, "step": 21800 }, { "epoch": 8.074787115883007, "grad_norm": 0.3535875380039215, "learning_rate": 9.247138331574175e-05, "loss": 0.0431, "step": 21810 }, { "epoch": 8.078489448352462, "grad_norm": 0.31983482837677, "learning_rate": 9.246265554955753e-05, "loss": 0.0312, "step": 21820 }, { "epoch": 8.082191780821917, "grad_norm": 0.13796237111091614, "learning_rate": 9.245392313972115e-05, "loss": 0.0301, "step": 21830 }, { "epoch": 8.085894113291374, "grad_norm": 0.4763690233230591, "learning_rate": 9.244518608718756e-05, "loss": 0.0317, "step": 21840 }, { "epoch": 8.08959644576083, "grad_norm": 0.28905972838401794, "learning_rate": 9.243644439291223e-05, "loss": 0.0444, "step": 21850 }, { "epoch": 8.093298778230285, "grad_norm": 0.31419146060943604, "learning_rate": 9.242769805785115e-05, "loss": 0.0292, "step": 21860 }, { "epoch": 8.097001110699741, "grad_norm": 0.15050038695335388, "learning_rate": 9.24189470829608e-05, "loss": 0.0341, "step": 21870 }, { "epoch": 8.100703443169197, "grad_norm": 0.2644211947917938, "learning_rate": 9.241019146919818e-05, "loss": 0.0433, "step": 21880 }, { "epoch": 8.104405775638652, "grad_norm": 0.18313628435134888, "learning_rate": 9.240143121752076e-05, "loss": 0.037, "step": 21890 }, { "epoch": 8.108108108108109, "grad_norm": 0.41386207938194275, "learning_rate": 9.239266632888659e-05, "loss": 0.0271, "step": 21900 }, { "epoch": 8.111810440577564, "grad_norm": 0.3247825503349304, "learning_rate": 9.238389680425416e-05, "loss": 0.0311, "step": 21910 }, { "epoch": 8.11551277304702, "grad_norm": 0.2690303325653076, "learning_rate": 9.23751226445825e-05, "loss": 0.0327, "step": 21920 }, { "epoch": 8.119215105516476, "grad_norm": 0.3716367781162262, "learning_rate": 9.236634385083114e-05, "loss": 0.0321, "step": 21930 }, { "epoch": 8.122917437985931, "grad_norm": 0.23809349536895752, "learning_rate": 9.235756042396012e-05, "loss": 0.027, "step": 21940 }, { "epoch": 8.126619770455386, "grad_norm": 0.4002937376499176, "learning_rate": 9.234877236492997e-05, "loss": 0.0242, "step": 21950 }, { "epoch": 8.130322102924843, "grad_norm": 0.561455249786377, "learning_rate": 9.233997967470174e-05, "loss": 0.0323, "step": 21960 }, { "epoch": 8.134024435394299, "grad_norm": 0.184918612241745, "learning_rate": 9.233118235423698e-05, "loss": 0.0337, "step": 21970 }, { "epoch": 8.137726767863754, "grad_norm": 0.9409051537513733, "learning_rate": 9.232238040449779e-05, "loss": 0.0449, "step": 21980 }, { "epoch": 8.14142910033321, "grad_norm": 3.7039599418640137, "learning_rate": 9.23135738264467e-05, "loss": 0.0326, "step": 21990 }, { "epoch": 8.145131432802666, "grad_norm": 0.7149300575256348, "learning_rate": 9.230476262104677e-05, "loss": 0.0323, "step": 22000 }, { "epoch": 8.148833765272121, "grad_norm": 0.31035247445106506, "learning_rate": 9.229594678926164e-05, "loss": 0.0304, "step": 22010 }, { "epoch": 8.152536097741578, "grad_norm": 0.7443815469741821, "learning_rate": 9.228712633205532e-05, "loss": 0.0432, "step": 22020 }, { "epoch": 8.156238430211033, "grad_norm": 0.4301444888114929, "learning_rate": 9.227830125039247e-05, "loss": 0.0269, "step": 22030 }, { "epoch": 8.159940762680488, "grad_norm": 0.927799642086029, "learning_rate": 9.226947154523817e-05, "loss": 0.0311, "step": 22040 }, { "epoch": 8.163643095149945, "grad_norm": 0.23769158124923706, "learning_rate": 9.226063721755799e-05, "loss": 0.038, "step": 22050 }, { "epoch": 8.1673454276194, "grad_norm": 0.9791789650917053, "learning_rate": 9.225179826831807e-05, "loss": 0.0403, "step": 22060 }, { "epoch": 8.171047760088856, "grad_norm": 0.28332430124282837, "learning_rate": 9.224295469848501e-05, "loss": 0.0346, "step": 22070 }, { "epoch": 8.174750092558313, "grad_norm": 0.39735230803489685, "learning_rate": 9.223410650902594e-05, "loss": 0.0488, "step": 22080 }, { "epoch": 8.178452425027768, "grad_norm": 0.4009358286857605, "learning_rate": 9.222525370090849e-05, "loss": 0.0394, "step": 22090 }, { "epoch": 8.182154757497223, "grad_norm": 0.19883567094802856, "learning_rate": 9.221639627510076e-05, "loss": 0.0326, "step": 22100 }, { "epoch": 8.18585708996668, "grad_norm": 0.37365248799324036, "learning_rate": 9.220753423257141e-05, "loss": 0.0264, "step": 22110 }, { "epoch": 8.189559422436135, "grad_norm": 0.4520326554775238, "learning_rate": 9.219866757428959e-05, "loss": 0.0381, "step": 22120 }, { "epoch": 8.19326175490559, "grad_norm": 0.3200320303440094, "learning_rate": 9.21897963012249e-05, "loss": 0.0232, "step": 22130 }, { "epoch": 8.196964087375047, "grad_norm": 0.6240250468254089, "learning_rate": 9.218092041434755e-05, "loss": 0.0395, "step": 22140 }, { "epoch": 8.200666419844502, "grad_norm": 0.1980578452348709, "learning_rate": 9.217203991462815e-05, "loss": 0.033, "step": 22150 }, { "epoch": 8.204368752313957, "grad_norm": 0.3646887242794037, "learning_rate": 9.216315480303787e-05, "loss": 0.0321, "step": 22160 }, { "epoch": 8.208071084783414, "grad_norm": 0.24337555468082428, "learning_rate": 9.215426508054836e-05, "loss": 0.0384, "step": 22170 }, { "epoch": 8.21177341725287, "grad_norm": 0.23082241415977478, "learning_rate": 9.214537074813181e-05, "loss": 0.0364, "step": 22180 }, { "epoch": 8.215475749722325, "grad_norm": 0.28537946939468384, "learning_rate": 9.213647180676088e-05, "loss": 0.0237, "step": 22190 }, { "epoch": 8.219178082191782, "grad_norm": 0.2457101047039032, "learning_rate": 9.212756825740873e-05, "loss": 0.0411, "step": 22200 }, { "epoch": 8.222880414661237, "grad_norm": 0.41481682658195496, "learning_rate": 9.211866010104909e-05, "loss": 0.0395, "step": 22210 }, { "epoch": 8.226582747130692, "grad_norm": 0.4638250172138214, "learning_rate": 9.210974733865607e-05, "loss": 0.027, "step": 22220 }, { "epoch": 8.230285079600147, "grad_norm": 0.22013503313064575, "learning_rate": 9.21008299712044e-05, "loss": 0.0341, "step": 22230 }, { "epoch": 8.233987412069604, "grad_norm": 0.8725369572639465, "learning_rate": 9.209190799966926e-05, "loss": 0.0306, "step": 22240 }, { "epoch": 8.23768974453906, "grad_norm": 0.22953006625175476, "learning_rate": 9.208298142502636e-05, "loss": 0.0217, "step": 22250 }, { "epoch": 8.241392077008515, "grad_norm": 0.1655818372964859, "learning_rate": 9.207405024825186e-05, "loss": 0.0309, "step": 22260 }, { "epoch": 8.245094409477971, "grad_norm": 1.3671681880950928, "learning_rate": 9.206511447032251e-05, "loss": 0.0404, "step": 22270 }, { "epoch": 8.248796741947427, "grad_norm": 0.22153957188129425, "learning_rate": 9.205617409221546e-05, "loss": 0.0283, "step": 22280 }, { "epoch": 8.252499074416882, "grad_norm": 0.309308260679245, "learning_rate": 9.204722911490846e-05, "loss": 0.0387, "step": 22290 }, { "epoch": 8.256201406886339, "grad_norm": 0.3558448255062103, "learning_rate": 9.20382795393797e-05, "loss": 0.027, "step": 22300 }, { "epoch": 8.259903739355794, "grad_norm": 0.2863099277019501, "learning_rate": 9.202932536660789e-05, "loss": 0.0331, "step": 22310 }, { "epoch": 8.263606071825249, "grad_norm": 0.3504077196121216, "learning_rate": 9.202036659757222e-05, "loss": 0.033, "step": 22320 }, { "epoch": 8.267308404294706, "grad_norm": 0.2638484537601471, "learning_rate": 9.201140323325248e-05, "loss": 0.0302, "step": 22330 }, { "epoch": 8.271010736764161, "grad_norm": 0.21793389320373535, "learning_rate": 9.200243527462882e-05, "loss": 0.0215, "step": 22340 }, { "epoch": 8.274713069233616, "grad_norm": 0.2342817634344101, "learning_rate": 9.199346272268199e-05, "loss": 0.0369, "step": 22350 }, { "epoch": 8.278415401703073, "grad_norm": 0.5444105267524719, "learning_rate": 9.198448557839321e-05, "loss": 0.0374, "step": 22360 }, { "epoch": 8.282117734172529, "grad_norm": 0.232513889670372, "learning_rate": 9.197550384274423e-05, "loss": 0.0361, "step": 22370 }, { "epoch": 8.285820066641984, "grad_norm": 0.3089351952075958, "learning_rate": 9.196651751671724e-05, "loss": 0.0243, "step": 22380 }, { "epoch": 8.28952239911144, "grad_norm": 1.1993464231491089, "learning_rate": 9.1957526601295e-05, "loss": 0.0385, "step": 22390 }, { "epoch": 8.293224731580896, "grad_norm": 0.38935211300849915, "learning_rate": 9.194853109746074e-05, "loss": 0.0468, "step": 22400 }, { "epoch": 8.296927064050351, "grad_norm": 0.4584597945213318, "learning_rate": 9.193953100619816e-05, "loss": 0.0373, "step": 22410 }, { "epoch": 8.300629396519808, "grad_norm": 0.2835073471069336, "learning_rate": 9.193052632849156e-05, "loss": 0.032, "step": 22420 }, { "epoch": 8.304331728989263, "grad_norm": 0.16846267879009247, "learning_rate": 9.192151706532562e-05, "loss": 0.0318, "step": 22430 }, { "epoch": 8.308034061458718, "grad_norm": 0.2171611487865448, "learning_rate": 9.19125032176856e-05, "loss": 0.0274, "step": 22440 }, { "epoch": 8.311736393928175, "grad_norm": 0.31165727972984314, "learning_rate": 9.190348478655724e-05, "loss": 0.0338, "step": 22450 }, { "epoch": 8.31543872639763, "grad_norm": 0.3592532277107239, "learning_rate": 9.189446177292679e-05, "loss": 0.0347, "step": 22460 }, { "epoch": 8.319141058867086, "grad_norm": 0.5086525082588196, "learning_rate": 9.1885434177781e-05, "loss": 0.0518, "step": 22470 }, { "epoch": 8.322843391336542, "grad_norm": 0.7011920809745789, "learning_rate": 9.18764020021071e-05, "loss": 0.0292, "step": 22480 }, { "epoch": 8.326545723805998, "grad_norm": 0.43971797823905945, "learning_rate": 9.186736524689281e-05, "loss": 0.0396, "step": 22490 }, { "epoch": 8.330248056275453, "grad_norm": 0.23679295182228088, "learning_rate": 9.185832391312644e-05, "loss": 0.0325, "step": 22500 }, { "epoch": 8.33395038874491, "grad_norm": 0.4411209225654602, "learning_rate": 9.184927800179668e-05, "loss": 0.0255, "step": 22510 }, { "epoch": 8.337652721214365, "grad_norm": 0.2736719846725464, "learning_rate": 9.184022751389279e-05, "loss": 0.0222, "step": 22520 }, { "epoch": 8.34135505368382, "grad_norm": 0.5404953360557556, "learning_rate": 9.183117245040455e-05, "loss": 0.0356, "step": 22530 }, { "epoch": 8.345057386153277, "grad_norm": 0.2778756022453308, "learning_rate": 9.182211281232216e-05, "loss": 0.0269, "step": 22540 }, { "epoch": 8.348759718622732, "grad_norm": 0.2907482087612152, "learning_rate": 9.18130486006364e-05, "loss": 0.0202, "step": 22550 }, { "epoch": 8.352462051092187, "grad_norm": 2.1091504096984863, "learning_rate": 9.180397981633851e-05, "loss": 0.0331, "step": 22560 }, { "epoch": 8.356164383561644, "grad_norm": 0.5251054167747498, "learning_rate": 9.179490646042024e-05, "loss": 0.0321, "step": 22570 }, { "epoch": 8.3598667160311, "grad_norm": 0.9965072274208069, "learning_rate": 9.178582853387384e-05, "loss": 0.0414, "step": 22580 }, { "epoch": 8.363569048500555, "grad_norm": 0.31721362471580505, "learning_rate": 9.177674603769204e-05, "loss": 0.0335, "step": 22590 }, { "epoch": 8.367271380970012, "grad_norm": 1.2248307466506958, "learning_rate": 9.176765897286813e-05, "loss": 0.0352, "step": 22600 }, { "epoch": 8.370973713439467, "grad_norm": 0.4006265103816986, "learning_rate": 9.175856734039581e-05, "loss": 0.0229, "step": 22610 }, { "epoch": 8.374676045908922, "grad_norm": 0.18789374828338623, "learning_rate": 9.174947114126935e-05, "loss": 0.0282, "step": 22620 }, { "epoch": 8.378378378378379, "grad_norm": 0.24458827078342438, "learning_rate": 9.174037037648351e-05, "loss": 0.0334, "step": 22630 }, { "epoch": 8.382080710847834, "grad_norm": 0.9596851468086243, "learning_rate": 9.173126504703351e-05, "loss": 0.0366, "step": 22640 }, { "epoch": 8.38578304331729, "grad_norm": 0.6001914739608765, "learning_rate": 9.17221551539151e-05, "loss": 0.043, "step": 22650 }, { "epoch": 8.389485375786746, "grad_norm": 0.2609383165836334, "learning_rate": 9.171304069812454e-05, "loss": 0.0363, "step": 22660 }, { "epoch": 8.393187708256201, "grad_norm": 0.22647204995155334, "learning_rate": 9.170392168065857e-05, "loss": 0.0306, "step": 22670 }, { "epoch": 8.396890040725657, "grad_norm": 0.9034143686294556, "learning_rate": 9.169479810251441e-05, "loss": 0.0388, "step": 22680 }, { "epoch": 8.400592373195114, "grad_norm": 0.5653461217880249, "learning_rate": 9.168566996468983e-05, "loss": 0.0406, "step": 22690 }, { "epoch": 8.404294705664569, "grad_norm": 2.0563786029815674, "learning_rate": 9.167653726818305e-05, "loss": 0.0451, "step": 22700 }, { "epoch": 8.407997038134024, "grad_norm": 0.1639474630355835, "learning_rate": 9.166740001399281e-05, "loss": 0.0369, "step": 22710 }, { "epoch": 8.41169937060348, "grad_norm": 0.8688313961029053, "learning_rate": 9.165825820311836e-05, "loss": 0.0375, "step": 22720 }, { "epoch": 8.415401703072936, "grad_norm": 2.1174073219299316, "learning_rate": 9.164911183655943e-05, "loss": 0.0372, "step": 22730 }, { "epoch": 8.419104035542391, "grad_norm": 0.4625963866710663, "learning_rate": 9.163996091531624e-05, "loss": 0.0249, "step": 22740 }, { "epoch": 8.422806368011848, "grad_norm": 1.055357575416565, "learning_rate": 9.163080544038952e-05, "loss": 0.0333, "step": 22750 }, { "epoch": 8.426508700481303, "grad_norm": 0.6800915598869324, "learning_rate": 9.162164541278051e-05, "loss": 0.0341, "step": 22760 }, { "epoch": 8.430211032950758, "grad_norm": 0.3469577133655548, "learning_rate": 9.161248083349095e-05, "loss": 0.0254, "step": 22770 }, { "epoch": 8.433913365420215, "grad_norm": 0.6409804821014404, "learning_rate": 9.160331170352304e-05, "loss": 0.0213, "step": 22780 }, { "epoch": 8.43761569788967, "grad_norm": 0.2665550410747528, "learning_rate": 9.159413802387951e-05, "loss": 0.0319, "step": 22790 }, { "epoch": 8.441318030359126, "grad_norm": 0.5554882287979126, "learning_rate": 9.158495979556358e-05, "loss": 0.04, "step": 22800 }, { "epoch": 8.445020362828583, "grad_norm": 0.3927288055419922, "learning_rate": 9.157577701957897e-05, "loss": 0.0315, "step": 22810 }, { "epoch": 8.448722695298038, "grad_norm": 0.3242253065109253, "learning_rate": 9.15665896969299e-05, "loss": 0.0304, "step": 22820 }, { "epoch": 8.452425027767493, "grad_norm": 0.2741468548774719, "learning_rate": 9.155739782862107e-05, "loss": 0.0261, "step": 22830 }, { "epoch": 8.45612736023695, "grad_norm": 1.1476237773895264, "learning_rate": 9.15482014156577e-05, "loss": 0.0408, "step": 22840 }, { "epoch": 8.459829692706405, "grad_norm": 0.19851362705230713, "learning_rate": 9.153900045904549e-05, "loss": 0.024, "step": 22850 }, { "epoch": 8.46353202517586, "grad_norm": 0.2032901793718338, "learning_rate": 9.152979495979063e-05, "loss": 0.025, "step": 22860 }, { "epoch": 8.467234357645317, "grad_norm": 0.38603654503822327, "learning_rate": 9.152058491889986e-05, "loss": 0.0349, "step": 22870 }, { "epoch": 8.470936690114772, "grad_norm": 0.31118378043174744, "learning_rate": 9.151137033738032e-05, "loss": 0.0372, "step": 22880 }, { "epoch": 8.474639022584228, "grad_norm": 0.2788589298725128, "learning_rate": 9.150215121623974e-05, "loss": 0.029, "step": 22890 }, { "epoch": 8.478341355053685, "grad_norm": 0.3312917649745941, "learning_rate": 9.14929275564863e-05, "loss": 0.0311, "step": 22900 }, { "epoch": 8.48204368752314, "grad_norm": 0.3778038024902344, "learning_rate": 9.148369935912868e-05, "loss": 0.027, "step": 22910 }, { "epoch": 8.485746019992595, "grad_norm": 0.2019740343093872, "learning_rate": 9.147446662517608e-05, "loss": 0.0338, "step": 22920 }, { "epoch": 8.489448352462052, "grad_norm": 0.22572240233421326, "learning_rate": 9.146522935563817e-05, "loss": 0.0271, "step": 22930 }, { "epoch": 8.493150684931507, "grad_norm": 0.38380706310272217, "learning_rate": 9.14559875515251e-05, "loss": 0.0338, "step": 22940 }, { "epoch": 8.496853017400962, "grad_norm": 0.19934023916721344, "learning_rate": 9.144674121384757e-05, "loss": 0.0432, "step": 22950 }, { "epoch": 8.50055534987042, "grad_norm": 0.34431177377700806, "learning_rate": 9.143749034361674e-05, "loss": 0.0401, "step": 22960 }, { "epoch": 8.504257682339874, "grad_norm": 0.15661367774009705, "learning_rate": 9.142823494184427e-05, "loss": 0.0306, "step": 22970 }, { "epoch": 8.50796001480933, "grad_norm": 0.3418791592121124, "learning_rate": 9.141897500954229e-05, "loss": 0.0219, "step": 22980 }, { "epoch": 8.511662347278786, "grad_norm": 0.20477847754955292, "learning_rate": 9.140971054772349e-05, "loss": 0.0338, "step": 22990 }, { "epoch": 8.515364679748242, "grad_norm": 0.37735408544540405, "learning_rate": 9.140044155740101e-05, "loss": 0.038, "step": 23000 }, { "epoch": 8.519067012217697, "grad_norm": 0.19743646681308746, "learning_rate": 9.139116803958848e-05, "loss": 0.0366, "step": 23010 }, { "epoch": 8.522769344687152, "grad_norm": 0.23394976556301117, "learning_rate": 9.138188999530004e-05, "loss": 0.0382, "step": 23020 }, { "epoch": 8.526471677156609, "grad_norm": 0.1983332633972168, "learning_rate": 9.137260742555032e-05, "loss": 0.0344, "step": 23030 }, { "epoch": 8.530174009626064, "grad_norm": 0.25630083680152893, "learning_rate": 9.136332033135447e-05, "loss": 0.0266, "step": 23040 }, { "epoch": 8.533876342095521, "grad_norm": 0.2769022285938263, "learning_rate": 9.135402871372808e-05, "loss": 0.0406, "step": 23050 }, { "epoch": 8.537578674564976, "grad_norm": 0.5461496710777283, "learning_rate": 9.134473257368732e-05, "loss": 0.0413, "step": 23060 }, { "epoch": 8.541281007034431, "grad_norm": 0.32607948780059814, "learning_rate": 9.133543191224874e-05, "loss": 0.0335, "step": 23070 }, { "epoch": 8.544983339503887, "grad_norm": 0.4340973198413849, "learning_rate": 9.132612673042947e-05, "loss": 0.0243, "step": 23080 }, { "epoch": 8.548685671973344, "grad_norm": 0.26438024640083313, "learning_rate": 9.131681702924713e-05, "loss": 0.0296, "step": 23090 }, { "epoch": 8.552388004442799, "grad_norm": 0.25791746377944946, "learning_rate": 9.130750280971978e-05, "loss": 0.0289, "step": 23100 }, { "epoch": 8.556090336912256, "grad_norm": 0.3714396357536316, "learning_rate": 9.129818407286603e-05, "loss": 0.038, "step": 23110 }, { "epoch": 8.55979266938171, "grad_norm": 1.8211156129837036, "learning_rate": 9.128886081970498e-05, "loss": 0.0389, "step": 23120 }, { "epoch": 8.563495001851166, "grad_norm": 0.373198539018631, "learning_rate": 9.127953305125618e-05, "loss": 0.0413, "step": 23130 }, { "epoch": 8.567197334320621, "grad_norm": 1.1726850271224976, "learning_rate": 9.127020076853969e-05, "loss": 0.0318, "step": 23140 }, { "epoch": 8.570899666790078, "grad_norm": 0.2644473910331726, "learning_rate": 9.126086397257612e-05, "loss": 0.0349, "step": 23150 }, { "epoch": 8.574601999259533, "grad_norm": 0.2495058923959732, "learning_rate": 9.125152266438649e-05, "loss": 0.0333, "step": 23160 }, { "epoch": 8.57830433172899, "grad_norm": 0.18385110795497894, "learning_rate": 9.124217684499235e-05, "loss": 0.0378, "step": 23170 }, { "epoch": 8.582006664198445, "grad_norm": 0.4205147624015808, "learning_rate": 9.123282651541576e-05, "loss": 0.0439, "step": 23180 }, { "epoch": 8.5857089966679, "grad_norm": 0.20334628224372864, "learning_rate": 9.122347167667926e-05, "loss": 0.0254, "step": 23190 }, { "epoch": 8.589411329137356, "grad_norm": 0.331400603055954, "learning_rate": 9.121411232980588e-05, "loss": 0.0274, "step": 23200 }, { "epoch": 8.593113661606813, "grad_norm": 0.9770618677139282, "learning_rate": 9.120474847581913e-05, "loss": 0.0264, "step": 23210 }, { "epoch": 8.596815994076268, "grad_norm": 0.21317461133003235, "learning_rate": 9.119538011574305e-05, "loss": 0.0227, "step": 23220 }, { "epoch": 8.600518326545723, "grad_norm": 0.8288081884384155, "learning_rate": 9.118600725060214e-05, "loss": 0.0318, "step": 23230 }, { "epoch": 8.60422065901518, "grad_norm": 0.4497022330760956, "learning_rate": 9.117662988142138e-05, "loss": 0.0308, "step": 23240 }, { "epoch": 8.607922991484635, "grad_norm": 0.17117121815681458, "learning_rate": 9.116724800922629e-05, "loss": 0.0353, "step": 23250 }, { "epoch": 8.61162532395409, "grad_norm": 0.32410314679145813, "learning_rate": 9.115786163504285e-05, "loss": 0.0387, "step": 23260 }, { "epoch": 8.615327656423547, "grad_norm": 0.3385207951068878, "learning_rate": 9.114847075989755e-05, "loss": 0.0275, "step": 23270 }, { "epoch": 8.619029988893002, "grad_norm": 0.15356914699077606, "learning_rate": 9.113907538481736e-05, "loss": 0.0221, "step": 23280 }, { "epoch": 8.622732321362458, "grad_norm": 0.5583391189575195, "learning_rate": 9.112967551082973e-05, "loss": 0.0318, "step": 23290 }, { "epoch": 8.626434653831915, "grad_norm": 0.16559246182441711, "learning_rate": 9.112027113896262e-05, "loss": 0.0351, "step": 23300 }, { "epoch": 8.63013698630137, "grad_norm": 0.2382386028766632, "learning_rate": 9.111086227024448e-05, "loss": 0.0331, "step": 23310 }, { "epoch": 8.633839318770825, "grad_norm": 0.2589449882507324, "learning_rate": 9.110144890570426e-05, "loss": 0.0296, "step": 23320 }, { "epoch": 8.637541651240282, "grad_norm": 0.2535060942173004, "learning_rate": 9.109203104637137e-05, "loss": 0.0263, "step": 23330 }, { "epoch": 8.641243983709737, "grad_norm": 0.22738096117973328, "learning_rate": 9.108260869327576e-05, "loss": 0.0299, "step": 23340 }, { "epoch": 8.644946316179192, "grad_norm": 0.24309879541397095, "learning_rate": 9.107318184744781e-05, "loss": 0.0343, "step": 23350 }, { "epoch": 8.64864864864865, "grad_norm": 0.5707695484161377, "learning_rate": 9.106375050991847e-05, "loss": 0.0237, "step": 23360 }, { "epoch": 8.652350981118104, "grad_norm": 0.6524394154548645, "learning_rate": 9.10543146817191e-05, "loss": 0.034, "step": 23370 }, { "epoch": 8.65605331358756, "grad_norm": 0.5128520727157593, "learning_rate": 9.104487436388161e-05, "loss": 0.0287, "step": 23380 }, { "epoch": 8.659755646057016, "grad_norm": 0.22947891056537628, "learning_rate": 9.103542955743835e-05, "loss": 0.0361, "step": 23390 }, { "epoch": 8.663457978526472, "grad_norm": 0.2125868946313858, "learning_rate": 9.102598026342222e-05, "loss": 0.0322, "step": 23400 }, { "epoch": 8.667160310995927, "grad_norm": 0.2843421399593353, "learning_rate": 9.101652648286658e-05, "loss": 0.0283, "step": 23410 }, { "epoch": 8.670862643465384, "grad_norm": 0.5942489504814148, "learning_rate": 9.100706821680527e-05, "loss": 0.0403, "step": 23420 }, { "epoch": 8.674564975934839, "grad_norm": 0.5042659640312195, "learning_rate": 9.099760546627261e-05, "loss": 0.0372, "step": 23430 }, { "epoch": 8.678267308404294, "grad_norm": 0.2762080430984497, "learning_rate": 9.098813823230348e-05, "loss": 0.0198, "step": 23440 }, { "epoch": 8.681969640873751, "grad_norm": 0.3533563017845154, "learning_rate": 9.097866651593317e-05, "loss": 0.038, "step": 23450 }, { "epoch": 8.685671973343206, "grad_norm": 0.5923087000846863, "learning_rate": 9.096919031819751e-05, "loss": 0.0238, "step": 23460 }, { "epoch": 8.689374305812661, "grad_norm": 0.1819404810667038, "learning_rate": 9.095970964013277e-05, "loss": 0.04, "step": 23470 }, { "epoch": 8.693076638282118, "grad_norm": 0.682144820690155, "learning_rate": 9.095022448277578e-05, "loss": 0.0228, "step": 23480 }, { "epoch": 8.696778970751573, "grad_norm": 3.315314292907715, "learning_rate": 9.094073484716381e-05, "loss": 0.0268, "step": 23490 }, { "epoch": 8.700481303221029, "grad_norm": 0.13231217861175537, "learning_rate": 9.093124073433463e-05, "loss": 0.0226, "step": 23500 }, { "epoch": 8.704183635690486, "grad_norm": 0.2111813873052597, "learning_rate": 9.09217421453265e-05, "loss": 0.0202, "step": 23510 }, { "epoch": 8.70788596815994, "grad_norm": 0.18266290426254272, "learning_rate": 9.091223908117818e-05, "loss": 0.0255, "step": 23520 }, { "epoch": 8.711588300629396, "grad_norm": 0.19091007113456726, "learning_rate": 9.090273154292889e-05, "loss": 0.0294, "step": 23530 }, { "epoch": 8.715290633098853, "grad_norm": 0.8561403751373291, "learning_rate": 9.089321953161837e-05, "loss": 0.027, "step": 23540 }, { "epoch": 8.718992965568308, "grad_norm": 0.18997476994991302, "learning_rate": 9.088370304828685e-05, "loss": 0.0368, "step": 23550 }, { "epoch": 8.722695298037763, "grad_norm": 0.22287946939468384, "learning_rate": 9.087418209397506e-05, "loss": 0.0402, "step": 23560 }, { "epoch": 8.72639763050722, "grad_norm": 0.2789328098297119, "learning_rate": 9.086465666972415e-05, "loss": 0.0273, "step": 23570 }, { "epoch": 8.730099962976675, "grad_norm": 0.27261587977409363, "learning_rate": 9.085512677657582e-05, "loss": 0.0253, "step": 23580 }, { "epoch": 8.73380229544613, "grad_norm": 0.30152401328086853, "learning_rate": 9.084559241557226e-05, "loss": 0.0366, "step": 23590 }, { "epoch": 8.737504627915587, "grad_norm": 0.2072356641292572, "learning_rate": 9.083605358775612e-05, "loss": 0.0238, "step": 23600 }, { "epoch": 8.741206960385043, "grad_norm": 2.762490749359131, "learning_rate": 9.082651029417055e-05, "loss": 0.042, "step": 23610 }, { "epoch": 8.744909292854498, "grad_norm": 0.6103137731552124, "learning_rate": 9.081696253585921e-05, "loss": 0.0421, "step": 23620 }, { "epoch": 8.748611625323955, "grad_norm": 0.8626862168312073, "learning_rate": 9.080741031386619e-05, "loss": 0.0309, "step": 23630 }, { "epoch": 8.75231395779341, "grad_norm": 0.2989400625228882, "learning_rate": 9.079785362923616e-05, "loss": 0.0265, "step": 23640 }, { "epoch": 8.756016290262865, "grad_norm": 0.31491199135780334, "learning_rate": 9.078829248301417e-05, "loss": 0.027, "step": 23650 }, { "epoch": 8.759718622732322, "grad_norm": 0.38209268450737, "learning_rate": 9.077872687624586e-05, "loss": 0.0327, "step": 23660 }, { "epoch": 8.763420955201777, "grad_norm": 0.16645902395248413, "learning_rate": 9.076915680997727e-05, "loss": 0.0287, "step": 23670 }, { "epoch": 8.767123287671232, "grad_norm": 0.4434318244457245, "learning_rate": 9.075958228525501e-05, "loss": 0.0287, "step": 23680 }, { "epoch": 8.77082562014069, "grad_norm": 0.2240263819694519, "learning_rate": 9.075000330312608e-05, "loss": 0.0458, "step": 23690 }, { "epoch": 8.774527952610145, "grad_norm": 0.2766158878803253, "learning_rate": 9.074041986463808e-05, "loss": 0.0391, "step": 23700 }, { "epoch": 8.7782302850796, "grad_norm": 0.25390365719795227, "learning_rate": 9.073083197083902e-05, "loss": 0.0327, "step": 23710 }, { "epoch": 8.781932617549057, "grad_norm": 0.3133545219898224, "learning_rate": 9.07212396227774e-05, "loss": 0.0313, "step": 23720 }, { "epoch": 8.785634950018512, "grad_norm": 0.39427807927131653, "learning_rate": 9.071164282150224e-05, "loss": 0.0372, "step": 23730 }, { "epoch": 8.789337282487967, "grad_norm": 0.1894243210554123, "learning_rate": 9.070204156806304e-05, "loss": 0.029, "step": 23740 }, { "epoch": 8.793039614957424, "grad_norm": 0.21332189440727234, "learning_rate": 9.069243586350975e-05, "loss": 0.0283, "step": 23750 }, { "epoch": 8.796741947426879, "grad_norm": 0.28858304023742676, "learning_rate": 9.068282570889286e-05, "loss": 0.0366, "step": 23760 }, { "epoch": 8.800444279896334, "grad_norm": 0.29308900237083435, "learning_rate": 9.067321110526332e-05, "loss": 0.035, "step": 23770 }, { "epoch": 8.804146612365791, "grad_norm": 0.294575959444046, "learning_rate": 9.066359205367258e-05, "loss": 0.0312, "step": 23780 }, { "epoch": 8.807848944835246, "grad_norm": 0.27075520157814026, "learning_rate": 9.065396855517253e-05, "loss": 0.0412, "step": 23790 }, { "epoch": 8.811551277304702, "grad_norm": 0.6826772093772888, "learning_rate": 9.064434061081562e-05, "loss": 0.0406, "step": 23800 }, { "epoch": 8.815253609774159, "grad_norm": 0.41142335534095764, "learning_rate": 9.06347082216547e-05, "loss": 0.0239, "step": 23810 }, { "epoch": 8.818955942243614, "grad_norm": 0.28007516264915466, "learning_rate": 9.06250713887432e-05, "loss": 0.0392, "step": 23820 }, { "epoch": 8.822658274713069, "grad_norm": 0.3074372410774231, "learning_rate": 9.061543011313497e-05, "loss": 0.0249, "step": 23830 }, { "epoch": 8.826360607182526, "grad_norm": 0.33433324098587036, "learning_rate": 9.060578439588436e-05, "loss": 0.0285, "step": 23840 }, { "epoch": 8.830062939651981, "grad_norm": 0.42047929763793945, "learning_rate": 9.059613423804623e-05, "loss": 0.0261, "step": 23850 }, { "epoch": 8.833765272121436, "grad_norm": 0.21404942870140076, "learning_rate": 9.05864796406759e-05, "loss": 0.0362, "step": 23860 }, { "epoch": 8.837467604590893, "grad_norm": 0.2987099885940552, "learning_rate": 9.057682060482916e-05, "loss": 0.0392, "step": 23870 }, { "epoch": 8.841169937060348, "grad_norm": 0.7668009996414185, "learning_rate": 9.056715713156233e-05, "loss": 0.0233, "step": 23880 }, { "epoch": 8.844872269529803, "grad_norm": 0.5080088973045349, "learning_rate": 9.055748922193219e-05, "loss": 0.0291, "step": 23890 }, { "epoch": 8.84857460199926, "grad_norm": 0.16458377242088318, "learning_rate": 9.0547816876996e-05, "loss": 0.0271, "step": 23900 }, { "epoch": 8.852276934468716, "grad_norm": 0.8046079277992249, "learning_rate": 9.053814009781153e-05, "loss": 0.0298, "step": 23910 }, { "epoch": 8.85597926693817, "grad_norm": 0.15510320663452148, "learning_rate": 9.0528458885437e-05, "loss": 0.0231, "step": 23920 }, { "epoch": 8.859681599407626, "grad_norm": 0.25145038962364197, "learning_rate": 9.051877324093114e-05, "loss": 0.0365, "step": 23930 }, { "epoch": 8.863383931877083, "grad_norm": 0.21760736405849457, "learning_rate": 9.050908316535315e-05, "loss": 0.0279, "step": 23940 }, { "epoch": 8.867086264346538, "grad_norm": 3.5649924278259277, "learning_rate": 9.049938865976275e-05, "loss": 0.0298, "step": 23950 }, { "epoch": 8.870788596815995, "grad_norm": 2.0594427585601807, "learning_rate": 9.04896897252201e-05, "loss": 0.029, "step": 23960 }, { "epoch": 8.87449092928545, "grad_norm": 0.30276811122894287, "learning_rate": 9.047998636278583e-05, "loss": 0.0348, "step": 23970 }, { "epoch": 8.878193261754905, "grad_norm": 0.21340477466583252, "learning_rate": 9.047027857352112e-05, "loss": 0.0229, "step": 23980 }, { "epoch": 8.88189559422436, "grad_norm": 0.3014797568321228, "learning_rate": 9.046056635848761e-05, "loss": 0.0303, "step": 23990 }, { "epoch": 8.885597926693817, "grad_norm": 0.4489670395851135, "learning_rate": 9.045084971874738e-05, "loss": 0.0236, "step": 24000 }, { "epoch": 8.889300259163273, "grad_norm": 0.2704315483570099, "learning_rate": 9.044112865536304e-05, "loss": 0.0269, "step": 24010 }, { "epoch": 8.89300259163273, "grad_norm": 0.2835577130317688, "learning_rate": 9.043140316939766e-05, "loss": 0.0326, "step": 24020 }, { "epoch": 8.896704924102185, "grad_norm": 0.4077593982219696, "learning_rate": 9.042167326191484e-05, "loss": 0.0329, "step": 24030 }, { "epoch": 8.90040725657164, "grad_norm": 0.25975775718688965, "learning_rate": 9.041193893397861e-05, "loss": 0.0339, "step": 24040 }, { "epoch": 8.904109589041095, "grad_norm": 0.34463030099868774, "learning_rate": 9.040220018665347e-05, "loss": 0.0381, "step": 24050 }, { "epoch": 8.907811921510552, "grad_norm": 0.7592114806175232, "learning_rate": 9.039245702100448e-05, "loss": 0.0377, "step": 24060 }, { "epoch": 8.911514253980007, "grad_norm": 0.28603827953338623, "learning_rate": 9.038270943809711e-05, "loss": 0.0369, "step": 24070 }, { "epoch": 8.915216586449462, "grad_norm": 0.216044619679451, "learning_rate": 9.037295743899737e-05, "loss": 0.0336, "step": 24080 }, { "epoch": 8.91891891891892, "grad_norm": 0.3527561128139496, "learning_rate": 9.036320102477169e-05, "loss": 0.0316, "step": 24090 }, { "epoch": 8.922621251388374, "grad_norm": 0.2646198868751526, "learning_rate": 9.035344019648702e-05, "loss": 0.0287, "step": 24100 }, { "epoch": 8.92632358385783, "grad_norm": 0.20706290006637573, "learning_rate": 9.034367495521081e-05, "loss": 0.0424, "step": 24110 }, { "epoch": 8.930025916327287, "grad_norm": 0.30312493443489075, "learning_rate": 9.033390530201097e-05, "loss": 0.0278, "step": 24120 }, { "epoch": 8.933728248796742, "grad_norm": 0.3507518768310547, "learning_rate": 9.032413123795588e-05, "loss": 0.0367, "step": 24130 }, { "epoch": 8.937430581266197, "grad_norm": 0.46034201979637146, "learning_rate": 9.031435276411442e-05, "loss": 0.035, "step": 24140 }, { "epoch": 8.941132913735654, "grad_norm": 0.2172558605670929, "learning_rate": 9.030456988155596e-05, "loss": 0.0323, "step": 24150 }, { "epoch": 8.944835246205109, "grad_norm": 0.27602630853652954, "learning_rate": 9.029478259135034e-05, "loss": 0.0394, "step": 24160 }, { "epoch": 8.948537578674564, "grad_norm": 0.22097347676753998, "learning_rate": 9.028499089456786e-05, "loss": 0.0213, "step": 24170 }, { "epoch": 8.952239911144021, "grad_norm": 0.34377777576446533, "learning_rate": 9.027519479227935e-05, "loss": 0.0349, "step": 24180 }, { "epoch": 8.955942243613476, "grad_norm": 0.19139006733894348, "learning_rate": 9.02653942855561e-05, "loss": 0.0257, "step": 24190 }, { "epoch": 8.959644576082932, "grad_norm": 0.28643810749053955, "learning_rate": 9.025558937546988e-05, "loss": 0.0287, "step": 24200 }, { "epoch": 8.963346908552388, "grad_norm": 0.6297005414962769, "learning_rate": 9.02457800630929e-05, "loss": 0.0262, "step": 24210 }, { "epoch": 8.967049241021844, "grad_norm": 0.2522977590560913, "learning_rate": 9.023596634949793e-05, "loss": 0.0303, "step": 24220 }, { "epoch": 8.970751573491299, "grad_norm": 0.35847339034080505, "learning_rate": 9.02261482357582e-05, "loss": 0.0278, "step": 24230 }, { "epoch": 8.974453905960756, "grad_norm": 0.7458994388580322, "learning_rate": 9.021632572294733e-05, "loss": 0.0396, "step": 24240 }, { "epoch": 8.978156238430211, "grad_norm": 0.24815931916236877, "learning_rate": 9.020649881213958e-05, "loss": 0.027, "step": 24250 }, { "epoch": 8.981858570899666, "grad_norm": 0.7424755096435547, "learning_rate": 9.019666750440956e-05, "loss": 0.0341, "step": 24260 }, { "epoch": 8.985560903369123, "grad_norm": 0.40585580468177795, "learning_rate": 9.01868318008324e-05, "loss": 0.0329, "step": 24270 }, { "epoch": 8.989263235838578, "grad_norm": 0.2278011292219162, "learning_rate": 9.017699170248378e-05, "loss": 0.031, "step": 24280 }, { "epoch": 8.992965568308033, "grad_norm": 0.37971338629722595, "learning_rate": 9.016714721043971e-05, "loss": 0.0396, "step": 24290 }, { "epoch": 8.99666790077749, "grad_norm": 0.19432899355888367, "learning_rate": 9.015729832577681e-05, "loss": 0.0319, "step": 24300 }, { "epoch": 9.000370233246946, "grad_norm": 0.28191420435905457, "learning_rate": 9.014744504957216e-05, "loss": 0.0327, "step": 24310 }, { "epoch": 9.0040725657164, "grad_norm": 0.4273037910461426, "learning_rate": 9.013758738290327e-05, "loss": 0.0199, "step": 24320 }, { "epoch": 9.007774898185858, "grad_norm": 0.21724866330623627, "learning_rate": 9.012772532684818e-05, "loss": 0.0252, "step": 24330 }, { "epoch": 9.011477230655313, "grad_norm": 0.1970595419406891, "learning_rate": 9.011785888248539e-05, "loss": 0.0306, "step": 24340 }, { "epoch": 9.015179563124768, "grad_norm": 0.5354965329170227, "learning_rate": 9.010798805089384e-05, "loss": 0.0297, "step": 24350 }, { "epoch": 9.018881895594225, "grad_norm": 0.11109251528978348, "learning_rate": 9.009811283315304e-05, "loss": 0.027, "step": 24360 }, { "epoch": 9.02258422806368, "grad_norm": 0.21148814260959625, "learning_rate": 9.008823323034288e-05, "loss": 0.0293, "step": 24370 }, { "epoch": 9.026286560533135, "grad_norm": 0.886106550693512, "learning_rate": 9.007834924354383e-05, "loss": 0.0475, "step": 24380 }, { "epoch": 9.029988893002592, "grad_norm": 0.24304446578025818, "learning_rate": 9.006846087383675e-05, "loss": 0.0389, "step": 24390 }, { "epoch": 9.033691225472047, "grad_norm": 0.47413554787635803, "learning_rate": 9.005856812230304e-05, "loss": 0.0373, "step": 24400 }, { "epoch": 9.037393557941503, "grad_norm": 0.35063034296035767, "learning_rate": 9.004867099002456e-05, "loss": 0.0327, "step": 24410 }, { "epoch": 9.04109589041096, "grad_norm": 0.2298070192337036, "learning_rate": 9.003876947808361e-05, "loss": 0.0315, "step": 24420 }, { "epoch": 9.044798222880415, "grad_norm": 0.39779388904571533, "learning_rate": 9.002886358756305e-05, "loss": 0.0345, "step": 24430 }, { "epoch": 9.04850055534987, "grad_norm": 0.8253679275512695, "learning_rate": 9.001895331954612e-05, "loss": 0.0329, "step": 24440 }, { "epoch": 9.052202887819327, "grad_norm": 0.2298089861869812, "learning_rate": 9.000903867511666e-05, "loss": 0.0275, "step": 24450 }, { "epoch": 9.055905220288782, "grad_norm": 0.6516557931900024, "learning_rate": 8.999911965535885e-05, "loss": 0.0362, "step": 24460 }, { "epoch": 9.059607552758237, "grad_norm": 0.13773946464061737, "learning_rate": 8.998919626135746e-05, "loss": 0.0292, "step": 24470 }, { "epoch": 9.063309885227694, "grad_norm": 0.323689728975296, "learning_rate": 8.997926849419769e-05, "loss": 0.0284, "step": 24480 }, { "epoch": 9.06701221769715, "grad_norm": 0.9212979078292847, "learning_rate": 8.996933635496523e-05, "loss": 0.0332, "step": 24490 }, { "epoch": 9.070714550166604, "grad_norm": 0.4996623992919922, "learning_rate": 8.995939984474624e-05, "loss": 0.0235, "step": 24500 }, { "epoch": 9.074416882636061, "grad_norm": 0.1905829757452011, "learning_rate": 8.994945896462736e-05, "loss": 0.0334, "step": 24510 }, { "epoch": 9.078119215105517, "grad_norm": 0.31676197052001953, "learning_rate": 8.993951371569571e-05, "loss": 0.0351, "step": 24520 }, { "epoch": 9.081821547574972, "grad_norm": 0.26493456959724426, "learning_rate": 8.99295640990389e-05, "loss": 0.0227, "step": 24530 }, { "epoch": 9.085523880044429, "grad_norm": 0.24721921980381012, "learning_rate": 8.991961011574497e-05, "loss": 0.0283, "step": 24540 }, { "epoch": 9.089226212513884, "grad_norm": 0.1962004005908966, "learning_rate": 8.990965176690252e-05, "loss": 0.037, "step": 24550 }, { "epoch": 9.092928544983339, "grad_norm": 0.29673779010772705, "learning_rate": 8.989968905360053e-05, "loss": 0.0321, "step": 24560 }, { "epoch": 9.096630877452796, "grad_norm": 2.040775775909424, "learning_rate": 8.988972197692855e-05, "loss": 0.0377, "step": 24570 }, { "epoch": 9.100333209922251, "grad_norm": 0.3482931852340698, "learning_rate": 8.987975053797655e-05, "loss": 0.0252, "step": 24580 }, { "epoch": 9.104035542391706, "grad_norm": 0.8182295560836792, "learning_rate": 8.986977473783498e-05, "loss": 0.0421, "step": 24590 }, { "epoch": 9.107737874861163, "grad_norm": 0.7116475105285645, "learning_rate": 8.98597945775948e-05, "loss": 0.0246, "step": 24600 }, { "epoch": 9.111440207330618, "grad_norm": 0.4277704954147339, "learning_rate": 8.98498100583474e-05, "loss": 0.0259, "step": 24610 }, { "epoch": 9.115142539800074, "grad_norm": 0.26846256852149963, "learning_rate": 8.98398211811847e-05, "loss": 0.0268, "step": 24620 }, { "epoch": 9.11884487226953, "grad_norm": 0.4480280876159668, "learning_rate": 8.982982794719904e-05, "loss": 0.0209, "step": 24630 }, { "epoch": 9.122547204738986, "grad_norm": 0.14189834892749786, "learning_rate": 8.98198303574833e-05, "loss": 0.0248, "step": 24640 }, { "epoch": 9.126249537208441, "grad_norm": 0.23037004470825195, "learning_rate": 8.980982841313074e-05, "loss": 0.0281, "step": 24650 }, { "epoch": 9.129951869677898, "grad_norm": 1.8823635578155518, "learning_rate": 8.979982211523523e-05, "loss": 0.0219, "step": 24660 }, { "epoch": 9.133654202147353, "grad_norm": 0.19812868535518646, "learning_rate": 8.9789811464891e-05, "loss": 0.0327, "step": 24670 }, { "epoch": 9.137356534616808, "grad_norm": 0.25671353936195374, "learning_rate": 8.977979646319282e-05, "loss": 0.0289, "step": 24680 }, { "epoch": 9.141058867086265, "grad_norm": 0.26875829696655273, "learning_rate": 8.97697771112359e-05, "loss": 0.0299, "step": 24690 }, { "epoch": 9.14476119955572, "grad_norm": 0.5028879046440125, "learning_rate": 8.975975341011596e-05, "loss": 0.0279, "step": 24700 }, { "epoch": 9.148463532025175, "grad_norm": 0.538116991519928, "learning_rate": 8.974972536092916e-05, "loss": 0.0236, "step": 24710 }, { "epoch": 9.152165864494632, "grad_norm": 0.48458072543144226, "learning_rate": 8.973969296477214e-05, "loss": 0.028, "step": 24720 }, { "epoch": 9.155868196964088, "grad_norm": 0.31679362058639526, "learning_rate": 8.972965622274205e-05, "loss": 0.0272, "step": 24730 }, { "epoch": 9.159570529433543, "grad_norm": 0.30895718932151794, "learning_rate": 8.97196151359365e-05, "loss": 0.0241, "step": 24740 }, { "epoch": 9.163272861903, "grad_norm": 0.3319845199584961, "learning_rate": 8.970956970545355e-05, "loss": 0.0459, "step": 24750 }, { "epoch": 9.166975194372455, "grad_norm": 0.20248448848724365, "learning_rate": 8.969951993239177e-05, "loss": 0.0341, "step": 24760 }, { "epoch": 9.17067752684191, "grad_norm": 0.39383599162101746, "learning_rate": 8.968946581785017e-05, "loss": 0.029, "step": 24770 }, { "epoch": 9.174379859311367, "grad_norm": 0.312148779630661, "learning_rate": 8.967940736292825e-05, "loss": 0.0304, "step": 24780 }, { "epoch": 9.178082191780822, "grad_norm": 0.19070817530155182, "learning_rate": 8.966934456872602e-05, "loss": 0.0263, "step": 24790 }, { "epoch": 9.181784524250277, "grad_norm": 0.3798842132091522, "learning_rate": 8.965927743634391e-05, "loss": 0.0324, "step": 24800 }, { "epoch": 9.185486856719734, "grad_norm": 0.308402955532074, "learning_rate": 8.964920596688283e-05, "loss": 0.031, "step": 24810 }, { "epoch": 9.18918918918919, "grad_norm": 0.35077524185180664, "learning_rate": 8.963913016144419e-05, "loss": 0.0281, "step": 24820 }, { "epoch": 9.192891521658645, "grad_norm": 0.5011463165283203, "learning_rate": 8.962905002112989e-05, "loss": 0.0353, "step": 24830 }, { "epoch": 9.1965938541281, "grad_norm": 0.5116323828697205, "learning_rate": 8.961896554704226e-05, "loss": 0.0284, "step": 24840 }, { "epoch": 9.200296186597557, "grad_norm": 0.41191577911376953, "learning_rate": 8.96088767402841e-05, "loss": 0.0308, "step": 24850 }, { "epoch": 9.203998519067012, "grad_norm": 0.2469080537557602, "learning_rate": 8.959878360195876e-05, "loss": 0.0527, "step": 24860 }, { "epoch": 9.207700851536467, "grad_norm": 0.3565555214881897, "learning_rate": 8.958868613316996e-05, "loss": 0.0326, "step": 24870 }, { "epoch": 9.211403184005924, "grad_norm": 0.18954095244407654, "learning_rate": 8.957858433502198e-05, "loss": 0.0323, "step": 24880 }, { "epoch": 9.21510551647538, "grad_norm": 0.48901498317718506, "learning_rate": 8.95684782086195e-05, "loss": 0.0194, "step": 24890 }, { "epoch": 9.218807848944834, "grad_norm": 0.5206404328346252, "learning_rate": 8.955836775506776e-05, "loss": 0.0389, "step": 24900 }, { "epoch": 9.222510181414291, "grad_norm": 0.31905823945999146, "learning_rate": 8.95482529754724e-05, "loss": 0.0299, "step": 24910 }, { "epoch": 9.226212513883747, "grad_norm": 0.38112130761146545, "learning_rate": 8.953813387093954e-05, "loss": 0.0262, "step": 24920 }, { "epoch": 9.229914846353202, "grad_norm": 0.17233192920684814, "learning_rate": 8.952801044257581e-05, "loss": 0.0291, "step": 24930 }, { "epoch": 9.233617178822659, "grad_norm": 0.26632118225097656, "learning_rate": 8.951788269148829e-05, "loss": 0.0215, "step": 24940 }, { "epoch": 9.237319511292114, "grad_norm": 0.19808803498744965, "learning_rate": 8.950775061878453e-05, "loss": 0.0342, "step": 24950 }, { "epoch": 9.241021843761569, "grad_norm": 0.2652507722377777, "learning_rate": 8.949761422557256e-05, "loss": 0.0215, "step": 24960 }, { "epoch": 9.244724176231026, "grad_norm": 0.237218976020813, "learning_rate": 8.948747351296088e-05, "loss": 0.0356, "step": 24970 }, { "epoch": 9.248426508700481, "grad_norm": 0.29601922631263733, "learning_rate": 8.947732848205846e-05, "loss": 0.0343, "step": 24980 }, { "epoch": 9.252128841169936, "grad_norm": 0.5698176622390747, "learning_rate": 8.946717913397476e-05, "loss": 0.0285, "step": 24990 }, { "epoch": 9.255831173639393, "grad_norm": 0.19499696791172028, "learning_rate": 8.945702546981969e-05, "loss": 0.0323, "step": 25000 }, { "epoch": 9.259533506108848, "grad_norm": 0.42465388774871826, "learning_rate": 8.944686749070363e-05, "loss": 0.0218, "step": 25010 }, { "epoch": 9.263235838578304, "grad_norm": 1.6243574619293213, "learning_rate": 8.943670519773745e-05, "loss": 0.04, "step": 25020 }, { "epoch": 9.26693817104776, "grad_norm": 0.5917499661445618, "learning_rate": 8.942653859203248e-05, "loss": 0.0332, "step": 25030 }, { "epoch": 9.270640503517216, "grad_norm": 0.2547871768474579, "learning_rate": 8.941636767470054e-05, "loss": 0.0161, "step": 25040 }, { "epoch": 9.27434283598667, "grad_norm": 0.18831048905849457, "learning_rate": 8.940619244685388e-05, "loss": 0.0266, "step": 25050 }, { "epoch": 9.278045168456128, "grad_norm": 0.3110092580318451, "learning_rate": 8.939601290960527e-05, "loss": 0.0244, "step": 25060 }, { "epoch": 9.281747500925583, "grad_norm": 0.2985449433326721, "learning_rate": 8.93858290640679e-05, "loss": 0.0303, "step": 25070 }, { "epoch": 9.285449833395038, "grad_norm": 0.2886282801628113, "learning_rate": 8.93756409113555e-05, "loss": 0.029, "step": 25080 }, { "epoch": 9.289152165864495, "grad_norm": 0.39362233877182007, "learning_rate": 8.93654484525822e-05, "loss": 0.0313, "step": 25090 }, { "epoch": 9.29285449833395, "grad_norm": 0.4400630593299866, "learning_rate": 8.935525168886262e-05, "loss": 0.0465, "step": 25100 }, { "epoch": 9.296556830803405, "grad_norm": 0.4675986170768738, "learning_rate": 8.934505062131193e-05, "loss": 0.0341, "step": 25110 }, { "epoch": 9.300259163272862, "grad_norm": 0.3943565785884857, "learning_rate": 8.933484525104562e-05, "loss": 0.0335, "step": 25120 }, { "epoch": 9.303961495742318, "grad_norm": 0.3100379407405853, "learning_rate": 8.932463557917981e-05, "loss": 0.0279, "step": 25130 }, { "epoch": 9.307663828211773, "grad_norm": 1.5554450750350952, "learning_rate": 8.931442160683094e-05, "loss": 0.0321, "step": 25140 }, { "epoch": 9.31136616068123, "grad_norm": 0.7834059000015259, "learning_rate": 8.930420333511606e-05, "loss": 0.0395, "step": 25150 }, { "epoch": 9.315068493150685, "grad_norm": 0.4740864634513855, "learning_rate": 8.929398076515259e-05, "loss": 0.0308, "step": 25160 }, { "epoch": 9.31877082562014, "grad_norm": 0.3996902406215668, "learning_rate": 8.928375389805845e-05, "loss": 0.0258, "step": 25170 }, { "epoch": 9.322473158089597, "grad_norm": 0.8528814911842346, "learning_rate": 8.927352273495204e-05, "loss": 0.034, "step": 25180 }, { "epoch": 9.326175490559052, "grad_norm": 0.26463082432746887, "learning_rate": 8.926328727695226e-05, "loss": 0.0362, "step": 25190 }, { "epoch": 9.329877823028507, "grad_norm": 0.31055939197540283, "learning_rate": 8.92530475251784e-05, "loss": 0.0344, "step": 25200 }, { "epoch": 9.333580155497964, "grad_norm": 0.1768639236688614, "learning_rate": 8.924280348075027e-05, "loss": 0.0266, "step": 25210 }, { "epoch": 9.33728248796742, "grad_norm": 1.7587443590164185, "learning_rate": 8.923255514478815e-05, "loss": 0.0434, "step": 25220 }, { "epoch": 9.340984820436875, "grad_norm": 0.301939994096756, "learning_rate": 8.92223025184128e-05, "loss": 0.0322, "step": 25230 }, { "epoch": 9.344687152906332, "grad_norm": 1.695224642753601, "learning_rate": 8.921204560274542e-05, "loss": 0.0246, "step": 25240 }, { "epoch": 9.348389485375787, "grad_norm": 0.33882567286491394, "learning_rate": 8.920178439890765e-05, "loss": 0.036, "step": 25250 }, { "epoch": 9.352091817845242, "grad_norm": 0.228139266371727, "learning_rate": 8.919151890802172e-05, "loss": 0.0366, "step": 25260 }, { "epoch": 9.355794150314699, "grad_norm": 0.44885900616645813, "learning_rate": 8.918124913121018e-05, "loss": 0.0257, "step": 25270 }, { "epoch": 9.359496482784154, "grad_norm": 1.1848152875900269, "learning_rate": 8.917097506959615e-05, "loss": 0.0216, "step": 25280 }, { "epoch": 9.36319881525361, "grad_norm": 0.2569606602191925, "learning_rate": 8.916069672430319e-05, "loss": 0.0389, "step": 25290 }, { "epoch": 9.366901147723066, "grad_norm": 0.5761136412620544, "learning_rate": 8.91504140964553e-05, "loss": 0.0397, "step": 25300 }, { "epoch": 9.370603480192521, "grad_norm": 0.24656738340854645, "learning_rate": 8.914012718717699e-05, "loss": 0.0282, "step": 25310 }, { "epoch": 9.374305812661976, "grad_norm": 0.13472333550453186, "learning_rate": 8.912983599759322e-05, "loss": 0.0285, "step": 25320 }, { "epoch": 9.378008145131433, "grad_norm": 0.29426318407058716, "learning_rate": 8.91195405288294e-05, "loss": 0.0268, "step": 25330 }, { "epoch": 9.381710477600889, "grad_norm": 0.24804368615150452, "learning_rate": 8.910924078201147e-05, "loss": 0.0361, "step": 25340 }, { "epoch": 9.385412810070344, "grad_norm": 0.35954228043556213, "learning_rate": 8.909893675826574e-05, "loss": 0.0304, "step": 25350 }, { "epoch": 9.3891151425398, "grad_norm": 0.2113642394542694, "learning_rate": 8.90886284587191e-05, "loss": 0.025, "step": 25360 }, { "epoch": 9.392817475009256, "grad_norm": 0.4025888741016388, "learning_rate": 8.907831588449879e-05, "loss": 0.0352, "step": 25370 }, { "epoch": 9.396519807478711, "grad_norm": 0.5868949890136719, "learning_rate": 8.906799903673265e-05, "loss": 0.0345, "step": 25380 }, { "epoch": 9.400222139948168, "grad_norm": 0.2777172029018402, "learning_rate": 8.905767791654884e-05, "loss": 0.03, "step": 25390 }, { "epoch": 9.403924472417623, "grad_norm": 0.18211698532104492, "learning_rate": 8.90473525250761e-05, "loss": 0.044, "step": 25400 }, { "epoch": 9.407626804887078, "grad_norm": 0.40290966629981995, "learning_rate": 8.90370228634436e-05, "loss": 0.0228, "step": 25410 }, { "epoch": 9.411329137356535, "grad_norm": 0.7133321762084961, "learning_rate": 8.902668893278097e-05, "loss": 0.0263, "step": 25420 }, { "epoch": 9.41503146982599, "grad_norm": 0.346899151802063, "learning_rate": 8.901635073421831e-05, "loss": 0.0265, "step": 25430 }, { "epoch": 9.418733802295446, "grad_norm": 0.27920353412628174, "learning_rate": 8.90060082688862e-05, "loss": 0.0218, "step": 25440 }, { "epoch": 9.422436134764903, "grad_norm": 0.20238998532295227, "learning_rate": 8.899566153791566e-05, "loss": 0.0386, "step": 25450 }, { "epoch": 9.426138467234358, "grad_norm": 0.6469835638999939, "learning_rate": 8.898531054243822e-05, "loss": 0.0341, "step": 25460 }, { "epoch": 9.429840799703813, "grad_norm": 0.45382994413375854, "learning_rate": 8.897495528358582e-05, "loss": 0.0394, "step": 25470 }, { "epoch": 9.43354313217327, "grad_norm": 0.13598434627056122, "learning_rate": 8.89645957624909e-05, "loss": 0.0364, "step": 25480 }, { "epoch": 9.437245464642725, "grad_norm": 0.2050965577363968, "learning_rate": 8.895423198028638e-05, "loss": 0.0271, "step": 25490 }, { "epoch": 9.44094779711218, "grad_norm": 0.44031304121017456, "learning_rate": 8.894386393810563e-05, "loss": 0.0272, "step": 25500 }, { "epoch": 9.444650129581637, "grad_norm": 0.20059175789356232, "learning_rate": 8.893349163708246e-05, "loss": 0.0227, "step": 25510 }, { "epoch": 9.448352462051092, "grad_norm": 0.4251457750797272, "learning_rate": 8.892311507835119e-05, "loss": 0.0275, "step": 25520 }, { "epoch": 9.452054794520548, "grad_norm": 0.1482428014278412, "learning_rate": 8.891273426304654e-05, "loss": 0.0265, "step": 25530 }, { "epoch": 9.455757126990004, "grad_norm": 0.21169020235538483, "learning_rate": 8.890234919230381e-05, "loss": 0.0348, "step": 25540 }, { "epoch": 9.45945945945946, "grad_norm": 0.3256126046180725, "learning_rate": 8.889195986725865e-05, "loss": 0.0363, "step": 25550 }, { "epoch": 9.463161791928915, "grad_norm": 0.8998126983642578, "learning_rate": 8.888156628904724e-05, "loss": 0.0403, "step": 25560 }, { "epoch": 9.466864124398372, "grad_norm": 0.7262536287307739, "learning_rate": 8.887116845880619e-05, "loss": 0.0252, "step": 25570 }, { "epoch": 9.470566456867827, "grad_norm": 0.18595921993255615, "learning_rate": 8.88607663776726e-05, "loss": 0.0317, "step": 25580 }, { "epoch": 9.474268789337282, "grad_norm": 0.22713248431682587, "learning_rate": 8.885036004678402e-05, "loss": 0.0368, "step": 25590 }, { "epoch": 9.477971121806739, "grad_norm": 0.36565059423446655, "learning_rate": 8.883994946727849e-05, "loss": 0.0321, "step": 25600 }, { "epoch": 9.481673454276194, "grad_norm": 0.3685920536518097, "learning_rate": 8.882953464029447e-05, "loss": 0.0398, "step": 25610 }, { "epoch": 9.48537578674565, "grad_norm": 0.3020392954349518, "learning_rate": 8.881911556697093e-05, "loss": 0.0454, "step": 25620 }, { "epoch": 9.489078119215106, "grad_norm": 0.25137171149253845, "learning_rate": 8.880869224844725e-05, "loss": 0.0318, "step": 25630 }, { "epoch": 9.492780451684562, "grad_norm": 0.20157316327095032, "learning_rate": 8.879826468586337e-05, "loss": 0.024, "step": 25640 }, { "epoch": 9.496482784154017, "grad_norm": 0.24656172096729279, "learning_rate": 8.878783288035957e-05, "loss": 0.0255, "step": 25650 }, { "epoch": 9.500185116623474, "grad_norm": 0.2247859686613083, "learning_rate": 8.87773968330767e-05, "loss": 0.0355, "step": 25660 }, { "epoch": 9.503887449092929, "grad_norm": 0.6459104418754578, "learning_rate": 8.876695654515601e-05, "loss": 0.0405, "step": 25670 }, { "epoch": 9.507589781562384, "grad_norm": 1.24984872341156, "learning_rate": 8.875651201773923e-05, "loss": 0.0233, "step": 25680 }, { "epoch": 9.51129211403184, "grad_norm": 0.29307958483695984, "learning_rate": 8.874606325196857e-05, "loss": 0.0278, "step": 25690 }, { "epoch": 9.514994446501296, "grad_norm": 0.2683233320713043, "learning_rate": 8.873561024898668e-05, "loss": 0.0282, "step": 25700 }, { "epoch": 9.518696778970751, "grad_norm": 0.6115036010742188, "learning_rate": 8.872515300993669e-05, "loss": 0.0244, "step": 25710 }, { "epoch": 9.522399111440208, "grad_norm": 0.28269365429878235, "learning_rate": 8.87146915359622e-05, "loss": 0.0323, "step": 25720 }, { "epoch": 9.526101443909663, "grad_norm": 0.24421241879463196, "learning_rate": 8.870422582820726e-05, "loss": 0.0273, "step": 25730 }, { "epoch": 9.529803776379119, "grad_norm": 0.3853977620601654, "learning_rate": 8.869375588781634e-05, "loss": 0.0342, "step": 25740 }, { "epoch": 9.533506108848574, "grad_norm": 0.2457202523946762, "learning_rate": 8.868328171593448e-05, "loss": 0.0273, "step": 25750 }, { "epoch": 9.53720844131803, "grad_norm": 0.15810537338256836, "learning_rate": 8.867280331370709e-05, "loss": 0.0379, "step": 25760 }, { "epoch": 9.540910773787486, "grad_norm": 1.3400744199752808, "learning_rate": 8.866232068228006e-05, "loss": 0.034, "step": 25770 }, { "epoch": 9.544613106256943, "grad_norm": 0.31080135703086853, "learning_rate": 8.865183382279978e-05, "loss": 0.0297, "step": 25780 }, { "epoch": 9.548315438726398, "grad_norm": 0.3945791721343994, "learning_rate": 8.864134273641304e-05, "loss": 0.0284, "step": 25790 }, { "epoch": 9.552017771195853, "grad_norm": 0.13812941312789917, "learning_rate": 8.863084742426719e-05, "loss": 0.0248, "step": 25800 }, { "epoch": 9.555720103665308, "grad_norm": 0.39362087845802307, "learning_rate": 8.862034788750993e-05, "loss": 0.0318, "step": 25810 }, { "epoch": 9.559422436134765, "grad_norm": 0.4480312466621399, "learning_rate": 8.860984412728948e-05, "loss": 0.0295, "step": 25820 }, { "epoch": 9.56312476860422, "grad_norm": 0.14622625708580017, "learning_rate": 8.859933614475452e-05, "loss": 0.039, "step": 25830 }, { "epoch": 9.566827101073676, "grad_norm": 0.4429458677768707, "learning_rate": 8.858882394105423e-05, "loss": 0.0318, "step": 25840 }, { "epoch": 9.570529433543133, "grad_norm": 0.3509398102760315, "learning_rate": 8.857830751733815e-05, "loss": 0.0376, "step": 25850 }, { "epoch": 9.574231766012588, "grad_norm": 0.2579801082611084, "learning_rate": 8.856778687475635e-05, "loss": 0.0258, "step": 25860 }, { "epoch": 9.577934098482043, "grad_norm": 0.3729190230369568, "learning_rate": 8.855726201445938e-05, "loss": 0.0248, "step": 25870 }, { "epoch": 9.5816364309515, "grad_norm": 0.34160125255584717, "learning_rate": 8.85467329375982e-05, "loss": 0.0214, "step": 25880 }, { "epoch": 9.585338763420955, "grad_norm": 0.6184436678886414, "learning_rate": 8.853619964532427e-05, "loss": 0.0239, "step": 25890 }, { "epoch": 9.58904109589041, "grad_norm": 0.3815275728702545, "learning_rate": 8.852566213878947e-05, "loss": 0.0401, "step": 25900 }, { "epoch": 9.592743428359867, "grad_norm": 0.3133804202079773, "learning_rate": 8.851512041914617e-05, "loss": 0.0183, "step": 25910 }, { "epoch": 9.596445760829322, "grad_norm": 0.29475635290145874, "learning_rate": 8.850457448754723e-05, "loss": 0.0253, "step": 25920 }, { "epoch": 9.600148093298777, "grad_norm": 0.28637754917144775, "learning_rate": 8.84940243451459e-05, "loss": 0.0384, "step": 25930 }, { "epoch": 9.603850425768234, "grad_norm": 0.20359352231025696, "learning_rate": 8.848346999309596e-05, "loss": 0.0351, "step": 25940 }, { "epoch": 9.60755275823769, "grad_norm": 0.3804088830947876, "learning_rate": 8.84729114325516e-05, "loss": 0.0392, "step": 25950 }, { "epoch": 9.611255090707145, "grad_norm": 1.4874461889266968, "learning_rate": 8.846234866466747e-05, "loss": 0.0288, "step": 25960 }, { "epoch": 9.614957423176602, "grad_norm": 0.1783400923013687, "learning_rate": 8.845178169059874e-05, "loss": 0.0234, "step": 25970 }, { "epoch": 9.618659755646057, "grad_norm": 0.2735777199268341, "learning_rate": 8.844121051150096e-05, "loss": 0.0292, "step": 25980 }, { "epoch": 9.622362088115512, "grad_norm": 0.21552570164203644, "learning_rate": 8.843063512853019e-05, "loss": 0.0191, "step": 25990 }, { "epoch": 9.626064420584969, "grad_norm": 0.34008798003196716, "learning_rate": 8.842005554284296e-05, "loss": 0.0325, "step": 26000 }, { "epoch": 9.629766753054424, "grad_norm": 0.25550809502601624, "learning_rate": 8.84094717555962e-05, "loss": 0.0324, "step": 26010 }, { "epoch": 9.63346908552388, "grad_norm": 0.5097101926803589, "learning_rate": 8.839888376794738e-05, "loss": 0.0395, "step": 26020 }, { "epoch": 9.637171417993336, "grad_norm": 0.5023301839828491, "learning_rate": 8.838829158105434e-05, "loss": 0.0377, "step": 26030 }, { "epoch": 9.640873750462791, "grad_norm": 0.28241121768951416, "learning_rate": 8.837769519607545e-05, "loss": 0.0324, "step": 26040 }, { "epoch": 9.644576082932247, "grad_norm": 0.5420107841491699, "learning_rate": 8.836709461416952e-05, "loss": 0.0269, "step": 26050 }, { "epoch": 9.648278415401704, "grad_norm": 0.2915950119495392, "learning_rate": 8.83564898364958e-05, "loss": 0.0309, "step": 26060 }, { "epoch": 9.651980747871159, "grad_norm": 0.26608964800834656, "learning_rate": 8.834588086421403e-05, "loss": 0.0442, "step": 26070 }, { "epoch": 9.655683080340614, "grad_norm": 0.1440885365009308, "learning_rate": 8.833526769848439e-05, "loss": 0.0231, "step": 26080 }, { "epoch": 9.659385412810071, "grad_norm": 0.4241134822368622, "learning_rate": 8.832465034046749e-05, "loss": 0.0249, "step": 26090 }, { "epoch": 9.663087745279526, "grad_norm": 0.42753103375434875, "learning_rate": 8.831402879132446e-05, "loss": 0.0244, "step": 26100 }, { "epoch": 9.666790077748981, "grad_norm": 0.18042081594467163, "learning_rate": 8.830340305221684e-05, "loss": 0.0256, "step": 26110 }, { "epoch": 9.670492410218438, "grad_norm": 0.23659084737300873, "learning_rate": 8.829277312430665e-05, "loss": 0.0308, "step": 26120 }, { "epoch": 9.674194742687893, "grad_norm": 0.7306292057037354, "learning_rate": 8.828213900875638e-05, "loss": 0.0386, "step": 26130 }, { "epoch": 9.677897075157349, "grad_norm": 0.3124621510505676, "learning_rate": 8.827150070672894e-05, "loss": 0.0183, "step": 26140 }, { "epoch": 9.681599407626805, "grad_norm": 0.62521892786026, "learning_rate": 8.82608582193877e-05, "loss": 0.0277, "step": 26150 }, { "epoch": 9.68530174009626, "grad_norm": 0.2459433674812317, "learning_rate": 8.825021154789655e-05, "loss": 0.0293, "step": 26160 }, { "epoch": 9.689004072565716, "grad_norm": 0.1908925473690033, "learning_rate": 8.823956069341976e-05, "loss": 0.0347, "step": 26170 }, { "epoch": 9.692706405035173, "grad_norm": 0.17918342351913452, "learning_rate": 8.822890565712211e-05, "loss": 0.018, "step": 26180 }, { "epoch": 9.696408737504628, "grad_norm": 0.19189095497131348, "learning_rate": 8.821824644016882e-05, "loss": 0.0267, "step": 26190 }, { "epoch": 9.700111069974083, "grad_norm": 0.5664727687835693, "learning_rate": 8.820758304372557e-05, "loss": 0.0381, "step": 26200 }, { "epoch": 9.70381340244354, "grad_norm": 0.18692511320114136, "learning_rate": 8.819691546895846e-05, "loss": 0.0294, "step": 26210 }, { "epoch": 9.707515734912995, "grad_norm": 0.658602237701416, "learning_rate": 8.818624371703412e-05, "loss": 0.0355, "step": 26220 }, { "epoch": 9.71121806738245, "grad_norm": 0.35286206007003784, "learning_rate": 8.817556778911956e-05, "loss": 0.0298, "step": 26230 }, { "epoch": 9.714920399851907, "grad_norm": 0.21407385170459747, "learning_rate": 8.816488768638232e-05, "loss": 0.0243, "step": 26240 }, { "epoch": 9.718622732321363, "grad_norm": 0.23050515353679657, "learning_rate": 8.815420340999033e-05, "loss": 0.0256, "step": 26250 }, { "epoch": 9.722325064790818, "grad_norm": 0.4093295931816101, "learning_rate": 8.814351496111201e-05, "loss": 0.0256, "step": 26260 }, { "epoch": 9.726027397260275, "grad_norm": 0.27826225757598877, "learning_rate": 8.813282234091627e-05, "loss": 0.0395, "step": 26270 }, { "epoch": 9.72972972972973, "grad_norm": 0.290210098028183, "learning_rate": 8.81221255505724e-05, "loss": 0.0268, "step": 26280 }, { "epoch": 9.733432062199185, "grad_norm": 2.7946903705596924, "learning_rate": 8.811142459125019e-05, "loss": 0.0256, "step": 26290 }, { "epoch": 9.737134394668642, "grad_norm": 0.6198290586471558, "learning_rate": 8.810071946411989e-05, "loss": 0.0389, "step": 26300 }, { "epoch": 9.740836727138097, "grad_norm": 0.35314691066741943, "learning_rate": 8.809001017035218e-05, "loss": 0.0331, "step": 26310 }, { "epoch": 9.744539059607552, "grad_norm": 0.2490670084953308, "learning_rate": 8.807929671111825e-05, "loss": 0.0287, "step": 26320 }, { "epoch": 9.74824139207701, "grad_norm": 0.20908790826797485, "learning_rate": 8.806857908758967e-05, "loss": 0.0322, "step": 26330 }, { "epoch": 9.751943724546464, "grad_norm": 0.2375100553035736, "learning_rate": 8.805785730093852e-05, "loss": 0.0265, "step": 26340 }, { "epoch": 9.75564605701592, "grad_norm": 0.27718451619148254, "learning_rate": 8.804713135233731e-05, "loss": 0.0298, "step": 26350 }, { "epoch": 9.759348389485377, "grad_norm": 0.20116205513477325, "learning_rate": 8.803640124295902e-05, "loss": 0.0293, "step": 26360 }, { "epoch": 9.763050721954832, "grad_norm": 0.28873106837272644, "learning_rate": 8.802566697397708e-05, "loss": 0.024, "step": 26370 }, { "epoch": 9.766753054424287, "grad_norm": 0.23762615025043488, "learning_rate": 8.801492854656536e-05, "loss": 0.0318, "step": 26380 }, { "epoch": 9.770455386893744, "grad_norm": 0.17308761179447174, "learning_rate": 8.800418596189822e-05, "loss": 0.047, "step": 26390 }, { "epoch": 9.774157719363199, "grad_norm": 0.25853776931762695, "learning_rate": 8.799343922115044e-05, "loss": 0.0311, "step": 26400 }, { "epoch": 9.777860051832654, "grad_norm": 0.16187690198421478, "learning_rate": 8.798268832549725e-05, "loss": 0.0307, "step": 26410 }, { "epoch": 9.781562384302111, "grad_norm": 0.539742648601532, "learning_rate": 8.797193327611439e-05, "loss": 0.0322, "step": 26420 }, { "epoch": 9.785264716771566, "grad_norm": 1.6107869148254395, "learning_rate": 8.7961174074178e-05, "loss": 0.0308, "step": 26430 }, { "epoch": 9.788967049241021, "grad_norm": 0.26957064867019653, "learning_rate": 8.795041072086468e-05, "loss": 0.0309, "step": 26440 }, { "epoch": 9.792669381710478, "grad_norm": 0.4108628034591675, "learning_rate": 8.79396432173515e-05, "loss": 0.0334, "step": 26450 }, { "epoch": 9.796371714179934, "grad_norm": 0.3612764775753021, "learning_rate": 8.792887156481598e-05, "loss": 0.0303, "step": 26460 }, { "epoch": 9.800074046649389, "grad_norm": 0.2072455883026123, "learning_rate": 8.79180957644361e-05, "loss": 0.0243, "step": 26470 }, { "epoch": 9.803776379118846, "grad_norm": 1.1120373010635376, "learning_rate": 8.790731581739026e-05, "loss": 0.0272, "step": 26480 }, { "epoch": 9.8074787115883, "grad_norm": 0.2659394145011902, "learning_rate": 8.789653172485737e-05, "loss": 0.0292, "step": 26490 }, { "epoch": 9.811181044057756, "grad_norm": 0.838779091835022, "learning_rate": 8.788574348801675e-05, "loss": 0.0419, "step": 26500 }, { "epoch": 9.814883376527213, "grad_norm": 0.18531928956508636, "learning_rate": 8.787495110804816e-05, "loss": 0.0265, "step": 26510 }, { "epoch": 9.818585708996668, "grad_norm": 0.29290714859962463, "learning_rate": 8.786415458613188e-05, "loss": 0.0224, "step": 26520 }, { "epoch": 9.822288041466123, "grad_norm": 0.2057393193244934, "learning_rate": 8.785335392344857e-05, "loss": 0.021, "step": 26530 }, { "epoch": 9.825990373935579, "grad_norm": 0.2877441942691803, "learning_rate": 8.78425491211794e-05, "loss": 0.0319, "step": 26540 }, { "epoch": 9.829692706405035, "grad_norm": 0.23449747264385223, "learning_rate": 8.783174018050594e-05, "loss": 0.028, "step": 26550 }, { "epoch": 9.83339503887449, "grad_norm": 0.3368900716304779, "learning_rate": 8.782092710261027e-05, "loss": 0.0323, "step": 26560 }, { "epoch": 9.837097371343948, "grad_norm": 0.1627149134874344, "learning_rate": 8.781010988867486e-05, "loss": 0.0184, "step": 26570 }, { "epoch": 9.840799703813403, "grad_norm": 0.70669025182724, "learning_rate": 8.779928853988268e-05, "loss": 0.0419, "step": 26580 }, { "epoch": 9.844502036282858, "grad_norm": 0.40507301688194275, "learning_rate": 8.778846305741715e-05, "loss": 0.0295, "step": 26590 }, { "epoch": 9.848204368752313, "grad_norm": 0.2968425154685974, "learning_rate": 8.77776334424621e-05, "loss": 0.0342, "step": 26600 }, { "epoch": 9.85190670122177, "grad_norm": 0.3312089145183563, "learning_rate": 8.776679969620185e-05, "loss": 0.0234, "step": 26610 }, { "epoch": 9.855609033691225, "grad_norm": 0.2657737135887146, "learning_rate": 8.775596181982118e-05, "loss": 0.0304, "step": 26620 }, { "epoch": 9.859311366160682, "grad_norm": 0.20146159827709198, "learning_rate": 8.774511981450529e-05, "loss": 0.0343, "step": 26630 }, { "epoch": 9.863013698630137, "grad_norm": 0.7853319644927979, "learning_rate": 8.773427368143983e-05, "loss": 0.0242, "step": 26640 }, { "epoch": 9.866716031099592, "grad_norm": 0.5276449918746948, "learning_rate": 8.772342342181095e-05, "loss": 0.023, "step": 26650 }, { "epoch": 9.870418363569048, "grad_norm": 0.8405433297157288, "learning_rate": 8.771256903680519e-05, "loss": 0.0324, "step": 26660 }, { "epoch": 9.874120696038505, "grad_norm": 0.22216564416885376, "learning_rate": 8.770171052760959e-05, "loss": 0.0424, "step": 26670 }, { "epoch": 9.87782302850796, "grad_norm": 0.35038796067237854, "learning_rate": 8.769084789541159e-05, "loss": 0.0394, "step": 26680 }, { "epoch": 9.881525360977415, "grad_norm": 0.26733455061912537, "learning_rate": 8.767998114139918e-05, "loss": 0.0381, "step": 26690 }, { "epoch": 9.885227693446872, "grad_norm": 0.22614552080631256, "learning_rate": 8.766911026676064e-05, "loss": 0.0231, "step": 26700 }, { "epoch": 9.888930025916327, "grad_norm": 0.21020051836967468, "learning_rate": 8.765823527268485e-05, "loss": 0.0275, "step": 26710 }, { "epoch": 9.892632358385782, "grad_norm": 1.014073371887207, "learning_rate": 8.764735616036106e-05, "loss": 0.0281, "step": 26720 }, { "epoch": 9.89633469085524, "grad_norm": 0.24010713398456573, "learning_rate": 8.7636472930979e-05, "loss": 0.0392, "step": 26730 }, { "epoch": 9.900037023324694, "grad_norm": 1.3095881938934326, "learning_rate": 8.762558558572886e-05, "loss": 0.039, "step": 26740 }, { "epoch": 9.90373935579415, "grad_norm": 0.15425895154476166, "learning_rate": 8.761469412580125e-05, "loss": 0.03, "step": 26750 }, { "epoch": 9.907441688263606, "grad_norm": 0.6127280592918396, "learning_rate": 8.760379855238723e-05, "loss": 0.0422, "step": 26760 }, { "epoch": 9.911144020733062, "grad_norm": 0.3493586480617523, "learning_rate": 8.759289886667834e-05, "loss": 0.0331, "step": 26770 }, { "epoch": 9.914846353202517, "grad_norm": 0.36804884672164917, "learning_rate": 8.758199506986655e-05, "loss": 0.0392, "step": 26780 }, { "epoch": 9.918548685671974, "grad_norm": 0.3334479033946991, "learning_rate": 8.757108716314429e-05, "loss": 0.0318, "step": 26790 }, { "epoch": 9.922251018141429, "grad_norm": 0.49711889028549194, "learning_rate": 8.756017514770443e-05, "loss": 0.0263, "step": 26800 }, { "epoch": 9.925953350610884, "grad_norm": 0.3521951735019684, "learning_rate": 8.754925902474027e-05, "loss": 0.0307, "step": 26810 }, { "epoch": 9.929655683080341, "grad_norm": 0.5384432077407837, "learning_rate": 8.753833879544561e-05, "loss": 0.0265, "step": 26820 }, { "epoch": 9.933358015549796, "grad_norm": 0.20933043956756592, "learning_rate": 8.752741446101464e-05, "loss": 0.0293, "step": 26830 }, { "epoch": 9.937060348019251, "grad_norm": 0.2987355887889862, "learning_rate": 8.751648602264206e-05, "loss": 0.024, "step": 26840 }, { "epoch": 9.940762680488708, "grad_norm": 0.23461414873600006, "learning_rate": 8.750555348152298e-05, "loss": 0.0389, "step": 26850 }, { "epoch": 9.944465012958164, "grad_norm": 0.19189175963401794, "learning_rate": 8.749461683885296e-05, "loss": 0.0346, "step": 26860 }, { "epoch": 9.948167345427619, "grad_norm": 0.3499331772327423, "learning_rate": 8.748367609582801e-05, "loss": 0.0308, "step": 26870 }, { "epoch": 9.951869677897076, "grad_norm": 0.20240545272827148, "learning_rate": 8.74727312536446e-05, "loss": 0.0289, "step": 26880 }, { "epoch": 9.95557201036653, "grad_norm": 0.25987353920936584, "learning_rate": 8.746178231349962e-05, "loss": 0.0262, "step": 26890 }, { "epoch": 9.959274342835986, "grad_norm": 0.19737260043621063, "learning_rate": 8.745082927659047e-05, "loss": 0.025, "step": 26900 }, { "epoch": 9.962976675305443, "grad_norm": 0.33272939920425415, "learning_rate": 8.743987214411493e-05, "loss": 0.0264, "step": 26910 }, { "epoch": 9.966679007774898, "grad_norm": 0.3399594724178314, "learning_rate": 8.742891091727125e-05, "loss": 0.0346, "step": 26920 }, { "epoch": 9.970381340244353, "grad_norm": 0.3464869558811188, "learning_rate": 8.741794559725818e-05, "loss": 0.0214, "step": 26930 }, { "epoch": 9.97408367271381, "grad_norm": 0.37642911076545715, "learning_rate": 8.740697618527481e-05, "loss": 0.0378, "step": 26940 }, { "epoch": 9.977786005183265, "grad_norm": 0.26006343960762024, "learning_rate": 8.739600268252078e-05, "loss": 0.0258, "step": 26950 }, { "epoch": 9.98148833765272, "grad_norm": 0.8955889344215393, "learning_rate": 8.73850250901961e-05, "loss": 0.0369, "step": 26960 }, { "epoch": 9.985190670122178, "grad_norm": 0.33052098751068115, "learning_rate": 8.737404340950129e-05, "loss": 0.0359, "step": 26970 }, { "epoch": 9.988893002591633, "grad_norm": 0.17568053305149078, "learning_rate": 8.73630576416373e-05, "loss": 0.0345, "step": 26980 }, { "epoch": 9.992595335061088, "grad_norm": 0.3145943582057953, "learning_rate": 8.735206778780549e-05, "loss": 0.0315, "step": 26990 }, { "epoch": 9.996297667530545, "grad_norm": 0.3274551331996918, "learning_rate": 8.73410738492077e-05, "loss": 0.0223, "step": 27000 }, { "epoch": 10.0, "grad_norm": 0.24786220490932465, "learning_rate": 8.733007582704623e-05, "loss": 0.034, "step": 27010 }, { "epoch": 10.003702332469455, "grad_norm": 0.45796892046928406, "learning_rate": 8.731907372252377e-05, "loss": 0.0359, "step": 27020 }, { "epoch": 10.007404664938912, "grad_norm": 0.21553292870521545, "learning_rate": 8.730806753684353e-05, "loss": 0.0326, "step": 27030 }, { "epoch": 10.011106997408367, "grad_norm": 0.26177865266799927, "learning_rate": 8.729705727120911e-05, "loss": 0.0366, "step": 27040 }, { "epoch": 10.014809329877822, "grad_norm": 0.28030848503112793, "learning_rate": 8.728604292682459e-05, "loss": 0.0302, "step": 27050 }, { "epoch": 10.01851166234728, "grad_norm": 0.5511851906776428, "learning_rate": 8.727502450489446e-05, "loss": 0.0246, "step": 27060 }, { "epoch": 10.022213994816735, "grad_norm": 0.19787460565567017, "learning_rate": 8.726400200662372e-05, "loss": 0.0232, "step": 27070 }, { "epoch": 10.02591632728619, "grad_norm": 0.37942367792129517, "learning_rate": 8.725297543321772e-05, "loss": 0.0444, "step": 27080 }, { "epoch": 10.029618659755647, "grad_norm": 0.5463720560073853, "learning_rate": 8.724194478588234e-05, "loss": 0.0314, "step": 27090 }, { "epoch": 10.033320992225102, "grad_norm": 0.5405006408691406, "learning_rate": 8.723091006582389e-05, "loss": 0.0346, "step": 27100 }, { "epoch": 10.037023324694557, "grad_norm": 0.23694142699241638, "learning_rate": 8.721987127424907e-05, "loss": 0.0357, "step": 27110 }, { "epoch": 10.040725657164014, "grad_norm": 0.43135175108909607, "learning_rate": 8.720882841236508e-05, "loss": 0.0381, "step": 27120 }, { "epoch": 10.04442798963347, "grad_norm": 0.19435857236385345, "learning_rate": 8.719778148137958e-05, "loss": 0.0145, "step": 27130 }, { "epoch": 10.048130322102924, "grad_norm": 0.7035146951675415, "learning_rate": 8.718673048250061e-05, "loss": 0.029, "step": 27140 }, { "epoch": 10.051832654572381, "grad_norm": 0.8847848773002625, "learning_rate": 8.717567541693673e-05, "loss": 0.031, "step": 27150 }, { "epoch": 10.055534987041836, "grad_norm": 0.23475274443626404, "learning_rate": 8.716461628589683e-05, "loss": 0.0246, "step": 27160 }, { "epoch": 10.059237319511292, "grad_norm": 0.3704792857170105, "learning_rate": 8.715355309059041e-05, "loss": 0.0355, "step": 27170 }, { "epoch": 10.062939651980749, "grad_norm": 0.37765923142433167, "learning_rate": 8.714248583222726e-05, "loss": 0.0234, "step": 27180 }, { "epoch": 10.066641984450204, "grad_norm": 0.4658762514591217, "learning_rate": 8.713141451201772e-05, "loss": 0.0261, "step": 27190 }, { "epoch": 10.070344316919659, "grad_norm": 0.26835834980010986, "learning_rate": 8.71203391311725e-05, "loss": 0.0351, "step": 27200 }, { "epoch": 10.074046649389116, "grad_norm": 0.16394121944904327, "learning_rate": 8.710925969090282e-05, "loss": 0.0249, "step": 27210 }, { "epoch": 10.077748981858571, "grad_norm": 0.21175748109817505, "learning_rate": 8.709817619242027e-05, "loss": 0.0354, "step": 27220 }, { "epoch": 10.081451314328026, "grad_norm": 0.329803466796875, "learning_rate": 8.708708863693697e-05, "loss": 0.0343, "step": 27230 }, { "epoch": 10.085153646797483, "grad_norm": 0.33937498927116394, "learning_rate": 8.70759970256654e-05, "loss": 0.0273, "step": 27240 }, { "epoch": 10.088855979266938, "grad_norm": 0.1801651567220688, "learning_rate": 8.706490135981855e-05, "loss": 0.0231, "step": 27250 }, { "epoch": 10.092558311736394, "grad_norm": 0.17591862380504608, "learning_rate": 8.705380164060982e-05, "loss": 0.0222, "step": 27260 }, { "epoch": 10.09626064420585, "grad_norm": 0.3600732088088989, "learning_rate": 8.704269786925302e-05, "loss": 0.0373, "step": 27270 }, { "epoch": 10.099962976675306, "grad_norm": 0.24690213799476624, "learning_rate": 8.70315900469625e-05, "loss": 0.0353, "step": 27280 }, { "epoch": 10.10366530914476, "grad_norm": 0.7395848035812378, "learning_rate": 8.702047817495295e-05, "loss": 0.0293, "step": 27290 }, { "epoch": 10.107367641614218, "grad_norm": 0.18979910016059875, "learning_rate": 8.700936225443959e-05, "loss": 0.0301, "step": 27300 }, { "epoch": 10.111069974083673, "grad_norm": 0.1700434535741806, "learning_rate": 8.6998242286638e-05, "loss": 0.0338, "step": 27310 }, { "epoch": 10.114772306553128, "grad_norm": 1.055220365524292, "learning_rate": 8.698711827276426e-05, "loss": 0.0414, "step": 27320 }, { "epoch": 10.118474639022585, "grad_norm": 0.19691723585128784, "learning_rate": 8.697599021403489e-05, "loss": 0.0237, "step": 27330 }, { "epoch": 10.12217697149204, "grad_norm": 0.2782294750213623, "learning_rate": 8.696485811166681e-05, "loss": 0.0276, "step": 27340 }, { "epoch": 10.125879303961495, "grad_norm": 0.3926069736480713, "learning_rate": 8.695372196687743e-05, "loss": 0.0268, "step": 27350 }, { "epoch": 10.129581636430952, "grad_norm": 0.2463107407093048, "learning_rate": 8.694258178088457e-05, "loss": 0.033, "step": 27360 }, { "epoch": 10.133283968900408, "grad_norm": 0.4179683327674866, "learning_rate": 8.693143755490652e-05, "loss": 0.04, "step": 27370 }, { "epoch": 10.136986301369863, "grad_norm": 0.3466982841491699, "learning_rate": 8.692028929016196e-05, "loss": 0.0287, "step": 27380 }, { "epoch": 10.14068863383932, "grad_norm": 0.41066232323646545, "learning_rate": 8.69091369878701e-05, "loss": 0.027, "step": 27390 }, { "epoch": 10.144390966308775, "grad_norm": 0.5180931091308594, "learning_rate": 8.689798064925049e-05, "loss": 0.0318, "step": 27400 }, { "epoch": 10.14809329877823, "grad_norm": 0.1951165795326233, "learning_rate": 8.68868202755232e-05, "loss": 0.0337, "step": 27410 }, { "epoch": 10.151795631247687, "grad_norm": 0.4046449363231659, "learning_rate": 8.68756558679087e-05, "loss": 0.0312, "step": 27420 }, { "epoch": 10.155497963717142, "grad_norm": 0.32063964009284973, "learning_rate": 8.686448742762791e-05, "loss": 0.0331, "step": 27430 }, { "epoch": 10.159200296186597, "grad_norm": 0.29435840249061584, "learning_rate": 8.685331495590221e-05, "loss": 0.0221, "step": 27440 }, { "epoch": 10.162902628656052, "grad_norm": 0.27984583377838135, "learning_rate": 8.684213845395339e-05, "loss": 0.0424, "step": 27450 }, { "epoch": 10.16660496112551, "grad_norm": 0.45993298292160034, "learning_rate": 8.68309579230037e-05, "loss": 0.0319, "step": 27460 }, { "epoch": 10.170307293594965, "grad_norm": 0.376241534948349, "learning_rate": 8.681977336427584e-05, "loss": 0.0273, "step": 27470 }, { "epoch": 10.17400962606442, "grad_norm": 0.27035418152809143, "learning_rate": 8.680858477899292e-05, "loss": 0.0336, "step": 27480 }, { "epoch": 10.177711958533877, "grad_norm": 0.17410880327224731, "learning_rate": 8.679739216837849e-05, "loss": 0.0332, "step": 27490 }, { "epoch": 10.181414291003332, "grad_norm": 0.23496609926223755, "learning_rate": 8.678619553365659e-05, "loss": 0.0221, "step": 27500 }, { "epoch": 10.185116623472787, "grad_norm": 0.46520209312438965, "learning_rate": 8.677499487605165e-05, "loss": 0.0241, "step": 27510 }, { "epoch": 10.188818955942244, "grad_norm": 0.6499049663543701, "learning_rate": 8.676379019678855e-05, "loss": 0.0218, "step": 27520 }, { "epoch": 10.1925212884117, "grad_norm": 0.3072949945926666, "learning_rate": 8.675258149709265e-05, "loss": 0.0311, "step": 27530 }, { "epoch": 10.196223620881154, "grad_norm": 0.16976842284202576, "learning_rate": 8.674136877818968e-05, "loss": 0.0187, "step": 27540 }, { "epoch": 10.199925953350611, "grad_norm": 0.29261869192123413, "learning_rate": 8.673015204130586e-05, "loss": 0.0285, "step": 27550 }, { "epoch": 10.203628285820066, "grad_norm": 0.39810240268707275, "learning_rate": 8.671893128766784e-05, "loss": 0.0366, "step": 27560 }, { "epoch": 10.207330618289522, "grad_norm": 0.22741961479187012, "learning_rate": 8.67077065185027e-05, "loss": 0.0253, "step": 27570 }, { "epoch": 10.211032950758979, "grad_norm": 0.20650476217269897, "learning_rate": 8.669647773503797e-05, "loss": 0.0363, "step": 27580 }, { "epoch": 10.214735283228434, "grad_norm": 0.5379213094711304, "learning_rate": 8.66852449385016e-05, "loss": 0.0335, "step": 27590 }, { "epoch": 10.218437615697889, "grad_norm": 0.5013992190361023, "learning_rate": 8.6674008130122e-05, "loss": 0.0282, "step": 27600 }, { "epoch": 10.222139948167346, "grad_norm": 0.2118614912033081, "learning_rate": 8.666276731112801e-05, "loss": 0.0276, "step": 27610 }, { "epoch": 10.225842280636801, "grad_norm": 0.24392478168010712, "learning_rate": 8.66515224827489e-05, "loss": 0.0266, "step": 27620 }, { "epoch": 10.229544613106256, "grad_norm": 0.21287083625793457, "learning_rate": 8.664027364621441e-05, "loss": 0.0221, "step": 27630 }, { "epoch": 10.233246945575713, "grad_norm": 0.3059949576854706, "learning_rate": 8.662902080275467e-05, "loss": 0.0258, "step": 27640 }, { "epoch": 10.236949278045168, "grad_norm": 0.3861425220966339, "learning_rate": 8.661776395360029e-05, "loss": 0.0304, "step": 27650 }, { "epoch": 10.240651610514623, "grad_norm": 0.28494706749916077, "learning_rate": 8.66065030999823e-05, "loss": 0.0228, "step": 27660 }, { "epoch": 10.24435394298408, "grad_norm": 0.5477967262268066, "learning_rate": 8.659523824313218e-05, "loss": 0.0244, "step": 27670 }, { "epoch": 10.248056275453536, "grad_norm": 0.2303297519683838, "learning_rate": 8.658396938428181e-05, "loss": 0.0362, "step": 27680 }, { "epoch": 10.25175860792299, "grad_norm": 0.4215902090072632, "learning_rate": 8.657269652466356e-05, "loss": 0.0257, "step": 27690 }, { "epoch": 10.255460940392448, "grad_norm": 0.14683964848518372, "learning_rate": 8.656141966551019e-05, "loss": 0.026, "step": 27700 }, { "epoch": 10.259163272861903, "grad_norm": 0.22769148647785187, "learning_rate": 8.655013880805495e-05, "loss": 0.0297, "step": 27710 }, { "epoch": 10.262865605331358, "grad_norm": 0.2170133888721466, "learning_rate": 8.653885395353147e-05, "loss": 0.0369, "step": 27720 }, { "epoch": 10.266567937800815, "grad_norm": 0.20541144907474518, "learning_rate": 8.652756510317387e-05, "loss": 0.0296, "step": 27730 }, { "epoch": 10.27027027027027, "grad_norm": 0.20184899866580963, "learning_rate": 8.651627225821666e-05, "loss": 0.0205, "step": 27740 }, { "epoch": 10.273972602739725, "grad_norm": 0.32008740305900574, "learning_rate": 8.650497541989482e-05, "loss": 0.023, "step": 27750 }, { "epoch": 10.277674935209182, "grad_norm": 0.14447979629039764, "learning_rate": 8.649367458944375e-05, "loss": 0.0174, "step": 27760 }, { "epoch": 10.281377267678637, "grad_norm": 0.16253961622714996, "learning_rate": 8.64823697680993e-05, "loss": 0.0205, "step": 27770 }, { "epoch": 10.285079600148093, "grad_norm": 1.1346057653427124, "learning_rate": 8.647106095709773e-05, "loss": 0.0167, "step": 27780 }, { "epoch": 10.28878193261755, "grad_norm": 0.21193933486938477, "learning_rate": 8.645974815767577e-05, "loss": 0.0179, "step": 27790 }, { "epoch": 10.292484265087005, "grad_norm": 0.43070659041404724, "learning_rate": 8.644843137107059e-05, "loss": 0.0522, "step": 27800 }, { "epoch": 10.29618659755646, "grad_norm": 0.32053086161613464, "learning_rate": 8.643711059851974e-05, "loss": 0.0304, "step": 27810 }, { "epoch": 10.299888930025917, "grad_norm": 0.34435606002807617, "learning_rate": 8.642578584126125e-05, "loss": 0.0244, "step": 27820 }, { "epoch": 10.303591262495372, "grad_norm": 0.37791118025779724, "learning_rate": 8.64144571005336e-05, "loss": 0.0286, "step": 27830 }, { "epoch": 10.307293594964827, "grad_norm": 0.3275986611843109, "learning_rate": 8.640312437757565e-05, "loss": 0.0295, "step": 27840 }, { "epoch": 10.310995927434284, "grad_norm": 0.929419755935669, "learning_rate": 8.639178767362676e-05, "loss": 0.0359, "step": 27850 }, { "epoch": 10.31469825990374, "grad_norm": 0.6716088652610779, "learning_rate": 8.638044698992669e-05, "loss": 0.031, "step": 27860 }, { "epoch": 10.318400592373195, "grad_norm": 0.2622787356376648, "learning_rate": 8.636910232771562e-05, "loss": 0.0355, "step": 27870 }, { "epoch": 10.322102924842651, "grad_norm": 0.2893676161766052, "learning_rate": 8.63577536882342e-05, "loss": 0.0294, "step": 27880 }, { "epoch": 10.325805257312107, "grad_norm": 0.2579231858253479, "learning_rate": 8.634640107272351e-05, "loss": 0.0346, "step": 27890 }, { "epoch": 10.329507589781562, "grad_norm": 0.20612521469593048, "learning_rate": 8.633504448242505e-05, "loss": 0.0394, "step": 27900 }, { "epoch": 10.333209922251019, "grad_norm": 0.23495639860630035, "learning_rate": 8.632368391858074e-05, "loss": 0.04, "step": 27910 }, { "epoch": 10.336912254720474, "grad_norm": 0.157579705119133, "learning_rate": 8.631231938243299e-05, "loss": 0.0324, "step": 27920 }, { "epoch": 10.340614587189929, "grad_norm": 0.2059934139251709, "learning_rate": 8.630095087522457e-05, "loss": 0.0293, "step": 27930 }, { "epoch": 10.344316919659386, "grad_norm": 0.38455453515052795, "learning_rate": 8.628957839819875e-05, "loss": 0.0267, "step": 27940 }, { "epoch": 10.348019252128841, "grad_norm": 0.35254818201065063, "learning_rate": 8.627820195259918e-05, "loss": 0.0299, "step": 27950 }, { "epoch": 10.351721584598296, "grad_norm": 0.1405503898859024, "learning_rate": 8.626682153967001e-05, "loss": 0.0228, "step": 27960 }, { "epoch": 10.355423917067753, "grad_norm": 0.30765700340270996, "learning_rate": 8.625543716065575e-05, "loss": 0.0382, "step": 27970 }, { "epoch": 10.359126249537209, "grad_norm": 0.15358351171016693, "learning_rate": 8.624404881680139e-05, "loss": 0.0256, "step": 27980 }, { "epoch": 10.362828582006664, "grad_norm": 0.5298340320587158, "learning_rate": 8.623265650935234e-05, "loss": 0.0326, "step": 27990 }, { "epoch": 10.36653091447612, "grad_norm": 0.276313841342926, "learning_rate": 8.622126023955446e-05, "loss": 0.0272, "step": 28000 }, { "epoch": 10.370233246945576, "grad_norm": 1.1474666595458984, "learning_rate": 8.620986000865401e-05, "loss": 0.0338, "step": 28010 }, { "epoch": 10.373935579415031, "grad_norm": 0.4653327167034149, "learning_rate": 8.61984558178977e-05, "loss": 0.0287, "step": 28020 }, { "epoch": 10.377637911884488, "grad_norm": 0.27686211466789246, "learning_rate": 8.61870476685327e-05, "loss": 0.0338, "step": 28030 }, { "epoch": 10.381340244353943, "grad_norm": 0.26411938667297363, "learning_rate": 8.617563556180657e-05, "loss": 0.0372, "step": 28040 }, { "epoch": 10.385042576823398, "grad_norm": 0.38735732436180115, "learning_rate": 8.616421949896734e-05, "loss": 0.0346, "step": 28050 }, { "epoch": 10.388744909292855, "grad_norm": 0.5837514996528625, "learning_rate": 8.615279948126343e-05, "loss": 0.0247, "step": 28060 }, { "epoch": 10.39244724176231, "grad_norm": 0.38135480880737305, "learning_rate": 8.61413755099437e-05, "loss": 0.0272, "step": 28070 }, { "epoch": 10.396149574231766, "grad_norm": 0.26121827960014343, "learning_rate": 8.61299475862575e-05, "loss": 0.0301, "step": 28080 }, { "epoch": 10.399851906701223, "grad_norm": 0.17857395112514496, "learning_rate": 8.611851571145456e-05, "loss": 0.0423, "step": 28090 }, { "epoch": 10.403554239170678, "grad_norm": 0.19356417655944824, "learning_rate": 8.610707988678503e-05, "loss": 0.0377, "step": 28100 }, { "epoch": 10.407256571640133, "grad_norm": 0.16035640239715576, "learning_rate": 8.609564011349953e-05, "loss": 0.0232, "step": 28110 }, { "epoch": 10.41095890410959, "grad_norm": 0.27276334166526794, "learning_rate": 8.60841963928491e-05, "loss": 0.0226, "step": 28120 }, { "epoch": 10.414661236579045, "grad_norm": 0.32047736644744873, "learning_rate": 8.607274872608521e-05, "loss": 0.0243, "step": 28130 }, { "epoch": 10.4183635690485, "grad_norm": 0.18440087139606476, "learning_rate": 8.606129711445976e-05, "loss": 0.0237, "step": 28140 }, { "epoch": 10.422065901517957, "grad_norm": 0.1933998465538025, "learning_rate": 8.604984155922506e-05, "loss": 0.0265, "step": 28150 }, { "epoch": 10.425768233987412, "grad_norm": 0.27275097370147705, "learning_rate": 8.603838206163391e-05, "loss": 0.0331, "step": 28160 }, { "epoch": 10.429470566456867, "grad_norm": 0.48218807578086853, "learning_rate": 8.602691862293945e-05, "loss": 0.0277, "step": 28170 }, { "epoch": 10.433172898926324, "grad_norm": 0.9424244165420532, "learning_rate": 8.601545124439535e-05, "loss": 0.0319, "step": 28180 }, { "epoch": 10.43687523139578, "grad_norm": 0.20708782970905304, "learning_rate": 8.600397992725566e-05, "loss": 0.0222, "step": 28190 }, { "epoch": 10.440577563865235, "grad_norm": 0.5346727967262268, "learning_rate": 8.599250467277483e-05, "loss": 0.0304, "step": 28200 }, { "epoch": 10.444279896334692, "grad_norm": 0.24533125758171082, "learning_rate": 8.598102548220782e-05, "loss": 0.0314, "step": 28210 }, { "epoch": 10.447982228804147, "grad_norm": 0.30344298481941223, "learning_rate": 8.596954235680996e-05, "loss": 0.0265, "step": 28220 }, { "epoch": 10.451684561273602, "grad_norm": 0.5883030295372009, "learning_rate": 8.595805529783702e-05, "loss": 0.0261, "step": 28230 }, { "epoch": 10.455386893743059, "grad_norm": 0.2060461938381195, "learning_rate": 8.594656430654522e-05, "loss": 0.0398, "step": 28240 }, { "epoch": 10.459089226212514, "grad_norm": 0.2655350863933563, "learning_rate": 8.59350693841912e-05, "loss": 0.0255, "step": 28250 }, { "epoch": 10.46279155868197, "grad_norm": 0.8654329180717468, "learning_rate": 8.592357053203202e-05, "loss": 0.0308, "step": 28260 }, { "epoch": 10.466493891151426, "grad_norm": 0.42008501291275024, "learning_rate": 8.591206775132517e-05, "loss": 0.0312, "step": 28270 }, { "epoch": 10.470196223620881, "grad_norm": 0.4148138761520386, "learning_rate": 8.590056104332858e-05, "loss": 0.0282, "step": 28280 }, { "epoch": 10.473898556090337, "grad_norm": 0.493208646774292, "learning_rate": 8.588905040930061e-05, "loss": 0.031, "step": 28290 }, { "epoch": 10.477600888559792, "grad_norm": 0.2510531544685364, "learning_rate": 8.587753585050004e-05, "loss": 0.0204, "step": 28300 }, { "epoch": 10.481303221029249, "grad_norm": 0.3129069209098816, "learning_rate": 8.586601736818611e-05, "loss": 0.0222, "step": 28310 }, { "epoch": 10.485005553498704, "grad_norm": 1.0415507555007935, "learning_rate": 8.585449496361843e-05, "loss": 0.0396, "step": 28320 }, { "epoch": 10.48870788596816, "grad_norm": 0.6141084432601929, "learning_rate": 8.584296863805709e-05, "loss": 0.0285, "step": 28330 }, { "epoch": 10.492410218437616, "grad_norm": 0.392575740814209, "learning_rate": 8.583143839276259e-05, "loss": 0.0274, "step": 28340 }, { "epoch": 10.496112550907071, "grad_norm": 0.24887505173683167, "learning_rate": 8.581990422899585e-05, "loss": 0.0218, "step": 28350 }, { "epoch": 10.499814883376526, "grad_norm": 0.2662474513053894, "learning_rate": 8.580836614801827e-05, "loss": 0.048, "step": 28360 }, { "epoch": 10.503517215845983, "grad_norm": 0.2722242474555969, "learning_rate": 8.579682415109156e-05, "loss": 0.0311, "step": 28370 }, { "epoch": 10.507219548315438, "grad_norm": 0.2854287922382355, "learning_rate": 8.5785278239478e-05, "loss": 0.0379, "step": 28380 }, { "epoch": 10.510921880784895, "grad_norm": 0.21032610535621643, "learning_rate": 8.577372841444022e-05, "loss": 0.0304, "step": 28390 }, { "epoch": 10.51462421325435, "grad_norm": 0.3548720180988312, "learning_rate": 8.576217467724128e-05, "loss": 0.0324, "step": 28400 }, { "epoch": 10.518326545723806, "grad_norm": 1.1200134754180908, "learning_rate": 8.575061702914468e-05, "loss": 0.0231, "step": 28410 }, { "epoch": 10.522028878193261, "grad_norm": 0.2425791472196579, "learning_rate": 8.573905547141437e-05, "loss": 0.0323, "step": 28420 }, { "epoch": 10.525731210662718, "grad_norm": 0.1790415197610855, "learning_rate": 8.572749000531469e-05, "loss": 0.0374, "step": 28430 }, { "epoch": 10.529433543132173, "grad_norm": 0.35140126943588257, "learning_rate": 8.571592063211038e-05, "loss": 0.0286, "step": 28440 }, { "epoch": 10.533135875601628, "grad_norm": 0.5933624505996704, "learning_rate": 8.570434735306671e-05, "loss": 0.0218, "step": 28450 }, { "epoch": 10.536838208071085, "grad_norm": 0.3296497166156769, "learning_rate": 8.56927701694493e-05, "loss": 0.0303, "step": 28460 }, { "epoch": 10.54054054054054, "grad_norm": 0.10815049707889557, "learning_rate": 8.568118908252421e-05, "loss": 0.0346, "step": 28470 }, { "epoch": 10.544242873009996, "grad_norm": 0.5005599856376648, "learning_rate": 8.566960409355791e-05, "loss": 0.0352, "step": 28480 }, { "epoch": 10.547945205479452, "grad_norm": 0.37193769216537476, "learning_rate": 8.565801520381736e-05, "loss": 0.0314, "step": 28490 }, { "epoch": 10.551647537948908, "grad_norm": 0.41920268535614014, "learning_rate": 8.564642241456986e-05, "loss": 0.029, "step": 28500 }, { "epoch": 10.555349870418363, "grad_norm": 0.4338608980178833, "learning_rate": 8.56348257270832e-05, "loss": 0.039, "step": 28510 }, { "epoch": 10.55905220288782, "grad_norm": 0.27209919691085815, "learning_rate": 8.562322514262557e-05, "loss": 0.0226, "step": 28520 }, { "epoch": 10.562754535357275, "grad_norm": 0.4456934928894043, "learning_rate": 8.561162066246561e-05, "loss": 0.0354, "step": 28530 }, { "epoch": 10.56645686782673, "grad_norm": 0.23947136104106903, "learning_rate": 8.560001228787236e-05, "loss": 0.0293, "step": 28540 }, { "epoch": 10.570159200296187, "grad_norm": 0.3426128327846527, "learning_rate": 8.558840002011528e-05, "loss": 0.0385, "step": 28550 }, { "epoch": 10.573861532765642, "grad_norm": 0.5553749203681946, "learning_rate": 8.557678386046428e-05, "loss": 0.0284, "step": 28560 }, { "epoch": 10.577563865235097, "grad_norm": 0.30822524428367615, "learning_rate": 8.556516381018968e-05, "loss": 0.025, "step": 28570 }, { "epoch": 10.581266197704554, "grad_norm": 0.2567939758300781, "learning_rate": 8.555353987056224e-05, "loss": 0.0235, "step": 28580 }, { "epoch": 10.58496853017401, "grad_norm": 0.1905071884393692, "learning_rate": 8.554191204285313e-05, "loss": 0.0248, "step": 28590 }, { "epoch": 10.588670862643465, "grad_norm": 0.1955389529466629, "learning_rate": 8.553028032833397e-05, "loss": 0.0412, "step": 28600 }, { "epoch": 10.592373195112922, "grad_norm": 0.20961248874664307, "learning_rate": 8.551864472827676e-05, "loss": 0.0184, "step": 28610 }, { "epoch": 10.596075527582377, "grad_norm": 0.2143004685640335, "learning_rate": 8.550700524395397e-05, "loss": 0.038, "step": 28620 }, { "epoch": 10.599777860051832, "grad_norm": 0.24427233636379242, "learning_rate": 8.549536187663847e-05, "loss": 0.0227, "step": 28630 }, { "epoch": 10.603480192521289, "grad_norm": 0.542316198348999, "learning_rate": 8.548371462760356e-05, "loss": 0.0306, "step": 28640 }, { "epoch": 10.607182524990744, "grad_norm": 0.34835344552993774, "learning_rate": 8.547206349812298e-05, "loss": 0.0261, "step": 28650 }, { "epoch": 10.6108848574602, "grad_norm": 0.6338566541671753, "learning_rate": 8.546040848947086e-05, "loss": 0.025, "step": 28660 }, { "epoch": 10.614587189929656, "grad_norm": 0.3036603033542633, "learning_rate": 8.544874960292177e-05, "loss": 0.0215, "step": 28670 }, { "epoch": 10.618289522399111, "grad_norm": 0.3447099030017853, "learning_rate": 8.543708683975071e-05, "loss": 0.036, "step": 28680 }, { "epoch": 10.621991854868567, "grad_norm": 0.24247416853904724, "learning_rate": 8.542542020123315e-05, "loss": 0.0278, "step": 28690 }, { "epoch": 10.625694187338024, "grad_norm": 0.5287677049636841, "learning_rate": 8.541374968864487e-05, "loss": 0.0419, "step": 28700 }, { "epoch": 10.629396519807479, "grad_norm": 0.2279222458600998, "learning_rate": 8.540207530326217e-05, "loss": 0.0282, "step": 28710 }, { "epoch": 10.633098852276934, "grad_norm": 1.2603188753128052, "learning_rate": 8.539039704636175e-05, "loss": 0.038, "step": 28720 }, { "epoch": 10.63680118474639, "grad_norm": 0.2084239274263382, "learning_rate": 8.537871491922071e-05, "loss": 0.0259, "step": 28730 }, { "epoch": 10.640503517215846, "grad_norm": 0.8985902667045593, "learning_rate": 8.53670289231166e-05, "loss": 0.0301, "step": 28740 }, { "epoch": 10.644205849685301, "grad_norm": 0.7058355808258057, "learning_rate": 8.535533905932738e-05, "loss": 0.0346, "step": 28750 }, { "epoch": 10.647908182154758, "grad_norm": 0.28405264019966125, "learning_rate": 8.534364532913144e-05, "loss": 0.0414, "step": 28760 }, { "epoch": 10.651610514624213, "grad_norm": 0.46831753849983215, "learning_rate": 8.533194773380758e-05, "loss": 0.0315, "step": 28770 }, { "epoch": 10.655312847093668, "grad_norm": 0.4085065722465515, "learning_rate": 8.532024627463505e-05, "loss": 0.0229, "step": 28780 }, { "epoch": 10.659015179563125, "grad_norm": 0.26143139600753784, "learning_rate": 8.530854095289347e-05, "loss": 0.0151, "step": 28790 }, { "epoch": 10.66271751203258, "grad_norm": 0.2922780215740204, "learning_rate": 8.529683176986295e-05, "loss": 0.0295, "step": 28800 }, { "epoch": 10.666419844502036, "grad_norm": 0.8892046213150024, "learning_rate": 8.528511872682398e-05, "loss": 0.0256, "step": 28810 }, { "epoch": 10.670122176971493, "grad_norm": 0.33060383796691895, "learning_rate": 8.527340182505746e-05, "loss": 0.0353, "step": 28820 }, { "epoch": 10.673824509440948, "grad_norm": 0.24505303800106049, "learning_rate": 8.526168106584476e-05, "loss": 0.0343, "step": 28830 }, { "epoch": 10.677526841910403, "grad_norm": 0.16276372969150543, "learning_rate": 8.524995645046762e-05, "loss": 0.0274, "step": 28840 }, { "epoch": 10.68122917437986, "grad_norm": 0.3391731083393097, "learning_rate": 8.523822798020827e-05, "loss": 0.0257, "step": 28850 }, { "epoch": 10.684931506849315, "grad_norm": 0.42327964305877686, "learning_rate": 8.522649565634927e-05, "loss": 0.025, "step": 28860 }, { "epoch": 10.68863383931877, "grad_norm": 0.23214368522167206, "learning_rate": 8.521475948017366e-05, "loss": 0.0304, "step": 28870 }, { "epoch": 10.692336171788227, "grad_norm": 0.14105428755283356, "learning_rate": 8.520301945296492e-05, "loss": 0.0297, "step": 28880 }, { "epoch": 10.696038504257682, "grad_norm": 0.32742634415626526, "learning_rate": 8.519127557600688e-05, "loss": 0.0228, "step": 28890 }, { "epoch": 10.699740836727138, "grad_norm": 0.5821499824523926, "learning_rate": 8.517952785058385e-05, "loss": 0.0406, "step": 28900 }, { "epoch": 10.703443169196595, "grad_norm": 0.39257803559303284, "learning_rate": 8.516777627798056e-05, "loss": 0.0178, "step": 28910 }, { "epoch": 10.70714550166605, "grad_norm": 0.33539050817489624, "learning_rate": 8.51560208594821e-05, "loss": 0.0224, "step": 28920 }, { "epoch": 10.710847834135505, "grad_norm": 0.19200286269187927, "learning_rate": 8.51442615963741e-05, "loss": 0.0256, "step": 28930 }, { "epoch": 10.714550166604962, "grad_norm": 0.30583709478378296, "learning_rate": 8.513249848994246e-05, "loss": 0.03, "step": 28940 }, { "epoch": 10.718252499074417, "grad_norm": 0.37575405836105347, "learning_rate": 8.512073154147362e-05, "loss": 0.0268, "step": 28950 }, { "epoch": 10.721954831543872, "grad_norm": 0.3106543719768524, "learning_rate": 8.510896075225438e-05, "loss": 0.0355, "step": 28960 }, { "epoch": 10.72565716401333, "grad_norm": 0.17579995095729828, "learning_rate": 8.509718612357197e-05, "loss": 0.0269, "step": 28970 }, { "epoch": 10.729359496482784, "grad_norm": 1.4117323160171509, "learning_rate": 8.508540765671407e-05, "loss": 0.0422, "step": 28980 }, { "epoch": 10.73306182895224, "grad_norm": 0.1709749847650528, "learning_rate": 8.507362535296871e-05, "loss": 0.0283, "step": 28990 }, { "epoch": 10.736764161421696, "grad_norm": 0.20813776552677155, "learning_rate": 8.506183921362443e-05, "loss": 0.0337, "step": 29000 }, { "epoch": 10.740466493891152, "grad_norm": 0.24693073332309723, "learning_rate": 8.505004923997013e-05, "loss": 0.0342, "step": 29010 }, { "epoch": 10.744168826360607, "grad_norm": 0.215192049741745, "learning_rate": 8.503825543329516e-05, "loss": 0.0377, "step": 29020 }, { "epoch": 10.747871158830064, "grad_norm": 0.39415010809898376, "learning_rate": 8.502645779488922e-05, "loss": 0.0223, "step": 29030 }, { "epoch": 10.751573491299519, "grad_norm": 0.1856951117515564, "learning_rate": 8.501465632604254e-05, "loss": 0.022, "step": 29040 }, { "epoch": 10.755275823768974, "grad_norm": 0.17251311242580414, "learning_rate": 8.500285102804568e-05, "loss": 0.0352, "step": 29050 }, { "epoch": 10.758978156238431, "grad_norm": 0.5094027519226074, "learning_rate": 8.499104190218964e-05, "loss": 0.031, "step": 29060 }, { "epoch": 10.762680488707886, "grad_norm": 0.22512230277061462, "learning_rate": 8.49792289497659e-05, "loss": 0.0234, "step": 29070 }, { "epoch": 10.766382821177341, "grad_norm": 0.26606813073158264, "learning_rate": 8.496741217206625e-05, "loss": 0.0292, "step": 29080 }, { "epoch": 10.770085153646798, "grad_norm": 0.4519916772842407, "learning_rate": 8.495559157038299e-05, "loss": 0.0298, "step": 29090 }, { "epoch": 10.773787486116253, "grad_norm": 0.1998250037431717, "learning_rate": 8.494376714600878e-05, "loss": 0.0309, "step": 29100 }, { "epoch": 10.777489818585709, "grad_norm": 0.3180837333202362, "learning_rate": 8.493193890023674e-05, "loss": 0.0266, "step": 29110 }, { "epoch": 10.781192151055166, "grad_norm": 0.13337095081806183, "learning_rate": 8.492010683436038e-05, "loss": 0.0214, "step": 29120 }, { "epoch": 10.78489448352462, "grad_norm": 0.5832461714744568, "learning_rate": 8.490827094967363e-05, "loss": 0.0339, "step": 29130 }, { "epoch": 10.788596815994076, "grad_norm": 0.2911337614059448, "learning_rate": 8.489643124747086e-05, "loss": 0.03, "step": 29140 }, { "epoch": 10.792299148463531, "grad_norm": 0.14615468680858612, "learning_rate": 8.488458772904684e-05, "loss": 0.0399, "step": 29150 }, { "epoch": 10.796001480932988, "grad_norm": 0.23583769798278809, "learning_rate": 8.487274039569675e-05, "loss": 0.0332, "step": 29160 }, { "epoch": 10.799703813402443, "grad_norm": 1.0312553644180298, "learning_rate": 8.48608892487162e-05, "loss": 0.0584, "step": 29170 }, { "epoch": 10.8034061458719, "grad_norm": 0.19187329709529877, "learning_rate": 8.484903428940121e-05, "loss": 0.0343, "step": 29180 }, { "epoch": 10.807108478341355, "grad_norm": 0.2964034676551819, "learning_rate": 8.483717551904823e-05, "loss": 0.0285, "step": 29190 }, { "epoch": 10.81081081081081, "grad_norm": 0.18932576477527618, "learning_rate": 8.482531293895412e-05, "loss": 0.0303, "step": 29200 }, { "epoch": 10.814513143280266, "grad_norm": 0.42880117893218994, "learning_rate": 8.481344655041613e-05, "loss": 0.0221, "step": 29210 }, { "epoch": 10.818215475749723, "grad_norm": 0.40366998314857483, "learning_rate": 8.480157635473197e-05, "loss": 0.0394, "step": 29220 }, { "epoch": 10.821917808219178, "grad_norm": 0.5094749927520752, "learning_rate": 8.478970235319975e-05, "loss": 0.0285, "step": 29230 }, { "epoch": 10.825620140688635, "grad_norm": 0.3116646409034729, "learning_rate": 8.477782454711798e-05, "loss": 0.0229, "step": 29240 }, { "epoch": 10.82932247315809, "grad_norm": 0.2652375102043152, "learning_rate": 8.476594293778561e-05, "loss": 0.0284, "step": 29250 }, { "epoch": 10.833024805627545, "grad_norm": 0.3177090287208557, "learning_rate": 8.475405752650199e-05, "loss": 0.0284, "step": 29260 }, { "epoch": 10.836727138097, "grad_norm": 0.2326488047838211, "learning_rate": 8.47421683145669e-05, "loss": 0.0306, "step": 29270 }, { "epoch": 10.840429470566457, "grad_norm": 0.23659375309944153, "learning_rate": 8.47302753032805e-05, "loss": 0.0232, "step": 29280 }, { "epoch": 10.844131803035912, "grad_norm": 0.5831325650215149, "learning_rate": 8.47183784939434e-05, "loss": 0.0369, "step": 29290 }, { "epoch": 10.847834135505368, "grad_norm": 0.4033372402191162, "learning_rate": 8.470647788785665e-05, "loss": 0.0264, "step": 29300 }, { "epoch": 10.851536467974825, "grad_norm": 0.16071806848049164, "learning_rate": 8.469457348632165e-05, "loss": 0.031, "step": 29310 }, { "epoch": 10.85523880044428, "grad_norm": 0.5016673803329468, "learning_rate": 8.468266529064025e-05, "loss": 0.0251, "step": 29320 }, { "epoch": 10.858941132913735, "grad_norm": 0.1924857497215271, "learning_rate": 8.467075330211474e-05, "loss": 0.0232, "step": 29330 }, { "epoch": 10.862643465383192, "grad_norm": 0.19765597581863403, "learning_rate": 8.465883752204776e-05, "loss": 0.0151, "step": 29340 }, { "epoch": 10.866345797852647, "grad_norm": 0.27385735511779785, "learning_rate": 8.46469179517424e-05, "loss": 0.0248, "step": 29350 }, { "epoch": 10.870048130322102, "grad_norm": 0.2220546454191208, "learning_rate": 8.463499459250222e-05, "loss": 0.0251, "step": 29360 }, { "epoch": 10.873750462791559, "grad_norm": 0.407850980758667, "learning_rate": 8.462306744563108e-05, "loss": 0.0305, "step": 29370 }, { "epoch": 10.877452795261014, "grad_norm": 0.20705419778823853, "learning_rate": 8.461113651243334e-05, "loss": 0.0361, "step": 29380 }, { "epoch": 10.88115512773047, "grad_norm": 0.8776096701622009, "learning_rate": 8.459920179421374e-05, "loss": 0.0325, "step": 29390 }, { "epoch": 10.884857460199926, "grad_norm": 0.4648410677909851, "learning_rate": 8.458726329227747e-05, "loss": 0.0232, "step": 29400 }, { "epoch": 10.888559792669382, "grad_norm": 0.24078422784805298, "learning_rate": 8.457532100793009e-05, "loss": 0.0475, "step": 29410 }, { "epoch": 10.892262125138837, "grad_norm": 1.1345010995864868, "learning_rate": 8.456337494247757e-05, "loss": 0.0356, "step": 29420 }, { "epoch": 10.895964457608294, "grad_norm": 0.5557284951210022, "learning_rate": 8.455142509722634e-05, "loss": 0.0269, "step": 29430 }, { "epoch": 10.899666790077749, "grad_norm": 0.3181266188621521, "learning_rate": 8.45394714734832e-05, "loss": 0.0398, "step": 29440 }, { "epoch": 10.903369122547204, "grad_norm": 0.7453750967979431, "learning_rate": 8.452751407255541e-05, "loss": 0.0371, "step": 29450 }, { "epoch": 10.907071455016661, "grad_norm": 0.26580676436424255, "learning_rate": 8.451555289575057e-05, "loss": 0.0324, "step": 29460 }, { "epoch": 10.910773787486116, "grad_norm": 0.3636733889579773, "learning_rate": 8.450358794437678e-05, "loss": 0.0406, "step": 29470 }, { "epoch": 10.914476119955571, "grad_norm": 0.24441790580749512, "learning_rate": 8.449161921974247e-05, "loss": 0.0299, "step": 29480 }, { "epoch": 10.918178452425028, "grad_norm": 0.19323194026947021, "learning_rate": 8.447964672315656e-05, "loss": 0.0261, "step": 29490 }, { "epoch": 10.921880784894483, "grad_norm": 0.21451838314533234, "learning_rate": 8.44676704559283e-05, "loss": 0.0219, "step": 29500 }, { "epoch": 10.925583117363939, "grad_norm": 0.2770748436450958, "learning_rate": 8.445569041936743e-05, "loss": 0.029, "step": 29510 }, { "epoch": 10.929285449833396, "grad_norm": 0.3063189685344696, "learning_rate": 8.444370661478406e-05, "loss": 0.0335, "step": 29520 }, { "epoch": 10.93298778230285, "grad_norm": 0.25440123677253723, "learning_rate": 8.443171904348872e-05, "loss": 0.0189, "step": 29530 }, { "epoch": 10.936690114772306, "grad_norm": 0.15801963210105896, "learning_rate": 8.441972770679235e-05, "loss": 0.0371, "step": 29540 }, { "epoch": 10.940392447241763, "grad_norm": 0.3212015926837921, "learning_rate": 8.44077326060063e-05, "loss": 0.0288, "step": 29550 }, { "epoch": 10.944094779711218, "grad_norm": 0.15347173810005188, "learning_rate": 8.439573374244237e-05, "loss": 0.0254, "step": 29560 }, { "epoch": 10.947797112180673, "grad_norm": 0.3558841347694397, "learning_rate": 8.438373111741269e-05, "loss": 0.0284, "step": 29570 }, { "epoch": 10.95149944465013, "grad_norm": 0.5713353157043457, "learning_rate": 8.437172473222987e-05, "loss": 0.0292, "step": 29580 }, { "epoch": 10.955201777119585, "grad_norm": 0.4432487487792969, "learning_rate": 8.435971458820692e-05, "loss": 0.0239, "step": 29590 }, { "epoch": 10.95890410958904, "grad_norm": 0.22206106781959534, "learning_rate": 8.434770068665723e-05, "loss": 0.0341, "step": 29600 }, { "epoch": 10.962606442058497, "grad_norm": 0.8259255886077881, "learning_rate": 8.433568302889464e-05, "loss": 0.0338, "step": 29610 }, { "epoch": 10.966308774527953, "grad_norm": 0.45786893367767334, "learning_rate": 8.432366161623337e-05, "loss": 0.0259, "step": 29620 }, { "epoch": 10.970011106997408, "grad_norm": 0.3784538507461548, "learning_rate": 8.431163644998808e-05, "loss": 0.0285, "step": 29630 }, { "epoch": 10.973713439466865, "grad_norm": 0.401563823223114, "learning_rate": 8.429960753147382e-05, "loss": 0.0348, "step": 29640 }, { "epoch": 10.97741577193632, "grad_norm": 0.16036464273929596, "learning_rate": 8.428757486200603e-05, "loss": 0.0223, "step": 29650 }, { "epoch": 10.981118104405775, "grad_norm": 0.8935896158218384, "learning_rate": 8.427553844290062e-05, "loss": 0.0359, "step": 29660 }, { "epoch": 10.984820436875232, "grad_norm": 0.353196382522583, "learning_rate": 8.426349827547385e-05, "loss": 0.0299, "step": 29670 }, { "epoch": 10.988522769344687, "grad_norm": 0.2543407082557678, "learning_rate": 8.425145436104242e-05, "loss": 0.024, "step": 29680 }, { "epoch": 10.992225101814142, "grad_norm": 1.0121549367904663, "learning_rate": 8.423940670092345e-05, "loss": 0.0339, "step": 29690 }, { "epoch": 10.9959274342836, "grad_norm": 0.346768319606781, "learning_rate": 8.422735529643444e-05, "loss": 0.0225, "step": 29700 }, { "epoch": 10.999629766753054, "grad_norm": 0.22145695984363556, "learning_rate": 8.421530014889332e-05, "loss": 0.0332, "step": 29710 }, { "epoch": 11.00333209922251, "grad_norm": 0.1354627013206482, "learning_rate": 8.420324125961841e-05, "loss": 0.0295, "step": 29720 }, { "epoch": 11.007034431691967, "grad_norm": 0.19332832098007202, "learning_rate": 8.419117862992844e-05, "loss": 0.0361, "step": 29730 }, { "epoch": 11.010736764161422, "grad_norm": 0.3811863660812378, "learning_rate": 8.41791122611426e-05, "loss": 0.0254, "step": 29740 }, { "epoch": 11.014439096630877, "grad_norm": 0.16096413135528564, "learning_rate": 8.416704215458043e-05, "loss": 0.0252, "step": 29750 }, { "epoch": 11.018141429100334, "grad_norm": 0.19968050718307495, "learning_rate": 8.415496831156188e-05, "loss": 0.0263, "step": 29760 }, { "epoch": 11.021843761569789, "grad_norm": 0.14566539227962494, "learning_rate": 8.414289073340737e-05, "loss": 0.0351, "step": 29770 }, { "epoch": 11.025546094039244, "grad_norm": 0.12056952714920044, "learning_rate": 8.413080942143767e-05, "loss": 0.0253, "step": 29780 }, { "epoch": 11.029248426508701, "grad_norm": 0.2932264506816864, "learning_rate": 8.411872437697394e-05, "loss": 0.0377, "step": 29790 }, { "epoch": 11.032950758978156, "grad_norm": 0.2747802734375, "learning_rate": 8.410663560133784e-05, "loss": 0.0375, "step": 29800 }, { "epoch": 11.036653091447612, "grad_norm": 0.3117488622665405, "learning_rate": 8.409454309585132e-05, "loss": 0.0287, "step": 29810 }, { "epoch": 11.040355423917068, "grad_norm": 0.2332390397787094, "learning_rate": 8.408244686183684e-05, "loss": 0.026, "step": 29820 }, { "epoch": 11.044057756386524, "grad_norm": 0.30800849199295044, "learning_rate": 8.407034690061721e-05, "loss": 0.0276, "step": 29830 }, { "epoch": 11.047760088855979, "grad_norm": 0.3406161367893219, "learning_rate": 8.405824321351568e-05, "loss": 0.0385, "step": 29840 }, { "epoch": 11.051462421325436, "grad_norm": 0.2835850417613983, "learning_rate": 8.404613580185585e-05, "loss": 0.0253, "step": 29850 }, { "epoch": 11.055164753794891, "grad_norm": 0.23068737983703613, "learning_rate": 8.403402466696182e-05, "loss": 0.0313, "step": 29860 }, { "epoch": 11.058867086264346, "grad_norm": 0.22635871171951294, "learning_rate": 8.402190981015803e-05, "loss": 0.0249, "step": 29870 }, { "epoch": 11.062569418733803, "grad_norm": 0.2761586010456085, "learning_rate": 8.400979123276931e-05, "loss": 0.0335, "step": 29880 }, { "epoch": 11.066271751203258, "grad_norm": 0.3154078722000122, "learning_rate": 8.399766893612096e-05, "loss": 0.0283, "step": 29890 }, { "epoch": 11.069974083672713, "grad_norm": 0.17439141869544983, "learning_rate": 8.398554292153866e-05, "loss": 0.0208, "step": 29900 }, { "epoch": 11.07367641614217, "grad_norm": 0.14429043233394623, "learning_rate": 8.397341319034848e-05, "loss": 0.0219, "step": 29910 }, { "epoch": 11.077378748611626, "grad_norm": 0.6935598254203796, "learning_rate": 8.39612797438769e-05, "loss": 0.0319, "step": 29920 }, { "epoch": 11.08108108108108, "grad_norm": 0.3279391825199127, "learning_rate": 8.394914258345083e-05, "loss": 0.0382, "step": 29930 }, { "epoch": 11.084783413550538, "grad_norm": 0.2779184877872467, "learning_rate": 8.393700171039758e-05, "loss": 0.0232, "step": 29940 }, { "epoch": 11.088485746019993, "grad_norm": 0.29593223333358765, "learning_rate": 8.392485712604483e-05, "loss": 0.0329, "step": 29950 }, { "epoch": 11.092188078489448, "grad_norm": 0.29042303562164307, "learning_rate": 8.391270883172073e-05, "loss": 0.0307, "step": 29960 }, { "epoch": 11.095890410958905, "grad_norm": 0.25288674235343933, "learning_rate": 8.390055682875377e-05, "loss": 0.0274, "step": 29970 }, { "epoch": 11.09959274342836, "grad_norm": 0.5800740122795105, "learning_rate": 8.388840111847288e-05, "loss": 0.0445, "step": 29980 }, { "epoch": 11.103295075897815, "grad_norm": 0.14256910979747772, "learning_rate": 8.38762417022074e-05, "loss": 0.0261, "step": 29990 }, { "epoch": 11.106997408367272, "grad_norm": 0.32230645418167114, "learning_rate": 8.386407858128706e-05, "loss": 0.024, "step": 30000 }, { "epoch": 11.110699740836727, "grad_norm": 1.3144515752792358, "learning_rate": 8.385191175704198e-05, "loss": 0.0279, "step": 30010 }, { "epoch": 11.114402073306183, "grad_norm": 0.18968015909194946, "learning_rate": 8.383974123080276e-05, "loss": 0.0358, "step": 30020 }, { "epoch": 11.11810440577564, "grad_norm": 0.2738577723503113, "learning_rate": 8.38275670039003e-05, "loss": 0.0224, "step": 30030 }, { "epoch": 11.121806738245095, "grad_norm": 0.5202693343162537, "learning_rate": 8.381538907766596e-05, "loss": 0.0372, "step": 30040 }, { "epoch": 11.12550907071455, "grad_norm": 0.4534394443035126, "learning_rate": 8.380320745343153e-05, "loss": 0.0302, "step": 30050 }, { "epoch": 11.129211403184005, "grad_norm": 0.19380533695220947, "learning_rate": 8.379102213252915e-05, "loss": 0.0295, "step": 30060 }, { "epoch": 11.132913735653462, "grad_norm": 0.3472882807254791, "learning_rate": 8.37788331162914e-05, "loss": 0.0274, "step": 30070 }, { "epoch": 11.136616068122917, "grad_norm": 0.17630070447921753, "learning_rate": 8.376664040605122e-05, "loss": 0.0337, "step": 30080 }, { "epoch": 11.140318400592372, "grad_norm": 0.3980371356010437, "learning_rate": 8.375444400314204e-05, "loss": 0.0208, "step": 30090 }, { "epoch": 11.14402073306183, "grad_norm": 0.39208072423934937, "learning_rate": 8.37422439088976e-05, "loss": 0.0345, "step": 30100 }, { "epoch": 11.147723065531284, "grad_norm": 0.2516510784626007, "learning_rate": 8.37300401246521e-05, "loss": 0.0206, "step": 30110 }, { "epoch": 11.15142539800074, "grad_norm": 0.16424565017223358, "learning_rate": 8.371783265174013e-05, "loss": 0.0275, "step": 30120 }, { "epoch": 11.155127730470197, "grad_norm": 0.22378571331501007, "learning_rate": 8.370562149149665e-05, "loss": 0.0301, "step": 30130 }, { "epoch": 11.158830062939652, "grad_norm": 0.42480212450027466, "learning_rate": 8.369340664525711e-05, "loss": 0.0249, "step": 30140 }, { "epoch": 11.162532395409107, "grad_norm": 0.3897313177585602, "learning_rate": 8.368118811435726e-05, "loss": 0.0281, "step": 30150 }, { "epoch": 11.166234727878564, "grad_norm": 0.15031468868255615, "learning_rate": 8.366896590013334e-05, "loss": 0.0365, "step": 30160 }, { "epoch": 11.169937060348019, "grad_norm": 0.30995601415634155, "learning_rate": 8.36567400039219e-05, "loss": 0.0284, "step": 30170 }, { "epoch": 11.173639392817474, "grad_norm": 0.2344636172056198, "learning_rate": 8.364451042705998e-05, "loss": 0.0234, "step": 30180 }, { "epoch": 11.177341725286931, "grad_norm": 0.30608269572257996, "learning_rate": 8.3632277170885e-05, "loss": 0.0251, "step": 30190 }, { "epoch": 11.181044057756386, "grad_norm": 0.36792585253715515, "learning_rate": 8.362004023673474e-05, "loss": 0.0219, "step": 30200 }, { "epoch": 11.184746390225841, "grad_norm": 0.910166323184967, "learning_rate": 8.360779962594742e-05, "loss": 0.0271, "step": 30210 }, { "epoch": 11.188448722695298, "grad_norm": 0.20273315906524658, "learning_rate": 8.359555533986168e-05, "loss": 0.0266, "step": 30220 }, { "epoch": 11.192151055164754, "grad_norm": 0.2389521300792694, "learning_rate": 8.358330737981651e-05, "loss": 0.0359, "step": 30230 }, { "epoch": 11.195853387634209, "grad_norm": 0.5744994878768921, "learning_rate": 8.357105574715134e-05, "loss": 0.0224, "step": 30240 }, { "epoch": 11.199555720103666, "grad_norm": 0.23474803566932678, "learning_rate": 8.355880044320598e-05, "loss": 0.025, "step": 30250 }, { "epoch": 11.203258052573121, "grad_norm": 0.3570406436920166, "learning_rate": 8.354654146932066e-05, "loss": 0.0307, "step": 30260 }, { "epoch": 11.206960385042576, "grad_norm": 0.1565113067626953, "learning_rate": 8.3534278826836e-05, "loss": 0.0295, "step": 30270 }, { "epoch": 11.210662717512033, "grad_norm": 0.21021431684494019, "learning_rate": 8.352201251709304e-05, "loss": 0.0255, "step": 30280 }, { "epoch": 11.214365049981488, "grad_norm": 0.12724654376506805, "learning_rate": 8.350974254143318e-05, "loss": 0.0173, "step": 30290 }, { "epoch": 11.218067382450943, "grad_norm": 0.19375839829444885, "learning_rate": 8.349746890119826e-05, "loss": 0.028, "step": 30300 }, { "epoch": 11.2217697149204, "grad_norm": 0.3377917408943176, "learning_rate": 8.34851915977305e-05, "loss": 0.0348, "step": 30310 }, { "epoch": 11.225472047389855, "grad_norm": 0.24721159040927887, "learning_rate": 8.347291063237253e-05, "loss": 0.0206, "step": 30320 }, { "epoch": 11.22917437985931, "grad_norm": 1.1380724906921387, "learning_rate": 8.346062600646738e-05, "loss": 0.0383, "step": 30330 }, { "epoch": 11.232876712328768, "grad_norm": 0.15289220213890076, "learning_rate": 8.344833772135847e-05, "loss": 0.0266, "step": 30340 }, { "epoch": 11.236579044798223, "grad_norm": 0.4033289849758148, "learning_rate": 8.343604577838964e-05, "loss": 0.0306, "step": 30350 }, { "epoch": 11.240281377267678, "grad_norm": 0.1997271627187729, "learning_rate": 8.342375017890512e-05, "loss": 0.0302, "step": 30360 }, { "epoch": 11.243983709737135, "grad_norm": 0.5971822738647461, "learning_rate": 8.341145092424954e-05, "loss": 0.0219, "step": 30370 }, { "epoch": 11.24768604220659, "grad_norm": 0.1525510996580124, "learning_rate": 8.33991480157679e-05, "loss": 0.0417, "step": 30380 }, { "epoch": 11.251388374676045, "grad_norm": 0.29334554076194763, "learning_rate": 8.338684145480566e-05, "loss": 0.0325, "step": 30390 }, { "epoch": 11.255090707145502, "grad_norm": 0.26801955699920654, "learning_rate": 8.337453124270863e-05, "loss": 0.0381, "step": 30400 }, { "epoch": 11.258793039614957, "grad_norm": 0.27343642711639404, "learning_rate": 8.336221738082305e-05, "loss": 0.0187, "step": 30410 }, { "epoch": 11.262495372084413, "grad_norm": 0.23018202185630798, "learning_rate": 8.334989987049553e-05, "loss": 0.0254, "step": 30420 }, { "epoch": 11.26619770455387, "grad_norm": 0.3049810528755188, "learning_rate": 8.33375787130731e-05, "loss": 0.021, "step": 30430 }, { "epoch": 11.269900037023325, "grad_norm": 0.1783679574728012, "learning_rate": 8.332525390990319e-05, "loss": 0.0266, "step": 30440 }, { "epoch": 11.27360236949278, "grad_norm": 1.0119750499725342, "learning_rate": 8.331292546233362e-05, "loss": 0.0237, "step": 30450 }, { "epoch": 11.277304701962237, "grad_norm": 0.17481666803359985, "learning_rate": 8.330059337171258e-05, "loss": 0.0211, "step": 30460 }, { "epoch": 11.281007034431692, "grad_norm": 0.20429065823554993, "learning_rate": 8.328825763938874e-05, "loss": 0.0268, "step": 30470 }, { "epoch": 11.284709366901147, "grad_norm": 0.35825029015541077, "learning_rate": 8.327591826671109e-05, "loss": 0.0255, "step": 30480 }, { "epoch": 11.288411699370604, "grad_norm": 0.36250555515289307, "learning_rate": 8.326357525502904e-05, "loss": 0.0274, "step": 30490 }, { "epoch": 11.29211403184006, "grad_norm": 0.2834232449531555, "learning_rate": 8.32512286056924e-05, "loss": 0.0354, "step": 30500 }, { "epoch": 11.295816364309514, "grad_norm": 0.27141109108924866, "learning_rate": 8.323887832005143e-05, "loss": 0.0235, "step": 30510 }, { "epoch": 11.299518696778971, "grad_norm": 0.20518234372138977, "learning_rate": 8.322652439945666e-05, "loss": 0.027, "step": 30520 }, { "epoch": 11.303221029248427, "grad_norm": 0.3380086421966553, "learning_rate": 8.321416684525917e-05, "loss": 0.0235, "step": 30530 }, { "epoch": 11.306923361717882, "grad_norm": 0.29346898198127747, "learning_rate": 8.320180565881031e-05, "loss": 0.034, "step": 30540 }, { "epoch": 11.310625694187339, "grad_norm": 0.43952974677085876, "learning_rate": 8.318944084146192e-05, "loss": 0.0258, "step": 30550 }, { "epoch": 11.314328026656794, "grad_norm": 0.24924534559249878, "learning_rate": 8.317707239456615e-05, "loss": 0.0282, "step": 30560 }, { "epoch": 11.318030359126249, "grad_norm": 1.4419608116149902, "learning_rate": 8.316470031947565e-05, "loss": 0.0325, "step": 30570 }, { "epoch": 11.321732691595706, "grad_norm": 0.16027800738811493, "learning_rate": 8.315232461754338e-05, "loss": 0.0235, "step": 30580 }, { "epoch": 11.325435024065161, "grad_norm": 0.3860771059989929, "learning_rate": 8.313994529012273e-05, "loss": 0.04, "step": 30590 }, { "epoch": 11.329137356534616, "grad_norm": 0.584922194480896, "learning_rate": 8.31275623385675e-05, "loss": 0.0256, "step": 30600 }, { "epoch": 11.332839689004073, "grad_norm": 0.2262783944606781, "learning_rate": 8.311517576423183e-05, "loss": 0.0339, "step": 30610 }, { "epoch": 11.336542021473528, "grad_norm": 0.5123595595359802, "learning_rate": 8.310278556847036e-05, "loss": 0.0249, "step": 30620 }, { "epoch": 11.340244353942984, "grad_norm": 0.20329609513282776, "learning_rate": 8.3090391752638e-05, "loss": 0.0238, "step": 30630 }, { "epoch": 11.34394668641244, "grad_norm": 0.13769294321537018, "learning_rate": 8.307799431809016e-05, "loss": 0.0226, "step": 30640 }, { "epoch": 11.347649018881896, "grad_norm": 0.38210755586624146, "learning_rate": 8.306559326618259e-05, "loss": 0.0332, "step": 30650 }, { "epoch": 11.35135135135135, "grad_norm": 0.2635597288608551, "learning_rate": 8.305318859827147e-05, "loss": 0.0205, "step": 30660 }, { "epoch": 11.355053683820808, "grad_norm": 0.23761260509490967, "learning_rate": 8.304078031571333e-05, "loss": 0.0234, "step": 30670 }, { "epoch": 11.358756016290263, "grad_norm": 0.17810720205307007, "learning_rate": 8.302836841986512e-05, "loss": 0.0314, "step": 30680 }, { "epoch": 11.362458348759718, "grad_norm": 0.12080077081918716, "learning_rate": 8.301595291208422e-05, "loss": 0.0263, "step": 30690 }, { "epoch": 11.366160681229175, "grad_norm": 1.1530241966247559, "learning_rate": 8.300353379372834e-05, "loss": 0.0395, "step": 30700 }, { "epoch": 11.36986301369863, "grad_norm": 0.4431559145450592, "learning_rate": 8.29911110661556e-05, "loss": 0.0306, "step": 30710 }, { "epoch": 11.373565346168085, "grad_norm": 0.145516037940979, "learning_rate": 8.297868473072459e-05, "loss": 0.0184, "step": 30720 }, { "epoch": 11.377267678637542, "grad_norm": 1.0143382549285889, "learning_rate": 8.296625478879417e-05, "loss": 0.0317, "step": 30730 }, { "epoch": 11.380970011106998, "grad_norm": 0.3376535475254059, "learning_rate": 8.29538212417237e-05, "loss": 0.0363, "step": 30740 }, { "epoch": 11.384672343576453, "grad_norm": 0.5857792496681213, "learning_rate": 8.29413840908729e-05, "loss": 0.0495, "step": 30750 }, { "epoch": 11.38837467604591, "grad_norm": 0.12671957910060883, "learning_rate": 8.292894333760186e-05, "loss": 0.0276, "step": 30760 }, { "epoch": 11.392077008515365, "grad_norm": 0.31855306029319763, "learning_rate": 8.291649898327107e-05, "loss": 0.0282, "step": 30770 }, { "epoch": 11.39577934098482, "grad_norm": 0.42316922545433044, "learning_rate": 8.290405102924144e-05, "loss": 0.0304, "step": 30780 }, { "epoch": 11.399481673454277, "grad_norm": 0.9146117568016052, "learning_rate": 8.289159947687427e-05, "loss": 0.0331, "step": 30790 }, { "epoch": 11.403184005923732, "grad_norm": 0.2214156985282898, "learning_rate": 8.287914432753123e-05, "loss": 0.0306, "step": 30800 }, { "epoch": 11.406886338393187, "grad_norm": 0.2576940655708313, "learning_rate": 8.28666855825744e-05, "loss": 0.018, "step": 30810 }, { "epoch": 11.410588670862644, "grad_norm": 0.26773160696029663, "learning_rate": 8.285422324336625e-05, "loss": 0.0301, "step": 30820 }, { "epoch": 11.4142910033321, "grad_norm": 2.200399398803711, "learning_rate": 8.284175731126965e-05, "loss": 0.0358, "step": 30830 }, { "epoch": 11.417993335801555, "grad_norm": 0.1829148530960083, "learning_rate": 8.282928778764783e-05, "loss": 0.0285, "step": 30840 }, { "epoch": 11.421695668271012, "grad_norm": 0.25220921635627747, "learning_rate": 8.281681467386446e-05, "loss": 0.0192, "step": 30850 }, { "epoch": 11.425398000740467, "grad_norm": 0.9825263023376465, "learning_rate": 8.280433797128357e-05, "loss": 0.0186, "step": 30860 }, { "epoch": 11.429100333209922, "grad_norm": 0.2323087602853775, "learning_rate": 8.279185768126962e-05, "loss": 0.0236, "step": 30870 }, { "epoch": 11.432802665679379, "grad_norm": 0.1100786030292511, "learning_rate": 8.277937380518741e-05, "loss": 0.0194, "step": 30880 }, { "epoch": 11.436504998148834, "grad_norm": 0.4088656008243561, "learning_rate": 8.276688634440216e-05, "loss": 0.0312, "step": 30890 }, { "epoch": 11.44020733061829, "grad_norm": 0.24606186151504517, "learning_rate": 8.275439530027948e-05, "loss": 0.0183, "step": 30900 }, { "epoch": 11.443909663087744, "grad_norm": 0.17736516892910004, "learning_rate": 8.274190067418537e-05, "loss": 0.0209, "step": 30910 }, { "epoch": 11.447611995557201, "grad_norm": 0.2899320125579834, "learning_rate": 8.272940246748625e-05, "loss": 0.0239, "step": 30920 }, { "epoch": 11.451314328026656, "grad_norm": 2.2066128253936768, "learning_rate": 8.271690068154887e-05, "loss": 0.0283, "step": 30930 }, { "epoch": 11.455016660496113, "grad_norm": 0.11990685015916824, "learning_rate": 8.270439531774042e-05, "loss": 0.0266, "step": 30940 }, { "epoch": 11.458718992965569, "grad_norm": 0.2750300168991089, "learning_rate": 8.269188637742846e-05, "loss": 0.017, "step": 30950 }, { "epoch": 11.462421325435024, "grad_norm": 0.2265913486480713, "learning_rate": 8.267937386198096e-05, "loss": 0.021, "step": 30960 }, { "epoch": 11.466123657904479, "grad_norm": 0.27636662125587463, "learning_rate": 8.266685777276628e-05, "loss": 0.0252, "step": 30970 }, { "epoch": 11.469825990373936, "grad_norm": 0.29003700613975525, "learning_rate": 8.265433811115316e-05, "loss": 0.0327, "step": 30980 }, { "epoch": 11.473528322843391, "grad_norm": 0.28280866146087646, "learning_rate": 8.26418148785107e-05, "loss": 0.026, "step": 30990 }, { "epoch": 11.477230655312846, "grad_norm": 0.2372298687696457, "learning_rate": 8.262928807620843e-05, "loss": 0.0318, "step": 31000 }, { "epoch": 11.480932987782303, "grad_norm": 0.07921764254570007, "learning_rate": 8.261675770561629e-05, "loss": 0.0308, "step": 31010 }, { "epoch": 11.484635320251758, "grad_norm": 0.6016937494277954, "learning_rate": 8.260422376810457e-05, "loss": 0.0267, "step": 31020 }, { "epoch": 11.488337652721214, "grad_norm": 0.1675977110862732, "learning_rate": 8.259168626504395e-05, "loss": 0.0354, "step": 31030 }, { "epoch": 11.49203998519067, "grad_norm": 0.5270063281059265, "learning_rate": 8.257914519780552e-05, "loss": 0.0271, "step": 31040 }, { "epoch": 11.495742317660126, "grad_norm": 0.2964983582496643, "learning_rate": 8.256660056776076e-05, "loss": 0.0393, "step": 31050 }, { "epoch": 11.49944465012958, "grad_norm": 0.19311246275901794, "learning_rate": 8.25540523762815e-05, "loss": 0.023, "step": 31060 }, { "epoch": 11.503146982599038, "grad_norm": 0.5025165677070618, "learning_rate": 8.254150062474005e-05, "loss": 0.022, "step": 31070 }, { "epoch": 11.506849315068493, "grad_norm": 0.3146786093711853, "learning_rate": 8.2528945314509e-05, "loss": 0.0338, "step": 31080 }, { "epoch": 11.510551647537948, "grad_norm": 0.2556847333908081, "learning_rate": 8.251638644696141e-05, "loss": 0.0269, "step": 31090 }, { "epoch": 11.514253980007405, "grad_norm": 0.790172278881073, "learning_rate": 8.250382402347065e-05, "loss": 0.0323, "step": 31100 }, { "epoch": 11.51795631247686, "grad_norm": 0.1406317949295044, "learning_rate": 8.249125804541061e-05, "loss": 0.0238, "step": 31110 }, { "epoch": 11.521658644946315, "grad_norm": 0.41723957657814026, "learning_rate": 8.247868851415542e-05, "loss": 0.0225, "step": 31120 }, { "epoch": 11.525360977415772, "grad_norm": 0.4500787556171417, "learning_rate": 8.246611543107967e-05, "loss": 0.0314, "step": 31130 }, { "epoch": 11.529063309885228, "grad_norm": 0.9450197219848633, "learning_rate": 8.245353879755837e-05, "loss": 0.0261, "step": 31140 }, { "epoch": 11.532765642354683, "grad_norm": 0.35752761363983154, "learning_rate": 8.244095861496686e-05, "loss": 0.0231, "step": 31150 }, { "epoch": 11.53646797482414, "grad_norm": 0.1923142373561859, "learning_rate": 8.242837488468087e-05, "loss": 0.025, "step": 31160 }, { "epoch": 11.540170307293595, "grad_norm": 0.3975507318973541, "learning_rate": 8.241578760807658e-05, "loss": 0.0342, "step": 31170 }, { "epoch": 11.54387263976305, "grad_norm": 0.2434675395488739, "learning_rate": 8.240319678653049e-05, "loss": 0.0249, "step": 31180 }, { "epoch": 11.547574972232507, "grad_norm": 0.44380316138267517, "learning_rate": 8.23906024214195e-05, "loss": 0.0311, "step": 31190 }, { "epoch": 11.551277304701962, "grad_norm": 0.2421714961528778, "learning_rate": 8.237800451412095e-05, "loss": 0.0267, "step": 31200 }, { "epoch": 11.554979637171417, "grad_norm": 0.19038993120193481, "learning_rate": 8.23654030660125e-05, "loss": 0.0232, "step": 31210 }, { "epoch": 11.558681969640874, "grad_norm": 0.361616313457489, "learning_rate": 8.235279807847223e-05, "loss": 0.0374, "step": 31220 }, { "epoch": 11.56238430211033, "grad_norm": 0.20530162751674652, "learning_rate": 8.234018955287859e-05, "loss": 0.0242, "step": 31230 }, { "epoch": 11.566086634579785, "grad_norm": 0.245712012052536, "learning_rate": 8.232757749061047e-05, "loss": 0.0222, "step": 31240 }, { "epoch": 11.569788967049242, "grad_norm": 0.47818952798843384, "learning_rate": 8.231496189304704e-05, "loss": 0.0361, "step": 31250 }, { "epoch": 11.573491299518697, "grad_norm": 0.2966049015522003, "learning_rate": 8.2302342761568e-05, "loss": 0.0454, "step": 31260 }, { "epoch": 11.577193631988152, "grad_norm": 0.4301688075065613, "learning_rate": 8.228972009755331e-05, "loss": 0.0362, "step": 31270 }, { "epoch": 11.580895964457609, "grad_norm": 0.23521827161312103, "learning_rate": 8.227709390238337e-05, "loss": 0.0402, "step": 31280 }, { "epoch": 11.584598296927064, "grad_norm": 0.8410806655883789, "learning_rate": 8.226446417743897e-05, "loss": 0.045, "step": 31290 }, { "epoch": 11.58830062939652, "grad_norm": 0.3963782489299774, "learning_rate": 8.225183092410128e-05, "loss": 0.0285, "step": 31300 }, { "epoch": 11.592002961865976, "grad_norm": 0.1644727885723114, "learning_rate": 8.223919414375185e-05, "loss": 0.0315, "step": 31310 }, { "epoch": 11.595705294335431, "grad_norm": 0.21119324862957, "learning_rate": 8.222655383777262e-05, "loss": 0.0226, "step": 31320 }, { "epoch": 11.599407626804886, "grad_norm": 0.1633717566728592, "learning_rate": 8.22139100075459e-05, "loss": 0.0181, "step": 31330 }, { "epoch": 11.603109959274343, "grad_norm": 0.3100634515285492, "learning_rate": 8.220126265445444e-05, "loss": 0.027, "step": 31340 }, { "epoch": 11.606812291743799, "grad_norm": 0.18064755201339722, "learning_rate": 8.218861177988129e-05, "loss": 0.0267, "step": 31350 }, { "epoch": 11.610514624213254, "grad_norm": 0.32431983947753906, "learning_rate": 8.217595738520996e-05, "loss": 0.0243, "step": 31360 }, { "epoch": 11.61421695668271, "grad_norm": 0.19519539177417755, "learning_rate": 8.216329947182431e-05, "loss": 0.0179, "step": 31370 }, { "epoch": 11.617919289152166, "grad_norm": 0.16370755434036255, "learning_rate": 8.215063804110857e-05, "loss": 0.0216, "step": 31380 }, { "epoch": 11.621621621621621, "grad_norm": 1.4456546306610107, "learning_rate": 8.213797309444742e-05, "loss": 0.0273, "step": 31390 }, { "epoch": 11.625323954091078, "grad_norm": 0.29262349009513855, "learning_rate": 8.212530463322583e-05, "loss": 0.036, "step": 31400 }, { "epoch": 11.629026286560533, "grad_norm": 0.7047381401062012, "learning_rate": 8.211263265882923e-05, "loss": 0.0317, "step": 31410 }, { "epoch": 11.632728619029988, "grad_norm": 0.3495204448699951, "learning_rate": 8.209995717264339e-05, "loss": 0.0292, "step": 31420 }, { "epoch": 11.636430951499445, "grad_norm": 0.22910678386688232, "learning_rate": 8.208727817605451e-05, "loss": 0.0275, "step": 31430 }, { "epoch": 11.6401332839689, "grad_norm": 0.11849433928728104, "learning_rate": 8.207459567044912e-05, "loss": 0.0231, "step": 31440 }, { "epoch": 11.643835616438356, "grad_norm": 0.3786710500717163, "learning_rate": 8.206190965721419e-05, "loss": 0.0267, "step": 31450 }, { "epoch": 11.647537948907813, "grad_norm": 0.2154490202665329, "learning_rate": 8.204922013773702e-05, "loss": 0.0214, "step": 31460 }, { "epoch": 11.651240281377268, "grad_norm": 0.23659810423851013, "learning_rate": 8.20365271134053e-05, "loss": 0.0359, "step": 31470 }, { "epoch": 11.654942613846723, "grad_norm": 0.6474195718765259, "learning_rate": 8.202383058560717e-05, "loss": 0.0285, "step": 31480 }, { "epoch": 11.65864494631618, "grad_norm": 0.25239327549934387, "learning_rate": 8.201113055573105e-05, "loss": 0.0329, "step": 31490 }, { "epoch": 11.662347278785635, "grad_norm": 0.22800277173519135, "learning_rate": 8.199842702516583e-05, "loss": 0.0485, "step": 31500 }, { "epoch": 11.66604961125509, "grad_norm": 0.2700081467628479, "learning_rate": 8.198571999530073e-05, "loss": 0.0299, "step": 31510 }, { "epoch": 11.669751943724547, "grad_norm": 0.24178527295589447, "learning_rate": 8.197300946752539e-05, "loss": 0.0211, "step": 31520 }, { "epoch": 11.673454276194002, "grad_norm": 0.2959147095680237, "learning_rate": 8.196029544322981e-05, "loss": 0.0285, "step": 31530 }, { "epoch": 11.677156608663458, "grad_norm": 0.3280770182609558, "learning_rate": 8.194757792380437e-05, "loss": 0.0257, "step": 31540 }, { "epoch": 11.680858941132914, "grad_norm": 0.09834953397512436, "learning_rate": 8.193485691063985e-05, "loss": 0.0466, "step": 31550 }, { "epoch": 11.68456127360237, "grad_norm": 0.17871327698230743, "learning_rate": 8.192213240512737e-05, "loss": 0.0219, "step": 31560 }, { "epoch": 11.688263606071825, "grad_norm": 0.32050350308418274, "learning_rate": 8.19094044086585e-05, "loss": 0.0326, "step": 31570 }, { "epoch": 11.691965938541282, "grad_norm": 0.11522864550352097, "learning_rate": 8.189667292262512e-05, "loss": 0.0244, "step": 31580 }, { "epoch": 11.695668271010737, "grad_norm": 0.23571425676345825, "learning_rate": 8.188393794841958e-05, "loss": 0.022, "step": 31590 }, { "epoch": 11.699370603480192, "grad_norm": 0.28782910108566284, "learning_rate": 8.18711994874345e-05, "loss": 0.0316, "step": 31600 }, { "epoch": 11.703072935949649, "grad_norm": 0.23006999492645264, "learning_rate": 8.185845754106295e-05, "loss": 0.0265, "step": 31610 }, { "epoch": 11.706775268419104, "grad_norm": 0.23639002442359924, "learning_rate": 8.18457121106984e-05, "loss": 0.0188, "step": 31620 }, { "epoch": 11.71047760088856, "grad_norm": 0.25372469425201416, "learning_rate": 8.183296319773466e-05, "loss": 0.034, "step": 31630 }, { "epoch": 11.714179933358016, "grad_norm": 0.2791052460670471, "learning_rate": 8.18202108035659e-05, "loss": 0.0287, "step": 31640 }, { "epoch": 11.717882265827471, "grad_norm": 0.38792791962623596, "learning_rate": 8.180745492958674e-05, "loss": 0.0323, "step": 31650 }, { "epoch": 11.721584598296927, "grad_norm": 0.21384207904338837, "learning_rate": 8.179469557719213e-05, "loss": 0.0235, "step": 31660 }, { "epoch": 11.725286930766384, "grad_norm": 1.291329026222229, "learning_rate": 8.178193274777741e-05, "loss": 0.0227, "step": 31670 }, { "epoch": 11.728989263235839, "grad_norm": 0.5152744054794312, "learning_rate": 8.176916644273832e-05, "loss": 0.0369, "step": 31680 }, { "epoch": 11.732691595705294, "grad_norm": 0.25602078437805176, "learning_rate": 8.175639666347094e-05, "loss": 0.0253, "step": 31690 }, { "epoch": 11.736393928174751, "grad_norm": 0.18253934383392334, "learning_rate": 8.174362341137177e-05, "loss": 0.0145, "step": 31700 }, { "epoch": 11.740096260644206, "grad_norm": 0.27784234285354614, "learning_rate": 8.173084668783767e-05, "loss": 0.023, "step": 31710 }, { "epoch": 11.743798593113661, "grad_norm": 0.3709719181060791, "learning_rate": 8.171806649426588e-05, "loss": 0.0229, "step": 31720 }, { "epoch": 11.747500925583118, "grad_norm": 0.2799723148345947, "learning_rate": 8.170528283205403e-05, "loss": 0.0162, "step": 31730 }, { "epoch": 11.751203258052573, "grad_norm": 0.2571316063404083, "learning_rate": 8.169249570260012e-05, "loss": 0.0317, "step": 31740 }, { "epoch": 11.754905590522029, "grad_norm": 0.31272879242897034, "learning_rate": 8.167970510730253e-05, "loss": 0.0401, "step": 31750 }, { "epoch": 11.758607922991484, "grad_norm": 0.11563744395971298, "learning_rate": 8.166691104756001e-05, "loss": 0.027, "step": 31760 }, { "epoch": 11.76231025546094, "grad_norm": 0.5162038207054138, "learning_rate": 8.165411352477171e-05, "loss": 0.0309, "step": 31770 }, { "epoch": 11.766012587930396, "grad_norm": 0.10044251382350922, "learning_rate": 8.164131254033716e-05, "loss": 0.0243, "step": 31780 }, { "epoch": 11.769714920399853, "grad_norm": 4.351011276245117, "learning_rate": 8.162850809565623e-05, "loss": 0.0367, "step": 31790 }, { "epoch": 11.773417252869308, "grad_norm": 0.18530824780464172, "learning_rate": 8.161570019212921e-05, "loss": 0.0481, "step": 31800 }, { "epoch": 11.777119585338763, "grad_norm": 0.35151931643486023, "learning_rate": 8.160288883115674e-05, "loss": 0.0298, "step": 31810 }, { "epoch": 11.780821917808218, "grad_norm": 0.29617658257484436, "learning_rate": 8.159007401413988e-05, "loss": 0.0281, "step": 31820 }, { "epoch": 11.784524250277675, "grad_norm": 0.22554519772529602, "learning_rate": 8.157725574248e-05, "loss": 0.0316, "step": 31830 }, { "epoch": 11.78822658274713, "grad_norm": 0.13379156589508057, "learning_rate": 8.156443401757892e-05, "loss": 0.0186, "step": 31840 }, { "epoch": 11.791928915216587, "grad_norm": 0.3120715618133545, "learning_rate": 8.155160884083881e-05, "loss": 0.0287, "step": 31850 }, { "epoch": 11.795631247686043, "grad_norm": 0.16332095861434937, "learning_rate": 8.153878021366217e-05, "loss": 0.034, "step": 31860 }, { "epoch": 11.799333580155498, "grad_norm": 0.44210657477378845, "learning_rate": 8.152594813745196e-05, "loss": 0.0393, "step": 31870 }, { "epoch": 11.803035912624953, "grad_norm": 0.28553643822669983, "learning_rate": 8.151311261361145e-05, "loss": 0.0255, "step": 31880 }, { "epoch": 11.80673824509441, "grad_norm": 0.2913154363632202, "learning_rate": 8.150027364354431e-05, "loss": 0.027, "step": 31890 }, { "epoch": 11.810440577563865, "grad_norm": 0.2506089508533478, "learning_rate": 8.148743122865463e-05, "loss": 0.0235, "step": 31900 }, { "epoch": 11.81414291003332, "grad_norm": 0.178458109498024, "learning_rate": 8.14745853703468e-05, "loss": 0.0231, "step": 31910 }, { "epoch": 11.817845242502777, "grad_norm": 0.2593863904476166, "learning_rate": 8.146173607002563e-05, "loss": 0.0237, "step": 31920 }, { "epoch": 11.821547574972232, "grad_norm": 0.17709387838840485, "learning_rate": 8.144888332909631e-05, "loss": 0.037, "step": 31930 }, { "epoch": 11.825249907441687, "grad_norm": 0.318706750869751, "learning_rate": 8.143602714896439e-05, "loss": 0.0218, "step": 31940 }, { "epoch": 11.828952239911144, "grad_norm": 0.3138180375099182, "learning_rate": 8.14231675310358e-05, "loss": 0.0262, "step": 31950 }, { "epoch": 11.8326545723806, "grad_norm": 0.2708602547645569, "learning_rate": 8.141030447671686e-05, "loss": 0.0416, "step": 31960 }, { "epoch": 11.836356904850055, "grad_norm": 0.1655137836933136, "learning_rate": 8.139743798741426e-05, "loss": 0.0178, "step": 31970 }, { "epoch": 11.840059237319512, "grad_norm": 0.2069305181503296, "learning_rate": 8.138456806453503e-05, "loss": 0.0257, "step": 31980 }, { "epoch": 11.843761569788967, "grad_norm": 0.23606760799884796, "learning_rate": 8.137169470948662e-05, "loss": 0.0231, "step": 31990 }, { "epoch": 11.847463902258422, "grad_norm": 0.17730413377285004, "learning_rate": 8.135881792367686e-05, "loss": 0.0199, "step": 32000 }, { "epoch": 11.851166234727879, "grad_norm": 0.17645864188671112, "learning_rate": 8.13459377085139e-05, "loss": 0.0266, "step": 32010 }, { "epoch": 11.854868567197334, "grad_norm": 0.3234141170978546, "learning_rate": 8.133305406540633e-05, "loss": 0.0169, "step": 32020 }, { "epoch": 11.85857089966679, "grad_norm": 0.41097280383110046, "learning_rate": 8.132016699576308e-05, "loss": 0.0135, "step": 32030 }, { "epoch": 11.862273232136246, "grad_norm": 0.9044532179832458, "learning_rate": 8.130727650099346e-05, "loss": 0.0192, "step": 32040 }, { "epoch": 11.865975564605701, "grad_norm": 0.12676697969436646, "learning_rate": 8.129438258250712e-05, "loss": 0.0162, "step": 32050 }, { "epoch": 11.869677897075157, "grad_norm": 0.20830181241035461, "learning_rate": 8.128148524171418e-05, "loss": 0.0276, "step": 32060 }, { "epoch": 11.873380229544614, "grad_norm": 0.6578395366668701, "learning_rate": 8.126858448002504e-05, "loss": 0.0328, "step": 32070 }, { "epoch": 11.877082562014069, "grad_norm": 0.2441943883895874, "learning_rate": 8.125568029885052e-05, "loss": 0.0307, "step": 32080 }, { "epoch": 11.880784894483524, "grad_norm": 0.6109654307365417, "learning_rate": 8.124277269960179e-05, "loss": 0.0252, "step": 32090 }, { "epoch": 11.88448722695298, "grad_norm": 0.21971745789051056, "learning_rate": 8.12298616836904e-05, "loss": 0.026, "step": 32100 }, { "epoch": 11.888189559422436, "grad_norm": 0.15867865085601807, "learning_rate": 8.121694725252829e-05, "loss": 0.0199, "step": 32110 }, { "epoch": 11.891891891891891, "grad_norm": 0.3146575391292572, "learning_rate": 8.120402940752778e-05, "loss": 0.0231, "step": 32120 }, { "epoch": 11.895594224361348, "grad_norm": 0.26947396993637085, "learning_rate": 8.119110815010151e-05, "loss": 0.0248, "step": 32130 }, { "epoch": 11.899296556830803, "grad_norm": 0.3725127577781677, "learning_rate": 8.117818348166258e-05, "loss": 0.0281, "step": 32140 }, { "epoch": 11.902998889300259, "grad_norm": 0.34458133578300476, "learning_rate": 8.116525540362434e-05, "loss": 0.0317, "step": 32150 }, { "epoch": 11.906701221769715, "grad_norm": 0.36567366123199463, "learning_rate": 8.115232391740064e-05, "loss": 0.0294, "step": 32160 }, { "epoch": 11.91040355423917, "grad_norm": 0.19311630725860596, "learning_rate": 8.113938902440564e-05, "loss": 0.0218, "step": 32170 }, { "epoch": 11.914105886708626, "grad_norm": 0.22947734594345093, "learning_rate": 8.112645072605386e-05, "loss": 0.0254, "step": 32180 }, { "epoch": 11.917808219178083, "grad_norm": 1.776878833770752, "learning_rate": 8.111350902376023e-05, "loss": 0.029, "step": 32190 }, { "epoch": 11.921510551647538, "grad_norm": 0.2464345246553421, "learning_rate": 8.110056391894005e-05, "loss": 0.027, "step": 32200 }, { "epoch": 11.925212884116993, "grad_norm": 0.2141982913017273, "learning_rate": 8.108761541300893e-05, "loss": 0.0246, "step": 32210 }, { "epoch": 11.92891521658645, "grad_norm": 0.158762127161026, "learning_rate": 8.107466350738295e-05, "loss": 0.029, "step": 32220 }, { "epoch": 11.932617549055905, "grad_norm": 0.241086944937706, "learning_rate": 8.106170820347847e-05, "loss": 0.0222, "step": 32230 }, { "epoch": 11.93631988152536, "grad_norm": 0.5680615901947021, "learning_rate": 8.104874950271231e-05, "loss": 0.029, "step": 32240 }, { "epoch": 11.940022213994817, "grad_norm": 0.2578682005405426, "learning_rate": 8.103578740650156e-05, "loss": 0.0237, "step": 32250 }, { "epoch": 11.943724546464273, "grad_norm": 0.327697217464447, "learning_rate": 8.102282191626378e-05, "loss": 0.025, "step": 32260 }, { "epoch": 11.947426878933728, "grad_norm": 0.23699717223644257, "learning_rate": 8.100985303341682e-05, "loss": 0.0262, "step": 32270 }, { "epoch": 11.951129211403185, "grad_norm": 0.2798881530761719, "learning_rate": 8.099688075937896e-05, "loss": 0.024, "step": 32280 }, { "epoch": 11.95483154387264, "grad_norm": 0.23650163412094116, "learning_rate": 8.098390509556883e-05, "loss": 0.0247, "step": 32290 }, { "epoch": 11.958533876342095, "grad_norm": 0.1462518721818924, "learning_rate": 8.097092604340542e-05, "loss": 0.0226, "step": 32300 }, { "epoch": 11.962236208811552, "grad_norm": 0.16482987999916077, "learning_rate": 8.095794360430808e-05, "loss": 0.0286, "step": 32310 }, { "epoch": 11.965938541281007, "grad_norm": 0.17068955302238464, "learning_rate": 8.09449577796966e-05, "loss": 0.0304, "step": 32320 }, { "epoch": 11.969640873750462, "grad_norm": 0.2160273939371109, "learning_rate": 8.093196857099106e-05, "loss": 0.0261, "step": 32330 }, { "epoch": 11.97334320621992, "grad_norm": 0.17681017518043518, "learning_rate": 8.091897597961193e-05, "loss": 0.0217, "step": 32340 }, { "epoch": 11.977045538689374, "grad_norm": 0.4275113046169281, "learning_rate": 8.090598000698009e-05, "loss": 0.025, "step": 32350 }, { "epoch": 11.98074787115883, "grad_norm": 0.20170603692531586, "learning_rate": 8.089298065451672e-05, "loss": 0.029, "step": 32360 }, { "epoch": 11.984450203628287, "grad_norm": 0.2478588968515396, "learning_rate": 8.087997792364344e-05, "loss": 0.0266, "step": 32370 }, { "epoch": 11.988152536097742, "grad_norm": 0.3200121223926544, "learning_rate": 8.086697181578222e-05, "loss": 0.0272, "step": 32380 }, { "epoch": 11.991854868567197, "grad_norm": 0.4847109913825989, "learning_rate": 8.085396233235536e-05, "loss": 0.0343, "step": 32390 }, { "epoch": 11.995557201036654, "grad_norm": 0.19808825850486755, "learning_rate": 8.084094947478556e-05, "loss": 0.0418, "step": 32400 }, { "epoch": 11.999259533506109, "grad_norm": 0.23063945770263672, "learning_rate": 8.082793324449589e-05, "loss": 0.0286, "step": 32410 }, { "epoch": 12.002961865975564, "grad_norm": 0.2851874530315399, "learning_rate": 8.081491364290981e-05, "loss": 0.0191, "step": 32420 }, { "epoch": 12.006664198445021, "grad_norm": 0.2307232916355133, "learning_rate": 8.080189067145108e-05, "loss": 0.0187, "step": 32430 }, { "epoch": 12.010366530914476, "grad_norm": 0.1893407106399536, "learning_rate": 8.078886433154392e-05, "loss": 0.0195, "step": 32440 }, { "epoch": 12.014068863383931, "grad_norm": 1.1233500242233276, "learning_rate": 8.077583462461283e-05, "loss": 0.0282, "step": 32450 }, { "epoch": 12.017771195853388, "grad_norm": 0.4325343370437622, "learning_rate": 8.076280155208273e-05, "loss": 0.0299, "step": 32460 }, { "epoch": 12.021473528322844, "grad_norm": 0.3079652190208435, "learning_rate": 8.07497651153789e-05, "loss": 0.037, "step": 32470 }, { "epoch": 12.025175860792299, "grad_norm": 0.35667741298675537, "learning_rate": 8.073672531592702e-05, "loss": 0.0275, "step": 32480 }, { "epoch": 12.028878193261756, "grad_norm": 0.18185046315193176, "learning_rate": 8.072368215515306e-05, "loss": 0.0196, "step": 32490 }, { "epoch": 12.03258052573121, "grad_norm": 0.7187784314155579, "learning_rate": 8.07106356344834e-05, "loss": 0.0223, "step": 32500 }, { "epoch": 12.036282858200666, "grad_norm": 0.19709505140781403, "learning_rate": 8.069758575534481e-05, "loss": 0.0297, "step": 32510 }, { "epoch": 12.039985190670123, "grad_norm": 0.17763978242874146, "learning_rate": 8.068453251916439e-05, "loss": 0.0221, "step": 32520 }, { "epoch": 12.043687523139578, "grad_norm": 1.4504276514053345, "learning_rate": 8.067147592736962e-05, "loss": 0.024, "step": 32530 }, { "epoch": 12.047389855609033, "grad_norm": 0.40944650769233704, "learning_rate": 8.065841598138837e-05, "loss": 0.028, "step": 32540 }, { "epoch": 12.05109218807849, "grad_norm": 0.21590635180473328, "learning_rate": 8.064535268264883e-05, "loss": 0.0232, "step": 32550 }, { "epoch": 12.054794520547945, "grad_norm": 0.21571436524391174, "learning_rate": 8.063228603257959e-05, "loss": 0.0366, "step": 32560 }, { "epoch": 12.0584968530174, "grad_norm": 0.15168297290802002, "learning_rate": 8.061921603260963e-05, "loss": 0.0402, "step": 32570 }, { "epoch": 12.062199185486858, "grad_norm": 0.20008157193660736, "learning_rate": 8.060614268416823e-05, "loss": 0.0289, "step": 32580 }, { "epoch": 12.065901517956313, "grad_norm": 0.2891072928905487, "learning_rate": 8.059306598868506e-05, "loss": 0.0257, "step": 32590 }, { "epoch": 12.069603850425768, "grad_norm": 0.15895836055278778, "learning_rate": 8.057998594759022e-05, "loss": 0.024, "step": 32600 }, { "epoch": 12.073306182895225, "grad_norm": 0.28515681624412537, "learning_rate": 8.056690256231409e-05, "loss": 0.0273, "step": 32610 }, { "epoch": 12.07700851536468, "grad_norm": 0.8465591669082642, "learning_rate": 8.055381583428743e-05, "loss": 0.0266, "step": 32620 }, { "epoch": 12.080710847834135, "grad_norm": 0.308326780796051, "learning_rate": 8.054072576494143e-05, "loss": 0.0439, "step": 32630 }, { "epoch": 12.084413180303592, "grad_norm": 0.2783707082271576, "learning_rate": 8.052763235570756e-05, "loss": 0.0189, "step": 32640 }, { "epoch": 12.088115512773047, "grad_norm": 1.3265289068222046, "learning_rate": 8.051453560801772e-05, "loss": 0.0268, "step": 32650 }, { "epoch": 12.091817845242502, "grad_norm": 0.19415195286273956, "learning_rate": 8.050143552330414e-05, "loss": 0.0279, "step": 32660 }, { "epoch": 12.095520177711958, "grad_norm": 0.410737007856369, "learning_rate": 8.048833210299944e-05, "loss": 0.0287, "step": 32670 }, { "epoch": 12.099222510181415, "grad_norm": 0.2884805202484131, "learning_rate": 8.047522534853657e-05, "loss": 0.0273, "step": 32680 }, { "epoch": 12.10292484265087, "grad_norm": 0.3733695149421692, "learning_rate": 8.046211526134888e-05, "loss": 0.0269, "step": 32690 }, { "epoch": 12.106627175120325, "grad_norm": 0.23450838029384613, "learning_rate": 8.044900184287007e-05, "loss": 0.0306, "step": 32700 }, { "epoch": 12.110329507589782, "grad_norm": 0.23336844146251678, "learning_rate": 8.043588509453419e-05, "loss": 0.0311, "step": 32710 }, { "epoch": 12.114031840059237, "grad_norm": 0.17481443285942078, "learning_rate": 8.042276501777567e-05, "loss": 0.0167, "step": 32720 }, { "epoch": 12.117734172528692, "grad_norm": 0.4924207627773285, "learning_rate": 8.040964161402932e-05, "loss": 0.03, "step": 32730 }, { "epoch": 12.12143650499815, "grad_norm": 0.37697896361351013, "learning_rate": 8.039651488473028e-05, "loss": 0.0207, "step": 32740 }, { "epoch": 12.125138837467604, "grad_norm": 0.21843743324279785, "learning_rate": 8.038338483131407e-05, "loss": 0.0327, "step": 32750 }, { "epoch": 12.12884116993706, "grad_norm": 0.35338759422302246, "learning_rate": 8.037025145521657e-05, "loss": 0.0266, "step": 32760 }, { "epoch": 12.132543502406516, "grad_norm": 0.21736197173595428, "learning_rate": 8.035711475787404e-05, "loss": 0.0285, "step": 32770 }, { "epoch": 12.136245834875972, "grad_norm": 0.37665560841560364, "learning_rate": 8.034397474072309e-05, "loss": 0.0372, "step": 32780 }, { "epoch": 12.139948167345427, "grad_norm": 0.17498522996902466, "learning_rate": 8.033083140520065e-05, "loss": 0.027, "step": 32790 }, { "epoch": 12.143650499814884, "grad_norm": 0.13839183747768402, "learning_rate": 8.031768475274413e-05, "loss": 0.0328, "step": 32800 }, { "epoch": 12.147352832284339, "grad_norm": 5.4089837074279785, "learning_rate": 8.030453478479117e-05, "loss": 0.03, "step": 32810 }, { "epoch": 12.151055164753794, "grad_norm": 1.3846913576126099, "learning_rate": 8.029138150277984e-05, "loss": 0.0459, "step": 32820 }, { "epoch": 12.154757497223251, "grad_norm": 0.9765980243682861, "learning_rate": 8.027822490814858e-05, "loss": 0.0412, "step": 32830 }, { "epoch": 12.158459829692706, "grad_norm": 1.9716805219650269, "learning_rate": 8.026506500233617e-05, "loss": 0.0297, "step": 32840 }, { "epoch": 12.162162162162161, "grad_norm": 0.37011873722076416, "learning_rate": 8.025190178678175e-05, "loss": 0.0374, "step": 32850 }, { "epoch": 12.165864494631618, "grad_norm": 1.6550343036651611, "learning_rate": 8.023873526292483e-05, "loss": 0.0242, "step": 32860 }, { "epoch": 12.169566827101074, "grad_norm": 0.1674708127975464, "learning_rate": 8.022556543220529e-05, "loss": 0.0227, "step": 32870 }, { "epoch": 12.173269159570529, "grad_norm": 0.2510000169277191, "learning_rate": 8.021239229606335e-05, "loss": 0.0188, "step": 32880 }, { "epoch": 12.176971492039986, "grad_norm": 0.40687763690948486, "learning_rate": 8.019921585593962e-05, "loss": 0.0314, "step": 32890 }, { "epoch": 12.18067382450944, "grad_norm": 0.2048557847738266, "learning_rate": 8.018603611327504e-05, "loss": 0.0171, "step": 32900 }, { "epoch": 12.184376156978896, "grad_norm": 0.2373085618019104, "learning_rate": 8.017285306951095e-05, "loss": 0.0213, "step": 32910 }, { "epoch": 12.188078489448353, "grad_norm": 0.21340546011924744, "learning_rate": 8.015966672608899e-05, "loss": 0.0324, "step": 32920 }, { "epoch": 12.191780821917808, "grad_norm": 0.12853890657424927, "learning_rate": 8.014647708445124e-05, "loss": 0.0239, "step": 32930 }, { "epoch": 12.195483154387263, "grad_norm": 0.5414863228797913, "learning_rate": 8.013328414604007e-05, "loss": 0.0276, "step": 32940 }, { "epoch": 12.19918548685672, "grad_norm": 0.28523436188697815, "learning_rate": 8.012008791229826e-05, "loss": 0.0358, "step": 32950 }, { "epoch": 12.202887819326175, "grad_norm": 0.3965228497982025, "learning_rate": 8.010688838466892e-05, "loss": 0.033, "step": 32960 }, { "epoch": 12.20659015179563, "grad_norm": 0.23208782076835632, "learning_rate": 8.009368556459552e-05, "loss": 0.0324, "step": 32970 }, { "epoch": 12.210292484265088, "grad_norm": 0.33905383944511414, "learning_rate": 8.008047945352193e-05, "loss": 0.0288, "step": 32980 }, { "epoch": 12.213994816734543, "grad_norm": 0.2686653733253479, "learning_rate": 8.006727005289232e-05, "loss": 0.0361, "step": 32990 }, { "epoch": 12.217697149203998, "grad_norm": 0.2560071349143982, "learning_rate": 8.005405736415126e-05, "loss": 0.0297, "step": 33000 }, { "epoch": 12.221399481673455, "grad_norm": 0.30037277936935425, "learning_rate": 8.004084138874368e-05, "loss": 0.0221, "step": 33010 }, { "epoch": 12.22510181414291, "grad_norm": 0.28897082805633545, "learning_rate": 8.002762212811484e-05, "loss": 0.0292, "step": 33020 }, { "epoch": 12.228804146612365, "grad_norm": 0.2113751471042633, "learning_rate": 8.00143995837104e-05, "loss": 0.039, "step": 33030 }, { "epoch": 12.232506479081822, "grad_norm": 0.40188831090927124, "learning_rate": 8.000117375697635e-05, "loss": 0.0253, "step": 33040 }, { "epoch": 12.236208811551277, "grad_norm": 0.3148209750652313, "learning_rate": 7.998794464935904e-05, "loss": 0.0215, "step": 33050 }, { "epoch": 12.239911144020732, "grad_norm": 0.1647680699825287, "learning_rate": 7.99747122623052e-05, "loss": 0.0224, "step": 33060 }, { "epoch": 12.24361347649019, "grad_norm": 0.3681179881095886, "learning_rate": 7.996147659726186e-05, "loss": 0.0226, "step": 33070 }, { "epoch": 12.247315808959645, "grad_norm": 0.7435990571975708, "learning_rate": 7.994823765567651e-05, "loss": 0.0328, "step": 33080 }, { "epoch": 12.2510181414291, "grad_norm": 0.3105887472629547, "learning_rate": 7.993499543899692e-05, "loss": 0.0262, "step": 33090 }, { "epoch": 12.254720473898557, "grad_norm": 0.2423926293849945, "learning_rate": 7.992174994867123e-05, "loss": 0.0318, "step": 33100 }, { "epoch": 12.258422806368012, "grad_norm": 0.5107476115226746, "learning_rate": 7.990850118614794e-05, "loss": 0.0215, "step": 33110 }, { "epoch": 12.262125138837467, "grad_norm": 0.3596755862236023, "learning_rate": 7.989524915287595e-05, "loss": 0.0217, "step": 33120 }, { "epoch": 12.265827471306924, "grad_norm": 0.313357949256897, "learning_rate": 7.988199385030445e-05, "loss": 0.0207, "step": 33130 }, { "epoch": 12.26952980377638, "grad_norm": 0.11275412887334824, "learning_rate": 7.986873527988303e-05, "loss": 0.0259, "step": 33140 }, { "epoch": 12.273232136245834, "grad_norm": 0.2340901643037796, "learning_rate": 7.985547344306161e-05, "loss": 0.0245, "step": 33150 }, { "epoch": 12.276934468715291, "grad_norm": 2.547515392303467, "learning_rate": 7.984220834129052e-05, "loss": 0.0308, "step": 33160 }, { "epoch": 12.280636801184746, "grad_norm": 0.28428345918655396, "learning_rate": 7.98289399760204e-05, "loss": 0.032, "step": 33170 }, { "epoch": 12.284339133654202, "grad_norm": 0.3925229609012604, "learning_rate": 7.981566834870225e-05, "loss": 0.0224, "step": 33180 }, { "epoch": 12.288041466123659, "grad_norm": 0.24882203340530396, "learning_rate": 7.980239346078742e-05, "loss": 0.0312, "step": 33190 }, { "epoch": 12.291743798593114, "grad_norm": 0.9981838464736938, "learning_rate": 7.978911531372765e-05, "loss": 0.0357, "step": 33200 }, { "epoch": 12.295446131062569, "grad_norm": 0.6315208673477173, "learning_rate": 7.977583390897502e-05, "loss": 0.0249, "step": 33210 }, { "epoch": 12.299148463532026, "grad_norm": 0.4278005361557007, "learning_rate": 7.976254924798196e-05, "loss": 0.0386, "step": 33220 }, { "epoch": 12.302850796001481, "grad_norm": 0.3500930368900299, "learning_rate": 7.974926133220127e-05, "loss": 0.0277, "step": 33230 }, { "epoch": 12.306553128470936, "grad_norm": 0.303894966840744, "learning_rate": 7.973597016308607e-05, "loss": 0.0221, "step": 33240 }, { "epoch": 12.310255460940393, "grad_norm": 0.37348809838294983, "learning_rate": 7.972267574208991e-05, "loss": 0.0299, "step": 33250 }, { "epoch": 12.313957793409848, "grad_norm": 0.15301276743412018, "learning_rate": 7.970937807066659e-05, "loss": 0.0292, "step": 33260 }, { "epoch": 12.317660125879303, "grad_norm": 0.3978094458580017, "learning_rate": 7.969607715027036e-05, "loss": 0.0381, "step": 33270 }, { "epoch": 12.32136245834876, "grad_norm": 0.22483015060424805, "learning_rate": 7.968277298235578e-05, "loss": 0.0276, "step": 33280 }, { "epoch": 12.325064790818216, "grad_norm": 0.4008134603500366, "learning_rate": 7.966946556837778e-05, "loss": 0.0278, "step": 33290 }, { "epoch": 12.32876712328767, "grad_norm": 0.38837578892707825, "learning_rate": 7.965615490979163e-05, "loss": 0.038, "step": 33300 }, { "epoch": 12.332469455757128, "grad_norm": 0.22388696670532227, "learning_rate": 7.964284100805297e-05, "loss": 0.0317, "step": 33310 }, { "epoch": 12.336171788226583, "grad_norm": 0.2623952031135559, "learning_rate": 7.962952386461777e-05, "loss": 0.0244, "step": 33320 }, { "epoch": 12.339874120696038, "grad_norm": 0.36843693256378174, "learning_rate": 7.961620348094241e-05, "loss": 0.0317, "step": 33330 }, { "epoch": 12.343576453165495, "grad_norm": 0.22944098711013794, "learning_rate": 7.960287985848356e-05, "loss": 0.0195, "step": 33340 }, { "epoch": 12.34727878563495, "grad_norm": 0.32681891322135925, "learning_rate": 7.958955299869825e-05, "loss": 0.0299, "step": 33350 }, { "epoch": 12.350981118104405, "grad_norm": 0.35598504543304443, "learning_rate": 7.957622290304394e-05, "loss": 0.0212, "step": 33360 }, { "epoch": 12.354683450573862, "grad_norm": 0.24768316745758057, "learning_rate": 7.956288957297834e-05, "loss": 0.0239, "step": 33370 }, { "epoch": 12.358385783043317, "grad_norm": 0.2233337163925171, "learning_rate": 7.954955300995961e-05, "loss": 0.0336, "step": 33380 }, { "epoch": 12.362088115512773, "grad_norm": 0.4381248354911804, "learning_rate": 7.953621321544616e-05, "loss": 0.0392, "step": 33390 }, { "epoch": 12.36579044798223, "grad_norm": 0.29236024618148804, "learning_rate": 7.952287019089685e-05, "loss": 0.0437, "step": 33400 }, { "epoch": 12.369492780451685, "grad_norm": 0.7049488425254822, "learning_rate": 7.950952393777085e-05, "loss": 0.0259, "step": 33410 }, { "epoch": 12.37319511292114, "grad_norm": 0.16665217280387878, "learning_rate": 7.949617445752769e-05, "loss": 0.0297, "step": 33420 }, { "epoch": 12.376897445390597, "grad_norm": 0.28699591755867004, "learning_rate": 7.948282175162722e-05, "loss": 0.0227, "step": 33430 }, { "epoch": 12.380599777860052, "grad_norm": 0.537385106086731, "learning_rate": 7.94694658215297e-05, "loss": 0.0345, "step": 33440 }, { "epoch": 12.384302110329507, "grad_norm": 0.36315593123435974, "learning_rate": 7.945610666869568e-05, "loss": 0.0211, "step": 33450 }, { "epoch": 12.388004442798964, "grad_norm": 0.2529697120189667, "learning_rate": 7.944274429458614e-05, "loss": 0.03, "step": 33460 }, { "epoch": 12.39170677526842, "grad_norm": 0.19728560745716095, "learning_rate": 7.942937870066236e-05, "loss": 0.0243, "step": 33470 }, { "epoch": 12.395409107737875, "grad_norm": 0.19504864513874054, "learning_rate": 7.941600988838595e-05, "loss": 0.0294, "step": 33480 }, { "epoch": 12.399111440207331, "grad_norm": 0.5015263557434082, "learning_rate": 7.940263785921896e-05, "loss": 0.0391, "step": 33490 }, { "epoch": 12.402813772676787, "grad_norm": 0.19380082190036774, "learning_rate": 7.938926261462366e-05, "loss": 0.0331, "step": 33500 }, { "epoch": 12.406516105146242, "grad_norm": 0.3013417720794678, "learning_rate": 7.937588415606281e-05, "loss": 0.0284, "step": 33510 }, { "epoch": 12.410218437615697, "grad_norm": 0.41979146003723145, "learning_rate": 7.936250248499941e-05, "loss": 0.0234, "step": 33520 }, { "epoch": 12.413920770085154, "grad_norm": 0.6533253192901611, "learning_rate": 7.934911760289692e-05, "loss": 0.0282, "step": 33530 }, { "epoch": 12.417623102554609, "grad_norm": 0.2617838382720947, "learning_rate": 7.933572951121904e-05, "loss": 0.0174, "step": 33540 }, { "epoch": 12.421325435024066, "grad_norm": 0.14585600793361664, "learning_rate": 7.932233821142987e-05, "loss": 0.0214, "step": 33550 }, { "epoch": 12.425027767493521, "grad_norm": 0.2576983869075775, "learning_rate": 7.93089437049939e-05, "loss": 0.0281, "step": 33560 }, { "epoch": 12.428730099962976, "grad_norm": 0.2058185487985611, "learning_rate": 7.92955459933759e-05, "loss": 0.0296, "step": 33570 }, { "epoch": 12.432432432432432, "grad_norm": 0.23529668152332306, "learning_rate": 7.928214507804104e-05, "loss": 0.0382, "step": 33580 }, { "epoch": 12.436134764901889, "grad_norm": 0.43218564987182617, "learning_rate": 7.926874096045482e-05, "loss": 0.0364, "step": 33590 }, { "epoch": 12.439837097371344, "grad_norm": 0.17559286952018738, "learning_rate": 7.925533364208309e-05, "loss": 0.0226, "step": 33600 }, { "epoch": 12.443539429840799, "grad_norm": 0.2257041186094284, "learning_rate": 7.924192312439205e-05, "loss": 0.0261, "step": 33610 }, { "epoch": 12.447241762310256, "grad_norm": 0.31212833523750305, "learning_rate": 7.922850940884827e-05, "loss": 0.0388, "step": 33620 }, { "epoch": 12.450944094779711, "grad_norm": 0.32136014103889465, "learning_rate": 7.921509249691865e-05, "loss": 0.0295, "step": 33630 }, { "epoch": 12.454646427249166, "grad_norm": 1.0519635677337646, "learning_rate": 7.920167239007044e-05, "loss": 0.0377, "step": 33640 }, { "epoch": 12.458348759718623, "grad_norm": 0.34975993633270264, "learning_rate": 7.918824908977123e-05, "loss": 0.0183, "step": 33650 }, { "epoch": 12.462051092188078, "grad_norm": 0.30031251907348633, "learning_rate": 7.9174822597489e-05, "loss": 0.0178, "step": 33660 }, { "epoch": 12.465753424657533, "grad_norm": 0.45426279306411743, "learning_rate": 7.916139291469202e-05, "loss": 0.0281, "step": 33670 }, { "epoch": 12.46945575712699, "grad_norm": 0.3792702853679657, "learning_rate": 7.914796004284896e-05, "loss": 0.0212, "step": 33680 }, { "epoch": 12.473158089596446, "grad_norm": 0.28180214762687683, "learning_rate": 7.913452398342881e-05, "loss": 0.0225, "step": 33690 }, { "epoch": 12.4768604220659, "grad_norm": 0.8005360960960388, "learning_rate": 7.912108473790092e-05, "loss": 0.0317, "step": 33700 }, { "epoch": 12.480562754535358, "grad_norm": 0.19738593697547913, "learning_rate": 7.910764230773498e-05, "loss": 0.0251, "step": 33710 }, { "epoch": 12.484265087004813, "grad_norm": 0.7180500030517578, "learning_rate": 7.909419669440105e-05, "loss": 0.0265, "step": 33720 }, { "epoch": 12.487967419474268, "grad_norm": 0.20186367630958557, "learning_rate": 7.908074789936952e-05, "loss": 0.0219, "step": 33730 }, { "epoch": 12.491669751943725, "grad_norm": 0.334566593170166, "learning_rate": 7.906729592411111e-05, "loss": 0.0215, "step": 33740 }, { "epoch": 12.49537208441318, "grad_norm": 0.26609429717063904, "learning_rate": 7.905384077009693e-05, "loss": 0.029, "step": 33750 }, { "epoch": 12.499074416882635, "grad_norm": 0.25326859951019287, "learning_rate": 7.904038243879839e-05, "loss": 0.0286, "step": 33760 }, { "epoch": 12.502776749352092, "grad_norm": 0.27423933148384094, "learning_rate": 7.902692093168729e-05, "loss": 0.0328, "step": 33770 }, { "epoch": 12.506479081821547, "grad_norm": 0.27169957756996155, "learning_rate": 7.901345625023576e-05, "loss": 0.0275, "step": 33780 }, { "epoch": 12.510181414291003, "grad_norm": 0.2884468138217926, "learning_rate": 7.89999883959163e-05, "loss": 0.0292, "step": 33790 }, { "epoch": 12.51388374676046, "grad_norm": 1.3010145425796509, "learning_rate": 7.898651737020166e-05, "loss": 0.029, "step": 33800 }, { "epoch": 12.517586079229915, "grad_norm": 0.16429737210273743, "learning_rate": 7.89730431745651e-05, "loss": 0.0232, "step": 33810 }, { "epoch": 12.52128841169937, "grad_norm": 0.31184110045433044, "learning_rate": 7.89595658104801e-05, "loss": 0.0264, "step": 33820 }, { "epoch": 12.524990744168827, "grad_norm": 0.6572989821434021, "learning_rate": 7.894608527942049e-05, "loss": 0.033, "step": 33830 }, { "epoch": 12.528693076638282, "grad_norm": 0.0706087201833725, "learning_rate": 7.893260158286055e-05, "loss": 0.0256, "step": 33840 }, { "epoch": 12.532395409107737, "grad_norm": 0.21408075094223022, "learning_rate": 7.891911472227478e-05, "loss": 0.0178, "step": 33850 }, { "epoch": 12.536097741577194, "grad_norm": 0.1969415545463562, "learning_rate": 7.890562469913811e-05, "loss": 0.0253, "step": 33860 }, { "epoch": 12.53980007404665, "grad_norm": 0.16210241615772247, "learning_rate": 7.889213151492578e-05, "loss": 0.0364, "step": 33870 }, { "epoch": 12.543502406516104, "grad_norm": 0.47271928191185, "learning_rate": 7.887863517111338e-05, "loss": 0.0265, "step": 33880 }, { "epoch": 12.547204738985561, "grad_norm": 0.35445600748062134, "learning_rate": 7.886513566917687e-05, "loss": 0.019, "step": 33890 }, { "epoch": 12.550907071455017, "grad_norm": 0.39033836126327515, "learning_rate": 7.88516330105925e-05, "loss": 0.0392, "step": 33900 }, { "epoch": 12.554609403924472, "grad_norm": 0.38193878531455994, "learning_rate": 7.883812719683695e-05, "loss": 0.0321, "step": 33910 }, { "epoch": 12.558311736393929, "grad_norm": 0.22704507410526276, "learning_rate": 7.882461822938716e-05, "loss": 0.031, "step": 33920 }, { "epoch": 12.562014068863384, "grad_norm": 0.48436427116394043, "learning_rate": 7.881110610972044e-05, "loss": 0.0266, "step": 33930 }, { "epoch": 12.565716401332839, "grad_norm": 0.2684752345085144, "learning_rate": 7.879759083931448e-05, "loss": 0.0214, "step": 33940 }, { "epoch": 12.569418733802296, "grad_norm": 0.44494950771331787, "learning_rate": 7.878407241964729e-05, "loss": 0.0324, "step": 33950 }, { "epoch": 12.573121066271751, "grad_norm": 0.6160573363304138, "learning_rate": 7.877055085219721e-05, "loss": 0.0211, "step": 33960 }, { "epoch": 12.576823398741206, "grad_norm": 0.2712053954601288, "learning_rate": 7.875702613844295e-05, "loss": 0.0209, "step": 33970 }, { "epoch": 12.580525731210663, "grad_norm": 0.4041667878627777, "learning_rate": 7.874349827986354e-05, "loss": 0.0218, "step": 33980 }, { "epoch": 12.584228063680118, "grad_norm": 0.43120822310447693, "learning_rate": 7.872996727793838e-05, "loss": 0.0252, "step": 33990 }, { "epoch": 12.587930396149574, "grad_norm": 0.23919565975666046, "learning_rate": 7.871643313414718e-05, "loss": 0.0346, "step": 34000 }, { "epoch": 12.59163272861903, "grad_norm": 1.293407917022705, "learning_rate": 7.870289584997005e-05, "loss": 0.0219, "step": 34010 }, { "epoch": 12.595335061088486, "grad_norm": 0.24785931408405304, "learning_rate": 7.868935542688736e-05, "loss": 0.0199, "step": 34020 }, { "epoch": 12.599037393557941, "grad_norm": 0.14952369034290314, "learning_rate": 7.867581186637991e-05, "loss": 0.0234, "step": 34030 }, { "epoch": 12.602739726027398, "grad_norm": 0.16783450543880463, "learning_rate": 7.866226516992878e-05, "loss": 0.0213, "step": 34040 }, { "epoch": 12.606442058496853, "grad_norm": 0.22410660982131958, "learning_rate": 7.864871533901544e-05, "loss": 0.024, "step": 34050 }, { "epoch": 12.610144390966308, "grad_norm": 1.5080219507217407, "learning_rate": 7.863516237512164e-05, "loss": 0.0187, "step": 34060 }, { "epoch": 12.613846723435765, "grad_norm": 0.18721669912338257, "learning_rate": 7.862160627972955e-05, "loss": 0.019, "step": 34070 }, { "epoch": 12.61754905590522, "grad_norm": 1.6539610624313354, "learning_rate": 7.860804705432164e-05, "loss": 0.0227, "step": 34080 }, { "epoch": 12.621251388374676, "grad_norm": 0.19110240042209625, "learning_rate": 7.859448470038069e-05, "loss": 0.0273, "step": 34090 }, { "epoch": 12.624953720844132, "grad_norm": 0.2565475404262543, "learning_rate": 7.858091921938988e-05, "loss": 0.0271, "step": 34100 }, { "epoch": 12.628656053313588, "grad_norm": 0.3748067617416382, "learning_rate": 7.856735061283273e-05, "loss": 0.0282, "step": 34110 }, { "epoch": 12.632358385783043, "grad_norm": 0.24924442172050476, "learning_rate": 7.855377888219307e-05, "loss": 0.023, "step": 34120 }, { "epoch": 12.6360607182525, "grad_norm": 0.27111881971359253, "learning_rate": 7.854020402895508e-05, "loss": 0.0208, "step": 34130 }, { "epoch": 12.639763050721955, "grad_norm": 0.13940538465976715, "learning_rate": 7.852662605460329e-05, "loss": 0.0285, "step": 34140 }, { "epoch": 12.64346538319141, "grad_norm": 0.5064489841461182, "learning_rate": 7.851304496062254e-05, "loss": 0.0297, "step": 34150 }, { "epoch": 12.647167715660867, "grad_norm": 0.17004713416099548, "learning_rate": 7.84994607484981e-05, "loss": 0.0257, "step": 34160 }, { "epoch": 12.650870048130322, "grad_norm": 0.2996779680252075, "learning_rate": 7.848587341971548e-05, "loss": 0.0268, "step": 34170 }, { "epoch": 12.654572380599777, "grad_norm": 0.3174663484096527, "learning_rate": 7.847228297576053e-05, "loss": 0.0221, "step": 34180 }, { "epoch": 12.658274713069234, "grad_norm": 0.26309630274772644, "learning_rate": 7.845868941811956e-05, "loss": 0.0332, "step": 34190 }, { "epoch": 12.66197704553869, "grad_norm": 0.14140523970127106, "learning_rate": 7.844509274827907e-05, "loss": 0.0245, "step": 34200 }, { "epoch": 12.665679378008145, "grad_norm": 0.2797076106071472, "learning_rate": 7.843149296772603e-05, "loss": 0.0196, "step": 34210 }, { "epoch": 12.669381710477602, "grad_norm": 0.2698999047279358, "learning_rate": 7.841789007794764e-05, "loss": 0.0314, "step": 34220 }, { "epoch": 12.673084042947057, "grad_norm": 0.3307766318321228, "learning_rate": 7.840428408043155e-05, "loss": 0.0335, "step": 34230 }, { "epoch": 12.676786375416512, "grad_norm": 0.2663612961769104, "learning_rate": 7.839067497666564e-05, "loss": 0.0255, "step": 34240 }, { "epoch": 12.680488707885969, "grad_norm": 0.2036428451538086, "learning_rate": 7.837706276813819e-05, "loss": 0.0265, "step": 34250 }, { "epoch": 12.684191040355424, "grad_norm": 0.3443094789981842, "learning_rate": 7.836344745633783e-05, "loss": 0.0211, "step": 34260 }, { "epoch": 12.68789337282488, "grad_norm": 0.22784271836280823, "learning_rate": 7.834982904275349e-05, "loss": 0.0231, "step": 34270 }, { "epoch": 12.691595705294336, "grad_norm": 0.27267351746559143, "learning_rate": 7.833620752887449e-05, "loss": 0.0267, "step": 34280 }, { "epoch": 12.695298037763791, "grad_norm": 0.3880615234375, "learning_rate": 7.832258291619043e-05, "loss": 0.0219, "step": 34290 }, { "epoch": 12.699000370233247, "grad_norm": 0.5372822284698486, "learning_rate": 7.830895520619128e-05, "loss": 0.0286, "step": 34300 }, { "epoch": 12.702702702702704, "grad_norm": 0.1932152360677719, "learning_rate": 7.829532440036735e-05, "loss": 0.0388, "step": 34310 }, { "epoch": 12.706405035172159, "grad_norm": 0.5642022490501404, "learning_rate": 7.828169050020928e-05, "loss": 0.0243, "step": 34320 }, { "epoch": 12.710107367641614, "grad_norm": 0.14787879586219788, "learning_rate": 7.826805350720807e-05, "loss": 0.026, "step": 34330 }, { "epoch": 12.71380970011107, "grad_norm": 0.11862362921237946, "learning_rate": 7.825441342285502e-05, "loss": 0.0345, "step": 34340 }, { "epoch": 12.717512032580526, "grad_norm": 0.5805646181106567, "learning_rate": 7.824077024864179e-05, "loss": 0.0302, "step": 34350 }, { "epoch": 12.721214365049981, "grad_norm": 0.17417439818382263, "learning_rate": 7.82271239860604e-05, "loss": 0.0198, "step": 34360 }, { "epoch": 12.724916697519436, "grad_norm": 0.2491118311882019, "learning_rate": 7.821347463660315e-05, "loss": 0.0243, "step": 34370 }, { "epoch": 12.728619029988893, "grad_norm": 0.24534769356250763, "learning_rate": 7.819982220176276e-05, "loss": 0.0164, "step": 34380 }, { "epoch": 12.732321362458348, "grad_norm": 0.18152108788490295, "learning_rate": 7.81861666830322e-05, "loss": 0.0186, "step": 34390 }, { "epoch": 12.736023694927805, "grad_norm": 0.5673219561576843, "learning_rate": 7.817250808190483e-05, "loss": 0.0276, "step": 34400 }, { "epoch": 12.73972602739726, "grad_norm": 0.2217029184103012, "learning_rate": 7.815884639987433e-05, "loss": 0.0272, "step": 34410 }, { "epoch": 12.743428359866716, "grad_norm": 0.3641487956047058, "learning_rate": 7.814518163843473e-05, "loss": 0.0265, "step": 34420 }, { "epoch": 12.747130692336171, "grad_norm": 0.370079904794693, "learning_rate": 7.813151379908036e-05, "loss": 0.0234, "step": 34430 }, { "epoch": 12.750833024805628, "grad_norm": 0.2809566557407379, "learning_rate": 7.811784288330597e-05, "loss": 0.0264, "step": 34440 }, { "epoch": 12.754535357275083, "grad_norm": 0.25348252058029175, "learning_rate": 7.810416889260653e-05, "loss": 0.0287, "step": 34450 }, { "epoch": 12.75823768974454, "grad_norm": 0.1743936687707901, "learning_rate": 7.809049182847745e-05, "loss": 0.028, "step": 34460 }, { "epoch": 12.761940022213995, "grad_norm": 0.2291736602783203, "learning_rate": 7.80768116924144e-05, "loss": 0.0187, "step": 34470 }, { "epoch": 12.76564235468345, "grad_norm": 0.12937627732753754, "learning_rate": 7.806312848591347e-05, "loss": 0.0274, "step": 34480 }, { "epoch": 12.769344687152905, "grad_norm": 0.5148833990097046, "learning_rate": 7.804944221047097e-05, "loss": 0.0343, "step": 34490 }, { "epoch": 12.773047019622362, "grad_norm": 0.144001305103302, "learning_rate": 7.803575286758364e-05, "loss": 0.0316, "step": 34500 }, { "epoch": 12.776749352091818, "grad_norm": 0.15057753026485443, "learning_rate": 7.802206045874854e-05, "loss": 0.0345, "step": 34510 }, { "epoch": 12.780451684561273, "grad_norm": 0.36263495683670044, "learning_rate": 7.800836498546304e-05, "loss": 0.0228, "step": 34520 }, { "epoch": 12.78415401703073, "grad_norm": 0.17328478395938873, "learning_rate": 7.799466644922484e-05, "loss": 0.0277, "step": 34530 }, { "epoch": 12.787856349500185, "grad_norm": 0.3478400707244873, "learning_rate": 7.798096485153201e-05, "loss": 0.0196, "step": 34540 }, { "epoch": 12.79155868196964, "grad_norm": 0.12657395005226135, "learning_rate": 7.796726019388295e-05, "loss": 0.0284, "step": 34550 }, { "epoch": 12.795261014439097, "grad_norm": 0.23354358971118927, "learning_rate": 7.795355247777635e-05, "loss": 0.0295, "step": 34560 }, { "epoch": 12.798963346908552, "grad_norm": 0.18622373044490814, "learning_rate": 7.793984170471129e-05, "loss": 0.0398, "step": 34570 }, { "epoch": 12.802665679378007, "grad_norm": 0.6896623373031616, "learning_rate": 7.792612787618714e-05, "loss": 0.0226, "step": 34580 }, { "epoch": 12.806368011847464, "grad_norm": 0.5054300427436829, "learning_rate": 7.791241099370364e-05, "loss": 0.0321, "step": 34590 }, { "epoch": 12.81007034431692, "grad_norm": 0.22038686275482178, "learning_rate": 7.789869105876083e-05, "loss": 0.0335, "step": 34600 }, { "epoch": 12.813772676786375, "grad_norm": 0.1786588430404663, "learning_rate": 7.78849680728591e-05, "loss": 0.0202, "step": 34610 }, { "epoch": 12.817475009255832, "grad_norm": 0.27252376079559326, "learning_rate": 7.78712420374992e-05, "loss": 0.0258, "step": 34620 }, { "epoch": 12.821177341725287, "grad_norm": 0.2954288721084595, "learning_rate": 7.785751295418217e-05, "loss": 0.0308, "step": 34630 }, { "epoch": 12.824879674194742, "grad_norm": 0.36640509963035583, "learning_rate": 7.784378082440941e-05, "loss": 0.0196, "step": 34640 }, { "epoch": 12.828582006664199, "grad_norm": 0.22057166695594788, "learning_rate": 7.783004564968263e-05, "loss": 0.0293, "step": 34650 }, { "epoch": 12.832284339133654, "grad_norm": 0.18659788370132446, "learning_rate": 7.781630743150392e-05, "loss": 0.0355, "step": 34660 }, { "epoch": 12.83598667160311, "grad_norm": 0.44356316328048706, "learning_rate": 7.780256617137564e-05, "loss": 0.033, "step": 34670 }, { "epoch": 12.839689004072566, "grad_norm": 0.3346021771430969, "learning_rate": 7.778882187080052e-05, "loss": 0.0193, "step": 34680 }, { "epoch": 12.843391336542021, "grad_norm": 0.2620854675769806, "learning_rate": 7.777507453128163e-05, "loss": 0.0366, "step": 34690 }, { "epoch": 12.847093669011477, "grad_norm": 1.0063756704330444, "learning_rate": 7.776132415432234e-05, "loss": 0.0234, "step": 34700 }, { "epoch": 12.850796001480933, "grad_norm": 0.23966912925243378, "learning_rate": 7.774757074142638e-05, "loss": 0.0251, "step": 34710 }, { "epoch": 12.854498333950389, "grad_norm": 0.18678152561187744, "learning_rate": 7.773381429409779e-05, "loss": 0.0229, "step": 34720 }, { "epoch": 12.858200666419844, "grad_norm": 0.28656715154647827, "learning_rate": 7.772005481384099e-05, "loss": 0.0266, "step": 34730 }, { "epoch": 12.8619029988893, "grad_norm": 0.21932387351989746, "learning_rate": 7.770629230216067e-05, "loss": 0.0172, "step": 34740 }, { "epoch": 12.865605331358756, "grad_norm": 0.4263312518596649, "learning_rate": 7.769252676056187e-05, "loss": 0.0213, "step": 34750 }, { "epoch": 12.869307663828211, "grad_norm": 0.14918839931488037, "learning_rate": 7.767875819054997e-05, "loss": 0.0174, "step": 34760 }, { "epoch": 12.873009996297668, "grad_norm": 0.39145663380622864, "learning_rate": 7.76649865936307e-05, "loss": 0.031, "step": 34770 }, { "epoch": 12.876712328767123, "grad_norm": 0.5883037447929382, "learning_rate": 7.765121197131009e-05, "loss": 0.0296, "step": 34780 }, { "epoch": 12.880414661236578, "grad_norm": 0.29562100768089294, "learning_rate": 7.763743432509451e-05, "loss": 0.0262, "step": 34790 }, { "epoch": 12.884116993706035, "grad_norm": 0.33599168062210083, "learning_rate": 7.762365365649067e-05, "loss": 0.0209, "step": 34800 }, { "epoch": 12.88781932617549, "grad_norm": 0.19972053170204163, "learning_rate": 7.760986996700559e-05, "loss": 0.0214, "step": 34810 }, { "epoch": 12.891521658644946, "grad_norm": 2.6229684352874756, "learning_rate": 7.759608325814664e-05, "loss": 0.0331, "step": 34820 }, { "epoch": 12.895223991114403, "grad_norm": 0.36718037724494934, "learning_rate": 7.758229353142152e-05, "loss": 0.0307, "step": 34830 }, { "epoch": 12.898926323583858, "grad_norm": 0.2976806163787842, "learning_rate": 7.756850078833823e-05, "loss": 0.032, "step": 34840 }, { "epoch": 12.902628656053313, "grad_norm": 0.20655496418476105, "learning_rate": 7.755470503040516e-05, "loss": 0.0219, "step": 34850 }, { "epoch": 12.90633098852277, "grad_norm": 0.4261629283428192, "learning_rate": 7.754090625913099e-05, "loss": 0.0365, "step": 34860 }, { "epoch": 12.910033320992225, "grad_norm": 0.3007451593875885, "learning_rate": 7.75271044760247e-05, "loss": 0.0305, "step": 34870 }, { "epoch": 12.91373565346168, "grad_norm": 0.23876158893108368, "learning_rate": 7.751329968259565e-05, "loss": 0.0218, "step": 34880 }, { "epoch": 12.917437985931137, "grad_norm": 0.3136571943759918, "learning_rate": 7.749949188035353e-05, "loss": 0.0223, "step": 34890 }, { "epoch": 12.921140318400592, "grad_norm": 0.22598044574260712, "learning_rate": 7.748568107080832e-05, "loss": 0.0199, "step": 34900 }, { "epoch": 12.924842650870048, "grad_norm": 0.3237540125846863, "learning_rate": 7.747186725547034e-05, "loss": 0.0375, "step": 34910 }, { "epoch": 12.928544983339505, "grad_norm": 0.11107441037893295, "learning_rate": 7.745805043585026e-05, "loss": 0.0258, "step": 34920 }, { "epoch": 12.93224731580896, "grad_norm": 0.12830005586147308, "learning_rate": 7.744423061345906e-05, "loss": 0.0266, "step": 34930 }, { "epoch": 12.935949648278415, "grad_norm": 0.17496268451213837, "learning_rate": 7.743040778980807e-05, "loss": 0.0194, "step": 34940 }, { "epoch": 12.939651980747872, "grad_norm": 0.4563919007778168, "learning_rate": 7.741658196640892e-05, "loss": 0.0213, "step": 34950 }, { "epoch": 12.943354313217327, "grad_norm": 0.49550294876098633, "learning_rate": 7.74027531447736e-05, "loss": 0.029, "step": 34960 }, { "epoch": 12.947056645686782, "grad_norm": 0.2873334586620331, "learning_rate": 7.738892132641438e-05, "loss": 0.0227, "step": 34970 }, { "epoch": 12.950758978156239, "grad_norm": 0.27080854773521423, "learning_rate": 7.737508651284391e-05, "loss": 0.032, "step": 34980 }, { "epoch": 12.954461310625694, "grad_norm": 0.2489413172006607, "learning_rate": 7.736124870557516e-05, "loss": 0.0196, "step": 34990 }, { "epoch": 12.95816364309515, "grad_norm": 0.6408553719520569, "learning_rate": 7.734740790612136e-05, "loss": 0.0384, "step": 35000 }, { "epoch": 12.961865975564606, "grad_norm": 0.19122298061847687, "learning_rate": 7.733356411599614e-05, "loss": 0.0274, "step": 35010 }, { "epoch": 12.965568308034062, "grad_norm": 0.22778570652008057, "learning_rate": 7.731971733671346e-05, "loss": 0.0223, "step": 35020 }, { "epoch": 12.969270640503517, "grad_norm": 0.13890910148620605, "learning_rate": 7.730586756978757e-05, "loss": 0.0233, "step": 35030 }, { "epoch": 12.972972972972974, "grad_norm": 0.762989342212677, "learning_rate": 7.729201481673305e-05, "loss": 0.0311, "step": 35040 }, { "epoch": 12.976675305442429, "grad_norm": 0.2563908100128174, "learning_rate": 7.727815907906481e-05, "loss": 0.0263, "step": 35050 }, { "epoch": 12.980377637911884, "grad_norm": 0.381351113319397, "learning_rate": 7.726430035829813e-05, "loss": 0.0179, "step": 35060 }, { "epoch": 12.984079970381341, "grad_norm": 0.2764284312725067, "learning_rate": 7.725043865594854e-05, "loss": 0.0249, "step": 35070 }, { "epoch": 12.987782302850796, "grad_norm": 0.4106925129890442, "learning_rate": 7.723657397353194e-05, "loss": 0.0243, "step": 35080 }, { "epoch": 12.991484635320251, "grad_norm": 0.17833997309207916, "learning_rate": 7.722270631256459e-05, "loss": 0.0224, "step": 35090 }, { "epoch": 12.995186967789708, "grad_norm": 0.2456929087638855, "learning_rate": 7.720883567456298e-05, "loss": 0.0219, "step": 35100 }, { "epoch": 12.998889300259163, "grad_norm": 0.30988043546676636, "learning_rate": 7.719496206104401e-05, "loss": 0.0222, "step": 35110 }, { "epoch": 13.002591632728619, "grad_norm": 0.33368274569511414, "learning_rate": 7.718108547352488e-05, "loss": 0.0262, "step": 35120 }, { "epoch": 13.006293965198076, "grad_norm": 0.34789004921913147, "learning_rate": 7.71672059135231e-05, "loss": 0.0311, "step": 35130 }, { "epoch": 13.00999629766753, "grad_norm": 0.19698558747768402, "learning_rate": 7.715332338255654e-05, "loss": 0.0155, "step": 35140 }, { "epoch": 13.013698630136986, "grad_norm": 0.39488276839256287, "learning_rate": 7.713943788214337e-05, "loss": 0.0316, "step": 35150 }, { "epoch": 13.017400962606443, "grad_norm": 0.4860442578792572, "learning_rate": 7.712554941380206e-05, "loss": 0.0281, "step": 35160 }, { "epoch": 13.021103295075898, "grad_norm": 0.16031597554683685, "learning_rate": 7.711165797905144e-05, "loss": 0.0244, "step": 35170 }, { "epoch": 13.024805627545353, "grad_norm": 0.4269806444644928, "learning_rate": 7.709776357941069e-05, "loss": 0.0235, "step": 35180 }, { "epoch": 13.02850796001481, "grad_norm": 0.17884524166584015, "learning_rate": 7.708386621639925e-05, "loss": 0.0271, "step": 35190 }, { "epoch": 13.032210292484265, "grad_norm": 0.26171180605888367, "learning_rate": 7.70699658915369e-05, "loss": 0.0202, "step": 35200 }, { "epoch": 13.03591262495372, "grad_norm": 0.22897835075855255, "learning_rate": 7.705606260634379e-05, "loss": 0.0333, "step": 35210 }, { "epoch": 13.039614957423177, "grad_norm": 0.18917666375637054, "learning_rate": 7.704215636234035e-05, "loss": 0.024, "step": 35220 }, { "epoch": 13.043317289892633, "grad_norm": 0.20276065170764923, "learning_rate": 7.702824716104735e-05, "loss": 0.0154, "step": 35230 }, { "epoch": 13.047019622362088, "grad_norm": 0.4722110629081726, "learning_rate": 7.701433500398589e-05, "loss": 0.0309, "step": 35240 }, { "epoch": 13.050721954831545, "grad_norm": 0.1852586567401886, "learning_rate": 7.700041989267736e-05, "loss": 0.0213, "step": 35250 }, { "epoch": 13.054424287301, "grad_norm": 0.325426310300827, "learning_rate": 7.698650182864351e-05, "loss": 0.0373, "step": 35260 }, { "epoch": 13.058126619770455, "grad_norm": 0.4035528600215912, "learning_rate": 7.697258081340638e-05, "loss": 0.026, "step": 35270 }, { "epoch": 13.061828952239912, "grad_norm": 0.2811002731323242, "learning_rate": 7.695865684848838e-05, "loss": 0.024, "step": 35280 }, { "epoch": 13.065531284709367, "grad_norm": 0.13287454843521118, "learning_rate": 7.694472993541219e-05, "loss": 0.0246, "step": 35290 }, { "epoch": 13.069233617178822, "grad_norm": 0.16085900366306305, "learning_rate": 7.693080007570084e-05, "loss": 0.0285, "step": 35300 }, { "epoch": 13.072935949648278, "grad_norm": 0.16320160031318665, "learning_rate": 7.691686727087769e-05, "loss": 0.0216, "step": 35310 }, { "epoch": 13.076638282117734, "grad_norm": 0.2928618788719177, "learning_rate": 7.69029315224664e-05, "loss": 0.0252, "step": 35320 }, { "epoch": 13.08034061458719, "grad_norm": 0.5937813520431519, "learning_rate": 7.688899283199096e-05, "loss": 0.0358, "step": 35330 }, { "epoch": 13.084042947056645, "grad_norm": 0.42630937695503235, "learning_rate": 7.687505120097571e-05, "loss": 0.0341, "step": 35340 }, { "epoch": 13.087745279526102, "grad_norm": 0.19253802299499512, "learning_rate": 7.686110663094525e-05, "loss": 0.016, "step": 35350 }, { "epoch": 13.091447611995557, "grad_norm": 0.19362780451774597, "learning_rate": 7.684715912342458e-05, "loss": 0.0238, "step": 35360 }, { "epoch": 13.095149944465012, "grad_norm": 0.2240161895751953, "learning_rate": 7.683320867993892e-05, "loss": 0.0196, "step": 35370 }, { "epoch": 13.098852276934469, "grad_norm": 0.23305644094944, "learning_rate": 7.681925530201392e-05, "loss": 0.0427, "step": 35380 }, { "epoch": 13.102554609403924, "grad_norm": 0.27562421560287476, "learning_rate": 7.680529899117547e-05, "loss": 0.0249, "step": 35390 }, { "epoch": 13.10625694187338, "grad_norm": 0.17583513259887695, "learning_rate": 7.679133974894983e-05, "loss": 0.0354, "step": 35400 }, { "epoch": 13.109959274342836, "grad_norm": 0.25786834955215454, "learning_rate": 7.677737757686356e-05, "loss": 0.0254, "step": 35410 }, { "epoch": 13.113661606812292, "grad_norm": 0.3290655016899109, "learning_rate": 7.676341247644353e-05, "loss": 0.0264, "step": 35420 }, { "epoch": 13.117363939281747, "grad_norm": 0.16491533815860748, "learning_rate": 7.674944444921695e-05, "loss": 0.0307, "step": 35430 }, { "epoch": 13.121066271751204, "grad_norm": 0.3107728660106659, "learning_rate": 7.673547349671135e-05, "loss": 0.0302, "step": 35440 }, { "epoch": 13.124768604220659, "grad_norm": 0.3059006929397583, "learning_rate": 7.672149962045457e-05, "loss": 0.0234, "step": 35450 }, { "epoch": 13.128470936690114, "grad_norm": 0.19545282423496246, "learning_rate": 7.670752282197476e-05, "loss": 0.0326, "step": 35460 }, { "epoch": 13.132173269159571, "grad_norm": 0.26107627153396606, "learning_rate": 7.669354310280041e-05, "loss": 0.0339, "step": 35470 }, { "epoch": 13.135875601629026, "grad_norm": 0.1812087595462799, "learning_rate": 7.667956046446031e-05, "loss": 0.026, "step": 35480 }, { "epoch": 13.139577934098481, "grad_norm": 0.2704720199108124, "learning_rate": 7.666557490848358e-05, "loss": 0.0333, "step": 35490 }, { "epoch": 13.143280266567938, "grad_norm": 0.4094022214412689, "learning_rate": 7.66515864363997e-05, "loss": 0.025, "step": 35500 }, { "epoch": 13.146982599037393, "grad_norm": 0.3693949282169342, "learning_rate": 7.663759504973837e-05, "loss": 0.0262, "step": 35510 }, { "epoch": 13.150684931506849, "grad_norm": 1.2362279891967773, "learning_rate": 7.662360075002971e-05, "loss": 0.0177, "step": 35520 }, { "epoch": 13.154387263976306, "grad_norm": 0.26176854968070984, "learning_rate": 7.66096035388041e-05, "loss": 0.0392, "step": 35530 }, { "epoch": 13.15808959644576, "grad_norm": 0.1982380896806717, "learning_rate": 7.659560341759224e-05, "loss": 0.0217, "step": 35540 }, { "epoch": 13.161791928915216, "grad_norm": 0.2655305862426758, "learning_rate": 7.658160038792518e-05, "loss": 0.0217, "step": 35550 }, { "epoch": 13.165494261384673, "grad_norm": 0.21108944714069366, "learning_rate": 7.656759445133428e-05, "loss": 0.0321, "step": 35560 }, { "epoch": 13.169196593854128, "grad_norm": 0.43176940083503723, "learning_rate": 7.65535856093512e-05, "loss": 0.0253, "step": 35570 }, { "epoch": 13.172898926323583, "grad_norm": 0.19135937094688416, "learning_rate": 7.65395738635079e-05, "loss": 0.0342, "step": 35580 }, { "epoch": 13.17660125879304, "grad_norm": 0.22240209579467773, "learning_rate": 7.65255592153367e-05, "loss": 0.0295, "step": 35590 }, { "epoch": 13.180303591262495, "grad_norm": 0.7548376321792603, "learning_rate": 7.651154166637025e-05, "loss": 0.0357, "step": 35600 }, { "epoch": 13.18400592373195, "grad_norm": 0.21694813668727875, "learning_rate": 7.649752121814144e-05, "loss": 0.0214, "step": 35610 }, { "epoch": 13.187708256201407, "grad_norm": 0.11801870912313461, "learning_rate": 7.648349787218355e-05, "loss": 0.025, "step": 35620 }, { "epoch": 13.191410588670863, "grad_norm": 0.3476898670196533, "learning_rate": 7.646947163003017e-05, "loss": 0.0231, "step": 35630 }, { "epoch": 13.195112921140318, "grad_norm": 0.8426253795623779, "learning_rate": 7.645544249321515e-05, "loss": 0.0339, "step": 35640 }, { "epoch": 13.198815253609775, "grad_norm": 0.25716671347618103, "learning_rate": 7.644141046327271e-05, "loss": 0.0319, "step": 35650 }, { "epoch": 13.20251758607923, "grad_norm": 0.34096458554267883, "learning_rate": 7.64273755417374e-05, "loss": 0.0252, "step": 35660 }, { "epoch": 13.206219918548685, "grad_norm": 0.5745083093643188, "learning_rate": 7.641333773014402e-05, "loss": 0.0279, "step": 35670 }, { "epoch": 13.209922251018142, "grad_norm": 0.14307764172554016, "learning_rate": 7.639929703002773e-05, "loss": 0.0235, "step": 35680 }, { "epoch": 13.213624583487597, "grad_norm": 0.6744877099990845, "learning_rate": 7.638525344292402e-05, "loss": 0.0315, "step": 35690 }, { "epoch": 13.217326915957052, "grad_norm": 0.20646655559539795, "learning_rate": 7.637120697036866e-05, "loss": 0.0322, "step": 35700 }, { "epoch": 13.22102924842651, "grad_norm": 0.665428638458252, "learning_rate": 7.635715761389775e-05, "loss": 0.0401, "step": 35710 }, { "epoch": 13.224731580895964, "grad_norm": 0.5332648158073425, "learning_rate": 7.63431053750477e-05, "loss": 0.0272, "step": 35720 }, { "epoch": 13.22843391336542, "grad_norm": 0.2978644073009491, "learning_rate": 7.632905025535528e-05, "loss": 0.035, "step": 35730 }, { "epoch": 13.232136245834877, "grad_norm": 0.26283973455429077, "learning_rate": 7.631499225635748e-05, "loss": 0.0143, "step": 35740 }, { "epoch": 13.235838578304332, "grad_norm": 0.2303188592195511, "learning_rate": 7.630093137959171e-05, "loss": 0.0256, "step": 35750 }, { "epoch": 13.239540910773787, "grad_norm": 1.5611599683761597, "learning_rate": 7.628686762659563e-05, "loss": 0.0249, "step": 35760 }, { "epoch": 13.243243243243244, "grad_norm": 0.2969816327095032, "learning_rate": 7.627280099890721e-05, "loss": 0.026, "step": 35770 }, { "epoch": 13.246945575712699, "grad_norm": 0.24797363579273224, "learning_rate": 7.62587314980648e-05, "loss": 0.0199, "step": 35780 }, { "epoch": 13.250647908182154, "grad_norm": 0.28229454159736633, "learning_rate": 7.624465912560697e-05, "loss": 0.0199, "step": 35790 }, { "epoch": 13.254350240651611, "grad_norm": 0.28541111946105957, "learning_rate": 7.623058388307269e-05, "loss": 0.0294, "step": 35800 }, { "epoch": 13.258052573121066, "grad_norm": 0.2548850178718567, "learning_rate": 7.621650577200117e-05, "loss": 0.0123, "step": 35810 }, { "epoch": 13.261754905590522, "grad_norm": 0.2151319682598114, "learning_rate": 7.620242479393203e-05, "loss": 0.0283, "step": 35820 }, { "epoch": 13.265457238059978, "grad_norm": 0.27053940296173096, "learning_rate": 7.618834095040509e-05, "loss": 0.0237, "step": 35830 }, { "epoch": 13.269159570529434, "grad_norm": 0.1399824619293213, "learning_rate": 7.617425424296054e-05, "loss": 0.0281, "step": 35840 }, { "epoch": 13.272861902998889, "grad_norm": 0.17708386480808258, "learning_rate": 7.616016467313891e-05, "loss": 0.0263, "step": 35850 }, { "epoch": 13.276564235468346, "grad_norm": 0.5433328151702881, "learning_rate": 7.614607224248103e-05, "loss": 0.0266, "step": 35860 }, { "epoch": 13.280266567937801, "grad_norm": 0.14090405404567719, "learning_rate": 7.613197695252796e-05, "loss": 0.0185, "step": 35870 }, { "epoch": 13.283968900407256, "grad_norm": 3.9825003147125244, "learning_rate": 7.61178788048212e-05, "loss": 0.0291, "step": 35880 }, { "epoch": 13.287671232876713, "grad_norm": 0.3796515464782715, "learning_rate": 7.610377780090249e-05, "loss": 0.0318, "step": 35890 }, { "epoch": 13.291373565346168, "grad_norm": 0.38988760113716125, "learning_rate": 7.608967394231387e-05, "loss": 0.0253, "step": 35900 }, { "epoch": 13.295075897815623, "grad_norm": 0.16120408475399017, "learning_rate": 7.607556723059773e-05, "loss": 0.0304, "step": 35910 }, { "epoch": 13.29877823028508, "grad_norm": 0.37822264432907104, "learning_rate": 7.606145766729678e-05, "loss": 0.0274, "step": 35920 }, { "epoch": 13.302480562754535, "grad_norm": 0.2923460006713867, "learning_rate": 7.604734525395398e-05, "loss": 0.0261, "step": 35930 }, { "epoch": 13.30618289522399, "grad_norm": 0.5771888494491577, "learning_rate": 7.603322999211268e-05, "loss": 0.0415, "step": 35940 }, { "epoch": 13.309885227693448, "grad_norm": 0.21450383961200714, "learning_rate": 7.60191118833165e-05, "loss": 0.0325, "step": 35950 }, { "epoch": 13.313587560162903, "grad_norm": 0.20444194972515106, "learning_rate": 7.600499092910934e-05, "loss": 0.037, "step": 35960 }, { "epoch": 13.317289892632358, "grad_norm": 0.2348439246416092, "learning_rate": 7.599086713103547e-05, "loss": 0.0457, "step": 35970 }, { "epoch": 13.320992225101815, "grad_norm": 0.4161786437034607, "learning_rate": 7.597674049063947e-05, "loss": 0.0335, "step": 35980 }, { "epoch": 13.32469455757127, "grad_norm": 0.3485853374004364, "learning_rate": 7.596261100946618e-05, "loss": 0.0216, "step": 35990 }, { "epoch": 13.328396890040725, "grad_norm": 0.3018706738948822, "learning_rate": 7.594847868906076e-05, "loss": 0.026, "step": 36000 }, { "epoch": 13.332099222510182, "grad_norm": 0.5842129588127136, "learning_rate": 7.593434353096875e-05, "loss": 0.036, "step": 36010 }, { "epoch": 13.335801554979637, "grad_norm": 1.996142029762268, "learning_rate": 7.592020553673591e-05, "loss": 0.0216, "step": 36020 }, { "epoch": 13.339503887449093, "grad_norm": 0.20909634232521057, "learning_rate": 7.590606470790836e-05, "loss": 0.0259, "step": 36030 }, { "epoch": 13.34320621991855, "grad_norm": 0.1307315230369568, "learning_rate": 7.589192104603253e-05, "loss": 0.0251, "step": 36040 }, { "epoch": 13.346908552388005, "grad_norm": 0.589514970779419, "learning_rate": 7.587777455265515e-05, "loss": 0.0309, "step": 36050 }, { "epoch": 13.35061088485746, "grad_norm": 0.17481546103954315, "learning_rate": 7.586362522932323e-05, "loss": 0.0203, "step": 36060 }, { "epoch": 13.354313217326917, "grad_norm": 0.6189455986022949, "learning_rate": 7.584947307758416e-05, "loss": 0.0271, "step": 36070 }, { "epoch": 13.358015549796372, "grad_norm": 0.23686912655830383, "learning_rate": 7.583531809898557e-05, "loss": 0.0288, "step": 36080 }, { "epoch": 13.361717882265827, "grad_norm": 0.2150978446006775, "learning_rate": 7.582116029507542e-05, "loss": 0.0276, "step": 36090 }, { "epoch": 13.365420214735284, "grad_norm": 0.1935468465089798, "learning_rate": 7.580699966740201e-05, "loss": 0.0244, "step": 36100 }, { "epoch": 13.36912254720474, "grad_norm": 0.20564617216587067, "learning_rate": 7.579283621751394e-05, "loss": 0.0245, "step": 36110 }, { "epoch": 13.372824879674194, "grad_norm": 0.20084987580776215, "learning_rate": 7.577866994696007e-05, "loss": 0.0228, "step": 36120 }, { "epoch": 13.37652721214365, "grad_norm": 0.17015911638736725, "learning_rate": 7.576450085728959e-05, "loss": 0.0217, "step": 36130 }, { "epoch": 13.380229544613107, "grad_norm": 0.28447243571281433, "learning_rate": 7.575032895005205e-05, "loss": 0.026, "step": 36140 }, { "epoch": 13.383931877082562, "grad_norm": 0.3846380412578583, "learning_rate": 7.573615422679726e-05, "loss": 0.0338, "step": 36150 }, { "epoch": 13.387634209552019, "grad_norm": 0.6461687684059143, "learning_rate": 7.572197668907532e-05, "loss": 0.0236, "step": 36160 }, { "epoch": 13.391336542021474, "grad_norm": 0.21233169734477997, "learning_rate": 7.570779633843669e-05, "loss": 0.0269, "step": 36170 }, { "epoch": 13.395038874490929, "grad_norm": 0.27323228120803833, "learning_rate": 7.569361317643211e-05, "loss": 0.027, "step": 36180 }, { "epoch": 13.398741206960384, "grad_norm": 0.38471606373786926, "learning_rate": 7.56794272046126e-05, "loss": 0.0357, "step": 36190 }, { "epoch": 13.402443539429841, "grad_norm": 0.1690789759159088, "learning_rate": 7.566523842452958e-05, "loss": 0.0304, "step": 36200 }, { "epoch": 13.406145871899296, "grad_norm": 0.20453065633773804, "learning_rate": 7.565104683773466e-05, "loss": 0.0223, "step": 36210 }, { "epoch": 13.409848204368751, "grad_norm": 0.12903109192848206, "learning_rate": 7.563685244577978e-05, "loss": 0.0233, "step": 36220 }, { "epoch": 13.413550536838208, "grad_norm": 0.3635326027870178, "learning_rate": 7.56226552502173e-05, "loss": 0.0238, "step": 36230 }, { "epoch": 13.417252869307664, "grad_norm": 0.15268443524837494, "learning_rate": 7.560845525259976e-05, "loss": 0.0212, "step": 36240 }, { "epoch": 13.420955201777119, "grad_norm": 0.4413941204547882, "learning_rate": 7.559425245448006e-05, "loss": 0.022, "step": 36250 }, { "epoch": 13.424657534246576, "grad_norm": 0.20160087943077087, "learning_rate": 7.558004685741137e-05, "loss": 0.0233, "step": 36260 }, { "epoch": 13.42835986671603, "grad_norm": 0.22793477773666382, "learning_rate": 7.556583846294725e-05, "loss": 0.0234, "step": 36270 }, { "epoch": 13.432062199185486, "grad_norm": 0.2903403639793396, "learning_rate": 7.555162727264144e-05, "loss": 0.0314, "step": 36280 }, { "epoch": 13.435764531654943, "grad_norm": 0.4199690818786621, "learning_rate": 7.55374132880481e-05, "loss": 0.0218, "step": 36290 }, { "epoch": 13.439466864124398, "grad_norm": 0.2319525182247162, "learning_rate": 7.552319651072164e-05, "loss": 0.0176, "step": 36300 }, { "epoch": 13.443169196593853, "grad_norm": 0.8879957795143127, "learning_rate": 7.550897694221679e-05, "loss": 0.0272, "step": 36310 }, { "epoch": 13.44687152906331, "grad_norm": 0.17924955487251282, "learning_rate": 7.549475458408855e-05, "loss": 0.0232, "step": 36320 }, { "epoch": 13.450573861532765, "grad_norm": 0.12397658824920654, "learning_rate": 7.54805294378923e-05, "loss": 0.0174, "step": 36330 }, { "epoch": 13.45427619400222, "grad_norm": 0.3594692647457123, "learning_rate": 7.546630150518365e-05, "loss": 0.0252, "step": 36340 }, { "epoch": 13.457978526471678, "grad_norm": 0.37684059143066406, "learning_rate": 7.545207078751857e-05, "loss": 0.0259, "step": 36350 }, { "epoch": 13.461680858941133, "grad_norm": 0.30012908577919006, "learning_rate": 7.543783728645328e-05, "loss": 0.0307, "step": 36360 }, { "epoch": 13.465383191410588, "grad_norm": 0.24480465054512024, "learning_rate": 7.542360100354436e-05, "loss": 0.025, "step": 36370 }, { "epoch": 13.469085523880045, "grad_norm": 0.2449689507484436, "learning_rate": 7.540936194034865e-05, "loss": 0.0262, "step": 36380 }, { "epoch": 13.4727878563495, "grad_norm": 0.1710643321275711, "learning_rate": 7.539512009842333e-05, "loss": 0.0176, "step": 36390 }, { "epoch": 13.476490188818955, "grad_norm": 0.2068617194890976, "learning_rate": 7.538087547932585e-05, "loss": 0.0281, "step": 36400 }, { "epoch": 13.480192521288412, "grad_norm": 0.21288087964057922, "learning_rate": 7.536662808461399e-05, "loss": 0.0214, "step": 36410 }, { "epoch": 13.483894853757867, "grad_norm": 0.4252849221229553, "learning_rate": 7.535237791584581e-05, "loss": 0.028, "step": 36420 }, { "epoch": 13.487597186227323, "grad_norm": 0.19568896293640137, "learning_rate": 7.533812497457972e-05, "loss": 0.0215, "step": 36430 }, { "epoch": 13.49129951869678, "grad_norm": 0.26464134454727173, "learning_rate": 7.532386926237436e-05, "loss": 0.018, "step": 36440 }, { "epoch": 13.495001851166235, "grad_norm": 0.21081918478012085, "learning_rate": 7.530961078078873e-05, "loss": 0.0271, "step": 36450 }, { "epoch": 13.49870418363569, "grad_norm": 0.19872498512268066, "learning_rate": 7.529534953138213e-05, "loss": 0.0291, "step": 36460 }, { "epoch": 13.502406516105147, "grad_norm": 0.32740670442581177, "learning_rate": 7.528108551571414e-05, "loss": 0.0225, "step": 36470 }, { "epoch": 13.506108848574602, "grad_norm": 0.2250063717365265, "learning_rate": 7.526681873534462e-05, "loss": 0.021, "step": 36480 }, { "epoch": 13.509811181044057, "grad_norm": 0.30336886644363403, "learning_rate": 7.525254919183382e-05, "loss": 0.0349, "step": 36490 }, { "epoch": 13.513513513513514, "grad_norm": 0.20013168454170227, "learning_rate": 7.52382768867422e-05, "loss": 0.0307, "step": 36500 }, { "epoch": 13.51721584598297, "grad_norm": 0.34901610016822815, "learning_rate": 7.522400182163056e-05, "loss": 0.0201, "step": 36510 }, { "epoch": 13.520918178452424, "grad_norm": 0.27488505840301514, "learning_rate": 7.520972399806e-05, "loss": 0.0251, "step": 36520 }, { "epoch": 13.524620510921881, "grad_norm": 0.1910986751317978, "learning_rate": 7.519544341759192e-05, "loss": 0.0208, "step": 36530 }, { "epoch": 13.528322843391337, "grad_norm": 0.413250207901001, "learning_rate": 7.518116008178805e-05, "loss": 0.0199, "step": 36540 }, { "epoch": 13.532025175860792, "grad_norm": 0.0968671664595604, "learning_rate": 7.516687399221037e-05, "loss": 0.023, "step": 36550 }, { "epoch": 13.535727508330249, "grad_norm": 0.2181147038936615, "learning_rate": 7.515258515042116e-05, "loss": 0.0181, "step": 36560 }, { "epoch": 13.539429840799704, "grad_norm": 0.19109074771404266, "learning_rate": 7.513829355798307e-05, "loss": 0.0315, "step": 36570 }, { "epoch": 13.543132173269159, "grad_norm": 0.21586892008781433, "learning_rate": 7.512399921645901e-05, "loss": 0.0402, "step": 36580 }, { "epoch": 13.546834505738616, "grad_norm": 0.2952425479888916, "learning_rate": 7.510970212741215e-05, "loss": 0.0251, "step": 36590 }, { "epoch": 13.550536838208071, "grad_norm": 0.28883007168769836, "learning_rate": 7.509540229240601e-05, "loss": 0.0229, "step": 36600 }, { "epoch": 13.554239170677526, "grad_norm": 0.15741001069545746, "learning_rate": 7.508109971300444e-05, "loss": 0.0212, "step": 36610 }, { "epoch": 13.557941503146983, "grad_norm": 0.40981581807136536, "learning_rate": 7.506679439077148e-05, "loss": 0.0324, "step": 36620 }, { "epoch": 13.561643835616438, "grad_norm": 0.2684498131275177, "learning_rate": 7.505248632727158e-05, "loss": 0.0297, "step": 36630 }, { "epoch": 13.565346168085894, "grad_norm": 0.48365285992622375, "learning_rate": 7.503817552406946e-05, "loss": 0.0214, "step": 36640 }, { "epoch": 13.56904850055535, "grad_norm": 0.2570270299911499, "learning_rate": 7.50238619827301e-05, "loss": 0.0325, "step": 36650 }, { "epoch": 13.572750833024806, "grad_norm": 0.278339684009552, "learning_rate": 7.500954570481882e-05, "loss": 0.0288, "step": 36660 }, { "epoch": 13.57645316549426, "grad_norm": 0.17274707555770874, "learning_rate": 7.49952266919012e-05, "loss": 0.0382, "step": 36670 }, { "epoch": 13.580155497963718, "grad_norm": 0.39232492446899414, "learning_rate": 7.498090494554319e-05, "loss": 0.0425, "step": 36680 }, { "epoch": 13.583857830433173, "grad_norm": 0.5446407198905945, "learning_rate": 7.496658046731096e-05, "loss": 0.0225, "step": 36690 }, { "epoch": 13.587560162902628, "grad_norm": 0.25623565912246704, "learning_rate": 7.495225325877103e-05, "loss": 0.0275, "step": 36700 }, { "epoch": 13.591262495372085, "grad_norm": 0.43890196084976196, "learning_rate": 7.49379233214902e-05, "loss": 0.0376, "step": 36710 }, { "epoch": 13.59496482784154, "grad_norm": 0.2194928228855133, "learning_rate": 7.492359065703558e-05, "loss": 0.0192, "step": 36720 }, { "epoch": 13.598667160310995, "grad_norm": 0.9504756331443787, "learning_rate": 7.490925526697455e-05, "loss": 0.025, "step": 36730 }, { "epoch": 13.602369492780452, "grad_norm": 0.48601415753364563, "learning_rate": 7.489491715287481e-05, "loss": 0.0211, "step": 36740 }, { "epoch": 13.606071825249908, "grad_norm": 0.18484356999397278, "learning_rate": 7.488057631630437e-05, "loss": 0.0237, "step": 36750 }, { "epoch": 13.609774157719363, "grad_norm": 0.2967836260795593, "learning_rate": 7.486623275883151e-05, "loss": 0.0233, "step": 36760 }, { "epoch": 13.61347649018882, "grad_norm": 0.1491621881723404, "learning_rate": 7.485188648202481e-05, "loss": 0.0224, "step": 36770 }, { "epoch": 13.617178822658275, "grad_norm": 0.20746032893657684, "learning_rate": 7.483753748745317e-05, "loss": 0.0223, "step": 36780 }, { "epoch": 13.62088115512773, "grad_norm": 0.21786123514175415, "learning_rate": 7.482318577668578e-05, "loss": 0.037, "step": 36790 }, { "epoch": 13.624583487597187, "grad_norm": 1.8234249353408813, "learning_rate": 7.480883135129211e-05, "loss": 0.0258, "step": 36800 }, { "epoch": 13.628285820066642, "grad_norm": 0.26073968410491943, "learning_rate": 7.479447421284193e-05, "loss": 0.0301, "step": 36810 }, { "epoch": 13.631988152536097, "grad_norm": 0.217829167842865, "learning_rate": 7.478011436290535e-05, "loss": 0.02, "step": 36820 }, { "epoch": 13.635690485005554, "grad_norm": 0.42978930473327637, "learning_rate": 7.476575180305271e-05, "loss": 0.0242, "step": 36830 }, { "epoch": 13.63939281747501, "grad_norm": 0.14634227752685547, "learning_rate": 7.475138653485469e-05, "loss": 0.0267, "step": 36840 }, { "epoch": 13.643095149944465, "grad_norm": 0.5155121088027954, "learning_rate": 7.473701855988227e-05, "loss": 0.0276, "step": 36850 }, { "epoch": 13.646797482413922, "grad_norm": 0.31530439853668213, "learning_rate": 7.472264787970666e-05, "loss": 0.0311, "step": 36860 }, { "epoch": 13.650499814883377, "grad_norm": 0.4410141110420227, "learning_rate": 7.470827449589947e-05, "loss": 0.0203, "step": 36870 }, { "epoch": 13.654202147352832, "grad_norm": 0.5506188869476318, "learning_rate": 7.469389841003251e-05, "loss": 0.0263, "step": 36880 }, { "epoch": 13.657904479822289, "grad_norm": 0.301327645778656, "learning_rate": 7.467951962367796e-05, "loss": 0.0393, "step": 36890 }, { "epoch": 13.661606812291744, "grad_norm": 0.15052814781665802, "learning_rate": 7.466513813840825e-05, "loss": 0.0321, "step": 36900 }, { "epoch": 13.6653091447612, "grad_norm": 0.4297565817832947, "learning_rate": 7.465075395579611e-05, "loss": 0.0287, "step": 36910 }, { "epoch": 13.669011477230656, "grad_norm": 1.1782914400100708, "learning_rate": 7.463636707741458e-05, "loss": 0.0387, "step": 36920 }, { "epoch": 13.672713809700111, "grad_norm": 0.30295559763908386, "learning_rate": 7.462197750483699e-05, "loss": 0.0311, "step": 36930 }, { "epoch": 13.676416142169566, "grad_norm": 0.14716513454914093, "learning_rate": 7.460758523963697e-05, "loss": 0.0224, "step": 36940 }, { "epoch": 13.680118474639023, "grad_norm": 0.387215793132782, "learning_rate": 7.45931902833884e-05, "loss": 0.0217, "step": 36950 }, { "epoch": 13.683820807108479, "grad_norm": 0.2775416970252991, "learning_rate": 7.457879263766554e-05, "loss": 0.0315, "step": 36960 }, { "epoch": 13.687523139577934, "grad_norm": 0.6160606741905212, "learning_rate": 7.456439230404286e-05, "loss": 0.0197, "step": 36970 }, { "epoch": 13.691225472047389, "grad_norm": 0.4234013557434082, "learning_rate": 7.454998928409516e-05, "loss": 0.0238, "step": 36980 }, { "epoch": 13.694927804516846, "grad_norm": 0.1491032838821411, "learning_rate": 7.453558357939755e-05, "loss": 0.018, "step": 36990 }, { "epoch": 13.698630136986301, "grad_norm": 0.19285593926906586, "learning_rate": 7.452117519152542e-05, "loss": 0.0416, "step": 37000 }, { "epoch": 13.702332469455758, "grad_norm": 0.8335027098655701, "learning_rate": 7.450676412205442e-05, "loss": 0.0249, "step": 37010 }, { "epoch": 13.706034801925213, "grad_norm": 0.24281617999076843, "learning_rate": 7.449235037256055e-05, "loss": 0.0267, "step": 37020 }, { "epoch": 13.709737134394668, "grad_norm": 0.23035866022109985, "learning_rate": 7.447793394462006e-05, "loss": 0.0385, "step": 37030 }, { "epoch": 13.713439466864124, "grad_norm": 0.20440207421779633, "learning_rate": 7.446351483980952e-05, "loss": 0.0224, "step": 37040 }, { "epoch": 13.71714179933358, "grad_norm": 0.19048845767974854, "learning_rate": 7.444909305970578e-05, "loss": 0.0178, "step": 37050 }, { "epoch": 13.720844131803036, "grad_norm": 0.2250695377588272, "learning_rate": 7.443466860588599e-05, "loss": 0.031, "step": 37060 }, { "epoch": 13.724546464272493, "grad_norm": 0.29606661200523376, "learning_rate": 7.442024147992756e-05, "loss": 0.0307, "step": 37070 }, { "epoch": 13.728248796741948, "grad_norm": 0.25251930952072144, "learning_rate": 7.440581168340825e-05, "loss": 0.0257, "step": 37080 }, { "epoch": 13.731951129211403, "grad_norm": 0.21015450358390808, "learning_rate": 7.439137921790606e-05, "loss": 0.0252, "step": 37090 }, { "epoch": 13.735653461680858, "grad_norm": 0.14164254069328308, "learning_rate": 7.437694408499933e-05, "loss": 0.0286, "step": 37100 }, { "epoch": 13.739355794150315, "grad_norm": 0.7944118976593018, "learning_rate": 7.436250628626662e-05, "loss": 0.0375, "step": 37110 }, { "epoch": 13.74305812661977, "grad_norm": 0.12784713506698608, "learning_rate": 7.434806582328686e-05, "loss": 0.0295, "step": 37120 }, { "epoch": 13.746760459089225, "grad_norm": 0.228728249669075, "learning_rate": 7.433362269763924e-05, "loss": 0.0211, "step": 37130 }, { "epoch": 13.750462791558682, "grad_norm": 0.22589215636253357, "learning_rate": 7.43191769109032e-05, "loss": 0.0231, "step": 37140 }, { "epoch": 13.754165124028138, "grad_norm": 0.2580416798591614, "learning_rate": 7.430472846465856e-05, "loss": 0.0187, "step": 37150 }, { "epoch": 13.757867456497593, "grad_norm": 0.22837655246257782, "learning_rate": 7.429027736048535e-05, "loss": 0.033, "step": 37160 }, { "epoch": 13.76156978896705, "grad_norm": 0.5048578381538391, "learning_rate": 7.427582359996393e-05, "loss": 0.0147, "step": 37170 }, { "epoch": 13.765272121436505, "grad_norm": 0.2577991783618927, "learning_rate": 7.426136718467493e-05, "loss": 0.0337, "step": 37180 }, { "epoch": 13.76897445390596, "grad_norm": 0.16522008180618286, "learning_rate": 7.42469081161993e-05, "loss": 0.0154, "step": 37190 }, { "epoch": 13.772676786375417, "grad_norm": 0.23174713551998138, "learning_rate": 7.423244639611826e-05, "loss": 0.0331, "step": 37200 }, { "epoch": 13.776379118844872, "grad_norm": 0.23864279687404633, "learning_rate": 7.42179820260133e-05, "loss": 0.0235, "step": 37210 }, { "epoch": 13.780081451314327, "grad_norm": 0.46215325593948364, "learning_rate": 7.420351500746625e-05, "loss": 0.0224, "step": 37220 }, { "epoch": 13.783783783783784, "grad_norm": 0.5070525407791138, "learning_rate": 7.418904534205917e-05, "loss": 0.0411, "step": 37230 }, { "epoch": 13.78748611625324, "grad_norm": 0.2818097174167633, "learning_rate": 7.417457303137448e-05, "loss": 0.0235, "step": 37240 }, { "epoch": 13.791188448722695, "grad_norm": 0.38801801204681396, "learning_rate": 7.416009807699482e-05, "loss": 0.0275, "step": 37250 }, { "epoch": 13.794890781192152, "grad_norm": 1.288380742073059, "learning_rate": 7.414562048050315e-05, "loss": 0.0183, "step": 37260 }, { "epoch": 13.798593113661607, "grad_norm": 0.1963326781988144, "learning_rate": 7.413114024348273e-05, "loss": 0.0269, "step": 37270 }, { "epoch": 13.802295446131062, "grad_norm": 0.278742253780365, "learning_rate": 7.411665736751709e-05, "loss": 0.0242, "step": 37280 }, { "epoch": 13.805997778600519, "grad_norm": 0.1881391406059265, "learning_rate": 7.410217185419006e-05, "loss": 0.0222, "step": 37290 }, { "epoch": 13.809700111069974, "grad_norm": 0.7055636644363403, "learning_rate": 7.408768370508576e-05, "loss": 0.0248, "step": 37300 }, { "epoch": 13.81340244353943, "grad_norm": 0.5164554119110107, "learning_rate": 7.407319292178859e-05, "loss": 0.0264, "step": 37310 }, { "epoch": 13.817104776008886, "grad_norm": 0.21341416239738464, "learning_rate": 7.405869950588323e-05, "loss": 0.0204, "step": 37320 }, { "epoch": 13.820807108478341, "grad_norm": 0.1697644293308258, "learning_rate": 7.404420345895467e-05, "loss": 0.024, "step": 37330 }, { "epoch": 13.824509440947796, "grad_norm": 0.3528194725513458, "learning_rate": 7.402970478258816e-05, "loss": 0.0324, "step": 37340 }, { "epoch": 13.828211773417253, "grad_norm": 0.3704000413417816, "learning_rate": 7.401520347836926e-05, "loss": 0.0224, "step": 37350 }, { "epoch": 13.831914105886709, "grad_norm": 0.26017987728118896, "learning_rate": 7.400069954788384e-05, "loss": 0.0338, "step": 37360 }, { "epoch": 13.835616438356164, "grad_norm": 0.46514201164245605, "learning_rate": 7.398619299271798e-05, "loss": 0.03, "step": 37370 }, { "epoch": 13.83931877082562, "grad_norm": 0.20369382202625275, "learning_rate": 7.397168381445812e-05, "loss": 0.015, "step": 37380 }, { "epoch": 13.843021103295076, "grad_norm": 0.1812177300453186, "learning_rate": 7.395717201469095e-05, "loss": 0.0291, "step": 37390 }, { "epoch": 13.846723435764531, "grad_norm": 0.21813355386257172, "learning_rate": 7.394265759500348e-05, "loss": 0.0447, "step": 37400 }, { "epoch": 13.850425768233988, "grad_norm": 0.3708217740058899, "learning_rate": 7.392814055698298e-05, "loss": 0.0328, "step": 37410 }, { "epoch": 13.854128100703443, "grad_norm": 0.2058952897787094, "learning_rate": 7.391362090221698e-05, "loss": 0.0257, "step": 37420 }, { "epoch": 13.857830433172898, "grad_norm": 0.2539026737213135, "learning_rate": 7.389909863229336e-05, "loss": 0.0296, "step": 37430 }, { "epoch": 13.861532765642355, "grad_norm": 0.1749017983675003, "learning_rate": 7.388457374880026e-05, "loss": 0.0232, "step": 37440 }, { "epoch": 13.86523509811181, "grad_norm": 0.16572947800159454, "learning_rate": 7.387004625332608e-05, "loss": 0.0297, "step": 37450 }, { "epoch": 13.868937430581266, "grad_norm": 0.18624339997768402, "learning_rate": 7.385551614745952e-05, "loss": 0.0265, "step": 37460 }, { "epoch": 13.872639763050723, "grad_norm": 0.1597674936056137, "learning_rate": 7.384098343278957e-05, "loss": 0.0207, "step": 37470 }, { "epoch": 13.876342095520178, "grad_norm": 0.24840930104255676, "learning_rate": 7.382644811090555e-05, "loss": 0.0211, "step": 37480 }, { "epoch": 13.880044427989633, "grad_norm": 0.19418954849243164, "learning_rate": 7.381191018339696e-05, "loss": 0.0257, "step": 37490 }, { "epoch": 13.88374676045909, "grad_norm": 0.2520993649959564, "learning_rate": 7.379736965185368e-05, "loss": 0.0229, "step": 37500 }, { "epoch": 13.887449092928545, "grad_norm": 0.2631191909313202, "learning_rate": 7.378282651786583e-05, "loss": 0.0171, "step": 37510 }, { "epoch": 13.891151425398, "grad_norm": 0.4344536066055298, "learning_rate": 7.376828078302384e-05, "loss": 0.0278, "step": 37520 }, { "epoch": 13.894853757867457, "grad_norm": 0.5404663681983948, "learning_rate": 7.37537324489184e-05, "loss": 0.0226, "step": 37530 }, { "epoch": 13.898556090336912, "grad_norm": 0.6680344343185425, "learning_rate": 7.373918151714048e-05, "loss": 0.0303, "step": 37540 }, { "epoch": 13.902258422806367, "grad_norm": 0.28972214460372925, "learning_rate": 7.372462798928137e-05, "loss": 0.0284, "step": 37550 }, { "epoch": 13.905960755275824, "grad_norm": 0.1655026525259018, "learning_rate": 7.37100718669326e-05, "loss": 0.0253, "step": 37560 }, { "epoch": 13.90966308774528, "grad_norm": 0.2415926307439804, "learning_rate": 7.369551315168604e-05, "loss": 0.03, "step": 37570 }, { "epoch": 13.913365420214735, "grad_norm": 0.6050620675086975, "learning_rate": 7.368095184513377e-05, "loss": 0.031, "step": 37580 }, { "epoch": 13.917067752684192, "grad_norm": 0.2972628176212311, "learning_rate": 7.36663879488682e-05, "loss": 0.0251, "step": 37590 }, { "epoch": 13.920770085153647, "grad_norm": 0.22150804102420807, "learning_rate": 7.365182146448205e-05, "loss": 0.0157, "step": 37600 }, { "epoch": 13.924472417623102, "grad_norm": 0.8650900721549988, "learning_rate": 7.363725239356826e-05, "loss": 0.0361, "step": 37610 }, { "epoch": 13.928174750092559, "grad_norm": 0.2635588049888611, "learning_rate": 7.362268073772007e-05, "loss": 0.0265, "step": 37620 }, { "epoch": 13.931877082562014, "grad_norm": 0.1291496455669403, "learning_rate": 7.360810649853105e-05, "loss": 0.0134, "step": 37630 }, { "epoch": 13.93557941503147, "grad_norm": 0.2707374095916748, "learning_rate": 7.359352967759498e-05, "loss": 0.0239, "step": 37640 }, { "epoch": 13.939281747500926, "grad_norm": 0.2148938924074173, "learning_rate": 7.357895027650598e-05, "loss": 0.0201, "step": 37650 }, { "epoch": 13.942984079970381, "grad_norm": 0.1946995109319687, "learning_rate": 7.356436829685844e-05, "loss": 0.0356, "step": 37660 }, { "epoch": 13.946686412439837, "grad_norm": 0.2482210397720337, "learning_rate": 7.354978374024701e-05, "loss": 0.0248, "step": 37670 }, { "epoch": 13.950388744909294, "grad_norm": 0.26872774958610535, "learning_rate": 7.353519660826665e-05, "loss": 0.0191, "step": 37680 }, { "epoch": 13.954091077378749, "grad_norm": 0.21026112139225006, "learning_rate": 7.352060690251254e-05, "loss": 0.0227, "step": 37690 }, { "epoch": 13.957793409848204, "grad_norm": 0.2598751187324524, "learning_rate": 7.350601462458024e-05, "loss": 0.029, "step": 37700 }, { "epoch": 13.96149574231766, "grad_norm": 0.2945255637168884, "learning_rate": 7.349141977606553e-05, "loss": 0.0296, "step": 37710 }, { "epoch": 13.965198074787116, "grad_norm": 0.3192485570907593, "learning_rate": 7.347682235856445e-05, "loss": 0.0127, "step": 37720 }, { "epoch": 13.968900407256571, "grad_norm": 0.21204036474227905, "learning_rate": 7.346222237367339e-05, "loss": 0.0275, "step": 37730 }, { "epoch": 13.972602739726028, "grad_norm": 0.3582506477832794, "learning_rate": 7.344761982298896e-05, "loss": 0.0249, "step": 37740 }, { "epoch": 13.976305072195483, "grad_norm": 0.35535597801208496, "learning_rate": 7.343301470810808e-05, "loss": 0.0196, "step": 37750 }, { "epoch": 13.980007404664939, "grad_norm": 0.2535710334777832, "learning_rate": 7.341840703062793e-05, "loss": 0.0221, "step": 37760 }, { "epoch": 13.983709737134395, "grad_norm": 0.4598102569580078, "learning_rate": 7.340379679214602e-05, "loss": 0.0212, "step": 37770 }, { "epoch": 13.98741206960385, "grad_norm": 0.46413764357566833, "learning_rate": 7.338918399426005e-05, "loss": 0.0325, "step": 37780 }, { "epoch": 13.991114402073306, "grad_norm": 0.2957209050655365, "learning_rate": 7.337456863856811e-05, "loss": 0.0317, "step": 37790 }, { "epoch": 13.994816734542763, "grad_norm": 0.16619907319545746, "learning_rate": 7.335995072666848e-05, "loss": 0.0236, "step": 37800 }, { "epoch": 13.998519067012218, "grad_norm": 0.18876473605632782, "learning_rate": 7.334533026015977e-05, "loss": 0.0354, "step": 37810 }, { "epoch": 14.002221399481673, "grad_norm": 0.17900845408439636, "learning_rate": 7.333070724064083e-05, "loss": 0.0226, "step": 37820 }, { "epoch": 14.00592373195113, "grad_norm": 0.35823705792427063, "learning_rate": 7.331608166971082e-05, "loss": 0.026, "step": 37830 }, { "epoch": 14.009626064420585, "grad_norm": 0.24593234062194824, "learning_rate": 7.330145354896918e-05, "loss": 0.0261, "step": 37840 }, { "epoch": 14.01332839689004, "grad_norm": 0.3180916905403137, "learning_rate": 7.328682288001561e-05, "loss": 0.0296, "step": 37850 }, { "epoch": 14.017030729359497, "grad_norm": 0.1911311149597168, "learning_rate": 7.32721896644501e-05, "loss": 0.0222, "step": 37860 }, { "epoch": 14.020733061828953, "grad_norm": 0.3857167065143585, "learning_rate": 7.325755390387292e-05, "loss": 0.027, "step": 37870 }, { "epoch": 14.024435394298408, "grad_norm": 0.6371869444847107, "learning_rate": 7.324291559988461e-05, "loss": 0.0238, "step": 37880 }, { "epoch": 14.028137726767865, "grad_norm": 0.208541139960289, "learning_rate": 7.3228274754086e-05, "loss": 0.0368, "step": 37890 }, { "epoch": 14.03184005923732, "grad_norm": 1.376240611076355, "learning_rate": 7.32136313680782e-05, "loss": 0.0245, "step": 37900 }, { "epoch": 14.035542391706775, "grad_norm": 0.2038455307483673, "learning_rate": 7.319898544346255e-05, "loss": 0.0294, "step": 37910 }, { "epoch": 14.03924472417623, "grad_norm": 0.38999852538108826, "learning_rate": 7.318433698184073e-05, "loss": 0.0218, "step": 37920 }, { "epoch": 14.042947056645687, "grad_norm": 0.1887597292661667, "learning_rate": 7.316968598481469e-05, "loss": 0.0208, "step": 37930 }, { "epoch": 14.046649389115142, "grad_norm": 0.2944330871105194, "learning_rate": 7.315503245398661e-05, "loss": 0.0303, "step": 37940 }, { "epoch": 14.050351721584597, "grad_norm": 0.2260814756155014, "learning_rate": 7.3140376390959e-05, "loss": 0.0163, "step": 37950 }, { "epoch": 14.054054054054054, "grad_norm": 0.5227804183959961, "learning_rate": 7.312571779733463e-05, "loss": 0.0263, "step": 37960 }, { "epoch": 14.05775638652351, "grad_norm": 0.16829153895378113, "learning_rate": 7.311105667471653e-05, "loss": 0.0237, "step": 37970 }, { "epoch": 14.061458718992965, "grad_norm": 0.5032731294631958, "learning_rate": 7.309639302470801e-05, "loss": 0.0283, "step": 37980 }, { "epoch": 14.065161051462422, "grad_norm": 0.3282682001590729, "learning_rate": 7.308172684891267e-05, "loss": 0.0271, "step": 37990 }, { "epoch": 14.068863383931877, "grad_norm": 0.15725690126419067, "learning_rate": 7.30670581489344e-05, "loss": 0.0128, "step": 38000 }, { "epoch": 14.072565716401332, "grad_norm": 3.899627447128296, "learning_rate": 7.305238692637731e-05, "loss": 0.0216, "step": 38010 }, { "epoch": 14.076268048870789, "grad_norm": 0.18585258722305298, "learning_rate": 7.303771318284587e-05, "loss": 0.0347, "step": 38020 }, { "epoch": 14.079970381340244, "grad_norm": 0.23848769068717957, "learning_rate": 7.302303691994473e-05, "loss": 0.0367, "step": 38030 }, { "epoch": 14.0836727138097, "grad_norm": 0.19448719918727875, "learning_rate": 7.30083581392789e-05, "loss": 0.03, "step": 38040 }, { "epoch": 14.087375046279156, "grad_norm": 0.2783067226409912, "learning_rate": 7.299367684245362e-05, "loss": 0.0265, "step": 38050 }, { "epoch": 14.091077378748611, "grad_norm": 0.4960367977619171, "learning_rate": 7.297899303107441e-05, "loss": 0.0267, "step": 38060 }, { "epoch": 14.094779711218067, "grad_norm": 1.0334937572479248, "learning_rate": 7.296430670674705e-05, "loss": 0.0237, "step": 38070 }, { "epoch": 14.098482043687524, "grad_norm": 0.4239039719104767, "learning_rate": 7.294961787107766e-05, "loss": 0.0268, "step": 38080 }, { "epoch": 14.102184376156979, "grad_norm": 1.6741265058517456, "learning_rate": 7.293492652567255e-05, "loss": 0.0291, "step": 38090 }, { "epoch": 14.105886708626434, "grad_norm": 0.2165290117263794, "learning_rate": 7.292023267213835e-05, "loss": 0.0313, "step": 38100 }, { "epoch": 14.10958904109589, "grad_norm": 0.3683403730392456, "learning_rate": 7.290553631208198e-05, "loss": 0.0201, "step": 38110 }, { "epoch": 14.113291373565346, "grad_norm": 0.7217789888381958, "learning_rate": 7.28908374471106e-05, "loss": 0.02, "step": 38120 }, { "epoch": 14.116993706034801, "grad_norm": 0.25179916620254517, "learning_rate": 7.287613607883163e-05, "loss": 0.0222, "step": 38130 }, { "epoch": 14.120696038504258, "grad_norm": 0.21306803822517395, "learning_rate": 7.286143220885285e-05, "loss": 0.0185, "step": 38140 }, { "epoch": 14.124398370973713, "grad_norm": 0.348922461271286, "learning_rate": 7.284672583878219e-05, "loss": 0.0231, "step": 38150 }, { "epoch": 14.128100703443168, "grad_norm": 1.656486988067627, "learning_rate": 7.283201697022794e-05, "loss": 0.0176, "step": 38160 }, { "epoch": 14.131803035912625, "grad_norm": 0.3089974820613861, "learning_rate": 7.281730560479865e-05, "loss": 0.0227, "step": 38170 }, { "epoch": 14.13550536838208, "grad_norm": 0.752407431602478, "learning_rate": 7.280259174410312e-05, "loss": 0.0308, "step": 38180 }, { "epoch": 14.139207700851536, "grad_norm": 0.3021070957183838, "learning_rate": 7.278787538975043e-05, "loss": 0.0283, "step": 38190 }, { "epoch": 14.142910033320993, "grad_norm": 0.46147215366363525, "learning_rate": 7.277315654334997e-05, "loss": 0.0254, "step": 38200 }, { "epoch": 14.146612365790448, "grad_norm": 0.1564246416091919, "learning_rate": 7.275843520651133e-05, "loss": 0.017, "step": 38210 }, { "epoch": 14.150314698259903, "grad_norm": 0.33907365798950195, "learning_rate": 7.274371138084444e-05, "loss": 0.0226, "step": 38220 }, { "epoch": 14.15401703072936, "grad_norm": 0.5880506634712219, "learning_rate": 7.272898506795948e-05, "loss": 0.0342, "step": 38230 }, { "epoch": 14.157719363198815, "grad_norm": 0.24929234385490417, "learning_rate": 7.271425626946686e-05, "loss": 0.0253, "step": 38240 }, { "epoch": 14.16142169566827, "grad_norm": 0.21508269011974335, "learning_rate": 7.269952498697734e-05, "loss": 0.0218, "step": 38250 }, { "epoch": 14.165124028137727, "grad_norm": 0.6975058913230896, "learning_rate": 7.26847912221019e-05, "loss": 0.0317, "step": 38260 }, { "epoch": 14.168826360607182, "grad_norm": 0.3266236484050751, "learning_rate": 7.267005497645178e-05, "loss": 0.0209, "step": 38270 }, { "epoch": 14.172528693076638, "grad_norm": 0.2761983573436737, "learning_rate": 7.265531625163857e-05, "loss": 0.0262, "step": 38280 }, { "epoch": 14.176231025546095, "grad_norm": 0.24928635358810425, "learning_rate": 7.2640575049274e-05, "loss": 0.033, "step": 38290 }, { "epoch": 14.17993335801555, "grad_norm": 0.2608441710472107, "learning_rate": 7.262583137097018e-05, "loss": 0.0199, "step": 38300 }, { "epoch": 14.183635690485005, "grad_norm": 0.4559570848941803, "learning_rate": 7.261108521833948e-05, "loss": 0.0284, "step": 38310 }, { "epoch": 14.187338022954462, "grad_norm": 0.24571727216243744, "learning_rate": 7.259633659299449e-05, "loss": 0.024, "step": 38320 }, { "epoch": 14.191040355423917, "grad_norm": 0.7165330648422241, "learning_rate": 7.25815854965481e-05, "loss": 0.0271, "step": 38330 }, { "epoch": 14.194742687893372, "grad_norm": 0.28205668926239014, "learning_rate": 7.256683193061348e-05, "loss": 0.0172, "step": 38340 }, { "epoch": 14.19844502036283, "grad_norm": 0.3056183457374573, "learning_rate": 7.255207589680402e-05, "loss": 0.0183, "step": 38350 }, { "epoch": 14.202147352832284, "grad_norm": 0.28051990270614624, "learning_rate": 7.253731739673349e-05, "loss": 0.0222, "step": 38360 }, { "epoch": 14.20584968530174, "grad_norm": 0.20399577915668488, "learning_rate": 7.252255643201579e-05, "loss": 0.0359, "step": 38370 }, { "epoch": 14.209552017771196, "grad_norm": 0.2534639239311218, "learning_rate": 7.250779300426517e-05, "loss": 0.0266, "step": 38380 }, { "epoch": 14.213254350240652, "grad_norm": 0.3130960464477539, "learning_rate": 7.249302711509616e-05, "loss": 0.0236, "step": 38390 }, { "epoch": 14.216956682710107, "grad_norm": 0.15884537994861603, "learning_rate": 7.247825876612353e-05, "loss": 0.0187, "step": 38400 }, { "epoch": 14.220659015179564, "grad_norm": 0.2562457323074341, "learning_rate": 7.246348795896231e-05, "loss": 0.0217, "step": 38410 }, { "epoch": 14.224361347649019, "grad_norm": 1.1463576555252075, "learning_rate": 7.244871469522781e-05, "loss": 0.0218, "step": 38420 }, { "epoch": 14.228063680118474, "grad_norm": 0.4281752109527588, "learning_rate": 7.243393897653565e-05, "loss": 0.0305, "step": 38430 }, { "epoch": 14.231766012587931, "grad_norm": 0.1709677278995514, "learning_rate": 7.241916080450163e-05, "loss": 0.0255, "step": 38440 }, { "epoch": 14.235468345057386, "grad_norm": 0.3797557055950165, "learning_rate": 7.240438018074189e-05, "loss": 0.0283, "step": 38450 }, { "epoch": 14.239170677526841, "grad_norm": 0.21482180058956146, "learning_rate": 7.238959710687282e-05, "loss": 0.018, "step": 38460 }, { "epoch": 14.242873009996298, "grad_norm": 0.3155953884124756, "learning_rate": 7.237481158451107e-05, "loss": 0.0179, "step": 38470 }, { "epoch": 14.246575342465754, "grad_norm": 0.17057420313358307, "learning_rate": 7.236002361527356e-05, "loss": 0.021, "step": 38480 }, { "epoch": 14.250277674935209, "grad_norm": 0.35791853070259094, "learning_rate": 7.23452332007775e-05, "loss": 0.0285, "step": 38490 }, { "epoch": 14.253980007404666, "grad_norm": 0.238340824842453, "learning_rate": 7.233044034264034e-05, "loss": 0.0268, "step": 38500 }, { "epoch": 14.25768233987412, "grad_norm": 0.1627175360918045, "learning_rate": 7.231564504247977e-05, "loss": 0.0211, "step": 38510 }, { "epoch": 14.261384672343576, "grad_norm": 0.5896655321121216, "learning_rate": 7.230084730191383e-05, "loss": 0.0209, "step": 38520 }, { "epoch": 14.265087004813033, "grad_norm": 0.19973105192184448, "learning_rate": 7.228604712256076e-05, "loss": 0.0464, "step": 38530 }, { "epoch": 14.268789337282488, "grad_norm": 0.2067071795463562, "learning_rate": 7.227124450603906e-05, "loss": 0.0297, "step": 38540 }, { "epoch": 14.272491669751943, "grad_norm": 0.2792257070541382, "learning_rate": 7.225643945396757e-05, "loss": 0.0167, "step": 38550 }, { "epoch": 14.2761940022214, "grad_norm": 0.2892460227012634, "learning_rate": 7.224163196796532e-05, "loss": 0.0301, "step": 38560 }, { "epoch": 14.279896334690855, "grad_norm": 0.09697018563747406, "learning_rate": 7.222682204965163e-05, "loss": 0.0213, "step": 38570 }, { "epoch": 14.28359866716031, "grad_norm": 0.24618946015834808, "learning_rate": 7.22120097006461e-05, "loss": 0.017, "step": 38580 }, { "epoch": 14.287300999629768, "grad_norm": 0.20794923603534698, "learning_rate": 7.219719492256858e-05, "loss": 0.0287, "step": 38590 }, { "epoch": 14.291003332099223, "grad_norm": 0.3185634911060333, "learning_rate": 7.218237771703921e-05, "loss": 0.0301, "step": 38600 }, { "epoch": 14.294705664568678, "grad_norm": 0.5242365598678589, "learning_rate": 7.216755808567834e-05, "loss": 0.0199, "step": 38610 }, { "epoch": 14.298407997038135, "grad_norm": 0.24456849694252014, "learning_rate": 7.215273603010668e-05, "loss": 0.0252, "step": 38620 }, { "epoch": 14.30211032950759, "grad_norm": 1.353316068649292, "learning_rate": 7.21379115519451e-05, "loss": 0.0198, "step": 38630 }, { "epoch": 14.305812661977045, "grad_norm": 0.42315736413002014, "learning_rate": 7.212308465281476e-05, "loss": 0.0159, "step": 38640 }, { "epoch": 14.309514994446502, "grad_norm": 0.22538553178310394, "learning_rate": 7.210825533433719e-05, "loss": 0.0233, "step": 38650 }, { "epoch": 14.313217326915957, "grad_norm": 0.280571848154068, "learning_rate": 7.209342359813404e-05, "loss": 0.0287, "step": 38660 }, { "epoch": 14.316919659385412, "grad_norm": 0.23649553954601288, "learning_rate": 7.207858944582731e-05, "loss": 0.0207, "step": 38670 }, { "epoch": 14.32062199185487, "grad_norm": 0.1239091083407402, "learning_rate": 7.206375287903921e-05, "loss": 0.0319, "step": 38680 }, { "epoch": 14.324324324324325, "grad_norm": 0.14244873821735382, "learning_rate": 7.20489138993923e-05, "loss": 0.0237, "step": 38690 }, { "epoch": 14.32802665679378, "grad_norm": 0.20454739034175873, "learning_rate": 7.203407250850928e-05, "loss": 0.0306, "step": 38700 }, { "epoch": 14.331728989263237, "grad_norm": 0.36235612630844116, "learning_rate": 7.201922870801325e-05, "loss": 0.0352, "step": 38710 }, { "epoch": 14.335431321732692, "grad_norm": 0.2898462116718292, "learning_rate": 7.200438249952745e-05, "loss": 0.0228, "step": 38720 }, { "epoch": 14.339133654202147, "grad_norm": 0.2874293029308319, "learning_rate": 7.198953388467549e-05, "loss": 0.0174, "step": 38730 }, { "epoch": 14.342835986671602, "grad_norm": 0.10408803820610046, "learning_rate": 7.197468286508113e-05, "loss": 0.0202, "step": 38740 }, { "epoch": 14.34653831914106, "grad_norm": 0.8150324821472168, "learning_rate": 7.195982944236851e-05, "loss": 0.026, "step": 38750 }, { "epoch": 14.350240651610514, "grad_norm": 0.27113184332847595, "learning_rate": 7.194497361816196e-05, "loss": 0.0213, "step": 38760 }, { "epoch": 14.353942984079971, "grad_norm": 0.16033104062080383, "learning_rate": 7.193011539408609e-05, "loss": 0.0212, "step": 38770 }, { "epoch": 14.357645316549426, "grad_norm": 0.3583742678165436, "learning_rate": 7.191525477176577e-05, "loss": 0.0311, "step": 38780 }, { "epoch": 14.361347649018882, "grad_norm": 0.2345406860113144, "learning_rate": 7.190039175282614e-05, "loss": 0.0216, "step": 38790 }, { "epoch": 14.365049981488337, "grad_norm": 0.3336373269557953, "learning_rate": 7.188552633889259e-05, "loss": 0.0243, "step": 38800 }, { "epoch": 14.368752313957794, "grad_norm": 0.45372945070266724, "learning_rate": 7.187065853159079e-05, "loss": 0.0264, "step": 38810 }, { "epoch": 14.372454646427249, "grad_norm": 0.2796817123889923, "learning_rate": 7.185578833254664e-05, "loss": 0.0224, "step": 38820 }, { "epoch": 14.376156978896704, "grad_norm": 0.17955228686332703, "learning_rate": 7.184091574338636e-05, "loss": 0.0215, "step": 38830 }, { "epoch": 14.379859311366161, "grad_norm": 0.2879847288131714, "learning_rate": 7.182604076573635e-05, "loss": 0.0246, "step": 38840 }, { "epoch": 14.383561643835616, "grad_norm": 0.24278044700622559, "learning_rate": 7.181116340122336e-05, "loss": 0.0288, "step": 38850 }, { "epoch": 14.387263976305071, "grad_norm": 0.2987695038318634, "learning_rate": 7.17962836514743e-05, "loss": 0.0156, "step": 38860 }, { "epoch": 14.390966308774528, "grad_norm": 0.18751777708530426, "learning_rate": 7.178140151811647e-05, "loss": 0.0212, "step": 38870 }, { "epoch": 14.394668641243983, "grad_norm": 0.1858469396829605, "learning_rate": 7.176651700277729e-05, "loss": 0.0284, "step": 38880 }, { "epoch": 14.398370973713439, "grad_norm": 0.2327498495578766, "learning_rate": 7.175163010708455e-05, "loss": 0.0172, "step": 38890 }, { "epoch": 14.402073306182896, "grad_norm": 0.19701087474822998, "learning_rate": 7.173674083266624e-05, "loss": 0.0233, "step": 38900 }, { "epoch": 14.40577563865235, "grad_norm": 0.537003219127655, "learning_rate": 7.172184918115062e-05, "loss": 0.0286, "step": 38910 }, { "epoch": 14.409477971121806, "grad_norm": 0.40075787901878357, "learning_rate": 7.170695515416626e-05, "loss": 0.0186, "step": 38920 }, { "epoch": 14.413180303591263, "grad_norm": 0.3633471131324768, "learning_rate": 7.169205875334189e-05, "loss": 0.0249, "step": 38930 }, { "epoch": 14.416882636060718, "grad_norm": 0.587105393409729, "learning_rate": 7.16771599803066e-05, "loss": 0.0305, "step": 38940 }, { "epoch": 14.420584968530173, "grad_norm": 0.32174578309059143, "learning_rate": 7.166225883668969e-05, "loss": 0.0164, "step": 38950 }, { "epoch": 14.42428730099963, "grad_norm": 0.21021966636180878, "learning_rate": 7.164735532412073e-05, "loss": 0.0297, "step": 38960 }, { "epoch": 14.427989633469085, "grad_norm": 0.2105940580368042, "learning_rate": 7.163244944422951e-05, "loss": 0.0343, "step": 38970 }, { "epoch": 14.43169196593854, "grad_norm": 0.18527083098888397, "learning_rate": 7.161754119864616e-05, "loss": 0.0238, "step": 38980 }, { "epoch": 14.435394298407997, "grad_norm": 0.23513507843017578, "learning_rate": 7.1602630589001e-05, "loss": 0.0203, "step": 38990 }, { "epoch": 14.439096630877453, "grad_norm": 0.4256509840488434, "learning_rate": 7.158771761692464e-05, "loss": 0.024, "step": 39000 }, { "epoch": 14.442798963346908, "grad_norm": 0.2787244915962219, "learning_rate": 7.157280228404795e-05, "loss": 0.0245, "step": 39010 }, { "epoch": 14.446501295816365, "grad_norm": 0.27412524819374084, "learning_rate": 7.155788459200203e-05, "loss": 0.0213, "step": 39020 }, { "epoch": 14.45020362828582, "grad_norm": 0.2005138248205185, "learning_rate": 7.154296454241827e-05, "loss": 0.0235, "step": 39030 }, { "epoch": 14.453905960755275, "grad_norm": 0.4949694871902466, "learning_rate": 7.152804213692829e-05, "loss": 0.0223, "step": 39040 }, { "epoch": 14.457608293224732, "grad_norm": 0.13220706582069397, "learning_rate": 7.151311737716397e-05, "loss": 0.0166, "step": 39050 }, { "epoch": 14.461310625694187, "grad_norm": 0.6228623986244202, "learning_rate": 7.149819026475751e-05, "loss": 0.0315, "step": 39060 }, { "epoch": 14.465012958163642, "grad_norm": 0.30547410249710083, "learning_rate": 7.148326080134126e-05, "loss": 0.0266, "step": 39070 }, { "epoch": 14.4687152906331, "grad_norm": 0.18371883034706116, "learning_rate": 7.146832898854793e-05, "loss": 0.024, "step": 39080 }, { "epoch": 14.472417623102555, "grad_norm": 0.4326559603214264, "learning_rate": 7.14533948280104e-05, "loss": 0.0206, "step": 39090 }, { "epoch": 14.47611995557201, "grad_norm": 0.13394880294799805, "learning_rate": 7.143845832136188e-05, "loss": 0.0294, "step": 39100 }, { "epoch": 14.479822288041467, "grad_norm": 0.2216934859752655, "learning_rate": 7.142351947023577e-05, "loss": 0.0222, "step": 39110 }, { "epoch": 14.483524620510922, "grad_norm": 0.4156900942325592, "learning_rate": 7.14085782762658e-05, "loss": 0.0225, "step": 39120 }, { "epoch": 14.487226952980377, "grad_norm": 0.39731571078300476, "learning_rate": 7.139363474108589e-05, "loss": 0.0267, "step": 39130 }, { "epoch": 14.490929285449834, "grad_norm": 0.3376002609729767, "learning_rate": 7.137868886633025e-05, "loss": 0.025, "step": 39140 }, { "epoch": 14.49463161791929, "grad_norm": 0.4150138199329376, "learning_rate": 7.136374065363334e-05, "loss": 0.0215, "step": 39150 }, { "epoch": 14.498333950388744, "grad_norm": 0.12554144859313965, "learning_rate": 7.134879010462988e-05, "loss": 0.0255, "step": 39160 }, { "epoch": 14.502036282858201, "grad_norm": 0.3470815420150757, "learning_rate": 7.133383722095483e-05, "loss": 0.0274, "step": 39170 }, { "epoch": 14.505738615327656, "grad_norm": 0.16216689348220825, "learning_rate": 7.131888200424339e-05, "loss": 0.0219, "step": 39180 }, { "epoch": 14.509440947797112, "grad_norm": 0.46788302063941956, "learning_rate": 7.130392445613109e-05, "loss": 0.0171, "step": 39190 }, { "epoch": 14.513143280266569, "grad_norm": 0.33012983202934265, "learning_rate": 7.128896457825364e-05, "loss": 0.0254, "step": 39200 }, { "epoch": 14.516845612736024, "grad_norm": 0.2415674924850464, "learning_rate": 7.127400237224702e-05, "loss": 0.0212, "step": 39210 }, { "epoch": 14.520547945205479, "grad_norm": 0.18226554989814758, "learning_rate": 7.12590378397475e-05, "loss": 0.0192, "step": 39220 }, { "epoch": 14.524250277674936, "grad_norm": 0.20825651288032532, "learning_rate": 7.124407098239155e-05, "loss": 0.019, "step": 39230 }, { "epoch": 14.527952610144391, "grad_norm": 0.20234555006027222, "learning_rate": 7.122910180181595e-05, "loss": 0.0228, "step": 39240 }, { "epoch": 14.531654942613846, "grad_norm": 0.14754413068294525, "learning_rate": 7.121413029965769e-05, "loss": 0.0349, "step": 39250 }, { "epoch": 14.535357275083303, "grad_norm": 0.23001037538051605, "learning_rate": 7.119915647755404e-05, "loss": 0.0247, "step": 39260 }, { "epoch": 14.539059607552758, "grad_norm": 0.25633764266967773, "learning_rate": 7.11841803371425e-05, "loss": 0.023, "step": 39270 }, { "epoch": 14.542761940022213, "grad_norm": 0.2257901132106781, "learning_rate": 7.116920188006085e-05, "loss": 0.0154, "step": 39280 }, { "epoch": 14.54646427249167, "grad_norm": 0.2102120965719223, "learning_rate": 7.115422110794711e-05, "loss": 0.019, "step": 39290 }, { "epoch": 14.550166604961126, "grad_norm": 0.2269575595855713, "learning_rate": 7.113923802243957e-05, "loss": 0.0137, "step": 39300 }, { "epoch": 14.55386893743058, "grad_norm": 0.3170243501663208, "learning_rate": 7.112425262517672e-05, "loss": 0.0173, "step": 39310 }, { "epoch": 14.557571269900038, "grad_norm": 0.5675642490386963, "learning_rate": 7.110926491779737e-05, "loss": 0.0245, "step": 39320 }, { "epoch": 14.561273602369493, "grad_norm": 0.2419680953025818, "learning_rate": 7.109427490194056e-05, "loss": 0.0247, "step": 39330 }, { "epoch": 14.564975934838948, "grad_norm": 0.2869776487350464, "learning_rate": 7.107928257924554e-05, "loss": 0.0211, "step": 39340 }, { "epoch": 14.568678267308405, "grad_norm": 0.3150027394294739, "learning_rate": 7.10642879513519e-05, "loss": 0.0286, "step": 39350 }, { "epoch": 14.57238059977786, "grad_norm": 0.3783162236213684, "learning_rate": 7.104929101989937e-05, "loss": 0.0204, "step": 39360 }, { "epoch": 14.576082932247315, "grad_norm": 0.24458755552768707, "learning_rate": 7.103429178652803e-05, "loss": 0.0167, "step": 39370 }, { "epoch": 14.579785264716772, "grad_norm": 0.19790808856487274, "learning_rate": 7.101929025287816e-05, "loss": 0.0217, "step": 39380 }, { "epoch": 14.583487597186227, "grad_norm": 0.14159126579761505, "learning_rate": 7.100428642059033e-05, "loss": 0.0263, "step": 39390 }, { "epoch": 14.587189929655683, "grad_norm": 0.3126460909843445, "learning_rate": 7.09892802913053e-05, "loss": 0.031, "step": 39400 }, { "epoch": 14.59089226212514, "grad_norm": 0.20189467072486877, "learning_rate": 7.097427186666412e-05, "loss": 0.023, "step": 39410 }, { "epoch": 14.594594594594595, "grad_norm": 0.22527115046977997, "learning_rate": 7.095926114830812e-05, "loss": 0.0237, "step": 39420 }, { "epoch": 14.59829692706405, "grad_norm": 0.23687954246997833, "learning_rate": 7.094424813787883e-05, "loss": 0.0194, "step": 39430 }, { "epoch": 14.601999259533507, "grad_norm": 0.21604318916797638, "learning_rate": 7.092923283701806e-05, "loss": 0.0193, "step": 39440 }, { "epoch": 14.605701592002962, "grad_norm": 0.26750287413597107, "learning_rate": 7.091421524736784e-05, "loss": 0.0202, "step": 39450 }, { "epoch": 14.609403924472417, "grad_norm": 0.12504301965236664, "learning_rate": 7.08991953705705e-05, "loss": 0.0225, "step": 39460 }, { "epoch": 14.613106256941874, "grad_norm": 0.13544288277626038, "learning_rate": 7.088417320826856e-05, "loss": 0.0339, "step": 39470 }, { "epoch": 14.61680858941133, "grad_norm": 0.18856507539749146, "learning_rate": 7.086914876210485e-05, "loss": 0.019, "step": 39480 }, { "epoch": 14.620510921880784, "grad_norm": 0.49323537945747375, "learning_rate": 7.08541220337224e-05, "loss": 0.0333, "step": 39490 }, { "epoch": 14.624213254350241, "grad_norm": 0.22448086738586426, "learning_rate": 7.083909302476453e-05, "loss": 0.0168, "step": 39500 }, { "epoch": 14.627915586819697, "grad_norm": 0.16288936138153076, "learning_rate": 7.082406173687475e-05, "loss": 0.0349, "step": 39510 }, { "epoch": 14.631617919289152, "grad_norm": 0.2675670385360718, "learning_rate": 7.080902817169693e-05, "loss": 0.0205, "step": 39520 }, { "epoch": 14.635320251758609, "grad_norm": 0.23992283642292023, "learning_rate": 7.079399233087504e-05, "loss": 0.0233, "step": 39530 }, { "epoch": 14.639022584228064, "grad_norm": 0.27357715368270874, "learning_rate": 7.077895421605343e-05, "loss": 0.0207, "step": 39540 }, { "epoch": 14.642724916697519, "grad_norm": 0.2868454158306122, "learning_rate": 7.076391382887661e-05, "loss": 0.0266, "step": 39550 }, { "epoch": 14.646427249166976, "grad_norm": 0.4121440649032593, "learning_rate": 7.07488711709894e-05, "loss": 0.0282, "step": 39560 }, { "epoch": 14.650129581636431, "grad_norm": 0.5015895962715149, "learning_rate": 7.073382624403684e-05, "loss": 0.032, "step": 39570 }, { "epoch": 14.653831914105886, "grad_norm": 0.23164047300815582, "learning_rate": 7.071877904966423e-05, "loss": 0.0292, "step": 39580 }, { "epoch": 14.657534246575342, "grad_norm": 0.19317717850208282, "learning_rate": 7.070372958951706e-05, "loss": 0.0159, "step": 39590 }, { "epoch": 14.661236579044798, "grad_norm": 0.3257616460323334, "learning_rate": 7.068867786524116e-05, "loss": 0.0239, "step": 39600 }, { "epoch": 14.664938911514254, "grad_norm": 0.21851153671741486, "learning_rate": 7.067362387848256e-05, "loss": 0.0236, "step": 39610 }, { "epoch": 14.66864124398371, "grad_norm": 0.1745627522468567, "learning_rate": 7.065856763088754e-05, "loss": 0.0207, "step": 39620 }, { "epoch": 14.672343576453166, "grad_norm": 0.49360811710357666, "learning_rate": 7.06435091241026e-05, "loss": 0.0256, "step": 39630 }, { "epoch": 14.676045908922621, "grad_norm": 0.26604777574539185, "learning_rate": 7.062844835977456e-05, "loss": 0.0242, "step": 39640 }, { "epoch": 14.679748241392076, "grad_norm": 0.18313910067081451, "learning_rate": 7.061338533955043e-05, "loss": 0.0257, "step": 39650 }, { "epoch": 14.683450573861533, "grad_norm": 1.1811225414276123, "learning_rate": 7.059832006507745e-05, "loss": 0.0196, "step": 39660 }, { "epoch": 14.687152906330988, "grad_norm": 0.09114763885736465, "learning_rate": 7.058325253800315e-05, "loss": 0.0194, "step": 39670 }, { "epoch": 14.690855238800445, "grad_norm": 0.15554045140743256, "learning_rate": 7.056818275997532e-05, "loss": 0.0331, "step": 39680 }, { "epoch": 14.6945575712699, "grad_norm": 0.3679211735725403, "learning_rate": 7.055311073264194e-05, "loss": 0.0193, "step": 39690 }, { "epoch": 14.698259903739356, "grad_norm": 0.46714797616004944, "learning_rate": 7.053803645765128e-05, "loss": 0.0175, "step": 39700 }, { "epoch": 14.70196223620881, "grad_norm": 0.26731473207473755, "learning_rate": 7.052295993665183e-05, "loss": 0.0287, "step": 39710 }, { "epoch": 14.705664568678268, "grad_norm": 0.23171433806419373, "learning_rate": 7.050788117129233e-05, "loss": 0.0212, "step": 39720 }, { "epoch": 14.709366901147723, "grad_norm": 0.5638130903244019, "learning_rate": 7.049280016322178e-05, "loss": 0.0256, "step": 39730 }, { "epoch": 14.713069233617178, "grad_norm": 0.22469477355480194, "learning_rate": 7.047771691408941e-05, "loss": 0.0247, "step": 39740 }, { "epoch": 14.716771566086635, "grad_norm": 0.41287264227867126, "learning_rate": 7.04626314255447e-05, "loss": 0.0204, "step": 39750 }, { "epoch": 14.72047389855609, "grad_norm": 0.3746251165866852, "learning_rate": 7.044754369923739e-05, "loss": 0.0218, "step": 39760 }, { "epoch": 14.724176231025545, "grad_norm": 0.19795380532741547, "learning_rate": 7.043245373681747e-05, "loss": 0.0294, "step": 39770 }, { "epoch": 14.727878563495002, "grad_norm": 0.10924447327852249, "learning_rate": 7.04173615399351e-05, "loss": 0.0299, "step": 39780 }, { "epoch": 14.731580895964457, "grad_norm": 1.0226691961288452, "learning_rate": 7.040226711024077e-05, "loss": 0.031, "step": 39790 }, { "epoch": 14.735283228433913, "grad_norm": 0.2771499454975128, "learning_rate": 7.038717044938519e-05, "loss": 0.026, "step": 39800 }, { "epoch": 14.73898556090337, "grad_norm": 0.2737056016921997, "learning_rate": 7.03720715590193e-05, "loss": 0.0203, "step": 39810 }, { "epoch": 14.742687893372825, "grad_norm": 0.22719737887382507, "learning_rate": 7.035697044079428e-05, "loss": 0.0329, "step": 39820 }, { "epoch": 14.74639022584228, "grad_norm": 0.2386423647403717, "learning_rate": 7.034186709636159e-05, "loss": 0.0192, "step": 39830 }, { "epoch": 14.750092558311737, "grad_norm": 0.1392679065465927, "learning_rate": 7.03267615273729e-05, "loss": 0.0248, "step": 39840 }, { "epoch": 14.753794890781192, "grad_norm": 0.11972888559103012, "learning_rate": 7.031165373548014e-05, "loss": 0.0157, "step": 39850 }, { "epoch": 14.757497223250647, "grad_norm": 0.15914130210876465, "learning_rate": 7.029654372233544e-05, "loss": 0.021, "step": 39860 }, { "epoch": 14.761199555720104, "grad_norm": 0.564649760723114, "learning_rate": 7.028143148959126e-05, "loss": 0.0218, "step": 39870 }, { "epoch": 14.76490188818956, "grad_norm": 0.20976220071315765, "learning_rate": 7.026631703890021e-05, "loss": 0.0235, "step": 39880 }, { "epoch": 14.768604220659014, "grad_norm": 0.17271865904331207, "learning_rate": 7.02512003719152e-05, "loss": 0.0179, "step": 39890 }, { "epoch": 14.772306553128471, "grad_norm": 0.644504725933075, "learning_rate": 7.023608149028937e-05, "loss": 0.0279, "step": 39900 }, { "epoch": 14.776008885597927, "grad_norm": 0.6132540106773376, "learning_rate": 7.022096039567609e-05, "loss": 0.0178, "step": 39910 }, { "epoch": 14.779711218067382, "grad_norm": 0.13296683132648468, "learning_rate": 7.020583708972896e-05, "loss": 0.0263, "step": 39920 }, { "epoch": 14.783413550536839, "grad_norm": 1.0241957902908325, "learning_rate": 7.01907115741019e-05, "loss": 0.0243, "step": 39930 }, { "epoch": 14.787115883006294, "grad_norm": 0.4027920663356781, "learning_rate": 7.017558385044898e-05, "loss": 0.0219, "step": 39940 }, { "epoch": 14.790818215475749, "grad_norm": 2.8698174953460693, "learning_rate": 7.016045392042452e-05, "loss": 0.025, "step": 39950 }, { "epoch": 14.794520547945206, "grad_norm": 0.37399986386299133, "learning_rate": 7.014532178568314e-05, "loss": 0.0206, "step": 39960 }, { "epoch": 14.798222880414661, "grad_norm": 0.18865719437599182, "learning_rate": 7.013018744787965e-05, "loss": 0.0253, "step": 39970 }, { "epoch": 14.801925212884116, "grad_norm": 0.41237449645996094, "learning_rate": 7.011505090866913e-05, "loss": 0.0259, "step": 39980 }, { "epoch": 14.805627545353573, "grad_norm": 0.2543753981590271, "learning_rate": 7.00999121697069e-05, "loss": 0.0255, "step": 39990 }, { "epoch": 14.809329877823028, "grad_norm": 0.3594406545162201, "learning_rate": 7.008477123264848e-05, "loss": 0.022, "step": 40000 }, { "epoch": 14.813032210292484, "grad_norm": 0.3496229350566864, "learning_rate": 7.006962809914967e-05, "loss": 0.0295, "step": 40010 }, { "epoch": 14.81673454276194, "grad_norm": 0.2601601779460907, "learning_rate": 7.005448277086653e-05, "loss": 0.0252, "step": 40020 }, { "epoch": 14.820436875231396, "grad_norm": 0.2333086133003235, "learning_rate": 7.003933524945528e-05, "loss": 0.0188, "step": 40030 }, { "epoch": 14.824139207700851, "grad_norm": 0.15106455981731415, "learning_rate": 7.002418553657247e-05, "loss": 0.0166, "step": 40040 }, { "epoch": 14.827841540170308, "grad_norm": 0.16483594477176666, "learning_rate": 7.000903363387482e-05, "loss": 0.0278, "step": 40050 }, { "epoch": 14.831543872639763, "grad_norm": 0.21354727447032928, "learning_rate": 6.999387954301934e-05, "loss": 0.0165, "step": 40060 }, { "epoch": 14.835246205109218, "grad_norm": 0.43835580348968506, "learning_rate": 6.997872326566326e-05, "loss": 0.0211, "step": 40070 }, { "epoch": 14.838948537578675, "grad_norm": 0.3333128094673157, "learning_rate": 6.996356480346404e-05, "loss": 0.0254, "step": 40080 }, { "epoch": 14.84265087004813, "grad_norm": 0.3959748446941376, "learning_rate": 6.99484041580794e-05, "loss": 0.0202, "step": 40090 }, { "epoch": 14.846353202517586, "grad_norm": 0.20206576585769653, "learning_rate": 6.993324133116726e-05, "loss": 0.0247, "step": 40100 }, { "epoch": 14.850055534987042, "grad_norm": 0.27631470561027527, "learning_rate": 6.991807632438582e-05, "loss": 0.0214, "step": 40110 }, { "epoch": 14.853757867456498, "grad_norm": 0.12643803656101227, "learning_rate": 6.99029091393935e-05, "loss": 0.0228, "step": 40120 }, { "epoch": 14.857460199925953, "grad_norm": 0.3609204888343811, "learning_rate": 6.988773977784895e-05, "loss": 0.0274, "step": 40130 }, { "epoch": 14.86116253239541, "grad_norm": 0.11169011145830154, "learning_rate": 6.987256824141109e-05, "loss": 0.0188, "step": 40140 }, { "epoch": 14.864864864864865, "grad_norm": 0.34942886233329773, "learning_rate": 6.985739453173903e-05, "loss": 0.019, "step": 40150 }, { "epoch": 14.86856719733432, "grad_norm": 0.1718326210975647, "learning_rate": 6.984221865049218e-05, "loss": 0.017, "step": 40160 }, { "epoch": 14.872269529803777, "grad_norm": 0.3012423515319824, "learning_rate": 6.982704059933011e-05, "loss": 0.0168, "step": 40170 }, { "epoch": 14.875971862273232, "grad_norm": 0.40521442890167236, "learning_rate": 6.981186037991271e-05, "loss": 0.0282, "step": 40180 }, { "epoch": 14.879674194742687, "grad_norm": 0.32173529267311096, "learning_rate": 6.979667799390004e-05, "loss": 0.0281, "step": 40190 }, { "epoch": 14.883376527212144, "grad_norm": 0.2488379329442978, "learning_rate": 6.978149344295242e-05, "loss": 0.0171, "step": 40200 }, { "epoch": 14.8870788596816, "grad_norm": 1.0374035835266113, "learning_rate": 6.976630672873042e-05, "loss": 0.0347, "step": 40210 }, { "epoch": 14.890781192151055, "grad_norm": 0.2586594820022583, "learning_rate": 6.975111785289485e-05, "loss": 0.0255, "step": 40220 }, { "epoch": 14.894483524620512, "grad_norm": 0.3458606004714966, "learning_rate": 6.97359268171067e-05, "loss": 0.0213, "step": 40230 }, { "epoch": 14.898185857089967, "grad_norm": 0.41025033593177795, "learning_rate": 6.97207336230273e-05, "loss": 0.0352, "step": 40240 }, { "epoch": 14.901888189559422, "grad_norm": 0.15898874402046204, "learning_rate": 6.97055382723181e-05, "loss": 0.0262, "step": 40250 }, { "epoch": 14.905590522028879, "grad_norm": 0.2963669002056122, "learning_rate": 6.969034076664085e-05, "loss": 0.0361, "step": 40260 }, { "epoch": 14.909292854498334, "grad_norm": 0.16427883505821228, "learning_rate": 6.967514110765755e-05, "loss": 0.0252, "step": 40270 }, { "epoch": 14.91299518696779, "grad_norm": 0.5420126914978027, "learning_rate": 6.965993929703039e-05, "loss": 0.0364, "step": 40280 }, { "epoch": 14.916697519437246, "grad_norm": 0.15442068874835968, "learning_rate": 6.964473533642185e-05, "loss": 0.0189, "step": 40290 }, { "epoch": 14.920399851906701, "grad_norm": 0.3201170861721039, "learning_rate": 6.962952922749457e-05, "loss": 0.016, "step": 40300 }, { "epoch": 14.924102184376157, "grad_norm": 0.22061944007873535, "learning_rate": 6.96143209719115e-05, "loss": 0.0289, "step": 40310 }, { "epoch": 14.927804516845613, "grad_norm": 0.1840021163225174, "learning_rate": 6.959911057133577e-05, "loss": 0.0286, "step": 40320 }, { "epoch": 14.931506849315069, "grad_norm": 0.20284251868724823, "learning_rate": 6.958389802743077e-05, "loss": 0.0177, "step": 40330 }, { "epoch": 14.935209181784524, "grad_norm": 0.17667190730571747, "learning_rate": 6.956868334186013e-05, "loss": 0.0229, "step": 40340 }, { "epoch": 14.93891151425398, "grad_norm": 0.20958876609802246, "learning_rate": 6.955346651628771e-05, "loss": 0.0284, "step": 40350 }, { "epoch": 14.942613846723436, "grad_norm": 0.1810120940208435, "learning_rate": 6.953824755237756e-05, "loss": 0.0326, "step": 40360 }, { "epoch": 14.946316179192891, "grad_norm": 0.18624122440814972, "learning_rate": 6.952302645179408e-05, "loss": 0.0187, "step": 40370 }, { "epoch": 14.950018511662348, "grad_norm": 0.48725590109825134, "learning_rate": 6.950780321620174e-05, "loss": 0.02, "step": 40380 }, { "epoch": 14.953720844131803, "grad_norm": 0.4311034083366394, "learning_rate": 6.949257784726539e-05, "loss": 0.0172, "step": 40390 }, { "epoch": 14.957423176601258, "grad_norm": 0.1625194251537323, "learning_rate": 6.947735034665002e-05, "loss": 0.0333, "step": 40400 }, { "epoch": 14.961125509070715, "grad_norm": 0.3741188943386078, "learning_rate": 6.94621207160209e-05, "loss": 0.0259, "step": 40410 }, { "epoch": 14.96482784154017, "grad_norm": 0.23572590947151184, "learning_rate": 6.944688895704353e-05, "loss": 0.0151, "step": 40420 }, { "epoch": 14.968530174009626, "grad_norm": 0.3628097474575043, "learning_rate": 6.94316550713836e-05, "loss": 0.0276, "step": 40430 }, { "epoch": 14.97223250647908, "grad_norm": 0.24358290433883667, "learning_rate": 6.941641906070712e-05, "loss": 0.0299, "step": 40440 }, { "epoch": 14.975934838948538, "grad_norm": 0.22843994200229645, "learning_rate": 6.940118092668022e-05, "loss": 0.0274, "step": 40450 }, { "epoch": 14.979637171417993, "grad_norm": 0.5988480448722839, "learning_rate": 6.938594067096936e-05, "loss": 0.03, "step": 40460 }, { "epoch": 14.98333950388745, "grad_norm": 0.4198873043060303, "learning_rate": 6.937069829524116e-05, "loss": 0.0331, "step": 40470 }, { "epoch": 14.987041836356905, "grad_norm": 0.1754336804151535, "learning_rate": 6.935545380116253e-05, "loss": 0.0251, "step": 40480 }, { "epoch": 14.99074416882636, "grad_norm": 0.571819007396698, "learning_rate": 6.934020719040056e-05, "loss": 0.0252, "step": 40490 }, { "epoch": 14.994446501295815, "grad_norm": 0.7911757230758667, "learning_rate": 6.932495846462261e-05, "loss": 0.0191, "step": 40500 }, { "epoch": 14.998148833765272, "grad_norm": 0.23711197078227997, "learning_rate": 6.930970762549628e-05, "loss": 0.0371, "step": 40510 }, { "epoch": 15.001851166234728, "grad_norm": 0.15055884420871735, "learning_rate": 6.929445467468933e-05, "loss": 0.0252, "step": 40520 }, { "epoch": 15.005553498704183, "grad_norm": 0.3014999032020569, "learning_rate": 6.927919961386984e-05, "loss": 0.0345, "step": 40530 }, { "epoch": 15.00925583117364, "grad_norm": 0.29056307673454285, "learning_rate": 6.926394244470605e-05, "loss": 0.0178, "step": 40540 }, { "epoch": 15.012958163643095, "grad_norm": 0.4473782479763031, "learning_rate": 6.924868316886649e-05, "loss": 0.0208, "step": 40550 }, { "epoch": 15.01666049611255, "grad_norm": 0.3581819534301758, "learning_rate": 6.923342178801988e-05, "loss": 0.0192, "step": 40560 }, { "epoch": 15.020362828582007, "grad_norm": 0.2266453057527542, "learning_rate": 6.921815830383518e-05, "loss": 0.0377, "step": 40570 }, { "epoch": 15.024065161051462, "grad_norm": 0.6855615973472595, "learning_rate": 6.920289271798157e-05, "loss": 0.0261, "step": 40580 }, { "epoch": 15.027767493520917, "grad_norm": 0.3609178960323334, "learning_rate": 6.918762503212848e-05, "loss": 0.0312, "step": 40590 }, { "epoch": 15.031469825990374, "grad_norm": 0.18669673800468445, "learning_rate": 6.917235524794558e-05, "loss": 0.0304, "step": 40600 }, { "epoch": 15.03517215845983, "grad_norm": 0.3987496495246887, "learning_rate": 6.915708336710273e-05, "loss": 0.027, "step": 40610 }, { "epoch": 15.038874490929285, "grad_norm": 0.32693150639533997, "learning_rate": 6.914180939127005e-05, "loss": 0.0258, "step": 40620 }, { "epoch": 15.042576823398742, "grad_norm": 0.27239733934402466, "learning_rate": 6.912653332211787e-05, "loss": 0.0247, "step": 40630 }, { "epoch": 15.046279155868197, "grad_norm": 0.5557066798210144, "learning_rate": 6.911125516131677e-05, "loss": 0.0272, "step": 40640 }, { "epoch": 15.049981488337652, "grad_norm": 0.2685360014438629, "learning_rate": 6.909597491053751e-05, "loss": 0.0285, "step": 40650 }, { "epoch": 15.053683820807109, "grad_norm": 0.21370412409305573, "learning_rate": 6.908069257145118e-05, "loss": 0.0232, "step": 40660 }, { "epoch": 15.057386153276564, "grad_norm": 0.15095938742160797, "learning_rate": 6.9065408145729e-05, "loss": 0.0344, "step": 40670 }, { "epoch": 15.06108848574602, "grad_norm": 0.25012508034706116, "learning_rate": 6.905012163504241e-05, "loss": 0.0281, "step": 40680 }, { "epoch": 15.064790818215476, "grad_norm": 0.7312056422233582, "learning_rate": 6.903483304106319e-05, "loss": 0.0197, "step": 40690 }, { "epoch": 15.068493150684931, "grad_norm": 0.17698191106319427, "learning_rate": 6.901954236546323e-05, "loss": 0.0232, "step": 40700 }, { "epoch": 15.072195483154387, "grad_norm": 0.18427209556102753, "learning_rate": 6.900424960991473e-05, "loss": 0.024, "step": 40710 }, { "epoch": 15.075897815623843, "grad_norm": 0.4393167793750763, "learning_rate": 6.898895477609007e-05, "loss": 0.0239, "step": 40720 }, { "epoch": 15.079600148093299, "grad_norm": 0.701984703540802, "learning_rate": 6.897365786566184e-05, "loss": 0.0336, "step": 40730 }, { "epoch": 15.083302480562754, "grad_norm": 0.42192524671554565, "learning_rate": 6.895835888030292e-05, "loss": 0.0215, "step": 40740 }, { "epoch": 15.08700481303221, "grad_norm": 0.6490359306335449, "learning_rate": 6.894305782168638e-05, "loss": 0.0213, "step": 40750 }, { "epoch": 15.090707145501666, "grad_norm": 0.3324282765388489, "learning_rate": 6.892775469148553e-05, "loss": 0.0243, "step": 40760 }, { "epoch": 15.094409477971121, "grad_norm": 0.4745471775531769, "learning_rate": 6.891244949137387e-05, "loss": 0.0313, "step": 40770 }, { "epoch": 15.098111810440578, "grad_norm": 0.17786738276481628, "learning_rate": 6.889714222302517e-05, "loss": 0.02, "step": 40780 }, { "epoch": 15.101814142910033, "grad_norm": 0.1414593607187271, "learning_rate": 6.888183288811341e-05, "loss": 0.0242, "step": 40790 }, { "epoch": 15.105516475379488, "grad_norm": 0.3761819899082184, "learning_rate": 6.886652148831279e-05, "loss": 0.026, "step": 40800 }, { "epoch": 15.109218807848945, "grad_norm": 0.3519703149795532, "learning_rate": 6.885120802529775e-05, "loss": 0.0214, "step": 40810 }, { "epoch": 15.1129211403184, "grad_norm": 0.1444690078496933, "learning_rate": 6.883589250074293e-05, "loss": 0.0223, "step": 40820 }, { "epoch": 15.116623472787856, "grad_norm": 0.15675513446331024, "learning_rate": 6.882057491632326e-05, "loss": 0.0177, "step": 40830 }, { "epoch": 15.120325805257313, "grad_norm": 0.3910374343395233, "learning_rate": 6.880525527371378e-05, "loss": 0.0208, "step": 40840 }, { "epoch": 15.124028137726768, "grad_norm": 0.22619715332984924, "learning_rate": 6.878993357458986e-05, "loss": 0.0209, "step": 40850 }, { "epoch": 15.127730470196223, "grad_norm": 0.20761840045452118, "learning_rate": 6.877460982062706e-05, "loss": 0.0203, "step": 40860 }, { "epoch": 15.13143280266568, "grad_norm": 0.2735730707645416, "learning_rate": 6.875928401350116e-05, "loss": 0.0258, "step": 40870 }, { "epoch": 15.135135135135135, "grad_norm": 0.18701955676078796, "learning_rate": 6.874395615488815e-05, "loss": 0.0239, "step": 40880 }, { "epoch": 15.13883746760459, "grad_norm": 0.22837327420711517, "learning_rate": 6.87286262464643e-05, "loss": 0.0218, "step": 40890 }, { "epoch": 15.142539800074047, "grad_norm": 0.1116182878613472, "learning_rate": 6.871329428990602e-05, "loss": 0.0302, "step": 40900 }, { "epoch": 15.146242132543502, "grad_norm": 0.3159089684486389, "learning_rate": 6.869796028689001e-05, "loss": 0.0315, "step": 40910 }, { "epoch": 15.149944465012958, "grad_norm": 0.2604732811450958, "learning_rate": 6.86826242390932e-05, "loss": 0.0278, "step": 40920 }, { "epoch": 15.153646797482414, "grad_norm": 0.35874301195144653, "learning_rate": 6.866728614819268e-05, "loss": 0.042, "step": 40930 }, { "epoch": 15.15734912995187, "grad_norm": 0.24515709280967712, "learning_rate": 6.865194601586581e-05, "loss": 0.0296, "step": 40940 }, { "epoch": 15.161051462421325, "grad_norm": 0.23425614833831787, "learning_rate": 6.863660384379017e-05, "loss": 0.0215, "step": 40950 }, { "epoch": 15.164753794890782, "grad_norm": 0.21714183688163757, "learning_rate": 6.862125963364354e-05, "loss": 0.0219, "step": 40960 }, { "epoch": 15.168456127360237, "grad_norm": 0.18238940834999084, "learning_rate": 6.860591338710397e-05, "loss": 0.0232, "step": 40970 }, { "epoch": 15.172158459829692, "grad_norm": 0.25769951939582825, "learning_rate": 6.85905651058497e-05, "loss": 0.0238, "step": 40980 }, { "epoch": 15.175860792299149, "grad_norm": 0.5633150339126587, "learning_rate": 6.857521479155915e-05, "loss": 0.021, "step": 40990 }, { "epoch": 15.179563124768604, "grad_norm": 0.1870492696762085, "learning_rate": 6.855986244591104e-05, "loss": 0.0174, "step": 41000 }, { "epoch": 15.18326545723806, "grad_norm": 0.4252515137195587, "learning_rate": 6.854450807058429e-05, "loss": 0.0282, "step": 41010 }, { "epoch": 15.186967789707516, "grad_norm": 0.11820712685585022, "learning_rate": 6.852915166725802e-05, "loss": 0.0192, "step": 41020 }, { "epoch": 15.190670122176972, "grad_norm": 0.31447941064834595, "learning_rate": 6.851379323761157e-05, "loss": 0.023, "step": 41030 }, { "epoch": 15.194372454646427, "grad_norm": 0.18834054470062256, "learning_rate": 6.849843278332453e-05, "loss": 0.0269, "step": 41040 }, { "epoch": 15.198074787115884, "grad_norm": 0.40318456292152405, "learning_rate": 6.84830703060767e-05, "loss": 0.0216, "step": 41050 }, { "epoch": 15.201777119585339, "grad_norm": 0.25808340311050415, "learning_rate": 6.846770580754807e-05, "loss": 0.0319, "step": 41060 }, { "epoch": 15.205479452054794, "grad_norm": 0.20109713077545166, "learning_rate": 6.845233928941892e-05, "loss": 0.0197, "step": 41070 }, { "epoch": 15.209181784524251, "grad_norm": 0.19825351238250732, "learning_rate": 6.843697075336969e-05, "loss": 0.0246, "step": 41080 }, { "epoch": 15.212884116993706, "grad_norm": 0.16457852721214294, "learning_rate": 6.842160020108104e-05, "loss": 0.018, "step": 41090 }, { "epoch": 15.216586449463161, "grad_norm": 0.19861875474452972, "learning_rate": 6.840622763423391e-05, "loss": 0.023, "step": 41100 }, { "epoch": 15.220288781932618, "grad_norm": 0.33185702562332153, "learning_rate": 6.839085305450938e-05, "loss": 0.0209, "step": 41110 }, { "epoch": 15.223991114402073, "grad_norm": 0.14957788586616516, "learning_rate": 6.837547646358882e-05, "loss": 0.0285, "step": 41120 }, { "epoch": 15.227693446871529, "grad_norm": 0.2711164951324463, "learning_rate": 6.836009786315377e-05, "loss": 0.0312, "step": 41130 }, { "epoch": 15.231395779340986, "grad_norm": 0.18385683000087738, "learning_rate": 6.834471725488604e-05, "loss": 0.03, "step": 41140 }, { "epoch": 15.23509811181044, "grad_norm": 0.2455144077539444, "learning_rate": 6.83293346404676e-05, "loss": 0.03, "step": 41150 }, { "epoch": 15.238800444279896, "grad_norm": 0.21452850103378296, "learning_rate": 6.831395002158067e-05, "loss": 0.0184, "step": 41160 }, { "epoch": 15.242502776749353, "grad_norm": 0.23084454238414764, "learning_rate": 6.829856339990772e-05, "loss": 0.0208, "step": 41170 }, { "epoch": 15.246205109218808, "grad_norm": 0.31384772062301636, "learning_rate": 6.82831747771314e-05, "loss": 0.0196, "step": 41180 }, { "epoch": 15.249907441688263, "grad_norm": 0.38093292713165283, "learning_rate": 6.826778415493455e-05, "loss": 0.0241, "step": 41190 }, { "epoch": 15.25360977415772, "grad_norm": 0.19891096651554108, "learning_rate": 6.825239153500029e-05, "loss": 0.0182, "step": 41200 }, { "epoch": 15.257312106627175, "grad_norm": 0.15482871234416962, "learning_rate": 6.823699691901196e-05, "loss": 0.0148, "step": 41210 }, { "epoch": 15.26101443909663, "grad_norm": 0.34712639451026917, "learning_rate": 6.822160030865303e-05, "loss": 0.0213, "step": 41220 }, { "epoch": 15.264716771566087, "grad_norm": 0.6000791192054749, "learning_rate": 6.820620170560731e-05, "loss": 0.0244, "step": 41230 }, { "epoch": 15.268419104035543, "grad_norm": 0.19847790896892548, "learning_rate": 6.819080111155873e-05, "loss": 0.0172, "step": 41240 }, { "epoch": 15.272121436504998, "grad_norm": 0.22355090081691742, "learning_rate": 6.817539852819149e-05, "loss": 0.0193, "step": 41250 }, { "epoch": 15.275823768974455, "grad_norm": 0.1967047154903412, "learning_rate": 6.815999395719e-05, "loss": 0.0183, "step": 41260 }, { "epoch": 15.27952610144391, "grad_norm": 0.38176074624061584, "learning_rate": 6.814458740023889e-05, "loss": 0.0143, "step": 41270 }, { "epoch": 15.283228433913365, "grad_norm": 0.0929698497056961, "learning_rate": 6.812917885902295e-05, "loss": 0.0268, "step": 41280 }, { "epoch": 15.286930766382822, "grad_norm": 0.18959970772266388, "learning_rate": 6.811376833522729e-05, "loss": 0.0123, "step": 41290 }, { "epoch": 15.290633098852277, "grad_norm": 0.27335575222969055, "learning_rate": 6.809835583053715e-05, "loss": 0.0264, "step": 41300 }, { "epoch": 15.294335431321732, "grad_norm": 0.8134138584136963, "learning_rate": 6.808294134663803e-05, "loss": 0.0221, "step": 41310 }, { "epoch": 15.29803776379119, "grad_norm": 0.7058496475219727, "learning_rate": 6.806752488521562e-05, "loss": 0.0356, "step": 41320 }, { "epoch": 15.301740096260644, "grad_norm": 0.1715216040611267, "learning_rate": 6.805210644795588e-05, "loss": 0.0202, "step": 41330 }, { "epoch": 15.3054424287301, "grad_norm": 0.4880825877189636, "learning_rate": 6.80366860365449e-05, "loss": 0.022, "step": 41340 }, { "epoch": 15.309144761199555, "grad_norm": 0.20595717430114746, "learning_rate": 6.802126365266905e-05, "loss": 0.0285, "step": 41350 }, { "epoch": 15.312847093669012, "grad_norm": 0.17370112240314484, "learning_rate": 6.80058392980149e-05, "loss": 0.0309, "step": 41360 }, { "epoch": 15.316549426138467, "grad_norm": 0.2041434645652771, "learning_rate": 6.799041297426925e-05, "loss": 0.0146, "step": 41370 }, { "epoch": 15.320251758607924, "grad_norm": 0.23945850133895874, "learning_rate": 6.797498468311907e-05, "loss": 0.0344, "step": 41380 }, { "epoch": 15.323954091077379, "grad_norm": 0.3069162964820862, "learning_rate": 6.795955442625159e-05, "loss": 0.0243, "step": 41390 }, { "epoch": 15.327656423546834, "grad_norm": 1.7264478206634521, "learning_rate": 6.794412220535426e-05, "loss": 0.0218, "step": 41400 }, { "epoch": 15.33135875601629, "grad_norm": 0.32798367738723755, "learning_rate": 6.792868802211468e-05, "loss": 0.0181, "step": 41410 }, { "epoch": 15.335061088485746, "grad_norm": 0.3766164779663086, "learning_rate": 6.791325187822075e-05, "loss": 0.0158, "step": 41420 }, { "epoch": 15.338763420955202, "grad_norm": 0.3174756169319153, "learning_rate": 6.789781377536052e-05, "loss": 0.0259, "step": 41430 }, { "epoch": 15.342465753424657, "grad_norm": 0.3166009485721588, "learning_rate": 6.788237371522229e-05, "loss": 0.0283, "step": 41440 }, { "epoch": 15.346168085894114, "grad_norm": 0.4599932134151459, "learning_rate": 6.786693169949455e-05, "loss": 0.0271, "step": 41450 }, { "epoch": 15.349870418363569, "grad_norm": 0.45090189576148987, "learning_rate": 6.785148772986603e-05, "loss": 0.0266, "step": 41460 }, { "epoch": 15.353572750833024, "grad_norm": 0.16595573723316193, "learning_rate": 6.783604180802563e-05, "loss": 0.0227, "step": 41470 }, { "epoch": 15.357275083302481, "grad_norm": 0.3519645035266876, "learning_rate": 6.782059393566253e-05, "loss": 0.0208, "step": 41480 }, { "epoch": 15.360977415771936, "grad_norm": 0.23988249897956848, "learning_rate": 6.780514411446608e-05, "loss": 0.0168, "step": 41490 }, { "epoch": 15.364679748241391, "grad_norm": 0.28564631938934326, "learning_rate": 6.778969234612584e-05, "loss": 0.021, "step": 41500 }, { "epoch": 15.368382080710848, "grad_norm": 0.43347451090812683, "learning_rate": 6.777423863233157e-05, "loss": 0.0253, "step": 41510 }, { "epoch": 15.372084413180303, "grad_norm": 0.3801391124725342, "learning_rate": 6.77587829747733e-05, "loss": 0.0291, "step": 41520 }, { "epoch": 15.375786745649759, "grad_norm": 0.18927481770515442, "learning_rate": 6.774332537514122e-05, "loss": 0.0195, "step": 41530 }, { "epoch": 15.379489078119216, "grad_norm": 0.3651718199253082, "learning_rate": 6.772786583512575e-05, "loss": 0.0215, "step": 41540 }, { "epoch": 15.38319141058867, "grad_norm": 0.6399316191673279, "learning_rate": 6.771240435641754e-05, "loss": 0.0309, "step": 41550 }, { "epoch": 15.386893743058126, "grad_norm": 0.1776842474937439, "learning_rate": 6.76969409407074e-05, "loss": 0.0167, "step": 41560 }, { "epoch": 15.390596075527583, "grad_norm": 0.24985556304454803, "learning_rate": 6.768147558968641e-05, "loss": 0.019, "step": 41570 }, { "epoch": 15.394298407997038, "grad_norm": 0.2789192795753479, "learning_rate": 6.766600830504585e-05, "loss": 0.0318, "step": 41580 }, { "epoch": 15.398000740466493, "grad_norm": 0.5162339210510254, "learning_rate": 6.765053908847716e-05, "loss": 0.0243, "step": 41590 }, { "epoch": 15.40170307293595, "grad_norm": 0.28814560174942017, "learning_rate": 6.763506794167208e-05, "loss": 0.0377, "step": 41600 }, { "epoch": 15.405405405405405, "grad_norm": 0.29129892587661743, "learning_rate": 6.761959486632244e-05, "loss": 0.0305, "step": 41610 }, { "epoch": 15.40910773787486, "grad_norm": 0.3224940896034241, "learning_rate": 6.760411986412043e-05, "loss": 0.0216, "step": 41620 }, { "epoch": 15.412810070344317, "grad_norm": 0.19361793994903564, "learning_rate": 6.758864293675833e-05, "loss": 0.0339, "step": 41630 }, { "epoch": 15.416512402813773, "grad_norm": 0.1860332190990448, "learning_rate": 6.757316408592868e-05, "loss": 0.0201, "step": 41640 }, { "epoch": 15.420214735283228, "grad_norm": 0.7430813908576965, "learning_rate": 6.755768331332424e-05, "loss": 0.0234, "step": 41650 }, { "epoch": 15.423917067752685, "grad_norm": 0.17155176401138306, "learning_rate": 6.754220062063793e-05, "loss": 0.0217, "step": 41660 }, { "epoch": 15.42761940022214, "grad_norm": 0.7215065360069275, "learning_rate": 6.752671600956295e-05, "loss": 0.0268, "step": 41670 }, { "epoch": 15.431321732691595, "grad_norm": 0.1315428465604782, "learning_rate": 6.751122948179266e-05, "loss": 0.0348, "step": 41680 }, { "epoch": 15.435024065161052, "grad_norm": 0.13680103421211243, "learning_rate": 6.749574103902064e-05, "loss": 0.0148, "step": 41690 }, { "epoch": 15.438726397630507, "grad_norm": 0.1810569316148758, "learning_rate": 6.748025068294067e-05, "loss": 0.0272, "step": 41700 }, { "epoch": 15.442428730099962, "grad_norm": 0.20881006121635437, "learning_rate": 6.746475841524677e-05, "loss": 0.0215, "step": 41710 }, { "epoch": 15.44613106256942, "grad_norm": 0.21061964333057404, "learning_rate": 6.744926423763317e-05, "loss": 0.023, "step": 41720 }, { "epoch": 15.449833395038874, "grad_norm": 0.32841232419013977, "learning_rate": 6.743376815179424e-05, "loss": 0.0163, "step": 41730 }, { "epoch": 15.45353572750833, "grad_norm": 0.2569274306297302, "learning_rate": 6.741827015942465e-05, "loss": 0.015, "step": 41740 }, { "epoch": 15.457238059977787, "grad_norm": 0.2570202052593231, "learning_rate": 6.740277026221923e-05, "loss": 0.0151, "step": 41750 }, { "epoch": 15.460940392447242, "grad_norm": 0.14956162869930267, "learning_rate": 6.7387268461873e-05, "loss": 0.0238, "step": 41760 }, { "epoch": 15.464642724916697, "grad_norm": 0.2450091391801834, "learning_rate": 6.737176476008125e-05, "loss": 0.0237, "step": 41770 }, { "epoch": 15.468345057386154, "grad_norm": 0.2268373817205429, "learning_rate": 6.735625915853942e-05, "loss": 0.0189, "step": 41780 }, { "epoch": 15.472047389855609, "grad_norm": 0.19353851675987244, "learning_rate": 6.734075165894317e-05, "loss": 0.0201, "step": 41790 }, { "epoch": 15.475749722325064, "grad_norm": 0.14143963158130646, "learning_rate": 6.732524226298841e-05, "loss": 0.025, "step": 41800 }, { "epoch": 15.479452054794521, "grad_norm": 0.3313455283641815, "learning_rate": 6.730973097237119e-05, "loss": 0.0209, "step": 41810 }, { "epoch": 15.483154387263976, "grad_norm": 0.17394450306892395, "learning_rate": 6.729421778878781e-05, "loss": 0.0233, "step": 41820 }, { "epoch": 15.486856719733431, "grad_norm": 0.2848356366157532, "learning_rate": 6.72787027139348e-05, "loss": 0.0253, "step": 41830 }, { "epoch": 15.490559052202888, "grad_norm": 0.1648077815771103, "learning_rate": 6.72631857495088e-05, "loss": 0.0184, "step": 41840 }, { "epoch": 15.494261384672344, "grad_norm": 0.2815701961517334, "learning_rate": 6.72476668972068e-05, "loss": 0.032, "step": 41850 }, { "epoch": 15.497963717141799, "grad_norm": 0.288260817527771, "learning_rate": 6.723214615872585e-05, "loss": 0.0258, "step": 41860 }, { "epoch": 15.501666049611256, "grad_norm": 0.15076924860477448, "learning_rate": 6.721662353576332e-05, "loss": 0.0241, "step": 41870 }, { "epoch": 15.50536838208071, "grad_norm": 0.24083437025547028, "learning_rate": 6.720109903001674e-05, "loss": 0.0235, "step": 41880 }, { "epoch": 15.509070714550166, "grad_norm": 0.4405452013015747, "learning_rate": 6.71855726431838e-05, "loss": 0.0183, "step": 41890 }, { "epoch": 15.512773047019623, "grad_norm": 0.1304732859134674, "learning_rate": 6.71700443769625e-05, "loss": 0.0321, "step": 41900 }, { "epoch": 15.516475379489078, "grad_norm": 0.19012312591075897, "learning_rate": 6.715451423305094e-05, "loss": 0.0195, "step": 41910 }, { "epoch": 15.520177711958533, "grad_norm": 0.18425814807415009, "learning_rate": 6.713898221314749e-05, "loss": 0.0243, "step": 41920 }, { "epoch": 15.52388004442799, "grad_norm": 0.2500613331794739, "learning_rate": 6.712344831895074e-05, "loss": 0.0234, "step": 41930 }, { "epoch": 15.527582376897445, "grad_norm": 0.10120757669210434, "learning_rate": 6.710791255215943e-05, "loss": 0.0277, "step": 41940 }, { "epoch": 15.5312847093669, "grad_norm": 0.4970211982727051, "learning_rate": 6.709237491447249e-05, "loss": 0.023, "step": 41950 }, { "epoch": 15.534987041836358, "grad_norm": 0.27100658416748047, "learning_rate": 6.707683540758915e-05, "loss": 0.0273, "step": 41960 }, { "epoch": 15.538689374305813, "grad_norm": 0.18096110224723816, "learning_rate": 6.706129403320876e-05, "loss": 0.0184, "step": 41970 }, { "epoch": 15.542391706775268, "grad_norm": 0.14375914633274078, "learning_rate": 6.70457507930309e-05, "loss": 0.0213, "step": 41980 }, { "epoch": 15.546094039244725, "grad_norm": 0.245061993598938, "learning_rate": 6.703020568875538e-05, "loss": 0.0268, "step": 41990 }, { "epoch": 15.54979637171418, "grad_norm": 0.206811785697937, "learning_rate": 6.701465872208216e-05, "loss": 0.0175, "step": 42000 }, { "epoch": 15.553498704183635, "grad_norm": 0.173117995262146, "learning_rate": 6.699910989471143e-05, "loss": 0.0151, "step": 42010 }, { "epoch": 15.557201036653092, "grad_norm": 0.23862622678279877, "learning_rate": 6.698355920834363e-05, "loss": 0.0295, "step": 42020 }, { "epoch": 15.560903369122547, "grad_norm": 0.14821362495422363, "learning_rate": 6.69680066646793e-05, "loss": 0.0235, "step": 42030 }, { "epoch": 15.564605701592003, "grad_norm": 0.3371911346912384, "learning_rate": 6.69524522654193e-05, "loss": 0.0228, "step": 42040 }, { "epoch": 15.56830803406146, "grad_norm": 0.14362221956253052, "learning_rate": 6.693689601226458e-05, "loss": 0.021, "step": 42050 }, { "epoch": 15.572010366530915, "grad_norm": 0.1536392718553543, "learning_rate": 6.692133790691639e-05, "loss": 0.0214, "step": 42060 }, { "epoch": 15.57571269900037, "grad_norm": 0.21152663230895996, "learning_rate": 6.690577795107611e-05, "loss": 0.0291, "step": 42070 }, { "epoch": 15.579415031469827, "grad_norm": 0.28970420360565186, "learning_rate": 6.689021614644538e-05, "loss": 0.0143, "step": 42080 }, { "epoch": 15.583117363939282, "grad_norm": 0.2992527186870575, "learning_rate": 6.687465249472603e-05, "loss": 0.0139, "step": 42090 }, { "epoch": 15.586819696408737, "grad_norm": 0.7999166250228882, "learning_rate": 6.685908699762002e-05, "loss": 0.0281, "step": 42100 }, { "epoch": 15.590522028878194, "grad_norm": 1.2759608030319214, "learning_rate": 6.684351965682962e-05, "loss": 0.0182, "step": 42110 }, { "epoch": 15.59422436134765, "grad_norm": 0.31281718611717224, "learning_rate": 6.682795047405724e-05, "loss": 0.0258, "step": 42120 }, { "epoch": 15.597926693817104, "grad_norm": 0.38452059030532837, "learning_rate": 6.681237945100548e-05, "loss": 0.0218, "step": 42130 }, { "epoch": 15.601629026286561, "grad_norm": 0.13771966099739075, "learning_rate": 6.679680658937719e-05, "loss": 0.0141, "step": 42140 }, { "epoch": 15.605331358756017, "grad_norm": 0.16483096778392792, "learning_rate": 6.67812318908754e-05, "loss": 0.0178, "step": 42150 }, { "epoch": 15.609033691225472, "grad_norm": 0.12259380519390106, "learning_rate": 6.67656553572033e-05, "loss": 0.0237, "step": 42160 }, { "epoch": 15.612736023694929, "grad_norm": 0.3597683608531952, "learning_rate": 6.675007699006433e-05, "loss": 0.0186, "step": 42170 }, { "epoch": 15.616438356164384, "grad_norm": 0.23175254464149475, "learning_rate": 6.673449679116215e-05, "loss": 0.0151, "step": 42180 }, { "epoch": 15.620140688633839, "grad_norm": 0.20676212012767792, "learning_rate": 6.671891476220055e-05, "loss": 0.0164, "step": 42190 }, { "epoch": 15.623843021103294, "grad_norm": 0.2623026669025421, "learning_rate": 6.670333090488356e-05, "loss": 0.0222, "step": 42200 }, { "epoch": 15.627545353572751, "grad_norm": 0.3259486258029938, "learning_rate": 6.668774522091542e-05, "loss": 0.0293, "step": 42210 }, { "epoch": 15.631247686042206, "grad_norm": 0.16215746104717255, "learning_rate": 6.667215771200055e-05, "loss": 0.0177, "step": 42220 }, { "epoch": 15.634950018511663, "grad_norm": 0.13125400245189667, "learning_rate": 6.665656837984359e-05, "loss": 0.0309, "step": 42230 }, { "epoch": 15.638652350981118, "grad_norm": 0.35921552777290344, "learning_rate": 6.664097722614934e-05, "loss": 0.0241, "step": 42240 }, { "epoch": 15.642354683450574, "grad_norm": 0.27984270453453064, "learning_rate": 6.662538425262285e-05, "loss": 0.0276, "step": 42250 }, { "epoch": 15.646057015920029, "grad_norm": 0.1704311966896057, "learning_rate": 6.660978946096933e-05, "loss": 0.0265, "step": 42260 }, { "epoch": 15.649759348389486, "grad_norm": 0.28983378410339355, "learning_rate": 6.65941928528942e-05, "loss": 0.0356, "step": 42270 }, { "epoch": 15.65346168085894, "grad_norm": 0.27853065729141235, "learning_rate": 6.657859443010311e-05, "loss": 0.0215, "step": 42280 }, { "epoch": 15.657164013328398, "grad_norm": 0.3486945629119873, "learning_rate": 6.656299419430183e-05, "loss": 0.0176, "step": 42290 }, { "epoch": 15.660866345797853, "grad_norm": 0.458509236574173, "learning_rate": 6.654739214719641e-05, "loss": 0.0237, "step": 42300 }, { "epoch": 15.664568678267308, "grad_norm": 0.2976602017879486, "learning_rate": 6.653178829049308e-05, "loss": 0.0191, "step": 42310 }, { "epoch": 15.668271010736763, "grad_norm": 0.3283105194568634, "learning_rate": 6.651618262589823e-05, "loss": 0.0296, "step": 42320 }, { "epoch": 15.67197334320622, "grad_norm": 0.4949652850627899, "learning_rate": 6.650057515511848e-05, "loss": 0.0331, "step": 42330 }, { "epoch": 15.675675675675675, "grad_norm": 0.1982576698064804, "learning_rate": 6.648496587986065e-05, "loss": 0.0246, "step": 42340 }, { "epoch": 15.67937800814513, "grad_norm": 0.19603122770786285, "learning_rate": 6.646935480183173e-05, "loss": 0.0186, "step": 42350 }, { "epoch": 15.683080340614588, "grad_norm": 0.1714480221271515, "learning_rate": 6.645374192273894e-05, "loss": 0.0252, "step": 42360 }, { "epoch": 15.686782673084043, "grad_norm": 0.19285650551319122, "learning_rate": 6.643812724428964e-05, "loss": 0.0211, "step": 42370 }, { "epoch": 15.690485005553498, "grad_norm": 0.13395191729068756, "learning_rate": 6.642251076819148e-05, "loss": 0.0221, "step": 42380 }, { "epoch": 15.694187338022955, "grad_norm": 0.4709298312664032, "learning_rate": 6.640689249615223e-05, "loss": 0.0241, "step": 42390 }, { "epoch": 15.69788967049241, "grad_norm": 0.11415750533342361, "learning_rate": 6.639127242987988e-05, "loss": 0.0217, "step": 42400 }, { "epoch": 15.701592002961865, "grad_norm": 0.12035747617483139, "learning_rate": 6.637565057108264e-05, "loss": 0.0243, "step": 42410 }, { "epoch": 15.705294335431322, "grad_norm": 0.2543218731880188, "learning_rate": 6.636002692146886e-05, "loss": 0.0208, "step": 42420 }, { "epoch": 15.708996667900777, "grad_norm": 0.26168182492256165, "learning_rate": 6.634440148274713e-05, "loss": 0.0247, "step": 42430 }, { "epoch": 15.712699000370232, "grad_norm": 0.14464609324932098, "learning_rate": 6.632877425662623e-05, "loss": 0.0198, "step": 42440 }, { "epoch": 15.71640133283969, "grad_norm": 0.3003924489021301, "learning_rate": 6.631314524481513e-05, "loss": 0.0231, "step": 42450 }, { "epoch": 15.720103665309145, "grad_norm": 0.22691932320594788, "learning_rate": 6.629751444902299e-05, "loss": 0.0216, "step": 42460 }, { "epoch": 15.7238059977786, "grad_norm": 0.5770584344863892, "learning_rate": 6.628188187095918e-05, "loss": 0.0258, "step": 42470 }, { "epoch": 15.727508330248057, "grad_norm": 0.17818541824817657, "learning_rate": 6.626624751233324e-05, "loss": 0.0219, "step": 42480 }, { "epoch": 15.731210662717512, "grad_norm": 0.2645045816898346, "learning_rate": 6.625061137485491e-05, "loss": 0.0224, "step": 42490 }, { "epoch": 15.734912995186967, "grad_norm": 0.37628868222236633, "learning_rate": 6.623497346023418e-05, "loss": 0.0213, "step": 42500 }, { "epoch": 15.738615327656424, "grad_norm": 0.2004934847354889, "learning_rate": 6.621933377018115e-05, "loss": 0.0231, "step": 42510 }, { "epoch": 15.74231766012588, "grad_norm": 0.16402514278888702, "learning_rate": 6.620369230640618e-05, "loss": 0.0314, "step": 42520 }, { "epoch": 15.746019992595334, "grad_norm": 0.2914353609085083, "learning_rate": 6.618804907061976e-05, "loss": 0.0233, "step": 42530 }, { "epoch": 15.749722325064791, "grad_norm": 0.31541821360588074, "learning_rate": 6.617240406453264e-05, "loss": 0.0306, "step": 42540 }, { "epoch": 15.753424657534246, "grad_norm": 0.4075387120246887, "learning_rate": 6.615675728985572e-05, "loss": 0.031, "step": 42550 }, { "epoch": 15.757126990003702, "grad_norm": 0.0935894176363945, "learning_rate": 6.614110874830012e-05, "loss": 0.02, "step": 42560 }, { "epoch": 15.760829322473159, "grad_norm": 0.1804715245962143, "learning_rate": 6.612545844157713e-05, "loss": 0.0165, "step": 42570 }, { "epoch": 15.764531654942614, "grad_norm": 0.5170519351959229, "learning_rate": 6.610980637139827e-05, "loss": 0.0213, "step": 42580 }, { "epoch": 15.768233987412069, "grad_norm": 0.19905219972133636, "learning_rate": 6.609415253947517e-05, "loss": 0.0317, "step": 42590 }, { "epoch": 15.771936319881526, "grad_norm": 0.3681011199951172, "learning_rate": 6.607849694751977e-05, "loss": 0.0311, "step": 42600 }, { "epoch": 15.775638652350981, "grad_norm": 0.1672448068857193, "learning_rate": 6.606283959724412e-05, "loss": 0.0186, "step": 42610 }, { "epoch": 15.779340984820436, "grad_norm": 0.15802903473377228, "learning_rate": 6.604718049036048e-05, "loss": 0.023, "step": 42620 }, { "epoch": 15.783043317289893, "grad_norm": 0.19874149560928345, "learning_rate": 6.60315196285813e-05, "loss": 0.0238, "step": 42630 }, { "epoch": 15.786745649759348, "grad_norm": 0.15553195774555206, "learning_rate": 6.601585701361925e-05, "loss": 0.0226, "step": 42640 }, { "epoch": 15.790447982228804, "grad_norm": 0.14122554659843445, "learning_rate": 6.600019264718713e-05, "loss": 0.0425, "step": 42650 }, { "epoch": 15.79415031469826, "grad_norm": 0.43365854024887085, "learning_rate": 6.598452653099803e-05, "loss": 0.0218, "step": 42660 }, { "epoch": 15.797852647167716, "grad_norm": 0.14381276071071625, "learning_rate": 6.596885866676512e-05, "loss": 0.0249, "step": 42670 }, { "epoch": 15.80155497963717, "grad_norm": 0.18901337683200836, "learning_rate": 6.595318905620184e-05, "loss": 0.0235, "step": 42680 }, { "epoch": 15.805257312106628, "grad_norm": 0.39523592591285706, "learning_rate": 6.593751770102178e-05, "loss": 0.0229, "step": 42690 }, { "epoch": 15.808959644576083, "grad_norm": 0.1904306709766388, "learning_rate": 6.592184460293877e-05, "loss": 0.0208, "step": 42700 }, { "epoch": 15.812661977045538, "grad_norm": 0.275372713804245, "learning_rate": 6.590616976366676e-05, "loss": 0.0299, "step": 42710 }, { "epoch": 15.816364309514995, "grad_norm": 0.20935600996017456, "learning_rate": 6.589049318491993e-05, "loss": 0.0222, "step": 42720 }, { "epoch": 15.82006664198445, "grad_norm": 0.15902644395828247, "learning_rate": 6.587481486841267e-05, "loss": 0.0211, "step": 42730 }, { "epoch": 15.823768974453905, "grad_norm": 1.7766578197479248, "learning_rate": 6.585913481585952e-05, "loss": 0.0199, "step": 42740 }, { "epoch": 15.827471306923362, "grad_norm": 0.24343982338905334, "learning_rate": 6.584345302897523e-05, "loss": 0.0199, "step": 42750 }, { "epoch": 15.831173639392818, "grad_norm": 0.2185053527355194, "learning_rate": 6.582776950947474e-05, "loss": 0.0179, "step": 42760 }, { "epoch": 15.834875971862273, "grad_norm": 0.18387387692928314, "learning_rate": 6.581208425907316e-05, "loss": 0.0237, "step": 42770 }, { "epoch": 15.83857830433173, "grad_norm": 0.18610842525959015, "learning_rate": 6.579639727948583e-05, "loss": 0.0291, "step": 42780 }, { "epoch": 15.842280636801185, "grad_norm": 0.16873405873775482, "learning_rate": 6.578070857242823e-05, "loss": 0.036, "step": 42790 }, { "epoch": 15.84598296927064, "grad_norm": 0.49673688411712646, "learning_rate": 6.576501813961609e-05, "loss": 0.0326, "step": 42800 }, { "epoch": 15.849685301740097, "grad_norm": 0.23251618444919586, "learning_rate": 6.574932598276525e-05, "loss": 0.039, "step": 42810 }, { "epoch": 15.853387634209552, "grad_norm": 0.412950724363327, "learning_rate": 6.573363210359179e-05, "loss": 0.026, "step": 42820 }, { "epoch": 15.857089966679007, "grad_norm": 0.16451367735862732, "learning_rate": 6.571793650381201e-05, "loss": 0.0241, "step": 42830 }, { "epoch": 15.860792299148464, "grad_norm": 0.20014509558677673, "learning_rate": 6.570223918514229e-05, "loss": 0.0157, "step": 42840 }, { "epoch": 15.86449463161792, "grad_norm": 0.4231421649456024, "learning_rate": 6.568654014929932e-05, "loss": 0.0257, "step": 42850 }, { "epoch": 15.868196964087375, "grad_norm": 0.23120884597301483, "learning_rate": 6.567083939799992e-05, "loss": 0.0265, "step": 42860 }, { "epoch": 15.871899296556832, "grad_norm": 0.3219464123249054, "learning_rate": 6.565513693296104e-05, "loss": 0.0187, "step": 42870 }, { "epoch": 15.875601629026287, "grad_norm": 0.18116344511508942, "learning_rate": 6.563943275589996e-05, "loss": 0.0313, "step": 42880 }, { "epoch": 15.879303961495742, "grad_norm": 0.6156719923019409, "learning_rate": 6.562372686853402e-05, "loss": 0.0259, "step": 42890 }, { "epoch": 15.883006293965199, "grad_norm": 0.13254080712795258, "learning_rate": 6.56080192725808e-05, "loss": 0.0191, "step": 42900 }, { "epoch": 15.886708626434654, "grad_norm": 0.11565198004245758, "learning_rate": 6.559230996975805e-05, "loss": 0.0223, "step": 42910 }, { "epoch": 15.89041095890411, "grad_norm": 0.17268118262290955, "learning_rate": 6.557659896178373e-05, "loss": 0.0156, "step": 42920 }, { "epoch": 15.894113291373566, "grad_norm": 0.18112100660800934, "learning_rate": 6.556088625037597e-05, "loss": 0.0263, "step": 42930 }, { "epoch": 15.897815623843021, "grad_norm": 0.19847282767295837, "learning_rate": 6.554517183725307e-05, "loss": 0.0271, "step": 42940 }, { "epoch": 15.901517956312476, "grad_norm": 0.15777209401130676, "learning_rate": 6.552945572413358e-05, "loss": 0.0223, "step": 42950 }, { "epoch": 15.905220288781933, "grad_norm": 0.1684832125902176, "learning_rate": 6.551373791273614e-05, "loss": 0.0264, "step": 42960 }, { "epoch": 15.908922621251389, "grad_norm": 0.2805016338825226, "learning_rate": 6.549801840477963e-05, "loss": 0.0275, "step": 42970 }, { "epoch": 15.912624953720844, "grad_norm": 1.8905800580978394, "learning_rate": 6.548229720198315e-05, "loss": 0.0309, "step": 42980 }, { "epoch": 15.9163272861903, "grad_norm": 0.3580152988433838, "learning_rate": 6.546657430606593e-05, "loss": 0.0267, "step": 42990 }, { "epoch": 15.920029618659756, "grad_norm": 0.2968962490558624, "learning_rate": 6.545084971874738e-05, "loss": 0.0209, "step": 43000 }, { "epoch": 15.923731951129211, "grad_norm": 0.1896810084581375, "learning_rate": 6.543512344174714e-05, "loss": 0.0196, "step": 43010 }, { "epoch": 15.927434283598668, "grad_norm": 0.28909316658973694, "learning_rate": 6.5419395476785e-05, "loss": 0.0224, "step": 43020 }, { "epoch": 15.931136616068123, "grad_norm": 0.17705336213111877, "learning_rate": 6.540366582558097e-05, "loss": 0.0199, "step": 43030 }, { "epoch": 15.934838948537578, "grad_norm": 0.21356269717216492, "learning_rate": 6.53879344898552e-05, "loss": 0.0254, "step": 43040 }, { "epoch": 15.938541281007033, "grad_norm": 0.3014996647834778, "learning_rate": 6.537220147132805e-05, "loss": 0.0183, "step": 43050 }, { "epoch": 15.94224361347649, "grad_norm": 0.1208595559000969, "learning_rate": 6.535646677172005e-05, "loss": 0.0283, "step": 43060 }, { "epoch": 15.945945945945946, "grad_norm": 0.2045363485813141, "learning_rate": 6.534073039275193e-05, "loss": 0.0141, "step": 43070 }, { "epoch": 15.949648278415403, "grad_norm": 0.19324955344200134, "learning_rate": 6.532499233614461e-05, "loss": 0.013, "step": 43080 }, { "epoch": 15.953350610884858, "grad_norm": 0.32846710085868835, "learning_rate": 6.530925260361918e-05, "loss": 0.024, "step": 43090 }, { "epoch": 15.957052943354313, "grad_norm": 1.0705515146255493, "learning_rate": 6.529351119689688e-05, "loss": 0.0248, "step": 43100 }, { "epoch": 15.960755275823768, "grad_norm": 0.2585233449935913, "learning_rate": 6.527776811769918e-05, "loss": 0.0236, "step": 43110 }, { "epoch": 15.964457608293225, "grad_norm": 0.11030624806880951, "learning_rate": 6.526202336774776e-05, "loss": 0.0258, "step": 43120 }, { "epoch": 15.96815994076268, "grad_norm": 0.156953364610672, "learning_rate": 6.52462769487644e-05, "loss": 0.0211, "step": 43130 }, { "epoch": 15.971862273232137, "grad_norm": 0.31908366084098816, "learning_rate": 6.52305288624711e-05, "loss": 0.0209, "step": 43140 }, { "epoch": 15.975564605701592, "grad_norm": 0.18861323595046997, "learning_rate": 6.521477911059008e-05, "loss": 0.0277, "step": 43150 }, { "epoch": 15.979266938171047, "grad_norm": 0.15198948979377747, "learning_rate": 6.519902769484368e-05, "loss": 0.0147, "step": 43160 }, { "epoch": 15.982969270640503, "grad_norm": 0.11318739503622055, "learning_rate": 6.518327461695446e-05, "loss": 0.033, "step": 43170 }, { "epoch": 15.98667160310996, "grad_norm": 0.2702484130859375, "learning_rate": 6.516751987864517e-05, "loss": 0.0265, "step": 43180 }, { "epoch": 15.990373935579415, "grad_norm": 0.17002619802951813, "learning_rate": 6.515176348163871e-05, "loss": 0.0229, "step": 43190 }, { "epoch": 15.994076268048872, "grad_norm": 0.2551537752151489, "learning_rate": 6.513600542765817e-05, "loss": 0.022, "step": 43200 }, { "epoch": 15.997778600518327, "grad_norm": 0.17516237497329712, "learning_rate": 6.512024571842683e-05, "loss": 0.0237, "step": 43210 }, { "epoch": 16.001480932987782, "grad_norm": 0.2616831064224243, "learning_rate": 6.510448435566815e-05, "loss": 0.0175, "step": 43220 }, { "epoch": 16.005183265457237, "grad_norm": 0.26777154207229614, "learning_rate": 6.508872134110577e-05, "loss": 0.0297, "step": 43230 }, { "epoch": 16.008885597926692, "grad_norm": 0.3150446116924286, "learning_rate": 6.507295667646352e-05, "loss": 0.0329, "step": 43240 }, { "epoch": 16.01258793039615, "grad_norm": 0.31635957956314087, "learning_rate": 6.505719036346539e-05, "loss": 0.0336, "step": 43250 }, { "epoch": 16.016290262865606, "grad_norm": 0.16937389969825745, "learning_rate": 6.504142240383555e-05, "loss": 0.0245, "step": 43260 }, { "epoch": 16.01999259533506, "grad_norm": 0.28010833263397217, "learning_rate": 6.502565279929835e-05, "loss": 0.0229, "step": 43270 }, { "epoch": 16.023694927804517, "grad_norm": 0.19999878108501434, "learning_rate": 6.500988155157839e-05, "loss": 0.0175, "step": 43280 }, { "epoch": 16.027397260273972, "grad_norm": 0.21113573014736176, "learning_rate": 6.499410866240032e-05, "loss": 0.0236, "step": 43290 }, { "epoch": 16.031099592743427, "grad_norm": 0.2519457936286926, "learning_rate": 6.497833413348909e-05, "loss": 0.0235, "step": 43300 }, { "epoch": 16.034801925212886, "grad_norm": 0.10876328498125076, "learning_rate": 6.496255796656976e-05, "loss": 0.016, "step": 43310 }, { "epoch": 16.03850425768234, "grad_norm": 0.13135983049869537, "learning_rate": 6.494678016336757e-05, "loss": 0.0229, "step": 43320 }, { "epoch": 16.042206590151796, "grad_norm": 0.3821909427642822, "learning_rate": 6.493100072560799e-05, "loss": 0.0299, "step": 43330 }, { "epoch": 16.04590892262125, "grad_norm": 0.2390604466199875, "learning_rate": 6.491521965501661e-05, "loss": 0.0318, "step": 43340 }, { "epoch": 16.049611255090706, "grad_norm": 0.4180494248867035, "learning_rate": 6.489943695331923e-05, "loss": 0.0188, "step": 43350 }, { "epoch": 16.05331358756016, "grad_norm": 0.22670698165893555, "learning_rate": 6.488365262224185e-05, "loss": 0.0233, "step": 43360 }, { "epoch": 16.05701592002962, "grad_norm": 0.3876887261867523, "learning_rate": 6.486786666351057e-05, "loss": 0.0196, "step": 43370 }, { "epoch": 16.060718252499075, "grad_norm": 0.7445489168167114, "learning_rate": 6.485207907885175e-05, "loss": 0.02, "step": 43380 }, { "epoch": 16.06442058496853, "grad_norm": 0.13855654001235962, "learning_rate": 6.48362898699919e-05, "loss": 0.0333, "step": 43390 }, { "epoch": 16.068122917437986, "grad_norm": 0.28277215361595154, "learning_rate": 6.48204990386577e-05, "loss": 0.0232, "step": 43400 }, { "epoch": 16.07182524990744, "grad_norm": 0.2003927081823349, "learning_rate": 6.480470658657599e-05, "loss": 0.0185, "step": 43410 }, { "epoch": 16.075527582376896, "grad_norm": 0.4136362671852112, "learning_rate": 6.478891251547383e-05, "loss": 0.0211, "step": 43420 }, { "epoch": 16.079229914846355, "grad_norm": 0.2412082850933075, "learning_rate": 6.477311682707845e-05, "loss": 0.0228, "step": 43430 }, { "epoch": 16.08293224731581, "grad_norm": 0.19428209960460663, "learning_rate": 6.47573195231172e-05, "loss": 0.0242, "step": 43440 }, { "epoch": 16.086634579785265, "grad_norm": 0.5416072010993958, "learning_rate": 6.474152060531768e-05, "loss": 0.0332, "step": 43450 }, { "epoch": 16.09033691225472, "grad_norm": 0.3244047462940216, "learning_rate": 6.472572007540764e-05, "loss": 0.0249, "step": 43460 }, { "epoch": 16.094039244724176, "grad_norm": 0.19188565015792847, "learning_rate": 6.4709917935115e-05, "loss": 0.0137, "step": 43470 }, { "epoch": 16.09774157719363, "grad_norm": 0.3446199297904968, "learning_rate": 6.469411418616782e-05, "loss": 0.0269, "step": 43480 }, { "epoch": 16.10144390966309, "grad_norm": 0.24612025916576385, "learning_rate": 6.467830883029443e-05, "loss": 0.0196, "step": 43490 }, { "epoch": 16.105146242132545, "grad_norm": 0.22496779263019562, "learning_rate": 6.466250186922325e-05, "loss": 0.0198, "step": 43500 }, { "epoch": 16.108848574602, "grad_norm": 0.2764008939266205, "learning_rate": 6.46466933046829e-05, "loss": 0.0188, "step": 43510 }, { "epoch": 16.112550907071455, "grad_norm": 0.6265040636062622, "learning_rate": 6.46308831384022e-05, "loss": 0.0155, "step": 43520 }, { "epoch": 16.11625323954091, "grad_norm": 0.25157225131988525, "learning_rate": 6.461507137211012e-05, "loss": 0.0255, "step": 43530 }, { "epoch": 16.119955572010365, "grad_norm": 0.5249143838882446, "learning_rate": 6.45992580075358e-05, "loss": 0.0166, "step": 43540 }, { "epoch": 16.123657904479824, "grad_norm": 0.21915146708488464, "learning_rate": 6.458344304640858e-05, "loss": 0.0198, "step": 43550 }, { "epoch": 16.12736023694928, "grad_norm": 0.4739411473274231, "learning_rate": 6.456762649045796e-05, "loss": 0.0224, "step": 43560 }, { "epoch": 16.131062569418734, "grad_norm": 0.23484832048416138, "learning_rate": 6.455180834141359e-05, "loss": 0.0379, "step": 43570 }, { "epoch": 16.13476490188819, "grad_norm": 0.15714600682258606, "learning_rate": 6.453598860100536e-05, "loss": 0.0227, "step": 43580 }, { "epoch": 16.138467234357645, "grad_norm": 0.2684646248817444, "learning_rate": 6.452016727096326e-05, "loss": 0.0286, "step": 43590 }, { "epoch": 16.1421695668271, "grad_norm": 0.2236841768026352, "learning_rate": 6.450434435301751e-05, "loss": 0.0227, "step": 43600 }, { "epoch": 16.145871899296555, "grad_norm": 0.21963997185230255, "learning_rate": 6.448851984889845e-05, "loss": 0.0231, "step": 43610 }, { "epoch": 16.149574231766014, "grad_norm": 0.2446017563343048, "learning_rate": 6.447269376033664e-05, "loss": 0.0224, "step": 43620 }, { "epoch": 16.15327656423547, "grad_norm": 0.397382527589798, "learning_rate": 6.445686608906283e-05, "loss": 0.0376, "step": 43630 }, { "epoch": 16.156978896704924, "grad_norm": 0.206482395529747, "learning_rate": 6.444103683680784e-05, "loss": 0.0249, "step": 43640 }, { "epoch": 16.16068122917438, "grad_norm": 0.14712320268154144, "learning_rate": 6.44252060053028e-05, "loss": 0.0191, "step": 43650 }, { "epoch": 16.164383561643834, "grad_norm": 0.28140392899513245, "learning_rate": 6.440937359627893e-05, "loss": 0.0274, "step": 43660 }, { "epoch": 16.16808589411329, "grad_norm": 0.1504129022359848, "learning_rate": 6.439353961146759e-05, "loss": 0.0244, "step": 43670 }, { "epoch": 16.17178822658275, "grad_norm": 0.2978205382823944, "learning_rate": 6.437770405260042e-05, "loss": 0.0302, "step": 43680 }, { "epoch": 16.175490559052204, "grad_norm": 0.16693279147148132, "learning_rate": 6.436186692140916e-05, "loss": 0.0232, "step": 43690 }, { "epoch": 16.17919289152166, "grad_norm": 0.27601107954978943, "learning_rate": 6.43460282196257e-05, "loss": 0.0177, "step": 43700 }, { "epoch": 16.182895223991114, "grad_norm": 0.3125764727592468, "learning_rate": 6.433018794898219e-05, "loss": 0.0162, "step": 43710 }, { "epoch": 16.18659755646057, "grad_norm": 0.18515539169311523, "learning_rate": 6.431434611121087e-05, "loss": 0.0217, "step": 43720 }, { "epoch": 16.190299888930024, "grad_norm": 0.1264755129814148, "learning_rate": 6.429850270804416e-05, "loss": 0.0233, "step": 43730 }, { "epoch": 16.194002221399483, "grad_norm": 0.28239959478378296, "learning_rate": 6.428265774121472e-05, "loss": 0.0216, "step": 43740 }, { "epoch": 16.197704553868938, "grad_norm": 0.22464798390865326, "learning_rate": 6.426681121245527e-05, "loss": 0.0169, "step": 43750 }, { "epoch": 16.201406886338393, "grad_norm": 0.2520039677619934, "learning_rate": 6.42509631234988e-05, "loss": 0.017, "step": 43760 }, { "epoch": 16.20510921880785, "grad_norm": 0.1845616102218628, "learning_rate": 6.423511347607844e-05, "loss": 0.0206, "step": 43770 }, { "epoch": 16.208811551277304, "grad_norm": 0.19424425065517426, "learning_rate": 6.421926227192749e-05, "loss": 0.0302, "step": 43780 }, { "epoch": 16.21251388374676, "grad_norm": 0.2872902452945709, "learning_rate": 6.420340951277938e-05, "loss": 0.0249, "step": 43790 }, { "epoch": 16.216216216216218, "grad_norm": 0.5187937021255493, "learning_rate": 6.418755520036775e-05, "loss": 0.0252, "step": 43800 }, { "epoch": 16.219918548685673, "grad_norm": 0.1980913281440735, "learning_rate": 6.417169933642642e-05, "loss": 0.0245, "step": 43810 }, { "epoch": 16.223620881155128, "grad_norm": 0.21754112839698792, "learning_rate": 6.415584192268936e-05, "loss": 0.0163, "step": 43820 }, { "epoch": 16.227323213624583, "grad_norm": 0.15854434669017792, "learning_rate": 6.41399829608907e-05, "loss": 0.0171, "step": 43830 }, { "epoch": 16.23102554609404, "grad_norm": 0.13432328402996063, "learning_rate": 6.412412245276479e-05, "loss": 0.0181, "step": 43840 }, { "epoch": 16.234727878563493, "grad_norm": 0.22761885821819305, "learning_rate": 6.410826040004607e-05, "loss": 0.0152, "step": 43850 }, { "epoch": 16.238430211032952, "grad_norm": 0.4181562066078186, "learning_rate": 6.409239680446919e-05, "loss": 0.0236, "step": 43860 }, { "epoch": 16.242132543502407, "grad_norm": 0.37001362442970276, "learning_rate": 6.4076531667769e-05, "loss": 0.0177, "step": 43870 }, { "epoch": 16.245834875971862, "grad_norm": 0.42973509430885315, "learning_rate": 6.406066499168046e-05, "loss": 0.0269, "step": 43880 }, { "epoch": 16.249537208441318, "grad_norm": 0.21664759516716003, "learning_rate": 6.404479677793874e-05, "loss": 0.0211, "step": 43890 }, { "epoch": 16.253239540910773, "grad_norm": 0.4110162854194641, "learning_rate": 6.402892702827916e-05, "loss": 0.0258, "step": 43900 }, { "epoch": 16.256941873380228, "grad_norm": 0.27579182386398315, "learning_rate": 6.401305574443722e-05, "loss": 0.0255, "step": 43910 }, { "epoch": 16.260644205849687, "grad_norm": 0.27886128425598145, "learning_rate": 6.399718292814859e-05, "loss": 0.0177, "step": 43920 }, { "epoch": 16.264346538319142, "grad_norm": 0.1366386115550995, "learning_rate": 6.398130858114903e-05, "loss": 0.0356, "step": 43930 }, { "epoch": 16.268048870788597, "grad_norm": 0.23162654042243958, "learning_rate": 6.396543270517462e-05, "loss": 0.0191, "step": 43940 }, { "epoch": 16.271751203258052, "grad_norm": 0.4188305735588074, "learning_rate": 6.394955530196147e-05, "loss": 0.018, "step": 43950 }, { "epoch": 16.275453535727507, "grad_norm": 0.349508672952652, "learning_rate": 6.393367637324593e-05, "loss": 0.0296, "step": 43960 }, { "epoch": 16.279155868196963, "grad_norm": 0.19297710061073303, "learning_rate": 6.39177959207645e-05, "loss": 0.0193, "step": 43970 }, { "epoch": 16.28285820066642, "grad_norm": 0.16456939280033112, "learning_rate": 6.390191394625381e-05, "loss": 0.0196, "step": 43980 }, { "epoch": 16.286560533135876, "grad_norm": 0.2553187906742096, "learning_rate": 6.388603045145075e-05, "loss": 0.0169, "step": 43990 }, { "epoch": 16.29026286560533, "grad_norm": 0.2614636719226837, "learning_rate": 6.387014543809223e-05, "loss": 0.0246, "step": 44000 }, { "epoch": 16.293965198074787, "grad_norm": 0.35071036219596863, "learning_rate": 6.385425890791548e-05, "loss": 0.0207, "step": 44010 }, { "epoch": 16.297667530544242, "grad_norm": 0.235344797372818, "learning_rate": 6.38383708626578e-05, "loss": 0.0231, "step": 44020 }, { "epoch": 16.301369863013697, "grad_norm": 0.2996385991573334, "learning_rate": 6.382248130405671e-05, "loss": 0.0253, "step": 44030 }, { "epoch": 16.305072195483156, "grad_norm": 0.15254813432693481, "learning_rate": 6.380659023384982e-05, "loss": 0.0226, "step": 44040 }, { "epoch": 16.30877452795261, "grad_norm": 0.418306827545166, "learning_rate": 6.3790697653775e-05, "loss": 0.0218, "step": 44050 }, { "epoch": 16.312476860422066, "grad_norm": 0.16104094684123993, "learning_rate": 6.377480356557022e-05, "loss": 0.0231, "step": 44060 }, { "epoch": 16.31617919289152, "grad_norm": 0.3393917679786682, "learning_rate": 6.375890797097362e-05, "loss": 0.0214, "step": 44070 }, { "epoch": 16.319881525360977, "grad_norm": 0.30668094754219055, "learning_rate": 6.374301087172351e-05, "loss": 0.0322, "step": 44080 }, { "epoch": 16.32358385783043, "grad_norm": 1.6611177921295166, "learning_rate": 6.372711226955843e-05, "loss": 0.0258, "step": 44090 }, { "epoch": 16.32728619029989, "grad_norm": 0.2379181683063507, "learning_rate": 6.371121216621698e-05, "loss": 0.0222, "step": 44100 }, { "epoch": 16.330988522769346, "grad_norm": 1.7729796171188354, "learning_rate": 6.3695310563438e-05, "loss": 0.0342, "step": 44110 }, { "epoch": 16.3346908552388, "grad_norm": 0.22730298340320587, "learning_rate": 6.367940746296041e-05, "loss": 0.0234, "step": 44120 }, { "epoch": 16.338393187708256, "grad_norm": 0.21892954409122467, "learning_rate": 6.366350286652341e-05, "loss": 0.0323, "step": 44130 }, { "epoch": 16.34209552017771, "grad_norm": 0.13479290902614594, "learning_rate": 6.364759677586627e-05, "loss": 0.0202, "step": 44140 }, { "epoch": 16.345797852647166, "grad_norm": 0.5812020301818848, "learning_rate": 6.363168919272846e-05, "loss": 0.0166, "step": 44150 }, { "epoch": 16.349500185116625, "grad_norm": 0.13109418749809265, "learning_rate": 6.361578011884964e-05, "loss": 0.0205, "step": 44160 }, { "epoch": 16.35320251758608, "grad_norm": 0.22463449835777283, "learning_rate": 6.359986955596955e-05, "loss": 0.0236, "step": 44170 }, { "epoch": 16.356904850055535, "grad_norm": 0.21883419156074524, "learning_rate": 6.358395750582817e-05, "loss": 0.0221, "step": 44180 }, { "epoch": 16.36060718252499, "grad_norm": 0.5314826965332031, "learning_rate": 6.356804397016564e-05, "loss": 0.0251, "step": 44190 }, { "epoch": 16.364309514994446, "grad_norm": 0.26433008909225464, "learning_rate": 6.355212895072223e-05, "loss": 0.0197, "step": 44200 }, { "epoch": 16.3680118474639, "grad_norm": 0.13019360601902008, "learning_rate": 6.353621244923835e-05, "loss": 0.0146, "step": 44210 }, { "epoch": 16.37171417993336, "grad_norm": 0.2646216154098511, "learning_rate": 6.352029446745463e-05, "loss": 0.0194, "step": 44220 }, { "epoch": 16.375416512402815, "grad_norm": 0.4591565728187561, "learning_rate": 6.350437500711184e-05, "loss": 0.0291, "step": 44230 }, { "epoch": 16.37911884487227, "grad_norm": 0.1819741576910019, "learning_rate": 6.34884540699509e-05, "loss": 0.0189, "step": 44240 }, { "epoch": 16.382821177341725, "grad_norm": 0.1748596578836441, "learning_rate": 6.34725316577129e-05, "loss": 0.0244, "step": 44250 }, { "epoch": 16.38652350981118, "grad_norm": 0.21125374734401703, "learning_rate": 6.34566077721391e-05, "loss": 0.027, "step": 44260 }, { "epoch": 16.390225842280636, "grad_norm": 0.2055051177740097, "learning_rate": 6.344068241497092e-05, "loss": 0.0217, "step": 44270 }, { "epoch": 16.393928174750094, "grad_norm": 0.6013686656951904, "learning_rate": 6.342475558794988e-05, "loss": 0.0158, "step": 44280 }, { "epoch": 16.39763050721955, "grad_norm": 0.15085315704345703, "learning_rate": 6.340882729281779e-05, "loss": 0.0209, "step": 44290 }, { "epoch": 16.401332839689005, "grad_norm": 0.25529539585113525, "learning_rate": 6.339289753131649e-05, "loss": 0.0321, "step": 44300 }, { "epoch": 16.40503517215846, "grad_norm": 0.21180744469165802, "learning_rate": 6.337696630518804e-05, "loss": 0.0164, "step": 44310 }, { "epoch": 16.408737504627915, "grad_norm": 0.23402856290340424, "learning_rate": 6.33610336161747e-05, "loss": 0.0209, "step": 44320 }, { "epoch": 16.41243983709737, "grad_norm": 0.2340879738330841, "learning_rate": 6.334509946601879e-05, "loss": 0.0223, "step": 44330 }, { "epoch": 16.41614216956683, "grad_norm": 0.2661411166191101, "learning_rate": 6.332916385646285e-05, "loss": 0.0277, "step": 44340 }, { "epoch": 16.419844502036284, "grad_norm": 1.4284884929656982, "learning_rate": 6.331322678924962e-05, "loss": 0.0151, "step": 44350 }, { "epoch": 16.42354683450574, "grad_norm": 0.18474604189395905, "learning_rate": 6.329728826612192e-05, "loss": 0.0199, "step": 44360 }, { "epoch": 16.427249166975194, "grad_norm": 0.2240343689918518, "learning_rate": 6.328134828882276e-05, "loss": 0.0256, "step": 44370 }, { "epoch": 16.43095149944465, "grad_norm": 0.1448933631181717, "learning_rate": 6.326540685909532e-05, "loss": 0.024, "step": 44380 }, { "epoch": 16.434653831914105, "grad_norm": 0.18430951237678528, "learning_rate": 6.324946397868294e-05, "loss": 0.0182, "step": 44390 }, { "epoch": 16.438356164383563, "grad_norm": 0.2438192367553711, "learning_rate": 6.323351964932908e-05, "loss": 0.0175, "step": 44400 }, { "epoch": 16.44205849685302, "grad_norm": 0.19664542376995087, "learning_rate": 6.321757387277742e-05, "loss": 0.0258, "step": 44410 }, { "epoch": 16.445760829322474, "grad_norm": 0.15796369314193726, "learning_rate": 6.320162665077177e-05, "loss": 0.0213, "step": 44420 }, { "epoch": 16.44946316179193, "grad_norm": 0.23776482045650482, "learning_rate": 6.318567798505605e-05, "loss": 0.0213, "step": 44430 }, { "epoch": 16.453165494261384, "grad_norm": 0.1503564566373825, "learning_rate": 6.316972787737441e-05, "loss": 0.0318, "step": 44440 }, { "epoch": 16.45686782673084, "grad_norm": 0.4358227849006653, "learning_rate": 6.315377632947115e-05, "loss": 0.0181, "step": 44450 }, { "epoch": 16.460570159200294, "grad_norm": 0.30034807324409485, "learning_rate": 6.313782334309066e-05, "loss": 0.0234, "step": 44460 }, { "epoch": 16.464272491669753, "grad_norm": 0.13392214477062225, "learning_rate": 6.312186891997759e-05, "loss": 0.0219, "step": 44470 }, { "epoch": 16.46797482413921, "grad_norm": 0.23247845470905304, "learning_rate": 6.310591306187665e-05, "loss": 0.0267, "step": 44480 }, { "epoch": 16.471677156608663, "grad_norm": 0.129319429397583, "learning_rate": 6.308995577053276e-05, "loss": 0.0208, "step": 44490 }, { "epoch": 16.47537948907812, "grad_norm": 0.2391686588525772, "learning_rate": 6.307399704769099e-05, "loss": 0.014, "step": 44500 }, { "epoch": 16.479081821547574, "grad_norm": 0.12037744373083115, "learning_rate": 6.305803689509656e-05, "loss": 0.0227, "step": 44510 }, { "epoch": 16.48278415401703, "grad_norm": 0.32960623502731323, "learning_rate": 6.304207531449486e-05, "loss": 0.0213, "step": 44520 }, { "epoch": 16.486486486486488, "grad_norm": 0.23991228640079498, "learning_rate": 6.302611230763138e-05, "loss": 0.0209, "step": 44530 }, { "epoch": 16.490188818955943, "grad_norm": 0.14202988147735596, "learning_rate": 6.301014787625187e-05, "loss": 0.0165, "step": 44540 }, { "epoch": 16.493891151425398, "grad_norm": 0.3134271204471588, "learning_rate": 6.299418202210214e-05, "loss": 0.0196, "step": 44550 }, { "epoch": 16.497593483894853, "grad_norm": 0.45097553730010986, "learning_rate": 6.29782147469282e-05, "loss": 0.0201, "step": 44560 }, { "epoch": 16.50129581636431, "grad_norm": 0.30710315704345703, "learning_rate": 6.296224605247622e-05, "loss": 0.0211, "step": 44570 }, { "epoch": 16.504998148833764, "grad_norm": 0.2225707322359085, "learning_rate": 6.294627594049249e-05, "loss": 0.0216, "step": 44580 }, { "epoch": 16.508700481303222, "grad_norm": 0.2993556559085846, "learning_rate": 6.293030441272347e-05, "loss": 0.0152, "step": 44590 }, { "epoch": 16.512402813772677, "grad_norm": 0.18797120451927185, "learning_rate": 6.291433147091583e-05, "loss": 0.0265, "step": 44600 }, { "epoch": 16.516105146242133, "grad_norm": 0.1491594761610031, "learning_rate": 6.289835711681631e-05, "loss": 0.0137, "step": 44610 }, { "epoch": 16.519807478711588, "grad_norm": 0.27091526985168457, "learning_rate": 6.288238135217183e-05, "loss": 0.0318, "step": 44620 }, { "epoch": 16.523509811181043, "grad_norm": 0.27375009655952454, "learning_rate": 6.286640417872951e-05, "loss": 0.0283, "step": 44630 }, { "epoch": 16.527212143650498, "grad_norm": 0.4939243495464325, "learning_rate": 6.285042559823657e-05, "loss": 0.0254, "step": 44640 }, { "epoch": 16.530914476119957, "grad_norm": 0.2654559314250946, "learning_rate": 6.283444561244042e-05, "loss": 0.0162, "step": 44650 }, { "epoch": 16.534616808589412, "grad_norm": 0.2972552478313446, "learning_rate": 6.281846422308857e-05, "loss": 0.0271, "step": 44660 }, { "epoch": 16.538319141058867, "grad_norm": 0.24597981572151184, "learning_rate": 6.280248143192875e-05, "loss": 0.0263, "step": 44670 }, { "epoch": 16.542021473528322, "grad_norm": 0.2649042010307312, "learning_rate": 6.278649724070882e-05, "loss": 0.0216, "step": 44680 }, { "epoch": 16.545723805997778, "grad_norm": 0.1649976670742035, "learning_rate": 6.277051165117677e-05, "loss": 0.0228, "step": 44690 }, { "epoch": 16.549426138467233, "grad_norm": 0.2538447678089142, "learning_rate": 6.275452466508077e-05, "loss": 0.0154, "step": 44700 }, { "epoch": 16.55312847093669, "grad_norm": 0.17283222079277039, "learning_rate": 6.273853628416911e-05, "loss": 0.0169, "step": 44710 }, { "epoch": 16.556830803406147, "grad_norm": 0.8406464457511902, "learning_rate": 6.272254651019029e-05, "loss": 0.0219, "step": 44720 }, { "epoch": 16.560533135875602, "grad_norm": 0.4976959526538849, "learning_rate": 6.270655534489292e-05, "loss": 0.0202, "step": 44730 }, { "epoch": 16.564235468345057, "grad_norm": 0.14426974952220917, "learning_rate": 6.269056279002575e-05, "loss": 0.0215, "step": 44740 }, { "epoch": 16.567937800814512, "grad_norm": 0.17648127675056458, "learning_rate": 6.26745688473377e-05, "loss": 0.0264, "step": 44750 }, { "epoch": 16.571640133283967, "grad_norm": 0.10166089981794357, "learning_rate": 6.265857351857788e-05, "loss": 0.0199, "step": 44760 }, { "epoch": 16.575342465753426, "grad_norm": 0.2624516487121582, "learning_rate": 6.264257680549548e-05, "loss": 0.0238, "step": 44770 }, { "epoch": 16.57904479822288, "grad_norm": 0.21807748079299927, "learning_rate": 6.262657870983989e-05, "loss": 0.029, "step": 44780 }, { "epoch": 16.582747130692336, "grad_norm": 0.41772058606147766, "learning_rate": 6.261057923336064e-05, "loss": 0.0331, "step": 44790 }, { "epoch": 16.58644946316179, "grad_norm": 0.2560140788555145, "learning_rate": 6.259457837780742e-05, "loss": 0.0191, "step": 44800 }, { "epoch": 16.590151795631247, "grad_norm": 0.40265417098999023, "learning_rate": 6.257857614493e-05, "loss": 0.0238, "step": 44810 }, { "epoch": 16.593854128100702, "grad_norm": 0.3356464207172394, "learning_rate": 6.256257253647843e-05, "loss": 0.0306, "step": 44820 }, { "epoch": 16.59755646057016, "grad_norm": 0.2570064067840576, "learning_rate": 6.254656755420283e-05, "loss": 0.0143, "step": 44830 }, { "epoch": 16.601258793039616, "grad_norm": 0.234294593334198, "learning_rate": 6.253056119985346e-05, "loss": 0.0171, "step": 44840 }, { "epoch": 16.60496112550907, "grad_norm": 1.1161671876907349, "learning_rate": 6.251455347518073e-05, "loss": 0.0251, "step": 44850 }, { "epoch": 16.608663457978526, "grad_norm": 0.7162284851074219, "learning_rate": 6.249854438193528e-05, "loss": 0.0322, "step": 44860 }, { "epoch": 16.61236579044798, "grad_norm": 0.212027445435524, "learning_rate": 6.248253392186781e-05, "loss": 0.0138, "step": 44870 }, { "epoch": 16.616068122917437, "grad_norm": 0.23761999607086182, "learning_rate": 6.246652209672917e-05, "loss": 0.0175, "step": 44880 }, { "epoch": 16.619770455386895, "grad_norm": 0.9196450114250183, "learning_rate": 6.245050890827042e-05, "loss": 0.0296, "step": 44890 }, { "epoch": 16.62347278785635, "grad_norm": 0.3803197145462036, "learning_rate": 6.243449435824276e-05, "loss": 0.0259, "step": 44900 }, { "epoch": 16.627175120325806, "grad_norm": 0.5419939756393433, "learning_rate": 6.241847844839745e-05, "loss": 0.0251, "step": 44910 }, { "epoch": 16.63087745279526, "grad_norm": 0.19669003784656525, "learning_rate": 6.240246118048606e-05, "loss": 0.0194, "step": 44920 }, { "epoch": 16.634579785264716, "grad_norm": 0.41102224588394165, "learning_rate": 6.238644255626012e-05, "loss": 0.0315, "step": 44930 }, { "epoch": 16.63828211773417, "grad_norm": 0.3994886577129364, "learning_rate": 6.237042257747146e-05, "loss": 0.012, "step": 44940 }, { "epoch": 16.64198445020363, "grad_norm": 0.11232698708772659, "learning_rate": 6.235440124587198e-05, "loss": 0.0306, "step": 44950 }, { "epoch": 16.645686782673085, "grad_norm": 0.16205136477947235, "learning_rate": 6.233837856321376e-05, "loss": 0.0186, "step": 44960 }, { "epoch": 16.64938911514254, "grad_norm": 0.28617537021636963, "learning_rate": 6.232235453124902e-05, "loss": 0.0209, "step": 44970 }, { "epoch": 16.653091447611995, "grad_norm": 0.287508100271225, "learning_rate": 6.230632915173009e-05, "loss": 0.03, "step": 44980 }, { "epoch": 16.65679378008145, "grad_norm": 0.21101875603199005, "learning_rate": 6.229030242640952e-05, "loss": 0.0196, "step": 44990 }, { "epoch": 16.660496112550906, "grad_norm": 0.552368700504303, "learning_rate": 6.227427435703997e-05, "loss": 0.0198, "step": 45000 }, { "epoch": 16.664198445020364, "grad_norm": 0.4075568616390228, "learning_rate": 6.22582449453742e-05, "loss": 0.023, "step": 45010 }, { "epoch": 16.66790077748982, "grad_norm": 0.23575693368911743, "learning_rate": 6.224221419316522e-05, "loss": 0.0329, "step": 45020 }, { "epoch": 16.671603109959275, "grad_norm": 0.2846919596195221, "learning_rate": 6.222618210216608e-05, "loss": 0.0235, "step": 45030 }, { "epoch": 16.67530544242873, "grad_norm": 0.46092885732650757, "learning_rate": 6.221014867413007e-05, "loss": 0.024, "step": 45040 }, { "epoch": 16.679007774898185, "grad_norm": 0.9261439442634583, "learning_rate": 6.219411391081055e-05, "loss": 0.0245, "step": 45050 }, { "epoch": 16.68271010736764, "grad_norm": 0.41940152645111084, "learning_rate": 6.217807781396106e-05, "loss": 0.0287, "step": 45060 }, { "epoch": 16.6864124398371, "grad_norm": 0.23542945086956024, "learning_rate": 6.216204038533529e-05, "loss": 0.0163, "step": 45070 }, { "epoch": 16.690114772306554, "grad_norm": 0.18963472545146942, "learning_rate": 6.214600162668706e-05, "loss": 0.0174, "step": 45080 }, { "epoch": 16.69381710477601, "grad_norm": 0.09031776338815689, "learning_rate": 6.212996153977037e-05, "loss": 0.0138, "step": 45090 }, { "epoch": 16.697519437245465, "grad_norm": 0.1610305905342102, "learning_rate": 6.211392012633932e-05, "loss": 0.0217, "step": 45100 }, { "epoch": 16.70122176971492, "grad_norm": 0.21979433298110962, "learning_rate": 6.209787738814819e-05, "loss": 0.0233, "step": 45110 }, { "epoch": 16.704924102184375, "grad_norm": 0.2938062250614166, "learning_rate": 6.208183332695135e-05, "loss": 0.0174, "step": 45120 }, { "epoch": 16.708626434653834, "grad_norm": 0.18942023813724518, "learning_rate": 6.20657879445034e-05, "loss": 0.0227, "step": 45130 }, { "epoch": 16.71232876712329, "grad_norm": 0.2930075228214264, "learning_rate": 6.204974124255902e-05, "loss": 0.0233, "step": 45140 }, { "epoch": 16.716031099592744, "grad_norm": 0.22789424657821655, "learning_rate": 6.203369322287306e-05, "loss": 0.0186, "step": 45150 }, { "epoch": 16.7197334320622, "grad_norm": 0.24895219504833221, "learning_rate": 6.201764388720049e-05, "loss": 0.025, "step": 45160 }, { "epoch": 16.723435764531654, "grad_norm": 0.383803129196167, "learning_rate": 6.200159323729645e-05, "loss": 0.0227, "step": 45170 }, { "epoch": 16.72713809700111, "grad_norm": 0.30692338943481445, "learning_rate": 6.198554127491622e-05, "loss": 0.0178, "step": 45180 }, { "epoch": 16.730840429470568, "grad_norm": 0.3753400146961212, "learning_rate": 6.196948800181523e-05, "loss": 0.0269, "step": 45190 }, { "epoch": 16.734542761940023, "grad_norm": 0.28997737169265747, "learning_rate": 6.195343341974899e-05, "loss": 0.0188, "step": 45200 }, { "epoch": 16.73824509440948, "grad_norm": 0.19435559213161469, "learning_rate": 6.193737753047329e-05, "loss": 0.0228, "step": 45210 }, { "epoch": 16.741947426878934, "grad_norm": 0.47129184007644653, "learning_rate": 6.19213203357439e-05, "loss": 0.0353, "step": 45220 }, { "epoch": 16.74564975934839, "grad_norm": 0.40346747636795044, "learning_rate": 6.190526183731685e-05, "loss": 0.0236, "step": 45230 }, { "epoch": 16.749352091817844, "grad_norm": 0.1991809457540512, "learning_rate": 6.188920203694828e-05, "loss": 0.0269, "step": 45240 }, { "epoch": 16.7530544242873, "grad_norm": 0.11237156391143799, "learning_rate": 6.187314093639444e-05, "loss": 0.0251, "step": 45250 }, { "epoch": 16.756756756756758, "grad_norm": 0.13477499783039093, "learning_rate": 6.185707853741175e-05, "loss": 0.0155, "step": 45260 }, { "epoch": 16.760459089226213, "grad_norm": 0.10147509723901749, "learning_rate": 6.184101484175678e-05, "loss": 0.0188, "step": 45270 }, { "epoch": 16.76416142169567, "grad_norm": 0.15588194131851196, "learning_rate": 6.182494985118624e-05, "loss": 0.0125, "step": 45280 }, { "epoch": 16.767863754165123, "grad_norm": 1.457660436630249, "learning_rate": 6.180888356745695e-05, "loss": 0.0288, "step": 45290 }, { "epoch": 16.77156608663458, "grad_norm": 0.3015576899051666, "learning_rate": 6.179281599232591e-05, "loss": 0.0251, "step": 45300 }, { "epoch": 16.775268419104037, "grad_norm": 0.2288232296705246, "learning_rate": 6.177674712755026e-05, "loss": 0.0224, "step": 45310 }, { "epoch": 16.778970751573492, "grad_norm": 0.23951590061187744, "learning_rate": 6.176067697488725e-05, "loss": 0.0228, "step": 45320 }, { "epoch": 16.782673084042948, "grad_norm": 0.5204002857208252, "learning_rate": 6.174460553609426e-05, "loss": 0.0246, "step": 45330 }, { "epoch": 16.786375416512403, "grad_norm": 0.14823967218399048, "learning_rate": 6.172853281292888e-05, "loss": 0.024, "step": 45340 }, { "epoch": 16.790077748981858, "grad_norm": 0.14046542346477509, "learning_rate": 6.17124588071488e-05, "loss": 0.0237, "step": 45350 }, { "epoch": 16.793780081451313, "grad_norm": 0.20596419274806976, "learning_rate": 6.169638352051182e-05, "loss": 0.0249, "step": 45360 }, { "epoch": 16.79748241392077, "grad_norm": 0.348510205745697, "learning_rate": 6.168030695477591e-05, "loss": 0.024, "step": 45370 }, { "epoch": 16.801184746390227, "grad_norm": 0.194993793964386, "learning_rate": 6.166422911169923e-05, "loss": 0.0281, "step": 45380 }, { "epoch": 16.804887078859682, "grad_norm": 0.2157241702079773, "learning_rate": 6.164814999303995e-05, "loss": 0.0208, "step": 45390 }, { "epoch": 16.808589411329137, "grad_norm": 0.1852346807718277, "learning_rate": 6.163206960055651e-05, "loss": 0.0227, "step": 45400 }, { "epoch": 16.812291743798593, "grad_norm": 1.3450661897659302, "learning_rate": 6.161598793600744e-05, "loss": 0.0303, "step": 45410 }, { "epoch": 16.815994076268048, "grad_norm": 0.592548668384552, "learning_rate": 6.159990500115139e-05, "loss": 0.0313, "step": 45420 }, { "epoch": 16.819696408737503, "grad_norm": 0.28341639041900635, "learning_rate": 6.158382079774716e-05, "loss": 0.0183, "step": 45430 }, { "epoch": 16.82339874120696, "grad_norm": 0.5600492358207703, "learning_rate": 6.156773532755372e-05, "loss": 0.0315, "step": 45440 }, { "epoch": 16.827101073676417, "grad_norm": 0.2923092246055603, "learning_rate": 6.155164859233012e-05, "loss": 0.0226, "step": 45450 }, { "epoch": 16.830803406145872, "grad_norm": 0.27772319316864014, "learning_rate": 6.153556059383561e-05, "loss": 0.0172, "step": 45460 }, { "epoch": 16.834505738615327, "grad_norm": 0.4639947712421417, "learning_rate": 6.151947133382954e-05, "loss": 0.0418, "step": 45470 }, { "epoch": 16.838208071084782, "grad_norm": 0.1556515097618103, "learning_rate": 6.15033808140714e-05, "loss": 0.0328, "step": 45480 }, { "epoch": 16.841910403554238, "grad_norm": 0.21795536577701569, "learning_rate": 6.148728903632081e-05, "loss": 0.0164, "step": 45490 }, { "epoch": 16.845612736023696, "grad_norm": 0.2580479085445404, "learning_rate": 6.147119600233758e-05, "loss": 0.0262, "step": 45500 }, { "epoch": 16.84931506849315, "grad_norm": 0.3614847958087921, "learning_rate": 6.145510171388161e-05, "loss": 0.0206, "step": 45510 }, { "epoch": 16.853017400962607, "grad_norm": 0.3556901514530182, "learning_rate": 6.143900617271293e-05, "loss": 0.0333, "step": 45520 }, { "epoch": 16.85671973343206, "grad_norm": 0.3969119191169739, "learning_rate": 6.142290938059173e-05, "loss": 0.0227, "step": 45530 }, { "epoch": 16.860422065901517, "grad_norm": 0.17037607729434967, "learning_rate": 6.140681133927834e-05, "loss": 0.0203, "step": 45540 }, { "epoch": 16.864124398370972, "grad_norm": 0.3074941039085388, "learning_rate": 6.13907120505332e-05, "loss": 0.0202, "step": 45550 }, { "epoch": 16.86782673084043, "grad_norm": 0.237688809633255, "learning_rate": 6.137461151611692e-05, "loss": 0.0243, "step": 45560 }, { "epoch": 16.871529063309886, "grad_norm": 0.21601036190986633, "learning_rate": 6.135850973779024e-05, "loss": 0.0224, "step": 45570 }, { "epoch": 16.87523139577934, "grad_norm": 0.23899884521961212, "learning_rate": 6.1342406717314e-05, "loss": 0.0236, "step": 45580 }, { "epoch": 16.878933728248796, "grad_norm": 0.2620009183883667, "learning_rate": 6.132630245644921e-05, "loss": 0.0267, "step": 45590 }, { "epoch": 16.88263606071825, "grad_norm": 0.1700623780488968, "learning_rate": 6.131019695695702e-05, "loss": 0.026, "step": 45600 }, { "epoch": 16.886338393187707, "grad_norm": 0.22994138300418854, "learning_rate": 6.129409022059869e-05, "loss": 0.0185, "step": 45610 }, { "epoch": 16.890040725657165, "grad_norm": 0.25656574964523315, "learning_rate": 6.127798224913564e-05, "loss": 0.0205, "step": 45620 }, { "epoch": 16.89374305812662, "grad_norm": 0.1446707546710968, "learning_rate": 6.126187304432941e-05, "loss": 0.0196, "step": 45630 }, { "epoch": 16.897445390596076, "grad_norm": 0.25651565194129944, "learning_rate": 6.124576260794167e-05, "loss": 0.0348, "step": 45640 }, { "epoch": 16.90114772306553, "grad_norm": 0.28750258684158325, "learning_rate": 6.122965094173424e-05, "loss": 0.0267, "step": 45650 }, { "epoch": 16.904850055534986, "grad_norm": 0.2854312062263489, "learning_rate": 6.121353804746907e-05, "loss": 0.0333, "step": 45660 }, { "epoch": 16.90855238800444, "grad_norm": 0.35346078872680664, "learning_rate": 6.119742392690823e-05, "loss": 0.021, "step": 45670 }, { "epoch": 16.9122547204739, "grad_norm": 0.1313863843679428, "learning_rate": 6.118130858181395e-05, "loss": 0.0246, "step": 45680 }, { "epoch": 16.915957052943355, "grad_norm": 0.21081578731536865, "learning_rate": 6.116519201394857e-05, "loss": 0.0152, "step": 45690 }, { "epoch": 16.91965938541281, "grad_norm": 0.2548011541366577, "learning_rate": 6.11490742250746e-05, "loss": 0.0141, "step": 45700 }, { "epoch": 16.923361717882266, "grad_norm": 0.36055898666381836, "learning_rate": 6.113295521695459e-05, "loss": 0.0257, "step": 45710 }, { "epoch": 16.92706405035172, "grad_norm": 0.27301251888275146, "learning_rate": 6.111683499135137e-05, "loss": 0.0236, "step": 45720 }, { "epoch": 16.930766382821176, "grad_norm": 0.31447306275367737, "learning_rate": 6.110071355002779e-05, "loss": 0.0157, "step": 45730 }, { "epoch": 16.934468715290635, "grad_norm": 0.4014495015144348, "learning_rate": 6.108459089474686e-05, "loss": 0.016, "step": 45740 }, { "epoch": 16.93817104776009, "grad_norm": 0.351767897605896, "learning_rate": 6.106846702727172e-05, "loss": 0.0212, "step": 45750 }, { "epoch": 16.941873380229545, "grad_norm": 0.9105050563812256, "learning_rate": 6.105234194936568e-05, "loss": 0.0281, "step": 45760 }, { "epoch": 16.945575712699, "grad_norm": 0.8668213486671448, "learning_rate": 6.103621566279214e-05, "loss": 0.0224, "step": 45770 }, { "epoch": 16.949278045168455, "grad_norm": 0.17446565628051758, "learning_rate": 6.102008816931466e-05, "loss": 0.0326, "step": 45780 }, { "epoch": 16.95298037763791, "grad_norm": 0.19729122519493103, "learning_rate": 6.10039594706969e-05, "loss": 0.0256, "step": 45790 }, { "epoch": 16.95668271010737, "grad_norm": 0.34561797976493835, "learning_rate": 6.0987829568702656e-05, "loss": 0.0246, "step": 45800 }, { "epoch": 16.960385042576824, "grad_norm": 0.36469876766204834, "learning_rate": 6.097169846509592e-05, "loss": 0.0233, "step": 45810 }, { "epoch": 16.96408737504628, "grad_norm": 0.2739935517311096, "learning_rate": 6.0955566161640724e-05, "loss": 0.0267, "step": 45820 }, { "epoch": 16.967789707515735, "grad_norm": 0.17727066576480865, "learning_rate": 6.093943266010128e-05, "loss": 0.0163, "step": 45830 }, { "epoch": 16.97149203998519, "grad_norm": 0.5802829265594482, "learning_rate": 6.092329796224192e-05, "loss": 0.0182, "step": 45840 }, { "epoch": 16.975194372454645, "grad_norm": 0.44789955019950867, "learning_rate": 6.090716206982714e-05, "loss": 0.0272, "step": 45850 }, { "epoch": 16.978896704924104, "grad_norm": 0.26386764645576477, "learning_rate": 6.0891024984621506e-05, "loss": 0.0178, "step": 45860 }, { "epoch": 16.98259903739356, "grad_norm": 2.6406736373901367, "learning_rate": 6.0874886708389745e-05, "loss": 0.0184, "step": 45870 }, { "epoch": 16.986301369863014, "grad_norm": 0.12439211457967758, "learning_rate": 6.085874724289673e-05, "loss": 0.0287, "step": 45880 }, { "epoch": 16.99000370233247, "grad_norm": 0.35239341855049133, "learning_rate": 6.084260658990744e-05, "loss": 0.0183, "step": 45890 }, { "epoch": 16.993706034801924, "grad_norm": 0.5906237363815308, "learning_rate": 6.0826464751186994e-05, "loss": 0.0307, "step": 45900 }, { "epoch": 16.99740836727138, "grad_norm": 0.3821828365325928, "learning_rate": 6.081032172850063e-05, "loss": 0.0233, "step": 45910 }, { "epoch": 17.00111069974084, "grad_norm": 0.42454493045806885, "learning_rate": 6.079417752361376e-05, "loss": 0.0213, "step": 45920 }, { "epoch": 17.004813032210293, "grad_norm": 2.3218603134155273, "learning_rate": 6.077803213829184e-05, "loss": 0.023, "step": 45930 }, { "epoch": 17.00851536467975, "grad_norm": 0.22281017899513245, "learning_rate": 6.0761885574300546e-05, "loss": 0.026, "step": 45940 }, { "epoch": 17.012217697149204, "grad_norm": 0.32760247588157654, "learning_rate": 6.074573783340562e-05, "loss": 0.0381, "step": 45950 }, { "epoch": 17.01592002961866, "grad_norm": 0.21340858936309814, "learning_rate": 6.072958891737296e-05, "loss": 0.0179, "step": 45960 }, { "epoch": 17.019622362088114, "grad_norm": 0.11887279152870178, "learning_rate": 6.071343882796859e-05, "loss": 0.0226, "step": 45970 }, { "epoch": 17.023324694557573, "grad_norm": 0.33185911178588867, "learning_rate": 6.069728756695866e-05, "loss": 0.019, "step": 45980 }, { "epoch": 17.027027027027028, "grad_norm": 0.11842501163482666, "learning_rate": 6.068113513610943e-05, "loss": 0.0424, "step": 45990 }, { "epoch": 17.030729359496483, "grad_norm": 0.29360002279281616, "learning_rate": 6.066498153718735e-05, "loss": 0.0299, "step": 46000 }, { "epoch": 17.03443169196594, "grad_norm": 0.5554399490356445, "learning_rate": 6.06488267719589e-05, "loss": 0.0265, "step": 46010 }, { "epoch": 17.038134024435394, "grad_norm": 0.22010543942451477, "learning_rate": 6.0632670842190776e-05, "loss": 0.0245, "step": 46020 }, { "epoch": 17.04183635690485, "grad_norm": 0.22853565216064453, "learning_rate": 6.061651374964974e-05, "loss": 0.0238, "step": 46030 }, { "epoch": 17.045538689374307, "grad_norm": 0.2740406095981598, "learning_rate": 6.0600355496102745e-05, "loss": 0.0231, "step": 46040 }, { "epoch": 17.049241021843763, "grad_norm": 0.25594598054885864, "learning_rate": 6.0584196083316794e-05, "loss": 0.0267, "step": 46050 }, { "epoch": 17.052943354313218, "grad_norm": 0.13316281139850616, "learning_rate": 6.0568035513059073e-05, "loss": 0.0279, "step": 46060 }, { "epoch": 17.056645686782673, "grad_norm": 0.22106505930423737, "learning_rate": 6.055187378709688e-05, "loss": 0.014, "step": 46070 }, { "epoch": 17.060348019252128, "grad_norm": 0.29454919695854187, "learning_rate": 6.053571090719763e-05, "loss": 0.0202, "step": 46080 }, { "epoch": 17.064050351721583, "grad_norm": 0.3746729791164398, "learning_rate": 6.0519546875128876e-05, "loss": 0.0203, "step": 46090 }, { "epoch": 17.067752684191042, "grad_norm": 0.33051300048828125, "learning_rate": 6.05033816926583e-05, "loss": 0.0219, "step": 46100 }, { "epoch": 17.071455016660497, "grad_norm": 0.13212203979492188, "learning_rate": 6.048721536155368e-05, "loss": 0.0149, "step": 46110 }, { "epoch": 17.075157349129952, "grad_norm": 0.23357848823070526, "learning_rate": 6.047104788358297e-05, "loss": 0.0245, "step": 46120 }, { "epoch": 17.078859681599408, "grad_norm": 0.25847533345222473, "learning_rate": 6.04548792605142e-05, "loss": 0.0254, "step": 46130 }, { "epoch": 17.082562014068863, "grad_norm": 0.23819243907928467, "learning_rate": 6.0438709494115544e-05, "loss": 0.0286, "step": 46140 }, { "epoch": 17.086264346538318, "grad_norm": 0.21059560775756836, "learning_rate": 6.042253858615532e-05, "loss": 0.0169, "step": 46150 }, { "epoch": 17.089966679007777, "grad_norm": 0.1475416123867035, "learning_rate": 6.040636653840195e-05, "loss": 0.0257, "step": 46160 }, { "epoch": 17.093669011477232, "grad_norm": 0.23213696479797363, "learning_rate": 6.039019335262398e-05, "loss": 0.0117, "step": 46170 }, { "epoch": 17.097371343946687, "grad_norm": 0.32746607065200806, "learning_rate": 6.037401903059008e-05, "loss": 0.0204, "step": 46180 }, { "epoch": 17.101073676416142, "grad_norm": 0.3473499119281769, "learning_rate": 6.035784357406906e-05, "loss": 0.022, "step": 46190 }, { "epoch": 17.104776008885597, "grad_norm": 0.12329067289829254, "learning_rate": 6.034166698482984e-05, "loss": 0.026, "step": 46200 }, { "epoch": 17.108478341355053, "grad_norm": 0.1851063072681427, "learning_rate": 6.032548926464148e-05, "loss": 0.0215, "step": 46210 }, { "epoch": 17.112180673824508, "grad_norm": 0.09798357635736465, "learning_rate": 6.030931041527311e-05, "loss": 0.0207, "step": 46220 }, { "epoch": 17.115883006293966, "grad_norm": 0.12757836282253265, "learning_rate": 6.029313043849407e-05, "loss": 0.0188, "step": 46230 }, { "epoch": 17.11958533876342, "grad_norm": 3.60551118850708, "learning_rate": 6.0276949336073765e-05, "loss": 0.0231, "step": 46240 }, { "epoch": 17.123287671232877, "grad_norm": 0.09621891379356384, "learning_rate": 6.026076710978171e-05, "loss": 0.0149, "step": 46250 }, { "epoch": 17.126990003702332, "grad_norm": 0.31762930750846863, "learning_rate": 6.024458376138762e-05, "loss": 0.0152, "step": 46260 }, { "epoch": 17.130692336171787, "grad_norm": 0.1526627093553543, "learning_rate": 6.022839929266124e-05, "loss": 0.0252, "step": 46270 }, { "epoch": 17.134394668641242, "grad_norm": 0.22580845654010773, "learning_rate": 6.0212213705372475e-05, "loss": 0.0249, "step": 46280 }, { "epoch": 17.1380970011107, "grad_norm": 0.17065644264221191, "learning_rate": 6.01960270012914e-05, "loss": 0.0294, "step": 46290 }, { "epoch": 17.141799333580156, "grad_norm": 0.3627491295337677, "learning_rate": 6.017983918218812e-05, "loss": 0.0299, "step": 46300 }, { "epoch": 17.14550166604961, "grad_norm": 0.4063742458820343, "learning_rate": 6.016365024983295e-05, "loss": 0.026, "step": 46310 }, { "epoch": 17.149203998519067, "grad_norm": 0.2991331219673157, "learning_rate": 6.014746020599626e-05, "loss": 0.0201, "step": 46320 }, { "epoch": 17.15290633098852, "grad_norm": 0.22082696855068207, "learning_rate": 6.013126905244858e-05, "loss": 0.0343, "step": 46330 }, { "epoch": 17.156608663457977, "grad_norm": 0.24179647862911224, "learning_rate": 6.011507679096054e-05, "loss": 0.0169, "step": 46340 }, { "epoch": 17.160310995927436, "grad_norm": 0.13981954753398895, "learning_rate": 6.009888342330292e-05, "loss": 0.0213, "step": 46350 }, { "epoch": 17.16401332839689, "grad_norm": 0.17811964452266693, "learning_rate": 6.00826889512466e-05, "loss": 0.0265, "step": 46360 }, { "epoch": 17.167715660866346, "grad_norm": 0.10750282555818558, "learning_rate": 6.006649337656256e-05, "loss": 0.0254, "step": 46370 }, { "epoch": 17.1714179933358, "grad_norm": 0.25514572858810425, "learning_rate": 6.005029670102195e-05, "loss": 0.0229, "step": 46380 }, { "epoch": 17.175120325805256, "grad_norm": 0.24772600829601288, "learning_rate": 6.003409892639599e-05, "loss": 0.0332, "step": 46390 }, { "epoch": 17.17882265827471, "grad_norm": 0.12544959783554077, "learning_rate": 6.001790005445607e-05, "loss": 0.0188, "step": 46400 }, { "epoch": 17.18252499074417, "grad_norm": 0.20342688262462616, "learning_rate": 6.0001700086973646e-05, "loss": 0.0182, "step": 46410 }, { "epoch": 17.186227323213625, "grad_norm": 0.15387924015522003, "learning_rate": 5.9985499025720346e-05, "loss": 0.026, "step": 46420 }, { "epoch": 17.18992965568308, "grad_norm": 0.3323458731174469, "learning_rate": 5.9969296872467894e-05, "loss": 0.0221, "step": 46430 }, { "epoch": 17.193631988152536, "grad_norm": 0.4084928333759308, "learning_rate": 5.995309362898812e-05, "loss": 0.0282, "step": 46440 }, { "epoch": 17.19733432062199, "grad_norm": 0.1428767740726471, "learning_rate": 5.9936889297052986e-05, "loss": 0.0315, "step": 46450 }, { "epoch": 17.201036653091446, "grad_norm": 0.2568940818309784, "learning_rate": 5.992068387843459e-05, "loss": 0.0235, "step": 46460 }, { "epoch": 17.204738985560905, "grad_norm": 0.2525489032268524, "learning_rate": 5.990447737490511e-05, "loss": 0.0361, "step": 46470 }, { "epoch": 17.20844131803036, "grad_norm": 0.26124751567840576, "learning_rate": 5.988826978823688e-05, "loss": 0.024, "step": 46480 }, { "epoch": 17.212143650499815, "grad_norm": 0.14414097368717194, "learning_rate": 5.9872061120202336e-05, "loss": 0.023, "step": 46490 }, { "epoch": 17.21584598296927, "grad_norm": 0.1782371997833252, "learning_rate": 5.985585137257401e-05, "loss": 0.0326, "step": 46500 }, { "epoch": 17.219548315438725, "grad_norm": 0.27488237619400024, "learning_rate": 5.983964054712462e-05, "loss": 0.0276, "step": 46510 }, { "epoch": 17.22325064790818, "grad_norm": 0.2327020764350891, "learning_rate": 5.9823428645626934e-05, "loss": 0.025, "step": 46520 }, { "epoch": 17.22695298037764, "grad_norm": 0.2780875861644745, "learning_rate": 5.9807215669853855e-05, "loss": 0.0365, "step": 46530 }, { "epoch": 17.230655312847095, "grad_norm": 0.321809321641922, "learning_rate": 5.9791001621578405e-05, "loss": 0.0169, "step": 46540 }, { "epoch": 17.23435764531655, "grad_norm": 0.31140410900115967, "learning_rate": 5.977478650257374e-05, "loss": 0.0154, "step": 46550 }, { "epoch": 17.238059977786005, "grad_norm": 1.3207316398620605, "learning_rate": 5.975857031461314e-05, "loss": 0.0196, "step": 46560 }, { "epoch": 17.24176231025546, "grad_norm": 0.27289506793022156, "learning_rate": 5.9742353059469936e-05, "loss": 0.0231, "step": 46570 }, { "epoch": 17.245464642724915, "grad_norm": 0.8505443334579468, "learning_rate": 5.972613473891766e-05, "loss": 0.0244, "step": 46580 }, { "epoch": 17.249166975194374, "grad_norm": 0.34571343660354614, "learning_rate": 5.9709915354729914e-05, "loss": 0.0179, "step": 46590 }, { "epoch": 17.25286930766383, "grad_norm": 0.18994653224945068, "learning_rate": 5.969369490868042e-05, "loss": 0.0232, "step": 46600 }, { "epoch": 17.256571640133284, "grad_norm": 0.2280251383781433, "learning_rate": 5.967747340254303e-05, "loss": 0.0148, "step": 46610 }, { "epoch": 17.26027397260274, "grad_norm": 0.18546365201473236, "learning_rate": 5.9661250838091675e-05, "loss": 0.0213, "step": 46620 }, { "epoch": 17.263976305072195, "grad_norm": 0.2710462510585785, "learning_rate": 5.9645027217100477e-05, "loss": 0.0219, "step": 46630 }, { "epoch": 17.26767863754165, "grad_norm": 0.2242615967988968, "learning_rate": 5.96288025413436e-05, "loss": 0.0148, "step": 46640 }, { "epoch": 17.27138097001111, "grad_norm": 0.3493739664554596, "learning_rate": 5.961257681259535e-05, "loss": 0.0162, "step": 46650 }, { "epoch": 17.275083302480564, "grad_norm": 0.23101772367954254, "learning_rate": 5.9596350032630156e-05, "loss": 0.0209, "step": 46660 }, { "epoch": 17.27878563495002, "grad_norm": 0.1592600792646408, "learning_rate": 5.958012220322255e-05, "loss": 0.021, "step": 46670 }, { "epoch": 17.282487967419474, "grad_norm": 0.1634080410003662, "learning_rate": 5.956389332614719e-05, "loss": 0.0191, "step": 46680 }, { "epoch": 17.28619029988893, "grad_norm": 0.13664491474628448, "learning_rate": 5.9547663403178824e-05, "loss": 0.0161, "step": 46690 }, { "epoch": 17.289892632358384, "grad_norm": 2.756075620651245, "learning_rate": 5.953143243609235e-05, "loss": 0.0137, "step": 46700 }, { "epoch": 17.293594964827843, "grad_norm": 0.13500036299228668, "learning_rate": 5.9515200426662756e-05, "loss": 0.0237, "step": 46710 }, { "epoch": 17.2972972972973, "grad_norm": 0.16139544546604156, "learning_rate": 5.949896737666515e-05, "loss": 0.0242, "step": 46720 }, { "epoch": 17.300999629766753, "grad_norm": 0.4325815439224243, "learning_rate": 5.9482733287874734e-05, "loss": 0.0267, "step": 46730 }, { "epoch": 17.30470196223621, "grad_norm": 0.25133582949638367, "learning_rate": 5.9466498162066885e-05, "loss": 0.0177, "step": 46740 }, { "epoch": 17.308404294705664, "grad_norm": 0.1919722855091095, "learning_rate": 5.945026200101702e-05, "loss": 0.019, "step": 46750 }, { "epoch": 17.31210662717512, "grad_norm": 0.3204641342163086, "learning_rate": 5.943402480650071e-05, "loss": 0.0259, "step": 46760 }, { "epoch": 17.315808959644578, "grad_norm": 0.3653683066368103, "learning_rate": 5.9417786580293647e-05, "loss": 0.0211, "step": 46770 }, { "epoch": 17.319511292114033, "grad_norm": 0.12774375081062317, "learning_rate": 5.940154732417158e-05, "loss": 0.0263, "step": 46780 }, { "epoch": 17.323213624583488, "grad_norm": 0.25588202476501465, "learning_rate": 5.9385307039910445e-05, "loss": 0.0222, "step": 46790 }, { "epoch": 17.326915957052943, "grad_norm": 0.36796221137046814, "learning_rate": 5.9369065729286245e-05, "loss": 0.0218, "step": 46800 }, { "epoch": 17.3306182895224, "grad_norm": 0.5570741295814514, "learning_rate": 5.9352823394075094e-05, "loss": 0.0201, "step": 46810 }, { "epoch": 17.334320621991854, "grad_norm": 0.31140685081481934, "learning_rate": 5.9336580036053235e-05, "loss": 0.0214, "step": 46820 }, { "epoch": 17.338022954461312, "grad_norm": 0.2608770430088043, "learning_rate": 5.932033565699704e-05, "loss": 0.021, "step": 46830 }, { "epoch": 17.341725286930767, "grad_norm": 0.2411666065454483, "learning_rate": 5.930409025868293e-05, "loss": 0.0214, "step": 46840 }, { "epoch": 17.345427619400223, "grad_norm": 0.2920004427433014, "learning_rate": 5.92878438428875e-05, "loss": 0.0168, "step": 46850 }, { "epoch": 17.349129951869678, "grad_norm": 0.28480324149131775, "learning_rate": 5.927159641138744e-05, "loss": 0.017, "step": 46860 }, { "epoch": 17.352832284339133, "grad_norm": 0.18151985108852386, "learning_rate": 5.925534796595953e-05, "loss": 0.0164, "step": 46870 }, { "epoch": 17.356534616808588, "grad_norm": 0.2394505739212036, "learning_rate": 5.92390985083807e-05, "loss": 0.0217, "step": 46880 }, { "epoch": 17.360236949278047, "grad_norm": 0.17645986378192902, "learning_rate": 5.922284804042792e-05, "loss": 0.0341, "step": 46890 }, { "epoch": 17.363939281747502, "grad_norm": 0.2653754651546478, "learning_rate": 5.9206596563878357e-05, "loss": 0.0201, "step": 46900 }, { "epoch": 17.367641614216957, "grad_norm": 0.1836954951286316, "learning_rate": 5.919034408050925e-05, "loss": 0.0173, "step": 46910 }, { "epoch": 17.371343946686412, "grad_norm": 0.140668123960495, "learning_rate": 5.917409059209791e-05, "loss": 0.0126, "step": 46920 }, { "epoch": 17.375046279155868, "grad_norm": 0.18482089042663574, "learning_rate": 5.9157836100421824e-05, "loss": 0.0159, "step": 46930 }, { "epoch": 17.378748611625323, "grad_norm": 0.21937042474746704, "learning_rate": 5.914158060725856e-05, "loss": 0.0215, "step": 46940 }, { "epoch": 17.38245094409478, "grad_norm": 0.15800343453884125, "learning_rate": 5.912532411438576e-05, "loss": 0.0194, "step": 46950 }, { "epoch": 17.386153276564237, "grad_norm": 0.20964941382408142, "learning_rate": 5.9109066623581265e-05, "loss": 0.0239, "step": 46960 }, { "epoch": 17.38985560903369, "grad_norm": 0.24199189245700836, "learning_rate": 5.9092808136622924e-05, "loss": 0.0237, "step": 46970 }, { "epoch": 17.393557941503147, "grad_norm": 0.26963356137275696, "learning_rate": 5.907654865528876e-05, "loss": 0.0205, "step": 46980 }, { "epoch": 17.397260273972602, "grad_norm": 0.15849268436431885, "learning_rate": 5.906028818135687e-05, "loss": 0.0233, "step": 46990 }, { "epoch": 17.400962606442057, "grad_norm": 0.905052661895752, "learning_rate": 5.90440267166055e-05, "loss": 0.0256, "step": 47000 }, { "epoch": 17.404664938911516, "grad_norm": 0.15227337181568146, "learning_rate": 5.902776426281296e-05, "loss": 0.0229, "step": 47010 }, { "epoch": 17.40836727138097, "grad_norm": 0.22108429670333862, "learning_rate": 5.901150082175769e-05, "loss": 0.0199, "step": 47020 }, { "epoch": 17.412069603850426, "grad_norm": 0.49388423562049866, "learning_rate": 5.899523639521825e-05, "loss": 0.0144, "step": 47030 }, { "epoch": 17.41577193631988, "grad_norm": 0.32495036721229553, "learning_rate": 5.897897098497328e-05, "loss": 0.0242, "step": 47040 }, { "epoch": 17.419474268789337, "grad_norm": 0.14176414906978607, "learning_rate": 5.896270459280153e-05, "loss": 0.0158, "step": 47050 }, { "epoch": 17.423176601258792, "grad_norm": 0.2319020926952362, "learning_rate": 5.8946437220481887e-05, "loss": 0.0272, "step": 47060 }, { "epoch": 17.426878933728247, "grad_norm": 0.13385862112045288, "learning_rate": 5.8930168869793325e-05, "loss": 0.0212, "step": 47070 }, { "epoch": 17.430581266197706, "grad_norm": 0.26427116990089417, "learning_rate": 5.8913899542514915e-05, "loss": 0.0194, "step": 47080 }, { "epoch": 17.43428359866716, "grad_norm": 0.253036230802536, "learning_rate": 5.889762924042585e-05, "loss": 0.0174, "step": 47090 }, { "epoch": 17.437985931136616, "grad_norm": 0.17136189341545105, "learning_rate": 5.888135796530544e-05, "loss": 0.0241, "step": 47100 }, { "epoch": 17.44168826360607, "grad_norm": 0.1276928335428238, "learning_rate": 5.886508571893308e-05, "loss": 0.0222, "step": 47110 }, { "epoch": 17.445390596075526, "grad_norm": 0.27242186665534973, "learning_rate": 5.8848812503088244e-05, "loss": 0.0128, "step": 47120 }, { "epoch": 17.44909292854498, "grad_norm": 0.1515224277973175, "learning_rate": 5.883253831955061e-05, "loss": 0.0192, "step": 47130 }, { "epoch": 17.45279526101444, "grad_norm": 0.1432582288980484, "learning_rate": 5.881626317009984e-05, "loss": 0.0276, "step": 47140 }, { "epoch": 17.456497593483896, "grad_norm": 0.12617473304271698, "learning_rate": 5.8799987056515804e-05, "loss": 0.0178, "step": 47150 }, { "epoch": 17.46019992595335, "grad_norm": 0.13440841436386108, "learning_rate": 5.8783709980578414e-05, "loss": 0.0163, "step": 47160 }, { "epoch": 17.463902258422806, "grad_norm": 0.3041628897190094, "learning_rate": 5.876743194406769e-05, "loss": 0.0202, "step": 47170 }, { "epoch": 17.46760459089226, "grad_norm": 0.19863814115524292, "learning_rate": 5.875115294876381e-05, "loss": 0.0235, "step": 47180 }, { "epoch": 17.471306923361716, "grad_norm": 0.14939895272254944, "learning_rate": 5.873487299644699e-05, "loss": 0.0348, "step": 47190 }, { "epoch": 17.475009255831175, "grad_norm": 0.2715637981891632, "learning_rate": 5.871859208889759e-05, "loss": 0.0285, "step": 47200 }, { "epoch": 17.47871158830063, "grad_norm": 1.3329397439956665, "learning_rate": 5.8702310227896074e-05, "loss": 0.0273, "step": 47210 }, { "epoch": 17.482413920770085, "grad_norm": 0.3520972728729248, "learning_rate": 5.8686027415222985e-05, "loss": 0.028, "step": 47220 }, { "epoch": 17.48611625323954, "grad_norm": 0.3631680905818939, "learning_rate": 5.8669743652659014e-05, "loss": 0.0235, "step": 47230 }, { "epoch": 17.489818585708996, "grad_norm": 0.2762584984302521, "learning_rate": 5.8653458941984895e-05, "loss": 0.0232, "step": 47240 }, { "epoch": 17.49352091817845, "grad_norm": 0.1319303959608078, "learning_rate": 5.8637173284981526e-05, "loss": 0.0129, "step": 47250 }, { "epoch": 17.49722325064791, "grad_norm": 0.6726883053779602, "learning_rate": 5.862088668342986e-05, "loss": 0.0281, "step": 47260 }, { "epoch": 17.500925583117365, "grad_norm": 0.34091249108314514, "learning_rate": 5.860459913911101e-05, "loss": 0.024, "step": 47270 }, { "epoch": 17.50462791558682, "grad_norm": 0.18205446004867554, "learning_rate": 5.858831065380612e-05, "loss": 0.0339, "step": 47280 }, { "epoch": 17.508330248056275, "grad_norm": 0.2642830014228821, "learning_rate": 5.857202122929649e-05, "loss": 0.0221, "step": 47290 }, { "epoch": 17.51203258052573, "grad_norm": 0.18177363276481628, "learning_rate": 5.85557308673635e-05, "loss": 0.0245, "step": 47300 }, { "epoch": 17.515734912995185, "grad_norm": 0.28783634305000305, "learning_rate": 5.853943956978866e-05, "loss": 0.0273, "step": 47310 }, { "epoch": 17.519437245464644, "grad_norm": 0.26540714502334595, "learning_rate": 5.852314733835354e-05, "loss": 0.0213, "step": 47320 }, { "epoch": 17.5231395779341, "grad_norm": 0.1755690574645996, "learning_rate": 5.850685417483983e-05, "loss": 0.0198, "step": 47330 }, { "epoch": 17.526841910403554, "grad_norm": 0.1699727475643158, "learning_rate": 5.8490560081029356e-05, "loss": 0.0298, "step": 47340 }, { "epoch": 17.53054424287301, "grad_norm": 0.1905122548341751, "learning_rate": 5.847426505870399e-05, "loss": 0.0204, "step": 47350 }, { "epoch": 17.534246575342465, "grad_norm": 0.18685047328472137, "learning_rate": 5.8457969109645735e-05, "loss": 0.0184, "step": 47360 }, { "epoch": 17.53794890781192, "grad_norm": 0.5901883244514465, "learning_rate": 5.844167223563669e-05, "loss": 0.0241, "step": 47370 }, { "epoch": 17.54165124028138, "grad_norm": 0.2516111731529236, "learning_rate": 5.842537443845908e-05, "loss": 0.0215, "step": 47380 }, { "epoch": 17.545353572750834, "grad_norm": 0.1744791567325592, "learning_rate": 5.840907571989518e-05, "loss": 0.0216, "step": 47390 }, { "epoch": 17.54905590522029, "grad_norm": 0.19900676608085632, "learning_rate": 5.8392776081727385e-05, "loss": 0.0157, "step": 47400 }, { "epoch": 17.552758237689744, "grad_norm": 0.304278701543808, "learning_rate": 5.837647552573824e-05, "loss": 0.0117, "step": 47410 }, { "epoch": 17.5564605701592, "grad_norm": 0.455169677734375, "learning_rate": 5.836017405371032e-05, "loss": 0.019, "step": 47420 }, { "epoch": 17.560162902628655, "grad_norm": 0.3160056471824646, "learning_rate": 5.834387166742632e-05, "loss": 0.024, "step": 47430 }, { "epoch": 17.563865235098113, "grad_norm": 0.366608589887619, "learning_rate": 5.832756836866907e-05, "loss": 0.0284, "step": 47440 }, { "epoch": 17.56756756756757, "grad_norm": 0.18363212049007416, "learning_rate": 5.831126415922148e-05, "loss": 0.0299, "step": 47450 }, { "epoch": 17.571269900037024, "grad_norm": 0.1701408326625824, "learning_rate": 5.8294959040866505e-05, "loss": 0.045, "step": 47460 }, { "epoch": 17.57497223250648, "grad_norm": 0.14511556923389435, "learning_rate": 5.827865301538731e-05, "loss": 0.018, "step": 47470 }, { "epoch": 17.578674564975934, "grad_norm": 0.8048312664031982, "learning_rate": 5.8262346084567057e-05, "loss": 0.0271, "step": 47480 }, { "epoch": 17.58237689744539, "grad_norm": 0.17640621960163116, "learning_rate": 5.824603825018904e-05, "loss": 0.0247, "step": 47490 }, { "epoch": 17.586079229914848, "grad_norm": 0.24720348417758942, "learning_rate": 5.8229729514036705e-05, "loss": 0.0197, "step": 47500 }, { "epoch": 17.589781562384303, "grad_norm": 0.29343822598457336, "learning_rate": 5.8213419877893515e-05, "loss": 0.0222, "step": 47510 }, { "epoch": 17.593483894853758, "grad_norm": 0.30763906240463257, "learning_rate": 5.819710934354307e-05, "loss": 0.0225, "step": 47520 }, { "epoch": 17.597186227323213, "grad_norm": 0.1361953169107437, "learning_rate": 5.818079791276907e-05, "loss": 0.0122, "step": 47530 }, { "epoch": 17.60088855979267, "grad_norm": 0.21432800590991974, "learning_rate": 5.816448558735532e-05, "loss": 0.0331, "step": 47540 }, { "epoch": 17.604590892262124, "grad_norm": 0.2784886062145233, "learning_rate": 5.8148172369085686e-05, "loss": 0.0188, "step": 47550 }, { "epoch": 17.608293224731582, "grad_norm": 0.15745747089385986, "learning_rate": 5.813185825974419e-05, "loss": 0.0188, "step": 47560 }, { "epoch": 17.611995557201038, "grad_norm": 0.29972004890441895, "learning_rate": 5.811554326111489e-05, "loss": 0.0296, "step": 47570 }, { "epoch": 17.615697889670493, "grad_norm": 0.3347662091255188, "learning_rate": 5.809922737498198e-05, "loss": 0.0155, "step": 47580 }, { "epoch": 17.619400222139948, "grad_norm": 0.3064022660255432, "learning_rate": 5.808291060312975e-05, "loss": 0.0258, "step": 47590 }, { "epoch": 17.623102554609403, "grad_norm": 0.2505500614643097, "learning_rate": 5.8066592947342555e-05, "loss": 0.0185, "step": 47600 }, { "epoch": 17.62680488707886, "grad_norm": 0.21895284950733185, "learning_rate": 5.8050274409404896e-05, "loss": 0.0386, "step": 47610 }, { "epoch": 17.630507219548317, "grad_norm": 0.3366765081882477, "learning_rate": 5.803395499110131e-05, "loss": 0.0292, "step": 47620 }, { "epoch": 17.634209552017772, "grad_norm": 1.3050557374954224, "learning_rate": 5.801763469421652e-05, "loss": 0.0299, "step": 47630 }, { "epoch": 17.637911884487227, "grad_norm": 0.26928892731666565, "learning_rate": 5.800131352053524e-05, "loss": 0.0155, "step": 47640 }, { "epoch": 17.641614216956683, "grad_norm": 0.30594927072525024, "learning_rate": 5.798499147184233e-05, "loss": 0.0177, "step": 47650 }, { "epoch": 17.645316549426138, "grad_norm": 0.23652076721191406, "learning_rate": 5.796866854992276e-05, "loss": 0.0245, "step": 47660 }, { "epoch": 17.649018881895593, "grad_norm": 0.2683643400669098, "learning_rate": 5.795234475656159e-05, "loss": 0.0221, "step": 47670 }, { "epoch": 17.65272121436505, "grad_norm": 0.49788880348205566, "learning_rate": 5.793602009354395e-05, "loss": 0.0318, "step": 47680 }, { "epoch": 17.656423546834507, "grad_norm": 0.1715325564146042, "learning_rate": 5.7919694562655083e-05, "loss": 0.0304, "step": 47690 }, { "epoch": 17.660125879303962, "grad_norm": 0.15435156226158142, "learning_rate": 5.7903368165680327e-05, "loss": 0.0143, "step": 47700 }, { "epoch": 17.663828211773417, "grad_norm": 0.22597362101078033, "learning_rate": 5.788704090440511e-05, "loss": 0.016, "step": 47710 }, { "epoch": 17.667530544242872, "grad_norm": 0.21478046476840973, "learning_rate": 5.7870712780614953e-05, "loss": 0.0139, "step": 47720 }, { "epoch": 17.671232876712327, "grad_norm": 0.27471840381622314, "learning_rate": 5.785438379609549e-05, "loss": 0.0178, "step": 47730 }, { "epoch": 17.674935209181786, "grad_norm": 0.1744433492422104, "learning_rate": 5.783805395263242e-05, "loss": 0.0266, "step": 47740 }, { "epoch": 17.67863754165124, "grad_norm": 0.3763967454433441, "learning_rate": 5.782172325201155e-05, "loss": 0.0159, "step": 47750 }, { "epoch": 17.682339874120697, "grad_norm": 0.18706181645393372, "learning_rate": 5.780539169601879e-05, "loss": 0.025, "step": 47760 }, { "epoch": 17.68604220659015, "grad_norm": 0.3892013728618622, "learning_rate": 5.778905928644013e-05, "loss": 0.02, "step": 47770 }, { "epoch": 17.689744539059607, "grad_norm": 0.3115132749080658, "learning_rate": 5.777272602506165e-05, "loss": 0.0268, "step": 47780 }, { "epoch": 17.693446871529062, "grad_norm": 0.11764727532863617, "learning_rate": 5.775639191366954e-05, "loss": 0.0236, "step": 47790 }, { "epoch": 17.69714920399852, "grad_norm": 0.26174384355545044, "learning_rate": 5.7740056954050084e-05, "loss": 0.0195, "step": 47800 }, { "epoch": 17.700851536467976, "grad_norm": 0.2675294280052185, "learning_rate": 5.772372114798962e-05, "loss": 0.0252, "step": 47810 }, { "epoch": 17.70455386893743, "grad_norm": 0.36897724866867065, "learning_rate": 5.770738449727463e-05, "loss": 0.0155, "step": 47820 }, { "epoch": 17.708256201406886, "grad_norm": 0.24812977015972137, "learning_rate": 5.769104700369164e-05, "loss": 0.0195, "step": 47830 }, { "epoch": 17.71195853387634, "grad_norm": 0.16948753595352173, "learning_rate": 5.767470866902734e-05, "loss": 0.021, "step": 47840 }, { "epoch": 17.715660866345797, "grad_norm": 0.8060321807861328, "learning_rate": 5.765836949506843e-05, "loss": 0.0247, "step": 47850 }, { "epoch": 17.719363198815252, "grad_norm": 0.29155558347702026, "learning_rate": 5.7642029483601746e-05, "loss": 0.0152, "step": 47860 }, { "epoch": 17.72306553128471, "grad_norm": 0.16048990190029144, "learning_rate": 5.76256886364142e-05, "loss": 0.015, "step": 47870 }, { "epoch": 17.726767863754166, "grad_norm": 0.22845204174518585, "learning_rate": 5.760934695529282e-05, "loss": 0.0207, "step": 47880 }, { "epoch": 17.73047019622362, "grad_norm": 0.23406724631786346, "learning_rate": 5.75930044420247e-05, "loss": 0.0293, "step": 47890 }, { "epoch": 17.734172528693076, "grad_norm": 0.14110583066940308, "learning_rate": 5.757666109839702e-05, "loss": 0.0182, "step": 47900 }, { "epoch": 17.73787486116253, "grad_norm": 1.5118316411972046, "learning_rate": 5.7560316926197076e-05, "loss": 0.0135, "step": 47910 }, { "epoch": 17.74157719363199, "grad_norm": 0.15482859313488007, "learning_rate": 5.7543971927212257e-05, "loss": 0.0203, "step": 47920 }, { "epoch": 17.745279526101445, "grad_norm": 0.31439515948295593, "learning_rate": 5.752762610322999e-05, "loss": 0.026, "step": 47930 }, { "epoch": 17.7489818585709, "grad_norm": 0.21675772964954376, "learning_rate": 5.751127945603786e-05, "loss": 0.026, "step": 47940 }, { "epoch": 17.752684191040355, "grad_norm": 0.2601664066314697, "learning_rate": 5.74949319874235e-05, "loss": 0.0237, "step": 47950 }, { "epoch": 17.75638652350981, "grad_norm": 0.4232177138328552, "learning_rate": 5.747858369917465e-05, "loss": 0.0275, "step": 47960 }, { "epoch": 17.760088855979266, "grad_norm": 0.2605708837509155, "learning_rate": 5.7462234593079113e-05, "loss": 0.0138, "step": 47970 }, { "epoch": 17.76379118844872, "grad_norm": 0.21284620463848114, "learning_rate": 5.744588467092483e-05, "loss": 0.0218, "step": 47980 }, { "epoch": 17.76749352091818, "grad_norm": 0.24235905706882477, "learning_rate": 5.74295339344998e-05, "loss": 0.03, "step": 47990 }, { "epoch": 17.771195853387635, "grad_norm": 0.387849360704422, "learning_rate": 5.74131823855921e-05, "loss": 0.0264, "step": 48000 }, { "epoch": 17.77489818585709, "grad_norm": 0.11427769064903259, "learning_rate": 5.739683002598993e-05, "loss": 0.0167, "step": 48010 }, { "epoch": 17.778600518326545, "grad_norm": 0.1514459103345871, "learning_rate": 5.7380476857481544e-05, "loss": 0.0243, "step": 48020 }, { "epoch": 17.782302850796, "grad_norm": 0.4631892740726471, "learning_rate": 5.73641228818553e-05, "loss": 0.0183, "step": 48030 }, { "epoch": 17.786005183265456, "grad_norm": 0.30395278334617615, "learning_rate": 5.7347768100899655e-05, "loss": 0.0192, "step": 48040 }, { "epoch": 17.789707515734914, "grad_norm": 0.28131383657455444, "learning_rate": 5.733141251640315e-05, "loss": 0.0245, "step": 48050 }, { "epoch": 17.79340984820437, "grad_norm": 0.20369286835193634, "learning_rate": 5.7315056130154374e-05, "loss": 0.0188, "step": 48060 }, { "epoch": 17.797112180673825, "grad_norm": 0.21132613718509674, "learning_rate": 5.729869894394205e-05, "loss": 0.0164, "step": 48070 }, { "epoch": 17.80081451314328, "grad_norm": 0.34341323375701904, "learning_rate": 5.7282340959555e-05, "loss": 0.0214, "step": 48080 }, { "epoch": 17.804516845612735, "grad_norm": 3.8151118755340576, "learning_rate": 5.726598217878211e-05, "loss": 0.0283, "step": 48090 }, { "epoch": 17.80821917808219, "grad_norm": 0.6055464148521423, "learning_rate": 5.72496226034123e-05, "loss": 0.0203, "step": 48100 }, { "epoch": 17.81192151055165, "grad_norm": 0.25960269570350647, "learning_rate": 5.723326223523467e-05, "loss": 0.0207, "step": 48110 }, { "epoch": 17.815623843021104, "grad_norm": 0.4811680018901825, "learning_rate": 5.721690107603838e-05, "loss": 0.0223, "step": 48120 }, { "epoch": 17.81932617549056, "grad_norm": 0.20385684072971344, "learning_rate": 5.7200539127612604e-05, "loss": 0.018, "step": 48130 }, { "epoch": 17.823028507960014, "grad_norm": 0.19624854624271393, "learning_rate": 5.718417639174672e-05, "loss": 0.0176, "step": 48140 }, { "epoch": 17.82673084042947, "grad_norm": 0.14679111540317535, "learning_rate": 5.7167812870230094e-05, "loss": 0.0167, "step": 48150 }, { "epoch": 17.830433172898925, "grad_norm": 2.14599609375, "learning_rate": 5.7151448564852236e-05, "loss": 0.0235, "step": 48160 }, { "epoch": 17.834135505368383, "grad_norm": 0.262247771024704, "learning_rate": 5.713508347740272e-05, "loss": 0.0188, "step": 48170 }, { "epoch": 17.83783783783784, "grad_norm": 0.2797951400279999, "learning_rate": 5.7118717609671194e-05, "loss": 0.0221, "step": 48180 }, { "epoch": 17.841540170307294, "grad_norm": 0.10393549501895905, "learning_rate": 5.71023509634474e-05, "loss": 0.0164, "step": 48190 }, { "epoch": 17.84524250277675, "grad_norm": 0.17268140614032745, "learning_rate": 5.7085983540521216e-05, "loss": 0.0289, "step": 48200 }, { "epoch": 17.848944835246204, "grad_norm": 0.2109012007713318, "learning_rate": 5.70696153426825e-05, "loss": 0.0143, "step": 48210 }, { "epoch": 17.85264716771566, "grad_norm": 0.1773819774389267, "learning_rate": 5.705324637172128e-05, "loss": 0.0251, "step": 48220 }, { "epoch": 17.856349500185118, "grad_norm": 0.2821255922317505, "learning_rate": 5.7036876629427646e-05, "loss": 0.0168, "step": 48230 }, { "epoch": 17.860051832654573, "grad_norm": 0.1108584851026535, "learning_rate": 5.702050611759178e-05, "loss": 0.019, "step": 48240 }, { "epoch": 17.86375416512403, "grad_norm": 0.11794750392436981, "learning_rate": 5.70041348380039e-05, "loss": 0.0158, "step": 48250 }, { "epoch": 17.867456497593484, "grad_norm": 0.0753345787525177, "learning_rate": 5.698776279245437e-05, "loss": 0.0209, "step": 48260 }, { "epoch": 17.87115883006294, "grad_norm": 0.6354846358299255, "learning_rate": 5.6971389982733614e-05, "loss": 0.0244, "step": 48270 }, { "epoch": 17.874861162532394, "grad_norm": 0.27601224184036255, "learning_rate": 5.695501641063211e-05, "loss": 0.0285, "step": 48280 }, { "epoch": 17.878563495001853, "grad_norm": 0.15246079862117767, "learning_rate": 5.693864207794049e-05, "loss": 0.0189, "step": 48290 }, { "epoch": 17.882265827471308, "grad_norm": 0.41110941767692566, "learning_rate": 5.692226698644938e-05, "loss": 0.0219, "step": 48300 }, { "epoch": 17.885968159940763, "grad_norm": 0.3058362305164337, "learning_rate": 5.6905891137949584e-05, "loss": 0.0224, "step": 48310 }, { "epoch": 17.889670492410218, "grad_norm": 0.1418852061033249, "learning_rate": 5.68895145342319e-05, "loss": 0.0165, "step": 48320 }, { "epoch": 17.893372824879673, "grad_norm": 0.1782446950674057, "learning_rate": 5.687313717708728e-05, "loss": 0.0283, "step": 48330 }, { "epoch": 17.89707515734913, "grad_norm": 0.20925992727279663, "learning_rate": 5.685675906830671e-05, "loss": 0.0197, "step": 48340 }, { "epoch": 17.900777489818587, "grad_norm": 1.6448181867599487, "learning_rate": 5.6840380209681255e-05, "loss": 0.0332, "step": 48350 }, { "epoch": 17.904479822288042, "grad_norm": 0.14743785560131073, "learning_rate": 5.682400060300213e-05, "loss": 0.0268, "step": 48360 }, { "epoch": 17.908182154757498, "grad_norm": 0.2520364224910736, "learning_rate": 5.6807620250060554e-05, "loss": 0.0236, "step": 48370 }, { "epoch": 17.911884487226953, "grad_norm": 0.141525000333786, "learning_rate": 5.679123915264786e-05, "loss": 0.025, "step": 48380 }, { "epoch": 17.915586819696408, "grad_norm": 0.15290269255638123, "learning_rate": 5.677485731255545e-05, "loss": 0.0136, "step": 48390 }, { "epoch": 17.919289152165863, "grad_norm": 0.20522134006023407, "learning_rate": 5.675847473157485e-05, "loss": 0.0223, "step": 48400 }, { "epoch": 17.92299148463532, "grad_norm": 0.15671341121196747, "learning_rate": 5.67420914114976e-05, "loss": 0.0195, "step": 48410 }, { "epoch": 17.926693817104777, "grad_norm": 1.1239039897918701, "learning_rate": 5.6725707354115375e-05, "loss": 0.0271, "step": 48420 }, { "epoch": 17.930396149574232, "grad_norm": 0.4260673224925995, "learning_rate": 5.670932256121991e-05, "loss": 0.0267, "step": 48430 }, { "epoch": 17.934098482043687, "grad_norm": 0.31859830021858215, "learning_rate": 5.669293703460302e-05, "loss": 0.029, "step": 48440 }, { "epoch": 17.937800814513142, "grad_norm": 0.1880604326725006, "learning_rate": 5.667655077605659e-05, "loss": 0.023, "step": 48450 }, { "epoch": 17.941503146982598, "grad_norm": 0.21335679292678833, "learning_rate": 5.666016378737261e-05, "loss": 0.0309, "step": 48460 }, { "epoch": 17.945205479452056, "grad_norm": 0.1183689758181572, "learning_rate": 5.6643776070343134e-05, "loss": 0.0204, "step": 48470 }, { "epoch": 17.94890781192151, "grad_norm": 0.25250551104545593, "learning_rate": 5.6627387626760285e-05, "loss": 0.0194, "step": 48480 }, { "epoch": 17.952610144390967, "grad_norm": 0.31235000491142273, "learning_rate": 5.6610998458416296e-05, "loss": 0.0235, "step": 48490 }, { "epoch": 17.956312476860422, "grad_norm": 0.42154425382614136, "learning_rate": 5.6594608567103456e-05, "loss": 0.0156, "step": 48500 }, { "epoch": 17.960014809329877, "grad_norm": 0.278644323348999, "learning_rate": 5.6578217954614134e-05, "loss": 0.0323, "step": 48510 }, { "epoch": 17.963717141799332, "grad_norm": 0.37667685747146606, "learning_rate": 5.656182662274079e-05, "loss": 0.0161, "step": 48520 }, { "epoch": 17.96741947426879, "grad_norm": 0.8062044382095337, "learning_rate": 5.6545434573275946e-05, "loss": 0.0209, "step": 48530 }, { "epoch": 17.971121806738246, "grad_norm": 0.34464436769485474, "learning_rate": 5.6529041808012226e-05, "loss": 0.0301, "step": 48540 }, { "epoch": 17.9748241392077, "grad_norm": 0.3140277862548828, "learning_rate": 5.65126483287423e-05, "loss": 0.0238, "step": 48550 }, { "epoch": 17.978526471677156, "grad_norm": 0.1369304358959198, "learning_rate": 5.6496254137258956e-05, "loss": 0.0193, "step": 48560 }, { "epoch": 17.98222880414661, "grad_norm": 0.24393022060394287, "learning_rate": 5.647985923535501e-05, "loss": 0.022, "step": 48570 }, { "epoch": 17.985931136616067, "grad_norm": 0.5313047170639038, "learning_rate": 5.646346362482342e-05, "loss": 0.0279, "step": 48580 }, { "epoch": 17.989633469085526, "grad_norm": 0.5348042249679565, "learning_rate": 5.644706730745716e-05, "loss": 0.0251, "step": 48590 }, { "epoch": 17.99333580155498, "grad_norm": 2.6082041263580322, "learning_rate": 5.6430670285049314e-05, "loss": 0.0259, "step": 48600 }, { "epoch": 17.997038134024436, "grad_norm": 0.9738509058952332, "learning_rate": 5.641427255939302e-05, "loss": 0.0286, "step": 48610 }, { "epoch": 18.00074046649389, "grad_norm": 0.1509387195110321, "learning_rate": 5.6397874132281535e-05, "loss": 0.0216, "step": 48620 }, { "epoch": 18.004442798963346, "grad_norm": 0.2608649432659149, "learning_rate": 5.6381475005508154e-05, "loss": 0.0294, "step": 48630 }, { "epoch": 18.0081451314328, "grad_norm": 0.20320193469524384, "learning_rate": 5.636507518086626e-05, "loss": 0.0324, "step": 48640 }, { "epoch": 18.01184746390226, "grad_norm": 0.11661228537559509, "learning_rate": 5.634867466014932e-05, "loss": 0.0151, "step": 48650 }, { "epoch": 18.015549796371715, "grad_norm": 0.9653388857841492, "learning_rate": 5.633227344515085e-05, "loss": 0.0223, "step": 48660 }, { "epoch": 18.01925212884117, "grad_norm": 0.29593509435653687, "learning_rate": 5.631587153766448e-05, "loss": 0.0296, "step": 48670 }, { "epoch": 18.022954461310626, "grad_norm": 0.2520831823348999, "learning_rate": 5.62994689394839e-05, "loss": 0.0224, "step": 48680 }, { "epoch": 18.02665679378008, "grad_norm": 0.22173254191875458, "learning_rate": 5.628306565240287e-05, "loss": 0.0207, "step": 48690 }, { "epoch": 18.030359126249536, "grad_norm": 0.15275771915912628, "learning_rate": 5.6266661678215216e-05, "loss": 0.0171, "step": 48700 }, { "epoch": 18.034061458718995, "grad_norm": 0.20618954300880432, "learning_rate": 5.625025701871487e-05, "loss": 0.0217, "step": 48710 }, { "epoch": 18.03776379118845, "grad_norm": 0.2606505751609802, "learning_rate": 5.623385167569582e-05, "loss": 0.0192, "step": 48720 }, { "epoch": 18.041466123657905, "grad_norm": 0.7697197198867798, "learning_rate": 5.62174456509521e-05, "loss": 0.0149, "step": 48730 }, { "epoch": 18.04516845612736, "grad_norm": 0.24447578191757202, "learning_rate": 5.62010389462779e-05, "loss": 0.0215, "step": 48740 }, { "epoch": 18.048870788596815, "grad_norm": 0.18647202849388123, "learning_rate": 5.618463156346739e-05, "loss": 0.0232, "step": 48750 }, { "epoch": 18.05257312106627, "grad_norm": 0.41677671670913696, "learning_rate": 5.6168223504314863e-05, "loss": 0.0263, "step": 48760 }, { "epoch": 18.05627545353573, "grad_norm": 0.21125325560569763, "learning_rate": 5.615181477061471e-05, "loss": 0.024, "step": 48770 }, { "epoch": 18.059977786005184, "grad_norm": 0.3676145076751709, "learning_rate": 5.613540536416132e-05, "loss": 0.0321, "step": 48780 }, { "epoch": 18.06368011847464, "grad_norm": 0.3313562572002411, "learning_rate": 5.611899528674923e-05, "loss": 0.0224, "step": 48790 }, { "epoch": 18.067382450944095, "grad_norm": 0.18320903182029724, "learning_rate": 5.6102584540173006e-05, "loss": 0.0157, "step": 48800 }, { "epoch": 18.07108478341355, "grad_norm": 0.6282317638397217, "learning_rate": 5.6086173126227335e-05, "loss": 0.0191, "step": 48810 }, { "epoch": 18.074787115883005, "grad_norm": 0.25544610619544983, "learning_rate": 5.606976104670692e-05, "loss": 0.0302, "step": 48820 }, { "epoch": 18.07848944835246, "grad_norm": 0.19927702844142914, "learning_rate": 5.6053348303406536e-05, "loss": 0.0318, "step": 48830 }, { "epoch": 18.08219178082192, "grad_norm": 0.3623408377170563, "learning_rate": 5.603693489812111e-05, "loss": 0.0249, "step": 48840 }, { "epoch": 18.085894113291374, "grad_norm": 0.23241883516311646, "learning_rate": 5.602052083264555e-05, "loss": 0.0264, "step": 48850 }, { "epoch": 18.08959644576083, "grad_norm": 0.2535143196582794, "learning_rate": 5.600410610877488e-05, "loss": 0.0238, "step": 48860 }, { "epoch": 18.093298778230285, "grad_norm": 0.22638921439647675, "learning_rate": 5.5987690728304195e-05, "loss": 0.0126, "step": 48870 }, { "epoch": 18.09700111069974, "grad_norm": 0.1860397905111313, "learning_rate": 5.5971274693028655e-05, "loss": 0.0191, "step": 48880 }, { "epoch": 18.100703443169195, "grad_norm": 0.2680909037590027, "learning_rate": 5.595485800474349e-05, "loss": 0.0307, "step": 48890 }, { "epoch": 18.104405775638654, "grad_norm": 0.22529685497283936, "learning_rate": 5.5938440665244006e-05, "loss": 0.0153, "step": 48900 }, { "epoch": 18.10810810810811, "grad_norm": 0.25902262330055237, "learning_rate": 5.592202267632559e-05, "loss": 0.0278, "step": 48910 }, { "epoch": 18.111810440577564, "grad_norm": 0.25168460607528687, "learning_rate": 5.590560403978367e-05, "loss": 0.0206, "step": 48920 }, { "epoch": 18.11551277304702, "grad_norm": 0.27909502387046814, "learning_rate": 5.588918475741377e-05, "loss": 0.0202, "step": 48930 }, { "epoch": 18.119215105516474, "grad_norm": 0.23504991829395294, "learning_rate": 5.587276483101148e-05, "loss": 0.0163, "step": 48940 }, { "epoch": 18.12291743798593, "grad_norm": 0.21646776795387268, "learning_rate": 5.585634426237246e-05, "loss": 0.0251, "step": 48950 }, { "epoch": 18.126619770455388, "grad_norm": 0.11680961400270462, "learning_rate": 5.583992305329243e-05, "loss": 0.0183, "step": 48960 }, { "epoch": 18.130322102924843, "grad_norm": 0.23714549839496613, "learning_rate": 5.5823501205567206e-05, "loss": 0.02, "step": 48970 }, { "epoch": 18.1340244353943, "grad_norm": 0.15820111334323883, "learning_rate": 5.5807078720992645e-05, "loss": 0.0148, "step": 48980 }, { "epoch": 18.137726767863754, "grad_norm": 0.3533134460449219, "learning_rate": 5.579065560136467e-05, "loss": 0.0155, "step": 48990 }, { "epoch": 18.14142910033321, "grad_norm": 0.18290016055107117, "learning_rate": 5.577423184847932e-05, "loss": 0.0233, "step": 49000 }, { "epoch": 18.145131432802664, "grad_norm": 0.1878644824028015, "learning_rate": 5.575780746413264e-05, "loss": 0.0243, "step": 49010 }, { "epoch": 18.148833765272123, "grad_norm": 0.2529149651527405, "learning_rate": 5.574138245012081e-05, "loss": 0.0141, "step": 49020 }, { "epoch": 18.152536097741578, "grad_norm": 0.25700512528419495, "learning_rate": 5.5724956808240016e-05, "loss": 0.0348, "step": 49030 }, { "epoch": 18.156238430211033, "grad_norm": 0.19706159830093384, "learning_rate": 5.570853054028655e-05, "loss": 0.0263, "step": 49040 }, { "epoch": 18.15994076268049, "grad_norm": 0.3443583548069, "learning_rate": 5.569210364805677e-05, "loss": 0.0245, "step": 49050 }, { "epoch": 18.163643095149943, "grad_norm": 0.2381477653980255, "learning_rate": 5.5675676133347096e-05, "loss": 0.03, "step": 49060 }, { "epoch": 18.1673454276194, "grad_norm": 0.2868528962135315, "learning_rate": 5.5659247997954024e-05, "loss": 0.0231, "step": 49070 }, { "epoch": 18.171047760088857, "grad_norm": 0.4340657591819763, "learning_rate": 5.564281924367408e-05, "loss": 0.031, "step": 49080 }, { "epoch": 18.174750092558313, "grad_norm": 0.16478346288204193, "learning_rate": 5.562638987230392e-05, "loss": 0.0238, "step": 49090 }, { "epoch": 18.178452425027768, "grad_norm": 0.26278069615364075, "learning_rate": 5.560995988564023e-05, "loss": 0.0161, "step": 49100 }, { "epoch": 18.182154757497223, "grad_norm": 0.2914533317089081, "learning_rate": 5.559352928547974e-05, "loss": 0.0266, "step": 49110 }, { "epoch": 18.185857089966678, "grad_norm": 0.21880078315734863, "learning_rate": 5.5577098073619314e-05, "loss": 0.0206, "step": 49120 }, { "epoch": 18.189559422436133, "grad_norm": 0.1875912994146347, "learning_rate": 5.556066625185583e-05, "loss": 0.0178, "step": 49130 }, { "epoch": 18.193261754905592, "grad_norm": 0.1314847320318222, "learning_rate": 5.554423382198624e-05, "loss": 0.0157, "step": 49140 }, { "epoch": 18.196964087375047, "grad_norm": 0.16574938595294952, "learning_rate": 5.552780078580756e-05, "loss": 0.0238, "step": 49150 }, { "epoch": 18.200666419844502, "grad_norm": 0.15653608739376068, "learning_rate": 5.551136714511691e-05, "loss": 0.0284, "step": 49160 }, { "epoch": 18.204368752313957, "grad_norm": 0.20820894837379456, "learning_rate": 5.5494932901711435e-05, "loss": 0.034, "step": 49170 }, { "epoch": 18.208071084783413, "grad_norm": 0.22307060658931732, "learning_rate": 5.547849805738836e-05, "loss": 0.0301, "step": 49180 }, { "epoch": 18.211773417252868, "grad_norm": 0.09838513284921646, "learning_rate": 5.546206261394498e-05, "loss": 0.0205, "step": 49190 }, { "epoch": 18.215475749722327, "grad_norm": 0.270815372467041, "learning_rate": 5.544562657317863e-05, "loss": 0.0338, "step": 49200 }, { "epoch": 18.21917808219178, "grad_norm": 0.24754764139652252, "learning_rate": 5.542918993688675e-05, "loss": 0.0281, "step": 49210 }, { "epoch": 18.222880414661237, "grad_norm": 0.26407238841056824, "learning_rate": 5.541275270686683e-05, "loss": 0.0291, "step": 49220 }, { "epoch": 18.226582747130692, "grad_norm": 0.2017131894826889, "learning_rate": 5.539631488491641e-05, "loss": 0.0224, "step": 49230 }, { "epoch": 18.230285079600147, "grad_norm": 0.33088260889053345, "learning_rate": 5.537987647283309e-05, "loss": 0.0194, "step": 49240 }, { "epoch": 18.233987412069602, "grad_norm": 0.23605400323867798, "learning_rate": 5.5363437472414595e-05, "loss": 0.0178, "step": 49250 }, { "epoch": 18.23768974453906, "grad_norm": 0.14788727462291718, "learning_rate": 5.534699788545862e-05, "loss": 0.0169, "step": 49260 }, { "epoch": 18.241392077008516, "grad_norm": 0.14253932237625122, "learning_rate": 5.5330557713763e-05, "loss": 0.0178, "step": 49270 }, { "epoch": 18.24509440947797, "grad_norm": 0.6978016495704651, "learning_rate": 5.5314116959125605e-05, "loss": 0.0182, "step": 49280 }, { "epoch": 18.248796741947427, "grad_norm": 0.2003665417432785, "learning_rate": 5.529767562334437e-05, "loss": 0.016, "step": 49290 }, { "epoch": 18.252499074416882, "grad_norm": 0.15015779435634613, "learning_rate": 5.52812337082173e-05, "loss": 0.0237, "step": 49300 }, { "epoch": 18.256201406886337, "grad_norm": 0.2803850471973419, "learning_rate": 5.5264791215542436e-05, "loss": 0.0202, "step": 49310 }, { "epoch": 18.259903739355796, "grad_norm": 1.2217141389846802, "learning_rate": 5.5248348147117936e-05, "loss": 0.0219, "step": 49320 }, { "epoch": 18.26360607182525, "grad_norm": 0.1830778270959854, "learning_rate": 5.523190450474197e-05, "loss": 0.0184, "step": 49330 }, { "epoch": 18.267308404294706, "grad_norm": 0.4980965554714203, "learning_rate": 5.5215460290212784e-05, "loss": 0.0188, "step": 49340 }, { "epoch": 18.27101073676416, "grad_norm": 0.2084444910287857, "learning_rate": 5.519901550532871e-05, "loss": 0.018, "step": 49350 }, { "epoch": 18.274713069233616, "grad_norm": 0.12474244087934494, "learning_rate": 5.518257015188811e-05, "loss": 0.0214, "step": 49360 }, { "epoch": 18.27841540170307, "grad_norm": 0.15637990832328796, "learning_rate": 5.516612423168943e-05, "loss": 0.0249, "step": 49370 }, { "epoch": 18.28211773417253, "grad_norm": 0.23289430141448975, "learning_rate": 5.514967774653118e-05, "loss": 0.0198, "step": 49380 }, { "epoch": 18.285820066641985, "grad_norm": 0.429696649312973, "learning_rate": 5.5133230698211926e-05, "loss": 0.0237, "step": 49390 }, { "epoch": 18.28952239911144, "grad_norm": 1.8661071062088013, "learning_rate": 5.511678308853026e-05, "loss": 0.0197, "step": 49400 }, { "epoch": 18.293224731580896, "grad_norm": 0.27444130182266235, "learning_rate": 5.510033491928489e-05, "loss": 0.023, "step": 49410 }, { "epoch": 18.29692706405035, "grad_norm": 0.24410422146320343, "learning_rate": 5.508388619227457e-05, "loss": 0.0176, "step": 49420 }, { "epoch": 18.300629396519806, "grad_norm": 0.24314004182815552, "learning_rate": 5.506743690929809e-05, "loss": 0.0239, "step": 49430 }, { "epoch": 18.304331728989265, "grad_norm": 0.14485839009284973, "learning_rate": 5.5050987072154335e-05, "loss": 0.0191, "step": 49440 }, { "epoch": 18.30803406145872, "grad_norm": 0.18882067501544952, "learning_rate": 5.5034536682642224e-05, "loss": 0.0175, "step": 49450 }, { "epoch": 18.311736393928175, "grad_norm": 0.1686730831861496, "learning_rate": 5.5018085742560744e-05, "loss": 0.0163, "step": 49460 }, { "epoch": 18.31543872639763, "grad_norm": 0.26701921224594116, "learning_rate": 5.5001634253708965e-05, "loss": 0.0165, "step": 49470 }, { "epoch": 18.319141058867086, "grad_norm": 0.48381513357162476, "learning_rate": 5.4985182217885986e-05, "loss": 0.0291, "step": 49480 }, { "epoch": 18.32284339133654, "grad_norm": 0.249215766787529, "learning_rate": 5.496872963689096e-05, "loss": 0.0202, "step": 49490 }, { "epoch": 18.326545723806, "grad_norm": 0.19365404546260834, "learning_rate": 5.495227651252315e-05, "loss": 0.0192, "step": 49500 }, { "epoch": 18.330248056275455, "grad_norm": 0.23036977648735046, "learning_rate": 5.4935822846581794e-05, "loss": 0.0159, "step": 49510 }, { "epoch": 18.33395038874491, "grad_norm": 0.3242391049861908, "learning_rate": 5.491936864086631e-05, "loss": 0.0217, "step": 49520 }, { "epoch": 18.337652721214365, "grad_norm": 0.23078098893165588, "learning_rate": 5.490291389717603e-05, "loss": 0.0189, "step": 49530 }, { "epoch": 18.34135505368382, "grad_norm": 0.29005664587020874, "learning_rate": 5.4886458617310486e-05, "loss": 0.0256, "step": 49540 }, { "epoch": 18.345057386153275, "grad_norm": 0.16655009984970093, "learning_rate": 5.487000280306917e-05, "loss": 0.017, "step": 49550 }, { "epoch": 18.348759718622734, "grad_norm": 0.18054364621639252, "learning_rate": 5.485354645625167e-05, "loss": 0.0338, "step": 49560 }, { "epoch": 18.35246205109219, "grad_norm": 0.11559820920228958, "learning_rate": 5.483708957865763e-05, "loss": 0.0169, "step": 49570 }, { "epoch": 18.356164383561644, "grad_norm": 0.36553701758384705, "learning_rate": 5.482063217208674e-05, "loss": 0.0181, "step": 49580 }, { "epoch": 18.3598667160311, "grad_norm": 0.2329391986131668, "learning_rate": 5.4804174238338756e-05, "loss": 0.0266, "step": 49590 }, { "epoch": 18.363569048500555, "grad_norm": 0.6794987916946411, "learning_rate": 5.478771577921351e-05, "loss": 0.0214, "step": 49600 }, { "epoch": 18.36727138097001, "grad_norm": 0.27115264534950256, "learning_rate": 5.477125679651086e-05, "loss": 0.0177, "step": 49610 }, { "epoch": 18.37097371343947, "grad_norm": 0.34175899624824524, "learning_rate": 5.4754797292030734e-05, "loss": 0.0172, "step": 49620 }, { "epoch": 18.374676045908924, "grad_norm": 0.5542476773262024, "learning_rate": 5.473833726757314e-05, "loss": 0.0229, "step": 49630 }, { "epoch": 18.37837837837838, "grad_norm": 0.22967779636383057, "learning_rate": 5.4721876724938104e-05, "loss": 0.0101, "step": 49640 }, { "epoch": 18.382080710847834, "grad_norm": 0.19390632212162018, "learning_rate": 5.470541566592573e-05, "loss": 0.0227, "step": 49650 }, { "epoch": 18.38578304331729, "grad_norm": 0.1379380226135254, "learning_rate": 5.468895409233615e-05, "loss": 0.0235, "step": 49660 }, { "epoch": 18.389485375786744, "grad_norm": 0.19183389842510223, "learning_rate": 5.467249200596963e-05, "loss": 0.0176, "step": 49670 }, { "epoch": 18.3931877082562, "grad_norm": 0.15516109764575958, "learning_rate": 5.46560294086264e-05, "loss": 0.0204, "step": 49680 }, { "epoch": 18.39689004072566, "grad_norm": 0.3060532808303833, "learning_rate": 5.463956630210678e-05, "loss": 0.0194, "step": 49690 }, { "epoch": 18.400592373195114, "grad_norm": 0.19620256125926971, "learning_rate": 5.462310268821118e-05, "loss": 0.0194, "step": 49700 }, { "epoch": 18.40429470566457, "grad_norm": 0.227831169962883, "learning_rate": 5.460663856874002e-05, "loss": 0.0207, "step": 49710 }, { "epoch": 18.407997038134024, "grad_norm": 0.2166755199432373, "learning_rate": 5.459017394549378e-05, "loss": 0.0216, "step": 49720 }, { "epoch": 18.41169937060348, "grad_norm": 0.20294398069381714, "learning_rate": 5.4573708820273026e-05, "loss": 0.0321, "step": 49730 }, { "epoch": 18.415401703072934, "grad_norm": 0.19086015224456787, "learning_rate": 5.455724319487835e-05, "loss": 0.019, "step": 49740 }, { "epoch": 18.419104035542393, "grad_norm": 0.18901175260543823, "learning_rate": 5.454077707111042e-05, "loss": 0.0145, "step": 49750 }, { "epoch": 18.422806368011848, "grad_norm": 0.15998922288417816, "learning_rate": 5.4524310450769924e-05, "loss": 0.0176, "step": 49760 }, { "epoch": 18.426508700481303, "grad_norm": 0.28273841738700867, "learning_rate": 5.450784333565765e-05, "loss": 0.0241, "step": 49770 }, { "epoch": 18.43021103295076, "grad_norm": 0.21947608888149261, "learning_rate": 5.449137572757439e-05, "loss": 0.0204, "step": 49780 }, { "epoch": 18.433913365420214, "grad_norm": 0.12332223355770111, "learning_rate": 5.4474907628321046e-05, "loss": 0.0196, "step": 49790 }, { "epoch": 18.43761569788967, "grad_norm": 0.3485233187675476, "learning_rate": 5.445843903969854e-05, "loss": 0.0153, "step": 49800 }, { "epoch": 18.441318030359128, "grad_norm": 0.7599613070487976, "learning_rate": 5.4441969963507824e-05, "loss": 0.0174, "step": 49810 }, { "epoch": 18.445020362828583, "grad_norm": 0.10567459464073181, "learning_rate": 5.442550040154996e-05, "loss": 0.0201, "step": 49820 }, { "epoch": 18.448722695298038, "grad_norm": 0.1263905167579651, "learning_rate": 5.440903035562603e-05, "loss": 0.014, "step": 49830 }, { "epoch": 18.452425027767493, "grad_norm": 0.1448197215795517, "learning_rate": 5.439255982753717e-05, "loss": 0.0155, "step": 49840 }, { "epoch": 18.456127360236948, "grad_norm": 0.23224206268787384, "learning_rate": 5.4376088819084556e-05, "loss": 0.0221, "step": 49850 }, { "epoch": 18.459829692706403, "grad_norm": 0.18855473399162292, "learning_rate": 5.435961733206947e-05, "loss": 0.0213, "step": 49860 }, { "epoch": 18.463532025175862, "grad_norm": 0.12762458622455597, "learning_rate": 5.4343145368293166e-05, "loss": 0.0139, "step": 49870 }, { "epoch": 18.467234357645317, "grad_norm": 0.2791828513145447, "learning_rate": 5.432667292955702e-05, "loss": 0.0163, "step": 49880 }, { "epoch": 18.470936690114772, "grad_norm": 0.3831506371498108, "learning_rate": 5.431020001766244e-05, "loss": 0.0213, "step": 49890 }, { "epoch": 18.474639022584228, "grad_norm": 0.22329239547252655, "learning_rate": 5.4293726634410855e-05, "loss": 0.0179, "step": 49900 }, { "epoch": 18.478341355053683, "grad_norm": 0.5394971370697021, "learning_rate": 5.4277252781603784e-05, "loss": 0.0177, "step": 49910 }, { "epoch": 18.482043687523138, "grad_norm": 0.1544915735721588, "learning_rate": 5.4260778461042785e-05, "loss": 0.0214, "step": 49920 }, { "epoch": 18.485746019992597, "grad_norm": 0.1593763381242752, "learning_rate": 5.424430367452945e-05, "loss": 0.0262, "step": 49930 }, { "epoch": 18.489448352462052, "grad_norm": 0.11819926649332047, "learning_rate": 5.4227828423865444e-05, "loss": 0.0235, "step": 49940 }, { "epoch": 18.493150684931507, "grad_norm": 0.37775135040283203, "learning_rate": 5.4211352710852495e-05, "loss": 0.0106, "step": 49950 }, { "epoch": 18.496853017400962, "grad_norm": 0.1283109486103058, "learning_rate": 5.419487653729234e-05, "loss": 0.016, "step": 49960 }, { "epoch": 18.500555349870417, "grad_norm": 0.1665278822183609, "learning_rate": 5.417839990498678e-05, "loss": 0.0202, "step": 49970 }, { "epoch": 18.504257682339873, "grad_norm": 0.1792452335357666, "learning_rate": 5.4161922815737696e-05, "loss": 0.0152, "step": 49980 }, { "epoch": 18.50796001480933, "grad_norm": 0.2912718951702118, "learning_rate": 5.4145445271346986e-05, "loss": 0.0141, "step": 49990 }, { "epoch": 18.511662347278786, "grad_norm": 0.47005078196525574, "learning_rate": 5.4128967273616625e-05, "loss": 0.019, "step": 50000 } ], "logging_steps": 10, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 38, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.81347303604224e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }