{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 1.5374420881271362, "learning_rate": 2.25e-07, "loss": 1.2524, "step": 10 }, { "grad_norm": 1.8701053857803345, "learning_rate": 4.75e-07, "loss": 1.2595, "step": 20 }, { "grad_norm": 1.4946467876434326, "learning_rate": 7.25e-07, "loss": 1.262, "step": 30 }, { "grad_norm": 1.3459829092025757, "learning_rate": 9.75e-07, "loss": 1.2579, "step": 40 }, { "grad_norm": 1.3111767768859863, "learning_rate": 1.2250000000000001e-06, "loss": 1.2368, "step": 50 }, { "grad_norm": 0.866030216217041, "learning_rate": 1.475e-06, "loss": 1.2088, "step": 60 }, { "grad_norm": 0.7889012098312378, "learning_rate": 1.7250000000000002e-06, "loss": 1.1884, "step": 70 }, { "grad_norm": 0.4208715856075287, "learning_rate": 1.975e-06, "loss": 1.1672, "step": 80 }, { "grad_norm": 0.40170571208000183, "learning_rate": 2.225e-06, "loss": 1.1518, "step": 90 }, { "grad_norm": 0.39282742142677307, "learning_rate": 2.4750000000000004e-06, "loss": 1.1523, "step": 100 }, { "grad_norm": 0.32241272926330566, "learning_rate": 2.725e-06, "loss": 1.1431, "step": 110 }, { "grad_norm": 0.4157978892326355, "learning_rate": 2.975e-06, "loss": 1.1444, "step": 120 }, { "grad_norm": 0.2832075357437134, "learning_rate": 3.225e-06, "loss": 1.1319, "step": 130 }, { "grad_norm": 0.2981403172016144, "learning_rate": 3.4750000000000006e-06, "loss": 1.1352, "step": 140 }, { "grad_norm": 0.3186399042606354, "learning_rate": 3.725e-06, "loss": 1.1367, "step": 150 }, { "grad_norm": 0.3825281858444214, "learning_rate": 3.975e-06, "loss": 1.1292, "step": 160 }, { "grad_norm": 0.35050058364868164, "learning_rate": 4.225e-06, "loss": 1.1238, "step": 170 }, { "grad_norm": 0.4038090705871582, "learning_rate": 4.475e-06, "loss": 1.1114, "step": 180 }, { "grad_norm": 0.3435182571411133, "learning_rate": 4.7250000000000005e-06, "loss": 1.0958, "step": 190 }, { "grad_norm": 0.4411773383617401, "learning_rate": 4.975000000000001e-06, "loss": 1.0911, "step": 200 }, { "grad_norm": 0.42298221588134766, "learning_rate": 5.225e-06, "loss": 1.0838, "step": 210 }, { "grad_norm": 0.38061875104904175, "learning_rate": 5.475e-06, "loss": 1.0799, "step": 220 }, { "grad_norm": 0.3617795407772064, "learning_rate": 5.725e-06, "loss": 1.0669, "step": 230 }, { "grad_norm": 0.4787779748439789, "learning_rate": 5.975e-06, "loss": 1.0692, "step": 240 }, { "grad_norm": 0.5046012997627258, "learning_rate": 6.2250000000000005e-06, "loss": 1.0568, "step": 250 }, { "grad_norm": 0.40797415375709534, "learning_rate": 6.475000000000001e-06, "loss": 1.0657, "step": 260 }, { "grad_norm": 0.5279905200004578, "learning_rate": 6.725000000000001e-06, "loss": 1.0622, "step": 270 }, { "grad_norm": 0.38201993703842163, "learning_rate": 6.975000000000001e-06, "loss": 1.0609, "step": 280 }, { "grad_norm": 0.40363433957099915, "learning_rate": 7.2249999999999994e-06, "loss": 1.0501, "step": 290 }, { "grad_norm": 0.44846677780151367, "learning_rate": 7.4750000000000004e-06, "loss": 1.0454, "step": 300 }, { "grad_norm": 0.4389367699623108, "learning_rate": 7.725e-06, "loss": 1.0449, "step": 310 }, { "grad_norm": 0.5368679761886597, "learning_rate": 7.975e-06, "loss": 1.0437, "step": 320 }, { "grad_norm": 0.5399627089500427, "learning_rate": 8.225e-06, "loss": 1.036, "step": 330 }, { "grad_norm": 0.5820310711860657, "learning_rate": 8.475000000000001e-06, "loss": 1.0215, "step": 340 }, { "grad_norm": 0.6386199593544006, "learning_rate": 8.725e-06, "loss": 1.0166, "step": 350 }, { "grad_norm": 1.67423415184021, "learning_rate": 8.975e-06, "loss": 0.9815, "step": 360 }, { "grad_norm": 0.9517745971679688, "learning_rate": 9.225e-06, "loss": 0.9591, "step": 370 }, { "grad_norm": 1.0896025896072388, "learning_rate": 9.475e-06, "loss": 0.9419, "step": 380 }, { "grad_norm": 1.0530717372894287, "learning_rate": 9.725000000000001e-06, "loss": 0.9267, "step": 390 }, { "grad_norm": 0.9728261828422546, "learning_rate": 9.975e-06, "loss": 0.894, "step": 400 }, { "grad_norm": 1.0596740245819092, "learning_rate": 1.0225e-05, "loss": 0.8639, "step": 410 }, { "grad_norm": 1.2152597904205322, "learning_rate": 1.0475e-05, "loss": 0.8226, "step": 420 }, { "grad_norm": 1.2999203205108643, "learning_rate": 1.0725e-05, "loss": 0.801, "step": 430 }, { "grad_norm": 1.396743893623352, "learning_rate": 1.0975e-05, "loss": 0.7717, "step": 440 }, { "grad_norm": 1.6090826988220215, "learning_rate": 1.1225e-05, "loss": 0.7454, "step": 450 }, { "grad_norm": 1.4230643510818481, "learning_rate": 1.1475000000000001e-05, "loss": 0.7005, "step": 460 }, { "grad_norm": 1.3035528659820557, "learning_rate": 1.1725e-05, "loss": 0.667, "step": 470 }, { "grad_norm": 1.6891943216323853, "learning_rate": 1.1975e-05, "loss": 0.6282, "step": 480 }, { "grad_norm": 1.9718265533447266, "learning_rate": 1.2225e-05, "loss": 0.5801, "step": 490 }, { "grad_norm": 1.5320810079574585, "learning_rate": 1.2475e-05, "loss": 0.5503, "step": 500 }, { "grad_norm": 1.5813798904418945, "learning_rate": 1.2725000000000001e-05, "loss": 0.5176, "step": 510 }, { "grad_norm": 1.5434144735336304, "learning_rate": 1.2975e-05, "loss": 0.4883, "step": 520 }, { "grad_norm": 2.1103360652923584, "learning_rate": 1.3225000000000001e-05, "loss": 0.4616, "step": 530 }, { "grad_norm": 1.5702450275421143, "learning_rate": 1.3475000000000002e-05, "loss": 0.4428, "step": 540 }, { "grad_norm": 1.424597978591919, "learning_rate": 1.3725000000000002e-05, "loss": 0.4133, "step": 550 }, { "grad_norm": 1.1719518899917603, "learning_rate": 1.3975000000000003e-05, "loss": 0.3905, "step": 560 }, { "grad_norm": 1.7527923583984375, "learning_rate": 1.4225e-05, "loss": 0.3694, "step": 570 }, { "grad_norm": 1.8145021200180054, "learning_rate": 1.4475e-05, "loss": 0.3392, "step": 580 }, { "grad_norm": 1.8114711046218872, "learning_rate": 1.4725e-05, "loss": 0.3079, "step": 590 }, { "grad_norm": 1.971382975578308, "learning_rate": 1.4975e-05, "loss": 0.3005, "step": 600 }, { "grad_norm": 1.7384554147720337, "learning_rate": 1.5225e-05, "loss": 0.2676, "step": 610 }, { "grad_norm": 1.752884864807129, "learning_rate": 1.5475e-05, "loss": 0.2633, "step": 620 }, { "grad_norm": 1.9697608947753906, "learning_rate": 1.5725e-05, "loss": 0.2455, "step": 630 }, { "grad_norm": 1.820151925086975, "learning_rate": 1.5975000000000002e-05, "loss": 0.2355, "step": 640 }, { "grad_norm": 2.1529152393341064, "learning_rate": 1.6225e-05, "loss": 0.2359, "step": 650 }, { "grad_norm": 1.621444582939148, "learning_rate": 1.6475e-05, "loss": 0.2377, "step": 660 }, { "grad_norm": 1.7641302347183228, "learning_rate": 1.6725000000000003e-05, "loss": 0.2212, "step": 670 }, { "grad_norm": 1.4949954748153687, "learning_rate": 1.6975000000000003e-05, "loss": 0.2149, "step": 680 }, { "grad_norm": 1.8603112697601318, "learning_rate": 1.7225e-05, "loss": 0.2118, "step": 690 }, { "grad_norm": 1.8610659837722778, "learning_rate": 1.7475e-05, "loss": 0.2031, "step": 700 }, { "grad_norm": 1.8866311311721802, "learning_rate": 1.7725e-05, "loss": 0.2093, "step": 710 }, { "grad_norm": 1.6355241537094116, "learning_rate": 1.7975e-05, "loss": 0.1829, "step": 720 }, { "grad_norm": 1.9842314720153809, "learning_rate": 1.8225e-05, "loss": 0.155, "step": 730 }, { "grad_norm": 1.649091124534607, "learning_rate": 1.8475000000000002e-05, "loss": 0.1588, "step": 740 }, { "grad_norm": 2.5398173332214355, "learning_rate": 1.8725e-05, "loss": 0.1541, "step": 750 }, { "grad_norm": 1.9353482723236084, "learning_rate": 1.8975e-05, "loss": 0.1585, "step": 760 }, { "grad_norm": 1.7311140298843384, "learning_rate": 1.9225e-05, "loss": 0.1481, "step": 770 }, { "grad_norm": 1.5283080339431763, "learning_rate": 1.9475000000000002e-05, "loss": 0.1457, "step": 780 }, { "grad_norm": 1.7381175756454468, "learning_rate": 1.9725000000000002e-05, "loss": 0.1404, "step": 790 }, { "grad_norm": 1.9841405153274536, "learning_rate": 1.9975e-05, "loss": 0.1446, "step": 800 }, { "grad_norm": 1.3846542835235596, "learning_rate": 2.0225000000000004e-05, "loss": 0.1368, "step": 810 }, { "grad_norm": 1.7769250869750977, "learning_rate": 2.0475e-05, "loss": 0.1313, "step": 820 }, { "grad_norm": 1.7192928791046143, "learning_rate": 2.0725e-05, "loss": 0.1341, "step": 830 }, { "grad_norm": 1.5942370891571045, "learning_rate": 2.0975e-05, "loss": 0.1256, "step": 840 }, { "grad_norm": 1.5805144309997559, "learning_rate": 2.1225e-05, "loss": 0.1321, "step": 850 }, { "grad_norm": 1.8758456707000732, "learning_rate": 2.1475e-05, "loss": 0.1372, "step": 860 }, { "grad_norm": 2.0747647285461426, "learning_rate": 2.1725e-05, "loss": 0.1266, "step": 870 }, { "grad_norm": 1.8952722549438477, "learning_rate": 2.1975000000000002e-05, "loss": 0.1292, "step": 880 }, { "grad_norm": 1.9105908870697021, "learning_rate": 2.2225e-05, "loss": 0.1168, "step": 890 }, { "grad_norm": 1.8793606758117676, "learning_rate": 2.2475e-05, "loss": 0.1095, "step": 900 }, { "grad_norm": 1.6377146244049072, "learning_rate": 2.2725000000000003e-05, "loss": 0.1106, "step": 910 }, { "grad_norm": 1.6358426809310913, "learning_rate": 2.2975000000000003e-05, "loss": 0.111, "step": 920 }, { "grad_norm": 2.1229193210601807, "learning_rate": 2.3225000000000002e-05, "loss": 0.1185, "step": 930 }, { "grad_norm": 1.8212212324142456, "learning_rate": 2.3475e-05, "loss": 0.1108, "step": 940 }, { "grad_norm": 1.5938177108764648, "learning_rate": 2.3725e-05, "loss": 0.1156, "step": 950 }, { "grad_norm": 1.8648502826690674, "learning_rate": 2.3975e-05, "loss": 0.1171, "step": 960 }, { "grad_norm": 1.5774849653244019, "learning_rate": 2.4225e-05, "loss": 0.113, "step": 970 }, { "grad_norm": 1.6125202178955078, "learning_rate": 2.4475000000000002e-05, "loss": 0.1096, "step": 980 }, { "grad_norm": 1.5128835439682007, "learning_rate": 2.4725e-05, "loss": 0.1102, "step": 990 }, { "grad_norm": 1.3836255073547363, "learning_rate": 2.4975e-05, "loss": 0.1147, "step": 1000 }, { "grad_norm": 1.3876839876174927, "learning_rate": 2.5225e-05, "loss": 0.1051, "step": 1010 }, { "grad_norm": 1.5396602153778076, "learning_rate": 2.5475e-05, "loss": 0.1106, "step": 1020 }, { "grad_norm": 1.3746628761291504, "learning_rate": 2.5725e-05, "loss": 0.1135, "step": 1030 }, { "grad_norm": 1.5163480043411255, "learning_rate": 2.5974999999999998e-05, "loss": 0.105, "step": 1040 }, { "grad_norm": 1.3157703876495361, "learning_rate": 2.6225e-05, "loss": 0.1005, "step": 1050 }, { "grad_norm": 1.851703405380249, "learning_rate": 2.6475e-05, "loss": 0.1004, "step": 1060 }, { "grad_norm": 1.2255969047546387, "learning_rate": 2.6725e-05, "loss": 0.0994, "step": 1070 }, { "grad_norm": 1.3843388557434082, "learning_rate": 2.6975000000000002e-05, "loss": 0.0941, "step": 1080 }, { "grad_norm": 1.6328489780426025, "learning_rate": 2.7225e-05, "loss": 0.0954, "step": 1090 }, { "grad_norm": 1.3767225742340088, "learning_rate": 2.7475e-05, "loss": 0.1051, "step": 1100 }, { "grad_norm": 1.2815425395965576, "learning_rate": 2.7725e-05, "loss": 0.0918, "step": 1110 }, { "grad_norm": 1.493270754814148, "learning_rate": 2.7975000000000002e-05, "loss": 0.1061, "step": 1120 }, { "grad_norm": 1.8160514831542969, "learning_rate": 2.8225e-05, "loss": 0.1021, "step": 1130 }, { "grad_norm": 1.2811086177825928, "learning_rate": 2.8475e-05, "loss": 0.0979, "step": 1140 }, { "grad_norm": 1.3650429248809814, "learning_rate": 2.8725e-05, "loss": 0.1027, "step": 1150 }, { "grad_norm": 1.5847651958465576, "learning_rate": 2.8975000000000003e-05, "loss": 0.0949, "step": 1160 }, { "grad_norm": 1.6571044921875, "learning_rate": 2.9225000000000002e-05, "loss": 0.0966, "step": 1170 }, { "grad_norm": 1.6426036357879639, "learning_rate": 2.9475e-05, "loss": 0.1009, "step": 1180 }, { "grad_norm": 1.2702823877334595, "learning_rate": 2.9725000000000004e-05, "loss": 0.0883, "step": 1190 }, { "grad_norm": 1.4664369821548462, "learning_rate": 2.9975000000000004e-05, "loss": 0.0973, "step": 1200 }, { "grad_norm": 1.1774901151657104, "learning_rate": 3.0225000000000003e-05, "loss": 0.0909, "step": 1210 }, { "grad_norm": 1.3524671792984009, "learning_rate": 3.0475000000000002e-05, "loss": 0.0824, "step": 1220 }, { "grad_norm": 1.3027154207229614, "learning_rate": 3.0725e-05, "loss": 0.0893, "step": 1230 }, { "grad_norm": 1.5789194107055664, "learning_rate": 3.0975e-05, "loss": 0.092, "step": 1240 }, { "grad_norm": 1.2760151624679565, "learning_rate": 3.122500000000001e-05, "loss": 0.0934, "step": 1250 }, { "grad_norm": 1.313791036605835, "learning_rate": 3.1475e-05, "loss": 0.0784, "step": 1260 }, { "grad_norm": 1.1423540115356445, "learning_rate": 3.1725e-05, "loss": 0.0962, "step": 1270 }, { "grad_norm": 1.6329028606414795, "learning_rate": 3.1975e-05, "loss": 0.0862, "step": 1280 }, { "grad_norm": 1.534610629081726, "learning_rate": 3.2225e-05, "loss": 0.0933, "step": 1290 }, { "grad_norm": 1.5817174911499023, "learning_rate": 3.2474999999999997e-05, "loss": 0.0884, "step": 1300 }, { "grad_norm": 1.3212560415267944, "learning_rate": 3.2725e-05, "loss": 0.0831, "step": 1310 }, { "grad_norm": 1.2008883953094482, "learning_rate": 3.2975e-05, "loss": 0.0984, "step": 1320 }, { "grad_norm": 1.2759214639663696, "learning_rate": 3.3225e-05, "loss": 0.082, "step": 1330 }, { "grad_norm": 1.7806296348571777, "learning_rate": 3.3475e-05, "loss": 0.0824, "step": 1340 }, { "grad_norm": 1.4445773363113403, "learning_rate": 3.3725e-05, "loss": 0.09, "step": 1350 }, { "grad_norm": 1.3388776779174805, "learning_rate": 3.3975e-05, "loss": 0.0911, "step": 1360 }, { "grad_norm": 1.407771348953247, "learning_rate": 3.4225e-05, "loss": 0.0829, "step": 1370 }, { "grad_norm": 1.386457085609436, "learning_rate": 3.4475000000000005e-05, "loss": 0.0804, "step": 1380 }, { "grad_norm": 1.2460911273956299, "learning_rate": 3.4725000000000004e-05, "loss": 0.0751, "step": 1390 }, { "grad_norm": 1.1201668977737427, "learning_rate": 3.4975e-05, "loss": 0.0784, "step": 1400 }, { "grad_norm": 1.3840147256851196, "learning_rate": 3.5225e-05, "loss": 0.0773, "step": 1410 }, { "grad_norm": 0.9109706282615662, "learning_rate": 3.5475e-05, "loss": 0.0801, "step": 1420 }, { "grad_norm": 1.5072592496871948, "learning_rate": 3.5725e-05, "loss": 0.0888, "step": 1430 }, { "grad_norm": 1.215524673461914, "learning_rate": 3.5975e-05, "loss": 0.0786, "step": 1440 }, { "grad_norm": 1.2683229446411133, "learning_rate": 3.6225000000000006e-05, "loss": 0.0764, "step": 1450 }, { "grad_norm": 1.2531319856643677, "learning_rate": 3.6475000000000006e-05, "loss": 0.0835, "step": 1460 }, { "grad_norm": 1.104944109916687, "learning_rate": 3.6725000000000005e-05, "loss": 0.0737, "step": 1470 }, { "grad_norm": 1.2927230596542358, "learning_rate": 3.6975000000000004e-05, "loss": 0.0747, "step": 1480 }, { "grad_norm": 1.0322929620742798, "learning_rate": 3.7225000000000004e-05, "loss": 0.0824, "step": 1490 }, { "grad_norm": 1.0104515552520752, "learning_rate": 3.7475e-05, "loss": 0.0792, "step": 1500 }, { "grad_norm": 1.4633547067642212, "learning_rate": 3.7725e-05, "loss": 0.0834, "step": 1510 }, { "grad_norm": 1.2494096755981445, "learning_rate": 3.7975e-05, "loss": 0.0792, "step": 1520 }, { "grad_norm": 1.1130144596099854, "learning_rate": 3.8225e-05, "loss": 0.072, "step": 1530 }, { "grad_norm": 0.8819894194602966, "learning_rate": 3.8475e-05, "loss": 0.0776, "step": 1540 }, { "grad_norm": 1.086320161819458, "learning_rate": 3.8725e-05, "loss": 0.0752, "step": 1550 }, { "grad_norm": 1.239625096321106, "learning_rate": 3.8975e-05, "loss": 0.0802, "step": 1560 }, { "grad_norm": 1.220015287399292, "learning_rate": 3.9225e-05, "loss": 0.0771, "step": 1570 }, { "grad_norm": 1.0933998823165894, "learning_rate": 3.9475000000000004e-05, "loss": 0.0719, "step": 1580 }, { "grad_norm": 1.2417328357696533, "learning_rate": 3.9725e-05, "loss": 0.0739, "step": 1590 }, { "grad_norm": 1.066192865371704, "learning_rate": 3.9975e-05, "loss": 0.0794, "step": 1600 }, { "grad_norm": 1.2965596914291382, "learning_rate": 4.0225e-05, "loss": 0.0742, "step": 1610 }, { "grad_norm": 1.2141427993774414, "learning_rate": 4.0475e-05, "loss": 0.0698, "step": 1620 }, { "grad_norm": 1.1107410192489624, "learning_rate": 4.0725e-05, "loss": 0.0765, "step": 1630 }, { "grad_norm": 1.0952140092849731, "learning_rate": 4.0975e-05, "loss": 0.0783, "step": 1640 }, { "grad_norm": 0.9865875244140625, "learning_rate": 4.1225e-05, "loss": 0.07, "step": 1650 }, { "grad_norm": 1.184139609336853, "learning_rate": 4.1475000000000005e-05, "loss": 0.0703, "step": 1660 }, { "grad_norm": 1.224378228187561, "learning_rate": 4.1725000000000005e-05, "loss": 0.0767, "step": 1670 }, { "grad_norm": 1.1607487201690674, "learning_rate": 4.1975000000000004e-05, "loss": 0.0707, "step": 1680 }, { "grad_norm": 1.1753695011138916, "learning_rate": 4.2225e-05, "loss": 0.0717, "step": 1690 }, { "grad_norm": 1.0489609241485596, "learning_rate": 4.2475e-05, "loss": 0.0823, "step": 1700 }, { "grad_norm": 0.8394931554794312, "learning_rate": 4.2725e-05, "loss": 0.0729, "step": 1710 }, { "grad_norm": 1.1587103605270386, "learning_rate": 4.2975e-05, "loss": 0.077, "step": 1720 }, { "grad_norm": 0.9085665345191956, "learning_rate": 4.322500000000001e-05, "loss": 0.0713, "step": 1730 }, { "grad_norm": 0.7699836492538452, "learning_rate": 4.3475000000000006e-05, "loss": 0.07, "step": 1740 }, { "grad_norm": 1.2427946329116821, "learning_rate": 4.3725000000000006e-05, "loss": 0.0695, "step": 1750 }, { "grad_norm": 0.7750063538551331, "learning_rate": 4.3975e-05, "loss": 0.0636, "step": 1760 }, { "grad_norm": 1.133881688117981, "learning_rate": 4.4225e-05, "loss": 0.0811, "step": 1770 }, { "grad_norm": 0.9558221697807312, "learning_rate": 4.4475e-05, "loss": 0.0744, "step": 1780 }, { "grad_norm": 0.7709345817565918, "learning_rate": 4.4725e-05, "loss": 0.0693, "step": 1790 }, { "grad_norm": 1.6187602281570435, "learning_rate": 4.4975e-05, "loss": 0.0783, "step": 1800 }, { "grad_norm": 1.1132522821426392, "learning_rate": 4.5225e-05, "loss": 0.0726, "step": 1810 }, { "grad_norm": 0.8688352704048157, "learning_rate": 4.5475e-05, "loss": 0.0745, "step": 1820 }, { "grad_norm": 0.8423070311546326, "learning_rate": 4.5725e-05, "loss": 0.0678, "step": 1830 }, { "grad_norm": 1.0605367422103882, "learning_rate": 4.5975e-05, "loss": 0.0634, "step": 1840 }, { "grad_norm": 0.9752839803695679, "learning_rate": 4.6225e-05, "loss": 0.0703, "step": 1850 }, { "grad_norm": 0.9316169619560242, "learning_rate": 4.6475000000000005e-05, "loss": 0.0647, "step": 1860 }, { "grad_norm": 1.0067654848098755, "learning_rate": 4.6725000000000004e-05, "loss": 0.0684, "step": 1870 }, { "grad_norm": 1.004638433456421, "learning_rate": 4.6975000000000003e-05, "loss": 0.0664, "step": 1880 }, { "grad_norm": 0.7814590930938721, "learning_rate": 4.7225e-05, "loss": 0.0643, "step": 1890 }, { "grad_norm": 1.028395175933838, "learning_rate": 4.7475e-05, "loss": 0.0697, "step": 1900 }, { "grad_norm": 0.888927161693573, "learning_rate": 4.7725e-05, "loss": 0.0687, "step": 1910 }, { "grad_norm": 0.7912315726280212, "learning_rate": 4.7975e-05, "loss": 0.0609, "step": 1920 }, { "grad_norm": 0.618228018283844, "learning_rate": 4.822500000000001e-05, "loss": 0.063, "step": 1930 }, { "grad_norm": 0.8028368353843689, "learning_rate": 4.8475000000000006e-05, "loss": 0.0756, "step": 1940 }, { "grad_norm": 0.828797459602356, "learning_rate": 4.8725000000000005e-05, "loss": 0.065, "step": 1950 }, { "grad_norm": 0.7768414616584778, "learning_rate": 4.8975000000000005e-05, "loss": 0.0638, "step": 1960 }, { "grad_norm": 0.9830964803695679, "learning_rate": 4.9225000000000004e-05, "loss": 0.066, "step": 1970 }, { "grad_norm": 0.7625438570976257, "learning_rate": 4.9475e-05, "loss": 0.0638, "step": 1980 }, { "grad_norm": 0.9257563948631287, "learning_rate": 4.9725e-05, "loss": 0.0603, "step": 1990 }, { "grad_norm": 0.7663386464118958, "learning_rate": 4.9975e-05, "loss": 0.0608, "step": 2000 }, { "grad_norm": 0.987976610660553, "learning_rate": 5.0225e-05, "loss": 0.0674, "step": 2010 }, { "grad_norm": 0.7862051725387573, "learning_rate": 5.047500000000001e-05, "loss": 0.0664, "step": 2020 }, { "grad_norm": 0.7599007487297058, "learning_rate": 5.0725e-05, "loss": 0.0574, "step": 2030 }, { "grad_norm": 0.752254068851471, "learning_rate": 5.0975000000000006e-05, "loss": 0.0649, "step": 2040 }, { "grad_norm": 1.0505106449127197, "learning_rate": 5.1225e-05, "loss": 0.0666, "step": 2050 }, { "grad_norm": 0.9973055124282837, "learning_rate": 5.1475000000000004e-05, "loss": 0.0663, "step": 2060 }, { "grad_norm": 0.8185192942619324, "learning_rate": 5.1725000000000004e-05, "loss": 0.0668, "step": 2070 }, { "grad_norm": 1.081514596939087, "learning_rate": 5.197500000000001e-05, "loss": 0.0664, "step": 2080 }, { "grad_norm": 0.96794193983078, "learning_rate": 5.2225e-05, "loss": 0.0662, "step": 2090 }, { "grad_norm": 0.9108173251152039, "learning_rate": 5.247500000000001e-05, "loss": 0.0644, "step": 2100 }, { "grad_norm": 0.863815188407898, "learning_rate": 5.2725e-05, "loss": 0.0621, "step": 2110 }, { "grad_norm": 0.9709319472312927, "learning_rate": 5.297500000000001e-05, "loss": 0.0602, "step": 2120 }, { "grad_norm": 1.0606023073196411, "learning_rate": 5.3225e-05, "loss": 0.0664, "step": 2130 }, { "grad_norm": 0.8928080201148987, "learning_rate": 5.3475e-05, "loss": 0.065, "step": 2140 }, { "grad_norm": 0.8774266839027405, "learning_rate": 5.3725000000000005e-05, "loss": 0.0642, "step": 2150 }, { "grad_norm": 0.8187870383262634, "learning_rate": 5.3975e-05, "loss": 0.0524, "step": 2160 }, { "grad_norm": 0.6506102681159973, "learning_rate": 5.4225000000000003e-05, "loss": 0.0625, "step": 2170 }, { "grad_norm": 0.7786508202552795, "learning_rate": 5.4474999999999996e-05, "loss": 0.0655, "step": 2180 }, { "grad_norm": 0.7899206280708313, "learning_rate": 5.4725e-05, "loss": 0.0646, "step": 2190 }, { "grad_norm": 0.936127245426178, "learning_rate": 5.4975e-05, "loss": 0.0589, "step": 2200 }, { "grad_norm": 0.6947914361953735, "learning_rate": 5.522500000000001e-05, "loss": 0.058, "step": 2210 }, { "grad_norm": 0.7940956950187683, "learning_rate": 5.5475e-05, "loss": 0.0586, "step": 2220 }, { "grad_norm": 0.817393958568573, "learning_rate": 5.5725000000000006e-05, "loss": 0.0598, "step": 2230 }, { "grad_norm": 0.7645295262336731, "learning_rate": 5.5975e-05, "loss": 0.0613, "step": 2240 }, { "grad_norm": 0.7082880735397339, "learning_rate": 5.6225000000000005e-05, "loss": 0.0677, "step": 2250 }, { "grad_norm": 0.8778141736984253, "learning_rate": 5.6475e-05, "loss": 0.0706, "step": 2260 }, { "grad_norm": 0.8077853918075562, "learning_rate": 5.6725e-05, "loss": 0.0654, "step": 2270 }, { "grad_norm": 0.7380754947662354, "learning_rate": 5.6975e-05, "loss": 0.0594, "step": 2280 }, { "grad_norm": 0.7497959136962891, "learning_rate": 5.722500000000001e-05, "loss": 0.0559, "step": 2290 }, { "grad_norm": 0.7783060669898987, "learning_rate": 5.7475e-05, "loss": 0.0577, "step": 2300 }, { "grad_norm": 0.5894813537597656, "learning_rate": 5.772500000000001e-05, "loss": 0.0597, "step": 2310 }, { "grad_norm": 0.8221947550773621, "learning_rate": 5.7975e-05, "loss": 0.058, "step": 2320 }, { "grad_norm": 0.8726267218589783, "learning_rate": 5.8225000000000006e-05, "loss": 0.0633, "step": 2330 }, { "grad_norm": 0.7449456453323364, "learning_rate": 5.8475000000000005e-05, "loss": 0.058, "step": 2340 }, { "grad_norm": 0.6573740839958191, "learning_rate": 5.8725000000000004e-05, "loss": 0.0588, "step": 2350 }, { "grad_norm": 0.8543050289154053, "learning_rate": 5.8975000000000004e-05, "loss": 0.0628, "step": 2360 }, { "grad_norm": 0.7564703822135925, "learning_rate": 5.922500000000001e-05, "loss": 0.0606, "step": 2370 }, { "grad_norm": 0.7179650068283081, "learning_rate": 5.9475e-05, "loss": 0.0616, "step": 2380 }, { "grad_norm": 0.6850331425666809, "learning_rate": 5.9724999999999995e-05, "loss": 0.0574, "step": 2390 }, { "grad_norm": 0.6381823420524597, "learning_rate": 5.9975e-05, "loss": 0.0581, "step": 2400 }, { "grad_norm": 0.7992104887962341, "learning_rate": 6.0225e-05, "loss": 0.052, "step": 2410 }, { "grad_norm": 0.7479788064956665, "learning_rate": 6.0475000000000006e-05, "loss": 0.057, "step": 2420 }, { "grad_norm": 0.5687183141708374, "learning_rate": 6.0725e-05, "loss": 0.0559, "step": 2430 }, { "grad_norm": 0.7166382074356079, "learning_rate": 6.0975000000000005e-05, "loss": 0.0559, "step": 2440 }, { "grad_norm": 0.6339192390441895, "learning_rate": 6.1225e-05, "loss": 0.0577, "step": 2450 }, { "grad_norm": 0.7452266216278076, "learning_rate": 6.1475e-05, "loss": 0.0634, "step": 2460 }, { "grad_norm": 0.7064713835716248, "learning_rate": 6.1725e-05, "loss": 0.0554, "step": 2470 }, { "grad_norm": 0.7131314873695374, "learning_rate": 6.1975e-05, "loss": 0.0607, "step": 2480 }, { "grad_norm": 0.6354004144668579, "learning_rate": 6.2225e-05, "loss": 0.0554, "step": 2490 }, { "grad_norm": 0.5757215619087219, "learning_rate": 6.2475e-05, "loss": 0.0571, "step": 2500 }, { "grad_norm": 0.7169754505157471, "learning_rate": 6.2725e-05, "loss": 0.0608, "step": 2510 }, { "grad_norm": 0.6807665228843689, "learning_rate": 6.297500000000001e-05, "loss": 0.0545, "step": 2520 }, { "grad_norm": 0.7271147966384888, "learning_rate": 6.3225e-05, "loss": 0.0522, "step": 2530 }, { "grad_norm": 0.5380944013595581, "learning_rate": 6.347500000000001e-05, "loss": 0.0522, "step": 2540 }, { "grad_norm": 0.7722380757331848, "learning_rate": 6.3725e-05, "loss": 0.0599, "step": 2550 }, { "grad_norm": 0.6163679361343384, "learning_rate": 6.397500000000001e-05, "loss": 0.05, "step": 2560 }, { "grad_norm": 0.7963970303535461, "learning_rate": 6.4225e-05, "loss": 0.0521, "step": 2570 }, { "grad_norm": 0.6644709706306458, "learning_rate": 6.447500000000001e-05, "loss": 0.0595, "step": 2580 }, { "grad_norm": 0.6518495678901672, "learning_rate": 6.4725e-05, "loss": 0.0524, "step": 2590 }, { "grad_norm": 0.714096188545227, "learning_rate": 6.497500000000001e-05, "loss": 0.0565, "step": 2600 }, { "grad_norm": 0.7701051235198975, "learning_rate": 6.5225e-05, "loss": 0.055, "step": 2610 }, { "grad_norm": 0.6831240057945251, "learning_rate": 6.5475e-05, "loss": 0.0531, "step": 2620 }, { "grad_norm": 0.5925824046134949, "learning_rate": 6.5725e-05, "loss": 0.0585, "step": 2630 }, { "grad_norm": 0.5701717138290405, "learning_rate": 6.5975e-05, "loss": 0.054, "step": 2640 }, { "grad_norm": 0.7296027541160583, "learning_rate": 6.6225e-05, "loss": 0.0548, "step": 2650 }, { "grad_norm": 0.5579178333282471, "learning_rate": 6.6475e-05, "loss": 0.0514, "step": 2660 }, { "grad_norm": 0.5659551620483398, "learning_rate": 6.672500000000001e-05, "loss": 0.0547, "step": 2670 }, { "grad_norm": 0.7810350656509399, "learning_rate": 6.6975e-05, "loss": 0.0504, "step": 2680 }, { "grad_norm": 0.8533453345298767, "learning_rate": 6.722500000000001e-05, "loss": 0.057, "step": 2690 }, { "grad_norm": 0.7203193306922913, "learning_rate": 6.7475e-05, "loss": 0.056, "step": 2700 }, { "grad_norm": 0.7321145534515381, "learning_rate": 6.7725e-05, "loss": 0.0602, "step": 2710 }, { "grad_norm": 0.4385448396205902, "learning_rate": 6.7975e-05, "loss": 0.0526, "step": 2720 }, { "grad_norm": 0.5927621126174927, "learning_rate": 6.8225e-05, "loss": 0.0572, "step": 2730 }, { "grad_norm": 0.6471867561340332, "learning_rate": 6.8475e-05, "loss": 0.0541, "step": 2740 }, { "grad_norm": 0.7940738797187805, "learning_rate": 6.8725e-05, "loss": 0.0517, "step": 2750 }, { "grad_norm": 0.7724630832672119, "learning_rate": 6.8975e-05, "loss": 0.0559, "step": 2760 }, { "grad_norm": 0.7754489779472351, "learning_rate": 6.9225e-05, "loss": 0.0497, "step": 2770 }, { "grad_norm": 0.6990789175033569, "learning_rate": 6.9475e-05, "loss": 0.0552, "step": 2780 }, { "grad_norm": 0.630670964717865, "learning_rate": 6.9725e-05, "loss": 0.0539, "step": 2790 }, { "grad_norm": 0.8136194348335266, "learning_rate": 6.997500000000001e-05, "loss": 0.0532, "step": 2800 }, { "grad_norm": 0.6073182225227356, "learning_rate": 7.022500000000001e-05, "loss": 0.0485, "step": 2810 }, { "grad_norm": 0.7617588639259338, "learning_rate": 7.0475e-05, "loss": 0.0463, "step": 2820 }, { "grad_norm": 0.7375597953796387, "learning_rate": 7.072500000000001e-05, "loss": 0.0548, "step": 2830 }, { "grad_norm": 0.5705177783966064, "learning_rate": 7.0975e-05, "loss": 0.055, "step": 2840 }, { "grad_norm": 0.5283348560333252, "learning_rate": 7.122500000000001e-05, "loss": 0.0502, "step": 2850 }, { "grad_norm": 0.5416377186775208, "learning_rate": 7.1475e-05, "loss": 0.0517, "step": 2860 }, { "grad_norm": 0.4922686815261841, "learning_rate": 7.172500000000001e-05, "loss": 0.0506, "step": 2870 }, { "grad_norm": 0.649588406085968, "learning_rate": 7.1975e-05, "loss": 0.0484, "step": 2880 }, { "grad_norm": 0.6833872199058533, "learning_rate": 7.2225e-05, "loss": 0.0478, "step": 2890 }, { "grad_norm": 0.5413126349449158, "learning_rate": 7.2475e-05, "loss": 0.049, "step": 2900 }, { "grad_norm": 0.5075055956840515, "learning_rate": 7.272499999999999e-05, "loss": 0.0479, "step": 2910 }, { "grad_norm": 0.541929304599762, "learning_rate": 7.2975e-05, "loss": 0.048, "step": 2920 }, { "grad_norm": 0.729365348815918, "learning_rate": 7.3225e-05, "loss": 0.0531, "step": 2930 }, { "grad_norm": 0.6413490176200867, "learning_rate": 7.347500000000001e-05, "loss": 0.0574, "step": 2940 }, { "grad_norm": 0.4965779185295105, "learning_rate": 7.3725e-05, "loss": 0.0535, "step": 2950 }, { "grad_norm": 0.6286324858665466, "learning_rate": 7.397500000000001e-05, "loss": 0.055, "step": 2960 }, { "grad_norm": 0.43583038449287415, "learning_rate": 7.4225e-05, "loss": 0.0499, "step": 2970 }, { "grad_norm": 0.7039752006530762, "learning_rate": 7.447500000000001e-05, "loss": 0.0491, "step": 2980 }, { "grad_norm": 0.6009247303009033, "learning_rate": 7.4725e-05, "loss": 0.0497, "step": 2990 }, { "grad_norm": 0.6339594721794128, "learning_rate": 7.4975e-05, "loss": 0.0533, "step": 3000 }, { "grad_norm": 0.5137279629707336, "learning_rate": 7.5225e-05, "loss": 0.0564, "step": 3010 }, { "grad_norm": 0.5804732441902161, "learning_rate": 7.5475e-05, "loss": 0.049, "step": 3020 }, { "grad_norm": 0.6390947699546814, "learning_rate": 7.5725e-05, "loss": 0.0495, "step": 3030 }, { "grad_norm": 0.5833460688591003, "learning_rate": 7.5975e-05, "loss": 0.0449, "step": 3040 }, { "grad_norm": 0.6313241124153137, "learning_rate": 7.6225e-05, "loss": 0.0479, "step": 3050 }, { "grad_norm": 0.6497313380241394, "learning_rate": 7.6475e-05, "loss": 0.0496, "step": 3060 }, { "grad_norm": 0.6420734524726868, "learning_rate": 7.672500000000001e-05, "loss": 0.0504, "step": 3070 }, { "grad_norm": 0.5452121496200562, "learning_rate": 7.697500000000001e-05, "loss": 0.0514, "step": 3080 }, { "grad_norm": 0.5727158188819885, "learning_rate": 7.722500000000001e-05, "loss": 0.051, "step": 3090 }, { "grad_norm": 0.5355808734893799, "learning_rate": 7.747500000000001e-05, "loss": 0.0533, "step": 3100 }, { "grad_norm": 0.7543832063674927, "learning_rate": 7.7725e-05, "loss": 0.0578, "step": 3110 }, { "grad_norm": 0.5339260697364807, "learning_rate": 7.797500000000001e-05, "loss": 0.0491, "step": 3120 }, { "grad_norm": 0.5518702864646912, "learning_rate": 7.8225e-05, "loss": 0.0583, "step": 3130 }, { "grad_norm": 0.4459390640258789, "learning_rate": 7.8475e-05, "loss": 0.0467, "step": 3140 }, { "grad_norm": 0.6320958733558655, "learning_rate": 7.8725e-05, "loss": 0.0532, "step": 3150 }, { "grad_norm": 0.502372682094574, "learning_rate": 7.8975e-05, "loss": 0.0439, "step": 3160 }, { "grad_norm": 0.39788517355918884, "learning_rate": 7.9225e-05, "loss": 0.0482, "step": 3170 }, { "grad_norm": 0.4839414954185486, "learning_rate": 7.9475e-05, "loss": 0.0494, "step": 3180 }, { "grad_norm": 0.40750768780708313, "learning_rate": 7.9725e-05, "loss": 0.0514, "step": 3190 }, { "grad_norm": 0.6160529851913452, "learning_rate": 7.9975e-05, "loss": 0.0462, "step": 3200 }, { "grad_norm": 0.6370850205421448, "learning_rate": 8.022500000000001e-05, "loss": 0.0474, "step": 3210 }, { "grad_norm": 0.6463127136230469, "learning_rate": 8.0475e-05, "loss": 0.0481, "step": 3220 }, { "grad_norm": 0.4748713970184326, "learning_rate": 8.072500000000001e-05, "loss": 0.055, "step": 3230 }, { "grad_norm": 0.5154767632484436, "learning_rate": 8.0975e-05, "loss": 0.0503, "step": 3240 }, { "grad_norm": 0.4518986642360687, "learning_rate": 8.122500000000001e-05, "loss": 0.0478, "step": 3250 }, { "grad_norm": 0.6002172231674194, "learning_rate": 8.1475e-05, "loss": 0.0523, "step": 3260 }, { "grad_norm": 0.6093466877937317, "learning_rate": 8.172500000000001e-05, "loss": 0.049, "step": 3270 }, { "grad_norm": 0.6547258496284485, "learning_rate": 8.1975e-05, "loss": 0.0458, "step": 3280 }, { "grad_norm": 0.5519648790359497, "learning_rate": 8.2225e-05, "loss": 0.0505, "step": 3290 }, { "grad_norm": 0.6260554194450378, "learning_rate": 8.2475e-05, "loss": 0.0524, "step": 3300 }, { "grad_norm": 0.5051695108413696, "learning_rate": 8.2725e-05, "loss": 0.0492, "step": 3310 }, { "grad_norm": 0.5219120979309082, "learning_rate": 8.2975e-05, "loss": 0.052, "step": 3320 }, { "grad_norm": 0.5329957604408264, "learning_rate": 8.3225e-05, "loss": 0.0462, "step": 3330 }, { "grad_norm": 0.43162932991981506, "learning_rate": 8.347500000000001e-05, "loss": 0.0471, "step": 3340 }, { "grad_norm": 0.5521388053894043, "learning_rate": 8.3725e-05, "loss": 0.0522, "step": 3350 }, { "grad_norm": 0.5228402018547058, "learning_rate": 8.397500000000001e-05, "loss": 0.0466, "step": 3360 }, { "grad_norm": 0.42989516258239746, "learning_rate": 8.422500000000001e-05, "loss": 0.046, "step": 3370 }, { "grad_norm": 0.577372133731842, "learning_rate": 8.447500000000001e-05, "loss": 0.0453, "step": 3380 }, { "grad_norm": 0.6060503721237183, "learning_rate": 8.4725e-05, "loss": 0.0493, "step": 3390 }, { "grad_norm": 0.6050747036933899, "learning_rate": 8.4975e-05, "loss": 0.0522, "step": 3400 }, { "grad_norm": 0.5121099352836609, "learning_rate": 8.5225e-05, "loss": 0.055, "step": 3410 }, { "grad_norm": 0.49627789855003357, "learning_rate": 8.5475e-05, "loss": 0.0459, "step": 3420 }, { "grad_norm": 0.5670469403266907, "learning_rate": 8.5725e-05, "loss": 0.0457, "step": 3430 }, { "grad_norm": 0.6700046062469482, "learning_rate": 8.5975e-05, "loss": 0.0518, "step": 3440 }, { "grad_norm": 0.5057273507118225, "learning_rate": 8.6225e-05, "loss": 0.043, "step": 3450 }, { "grad_norm": 0.6416817903518677, "learning_rate": 8.6475e-05, "loss": 0.0474, "step": 3460 }, { "grad_norm": 0.6221009492874146, "learning_rate": 8.672500000000001e-05, "loss": 0.0547, "step": 3470 }, { "grad_norm": 0.525396466255188, "learning_rate": 8.6975e-05, "loss": 0.049, "step": 3480 }, { "grad_norm": 0.45955485105514526, "learning_rate": 8.7225e-05, "loss": 0.0471, "step": 3490 }, { "grad_norm": 0.4246608316898346, "learning_rate": 8.747500000000001e-05, "loss": 0.046, "step": 3500 }, { "grad_norm": 0.5377941131591797, "learning_rate": 8.7725e-05, "loss": 0.0566, "step": 3510 }, { "grad_norm": 0.558617115020752, "learning_rate": 8.797500000000001e-05, "loss": 0.0485, "step": 3520 }, { "grad_norm": 0.4880578815937042, "learning_rate": 8.8225e-05, "loss": 0.0479, "step": 3530 }, { "grad_norm": 0.579226016998291, "learning_rate": 8.847500000000001e-05, "loss": 0.0495, "step": 3540 }, { "grad_norm": 0.6123835444450378, "learning_rate": 8.8725e-05, "loss": 0.0483, "step": 3550 }, { "grad_norm": 0.46246424317359924, "learning_rate": 8.897500000000001e-05, "loss": 0.0489, "step": 3560 }, { "grad_norm": 0.37736058235168457, "learning_rate": 8.9225e-05, "loss": 0.0489, "step": 3570 }, { "grad_norm": 0.5436564683914185, "learning_rate": 8.9475e-05, "loss": 0.0494, "step": 3580 }, { "grad_norm": 0.4172821342945099, "learning_rate": 8.9725e-05, "loss": 0.0447, "step": 3590 }, { "grad_norm": 0.5463844537734985, "learning_rate": 8.9975e-05, "loss": 0.0443, "step": 3600 }, { "grad_norm": 0.5651423931121826, "learning_rate": 9.0225e-05, "loss": 0.0446, "step": 3610 }, { "grad_norm": 0.46732190251350403, "learning_rate": 9.0475e-05, "loss": 0.0443, "step": 3620 }, { "grad_norm": 0.5967239141464233, "learning_rate": 9.072500000000001e-05, "loss": 0.0463, "step": 3630 }, { "grad_norm": 0.4411846101284027, "learning_rate": 9.0975e-05, "loss": 0.0431, "step": 3640 }, { "grad_norm": 0.5942131280899048, "learning_rate": 9.122500000000001e-05, "loss": 0.041, "step": 3650 }, { "grad_norm": 0.47800973057746887, "learning_rate": 9.1475e-05, "loss": 0.0443, "step": 3660 }, { "grad_norm": 0.5287109017372131, "learning_rate": 9.172500000000001e-05, "loss": 0.0448, "step": 3670 }, { "grad_norm": 0.5302049517631531, "learning_rate": 9.1975e-05, "loss": 0.0435, "step": 3680 }, { "grad_norm": 0.5272785425186157, "learning_rate": 9.2225e-05, "loss": 0.0498, "step": 3690 }, { "grad_norm": 0.5358034372329712, "learning_rate": 9.2475e-05, "loss": 0.0401, "step": 3700 }, { "grad_norm": 0.500447154045105, "learning_rate": 9.2725e-05, "loss": 0.0435, "step": 3710 }, { "grad_norm": 0.44525232911109924, "learning_rate": 9.2975e-05, "loss": 0.045, "step": 3720 }, { "grad_norm": 0.3971233069896698, "learning_rate": 9.3225e-05, "loss": 0.0494, "step": 3730 }, { "grad_norm": 0.5521978139877319, "learning_rate": 9.3475e-05, "loss": 0.0473, "step": 3740 }, { "grad_norm": 0.5844800472259521, "learning_rate": 9.3725e-05, "loss": 0.0473, "step": 3750 }, { "grad_norm": 0.46830418705940247, "learning_rate": 9.397500000000001e-05, "loss": 0.0488, "step": 3760 }, { "grad_norm": 0.4028521776199341, "learning_rate": 9.422500000000001e-05, "loss": 0.0403, "step": 3770 }, { "grad_norm": 0.5043988823890686, "learning_rate": 9.4475e-05, "loss": 0.0473, "step": 3780 }, { "grad_norm": 0.49835655093193054, "learning_rate": 9.472500000000001e-05, "loss": 0.0439, "step": 3790 }, { "grad_norm": 0.5075227618217468, "learning_rate": 9.4975e-05, "loss": 0.0481, "step": 3800 }, { "grad_norm": 0.5908878445625305, "learning_rate": 9.522500000000001e-05, "loss": 0.0518, "step": 3810 }, { "grad_norm": 0.5253186225891113, "learning_rate": 9.5475e-05, "loss": 0.0518, "step": 3820 }, { "grad_norm": 0.44254595041275024, "learning_rate": 9.572500000000001e-05, "loss": 0.0457, "step": 3830 }, { "grad_norm": 0.48778197169303894, "learning_rate": 9.5975e-05, "loss": 0.0453, "step": 3840 }, { "grad_norm": 0.47673845291137695, "learning_rate": 9.622500000000001e-05, "loss": 0.0464, "step": 3850 }, { "grad_norm": 0.41416066884994507, "learning_rate": 9.6475e-05, "loss": 0.0481, "step": 3860 }, { "grad_norm": 0.37595033645629883, "learning_rate": 9.6725e-05, "loss": 0.0451, "step": 3870 }, { "grad_norm": 0.5165213942527771, "learning_rate": 9.6975e-05, "loss": 0.0436, "step": 3880 }, { "grad_norm": 0.4204738140106201, "learning_rate": 9.7225e-05, "loss": 0.0487, "step": 3890 }, { "grad_norm": 0.45040836930274963, "learning_rate": 9.747500000000001e-05, "loss": 0.0484, "step": 3900 }, { "grad_norm": 0.44099703431129456, "learning_rate": 9.7725e-05, "loss": 0.05, "step": 3910 }, { "grad_norm": 0.5076567530632019, "learning_rate": 9.797500000000001e-05, "loss": 0.0531, "step": 3920 }, { "grad_norm": 0.47302722930908203, "learning_rate": 9.8225e-05, "loss": 0.0433, "step": 3930 }, { "grad_norm": 0.4934600293636322, "learning_rate": 9.847500000000001e-05, "loss": 0.0464, "step": 3940 }, { "grad_norm": 0.52922523021698, "learning_rate": 9.8725e-05, "loss": 0.0468, "step": 3950 }, { "grad_norm": 0.6159229278564453, "learning_rate": 9.897500000000001e-05, "loss": 0.0458, "step": 3960 }, { "grad_norm": 0.43028244376182556, "learning_rate": 9.9225e-05, "loss": 0.0404, "step": 3970 }, { "grad_norm": 0.31863903999328613, "learning_rate": 9.9475e-05, "loss": 0.0393, "step": 3980 }, { "grad_norm": 0.34206074476242065, "learning_rate": 9.9725e-05, "loss": 0.0416, "step": 3990 }, { "grad_norm": 0.49606046080589294, "learning_rate": 9.9975e-05, "loss": 0.0459, "step": 4000 }, { "grad_norm": 0.4180985689163208, "learning_rate": 9.999999653982884e-05, "loss": 0.0417, "step": 4010 }, { "grad_norm": 0.4295831322669983, "learning_rate": 9.999998457874392e-05, "loss": 0.0427, "step": 4020 }, { "grad_norm": 0.48980405926704407, "learning_rate": 9.999996407402913e-05, "loss": 0.0445, "step": 4030 }, { "grad_norm": 0.5127744078636169, "learning_rate": 9.999993502568801e-05, "loss": 0.043, "step": 4040 }, { "grad_norm": 0.4278995990753174, "learning_rate": 9.999989743372548e-05, "loss": 0.0419, "step": 4050 }, { "grad_norm": 0.36709457635879517, "learning_rate": 9.999985129814798e-05, "loss": 0.0424, "step": 4060 }, { "grad_norm": 0.4465927183628082, "learning_rate": 9.99997966189634e-05, "loss": 0.0385, "step": 4070 }, { "grad_norm": 0.47604236006736755, "learning_rate": 9.999973339618107e-05, "loss": 0.0439, "step": 4080 }, { "grad_norm": 0.5551064610481262, "learning_rate": 9.999966162981179e-05, "loss": 0.0431, "step": 4090 }, { "grad_norm": 0.38670989871025085, "learning_rate": 9.999958131986784e-05, "loss": 0.0397, "step": 4100 }, { "grad_norm": 0.49753114581108093, "learning_rate": 9.999949246636293e-05, "loss": 0.0463, "step": 4110 }, { "grad_norm": 0.3967913091182709, "learning_rate": 9.999939506931224e-05, "loss": 0.0461, "step": 4120 }, { "grad_norm": 0.4697362780570984, "learning_rate": 9.999928912873243e-05, "loss": 0.0466, "step": 4130 }, { "grad_norm": 0.41395702958106995, "learning_rate": 9.999917464464159e-05, "loss": 0.0429, "step": 4140 }, { "grad_norm": 0.4534008502960205, "learning_rate": 9.999905161705929e-05, "loss": 0.0431, "step": 4150 }, { "grad_norm": 0.43647632002830505, "learning_rate": 9.999892004600653e-05, "loss": 0.0414, "step": 4160 }, { "grad_norm": 0.4620169699192047, "learning_rate": 9.999877993150581e-05, "loss": 0.0425, "step": 4170 }, { "grad_norm": 0.4180283844470978, "learning_rate": 9.999863127358108e-05, "loss": 0.0439, "step": 4180 }, { "grad_norm": 0.49559566378593445, "learning_rate": 9.999847407225773e-05, "loss": 0.0464, "step": 4190 }, { "grad_norm": 0.6210417747497559, "learning_rate": 9.999830832756262e-05, "loss": 0.0406, "step": 4200 }, { "grad_norm": 0.3351442217826843, "learning_rate": 9.999813403952407e-05, "loss": 0.0413, "step": 4210 }, { "grad_norm": 0.45679184794425964, "learning_rate": 9.999795120817187e-05, "loss": 0.0415, "step": 4220 }, { "grad_norm": 0.4164811074733734, "learning_rate": 9.999775983353725e-05, "loss": 0.0417, "step": 4230 }, { "grad_norm": 0.5456452965736389, "learning_rate": 9.999755991565292e-05, "loss": 0.0381, "step": 4240 }, { "grad_norm": 0.45842209458351135, "learning_rate": 9.999735145455303e-05, "loss": 0.043, "step": 4250 }, { "grad_norm": 0.39575114846229553, "learning_rate": 9.99971344502732e-05, "loss": 0.0439, "step": 4260 }, { "grad_norm": 0.4868561625480652, "learning_rate": 9.999690890285053e-05, "loss": 0.0424, "step": 4270 }, { "grad_norm": 0.517256498336792, "learning_rate": 9.999667481232356e-05, "loss": 0.0436, "step": 4280 }, { "grad_norm": 0.4816701114177704, "learning_rate": 9.999643217873225e-05, "loss": 0.038, "step": 4290 }, { "grad_norm": 0.5950254201889038, "learning_rate": 9.999618100211809e-05, "loss": 0.0366, "step": 4300 }, { "grad_norm": 0.42010438442230225, "learning_rate": 9.999592128252402e-05, "loss": 0.0388, "step": 4310 }, { "grad_norm": 0.475917786359787, "learning_rate": 9.999565301999437e-05, "loss": 0.0424, "step": 4320 }, { "grad_norm": 0.3905465602874756, "learning_rate": 9.999537621457502e-05, "loss": 0.0392, "step": 4330 }, { "grad_norm": 0.5185609459877014, "learning_rate": 9.999509086631323e-05, "loss": 0.0429, "step": 4340 }, { "grad_norm": 0.35246238112449646, "learning_rate": 9.99947969752578e-05, "loss": 0.0453, "step": 4350 }, { "grad_norm": 0.48326990008354187, "learning_rate": 9.999449454145891e-05, "loss": 0.0414, "step": 4360 }, { "grad_norm": 0.5086027979850769, "learning_rate": 9.999418356496827e-05, "loss": 0.0447, "step": 4370 }, { "grad_norm": 0.48885297775268555, "learning_rate": 9.999386404583899e-05, "loss": 0.0461, "step": 4380 }, { "grad_norm": 0.4840022027492523, "learning_rate": 9.999353598412568e-05, "loss": 0.0484, "step": 4390 }, { "grad_norm": 0.4596962332725525, "learning_rate": 9.999319937988442e-05, "loss": 0.0409, "step": 4400 }, { "grad_norm": 0.40147578716278076, "learning_rate": 9.999285423317268e-05, "loss": 0.0424, "step": 4410 }, { "grad_norm": 0.39820146560668945, "learning_rate": 9.999250054404947e-05, "loss": 0.0437, "step": 4420 }, { "grad_norm": 0.3372025489807129, "learning_rate": 9.99921383125752e-05, "loss": 0.0402, "step": 4430 }, { "grad_norm": 0.4212909936904907, "learning_rate": 9.99917675388118e-05, "loss": 0.0427, "step": 4440 }, { "grad_norm": 0.46622607111930847, "learning_rate": 9.99913882228226e-05, "loss": 0.0402, "step": 4450 }, { "grad_norm": 0.42122742533683777, "learning_rate": 9.999100036467242e-05, "loss": 0.0391, "step": 4460 }, { "grad_norm": 0.3796517848968506, "learning_rate": 9.999060396442753e-05, "loss": 0.0376, "step": 4470 }, { "grad_norm": 0.46171700954437256, "learning_rate": 9.999019902215566e-05, "loss": 0.0395, "step": 4480 }, { "grad_norm": 0.4445663392543793, "learning_rate": 9.998978553792602e-05, "loss": 0.0465, "step": 4490 }, { "grad_norm": 0.34532907605171204, "learning_rate": 9.998936351180926e-05, "loss": 0.0391, "step": 4500 }, { "grad_norm": 0.3280892074108124, "learning_rate": 9.998893294387747e-05, "loss": 0.0376, "step": 4510 }, { "grad_norm": 0.39585357904434204, "learning_rate": 9.998849383420426e-05, "loss": 0.0362, "step": 4520 }, { "grad_norm": 0.46473684906959534, "learning_rate": 9.998804618286465e-05, "loss": 0.0429, "step": 4530 }, { "grad_norm": 0.5078720450401306, "learning_rate": 9.99875899899351e-05, "loss": 0.0429, "step": 4540 }, { "grad_norm": 0.2908954322338104, "learning_rate": 9.99871252554936e-05, "loss": 0.0354, "step": 4550 }, { "grad_norm": 0.32932260632514954, "learning_rate": 9.998665197961955e-05, "loss": 0.0406, "step": 4560 }, { "grad_norm": 0.31356215476989746, "learning_rate": 9.998617016239379e-05, "loss": 0.0371, "step": 4570 }, { "grad_norm": 0.39544305205345154, "learning_rate": 9.998567980389869e-05, "loss": 0.0456, "step": 4580 }, { "grad_norm": 0.3815229535102844, "learning_rate": 9.998518090421802e-05, "loss": 0.0449, "step": 4590 }, { "grad_norm": 0.38353291153907776, "learning_rate": 9.998467346343703e-05, "loss": 0.0407, "step": 4600 }, { "grad_norm": 0.34873199462890625, "learning_rate": 9.998415748164243e-05, "loss": 0.0424, "step": 4610 }, { "grad_norm": 0.32553863525390625, "learning_rate": 9.998363295892238e-05, "loss": 0.0389, "step": 4620 }, { "grad_norm": 0.45304128527641296, "learning_rate": 9.998309989536652e-05, "loss": 0.0414, "step": 4630 }, { "grad_norm": 0.31198492646217346, "learning_rate": 9.998255829106593e-05, "loss": 0.0423, "step": 4640 }, { "grad_norm": 0.3784310817718506, "learning_rate": 9.998200814611316e-05, "loss": 0.0388, "step": 4650 }, { "grad_norm": 0.39435580372810364, "learning_rate": 9.998144946060219e-05, "loss": 0.0401, "step": 4660 }, { "grad_norm": 0.45397189259529114, "learning_rate": 9.998088223462852e-05, "loss": 0.0411, "step": 4670 }, { "grad_norm": 0.3047187328338623, "learning_rate": 9.998030646828905e-05, "loss": 0.0369, "step": 4680 }, { "grad_norm": 0.40832602977752686, "learning_rate": 9.997972216168217e-05, "loss": 0.0393, "step": 4690 }, { "grad_norm": 0.4438135325908661, "learning_rate": 9.997912931490771e-05, "loss": 0.0471, "step": 4700 }, { "grad_norm": 0.3741624057292938, "learning_rate": 9.9978527928067e-05, "loss": 0.0444, "step": 4710 }, { "grad_norm": 0.2908375859260559, "learning_rate": 9.997791800126277e-05, "loss": 0.0364, "step": 4720 }, { "grad_norm": 0.24278175830841064, "learning_rate": 9.997729953459927e-05, "loss": 0.0377, "step": 4730 }, { "grad_norm": 0.3740856349468231, "learning_rate": 9.997667252818214e-05, "loss": 0.0378, "step": 4740 }, { "grad_norm": 0.37404322624206543, "learning_rate": 9.997603698211855e-05, "loss": 0.0377, "step": 4750 }, { "grad_norm": 0.4858682453632355, "learning_rate": 9.99753928965171e-05, "loss": 0.0414, "step": 4760 }, { "grad_norm": 0.38060054183006287, "learning_rate": 9.997474027148781e-05, "loss": 0.0443, "step": 4770 }, { "grad_norm": 0.2874741554260254, "learning_rate": 9.997407910714223e-05, "loss": 0.0352, "step": 4780 }, { "grad_norm": 0.399440199136734, "learning_rate": 9.997340940359332e-05, "loss": 0.0406, "step": 4790 }, { "grad_norm": 0.30914726853370667, "learning_rate": 9.997273116095552e-05, "loss": 0.0406, "step": 4800 }, { "grad_norm": 0.31118422746658325, "learning_rate": 9.997204437934473e-05, "loss": 0.0345, "step": 4810 }, { "grad_norm": 0.40919291973114014, "learning_rate": 9.997134905887829e-05, "loss": 0.0372, "step": 4820 }, { "grad_norm": 0.28912439942359924, "learning_rate": 9.997064519967501e-05, "loss": 0.0372, "step": 4830 }, { "grad_norm": 0.3736850321292877, "learning_rate": 9.996993280185517e-05, "loss": 0.0398, "step": 4840 }, { "grad_norm": 0.37677761912345886, "learning_rate": 9.99692118655405e-05, "loss": 0.0372, "step": 4850 }, { "grad_norm": 0.38087499141693115, "learning_rate": 9.996848239085417e-05, "loss": 0.0348, "step": 4860 }, { "grad_norm": 0.36800417304039, "learning_rate": 9.996774437792085e-05, "loss": 0.0425, "step": 4870 }, { "grad_norm": 0.3152332007884979, "learning_rate": 9.996699782686664e-05, "loss": 0.0383, "step": 4880 }, { "grad_norm": 0.41233721375465393, "learning_rate": 9.996624273781909e-05, "loss": 0.0382, "step": 4890 }, { "grad_norm": 0.36510878801345825, "learning_rate": 9.996547911090725e-05, "loss": 0.0358, "step": 4900 }, { "grad_norm": 0.3398612141609192, "learning_rate": 9.996470694626157e-05, "loss": 0.0359, "step": 4910 }, { "grad_norm": 0.34864214062690735, "learning_rate": 9.996392624401403e-05, "loss": 0.0377, "step": 4920 }, { "grad_norm": 0.4832817614078522, "learning_rate": 9.996313700429801e-05, "loss": 0.0424, "step": 4930 }, { "grad_norm": 0.3137720823287964, "learning_rate": 9.996233922724836e-05, "loss": 0.0342, "step": 4940 }, { "grad_norm": 0.381591260433197, "learning_rate": 9.996153291300141e-05, "loss": 0.0366, "step": 4950 }, { "grad_norm": 0.41010046005249023, "learning_rate": 9.996071806169494e-05, "loss": 0.0381, "step": 4960 }, { "grad_norm": 0.3551969826221466, "learning_rate": 9.995989467346817e-05, "loss": 0.0332, "step": 4970 }, { "grad_norm": 0.4160730838775635, "learning_rate": 9.995906274846183e-05, "loss": 0.0397, "step": 4980 }, { "grad_norm": 0.25931423902511597, "learning_rate": 9.995822228681803e-05, "loss": 0.0388, "step": 4990 }, { "grad_norm": 0.29817086458206177, "learning_rate": 9.99573732886804e-05, "loss": 0.0378, "step": 5000 }, { "grad_norm": 0.34694433212280273, "learning_rate": 9.995651575419402e-05, "loss": 0.0384, "step": 5010 }, { "grad_norm": 0.43203428387641907, "learning_rate": 9.995564968350541e-05, "loss": 0.0406, "step": 5020 }, { "grad_norm": 0.3618549406528473, "learning_rate": 9.995477507676256e-05, "loss": 0.0335, "step": 5030 }, { "grad_norm": 0.5512619018554688, "learning_rate": 9.995389193411493e-05, "loss": 0.0357, "step": 5040 }, { "grad_norm": 0.49632176756858826, "learning_rate": 9.995300025571339e-05, "loss": 0.0387, "step": 5050 }, { "grad_norm": 0.330147922039032, "learning_rate": 9.995210004171034e-05, "loss": 0.0399, "step": 5060 }, { "grad_norm": 0.3401624858379364, "learning_rate": 9.995119129225956e-05, "loss": 0.0353, "step": 5070 }, { "grad_norm": 0.3490809202194214, "learning_rate": 9.995027400751637e-05, "loss": 0.0335, "step": 5080 }, { "grad_norm": 0.37520331144332886, "learning_rate": 9.994934818763751e-05, "loss": 0.0381, "step": 5090 }, { "grad_norm": 0.38936424255371094, "learning_rate": 9.994841383278115e-05, "loss": 0.0379, "step": 5100 }, { "grad_norm": 0.36220741271972656, "learning_rate": 9.994747094310695e-05, "loss": 0.0411, "step": 5110 }, { "grad_norm": 0.3571086525917053, "learning_rate": 9.994651951877604e-05, "loss": 0.0348, "step": 5120 }, { "grad_norm": 0.37946224212646484, "learning_rate": 9.994555955995099e-05, "loss": 0.0352, "step": 5130 }, { "grad_norm": 0.45556017756462097, "learning_rate": 9.994459106679581e-05, "loss": 0.0394, "step": 5140 }, { "grad_norm": 0.4064622223377228, "learning_rate": 9.994361403947603e-05, "loss": 0.0397, "step": 5150 }, { "grad_norm": 0.4316805303096771, "learning_rate": 9.994262847815854e-05, "loss": 0.0395, "step": 5160 }, { "grad_norm": 0.24891681969165802, "learning_rate": 9.99416343830118e-05, "loss": 0.036, "step": 5170 }, { "grad_norm": 0.22625470161437988, "learning_rate": 9.994063175420565e-05, "loss": 0.0333, "step": 5180 }, { "grad_norm": 0.3939727246761322, "learning_rate": 9.99396205919114e-05, "loss": 0.0387, "step": 5190 }, { "grad_norm": 0.2865230143070221, "learning_rate": 9.993860089630185e-05, "loss": 0.0354, "step": 5200 }, { "grad_norm": 0.39884084463119507, "learning_rate": 9.993757266755123e-05, "loss": 0.0425, "step": 5210 }, { "grad_norm": 0.3792526125907898, "learning_rate": 9.993653590583522e-05, "loss": 0.0433, "step": 5220 }, { "grad_norm": 0.3168577253818512, "learning_rate": 9.993549061133102e-05, "loss": 0.0387, "step": 5230 }, { "grad_norm": 0.5472336411476135, "learning_rate": 9.993443678421719e-05, "loss": 0.0338, "step": 5240 }, { "grad_norm": 0.24519160389900208, "learning_rate": 9.993337442467384e-05, "loss": 0.0371, "step": 5250 }, { "grad_norm": 0.30268964171409607, "learning_rate": 9.993230353288248e-05, "loss": 0.0343, "step": 5260 }, { "grad_norm": 0.3955518305301666, "learning_rate": 9.993122410902608e-05, "loss": 0.0364, "step": 5270 }, { "grad_norm": 0.3624086081981659, "learning_rate": 9.993013615328912e-05, "loss": 0.0357, "step": 5280 }, { "grad_norm": 0.30167660117149353, "learning_rate": 9.992903966585747e-05, "loss": 0.0353, "step": 5290 }, { "grad_norm": 0.35356080532073975, "learning_rate": 9.992793464691852e-05, "loss": 0.0397, "step": 5300 }, { "grad_norm": 0.4082872271537781, "learning_rate": 9.992682109666105e-05, "loss": 0.044, "step": 5310 }, { "grad_norm": 0.3716380298137665, "learning_rate": 9.992569901527538e-05, "loss": 0.0434, "step": 5320 }, { "grad_norm": 0.39021918177604675, "learning_rate": 9.99245684029532e-05, "loss": 0.0357, "step": 5330 }, { "grad_norm": 0.31539344787597656, "learning_rate": 9.992342925988774e-05, "loss": 0.0406, "step": 5340 }, { "grad_norm": 0.41254812479019165, "learning_rate": 9.992228158627361e-05, "loss": 0.0406, "step": 5350 }, { "grad_norm": 0.26043084263801575, "learning_rate": 9.992112538230693e-05, "loss": 0.0345, "step": 5360 }, { "grad_norm": 0.34754663705825806, "learning_rate": 9.991996064818527e-05, "loss": 0.035, "step": 5370 }, { "grad_norm": 0.2958452105522156, "learning_rate": 9.991878738410768e-05, "loss": 0.0366, "step": 5380 }, { "grad_norm": 0.2784283757209778, "learning_rate": 9.991760559027457e-05, "loss": 0.0337, "step": 5390 }, { "grad_norm": 0.34577178955078125, "learning_rate": 9.991641526688793e-05, "loss": 0.0325, "step": 5400 }, { "grad_norm": 0.27616938948631287, "learning_rate": 9.991521641415113e-05, "loss": 0.033, "step": 5410 }, { "grad_norm": 0.38137197494506836, "learning_rate": 9.991400903226904e-05, "loss": 0.0325, "step": 5420 }, { "grad_norm": 0.38424578309059143, "learning_rate": 9.991279312144794e-05, "loss": 0.0371, "step": 5430 }, { "grad_norm": 0.3470863103866577, "learning_rate": 9.991156868189564e-05, "loss": 0.0353, "step": 5440 }, { "grad_norm": 0.356317400932312, "learning_rate": 9.991033571382131e-05, "loss": 0.037, "step": 5450 }, { "grad_norm": 0.29297593235969543, "learning_rate": 9.990909421743569e-05, "loss": 0.0371, "step": 5460 }, { "grad_norm": 0.4779548943042755, "learning_rate": 9.990784419295085e-05, "loss": 0.0324, "step": 5470 }, { "grad_norm": 0.44616469740867615, "learning_rate": 9.990658564058044e-05, "loss": 0.0379, "step": 5480 }, { "grad_norm": 0.3331824839115143, "learning_rate": 9.990531856053948e-05, "loss": 0.0357, "step": 5490 }, { "grad_norm": 0.36746877431869507, "learning_rate": 9.99040429530445e-05, "loss": 0.0333, "step": 5500 }, { "grad_norm": 0.45146939158439636, "learning_rate": 9.990275881831346e-05, "loss": 0.0365, "step": 5510 }, { "grad_norm": 0.48004040122032166, "learning_rate": 9.990146615656577e-05, "loss": 0.0389, "step": 5520 }, { "grad_norm": 0.34952056407928467, "learning_rate": 9.990016496802233e-05, "loss": 0.0381, "step": 5530 }, { "grad_norm": 0.2745393216609955, "learning_rate": 9.989885525290548e-05, "loss": 0.0352, "step": 5540 }, { "grad_norm": 0.35235026478767395, "learning_rate": 9.989753701143897e-05, "loss": 0.0356, "step": 5550 }, { "grad_norm": 0.29709866642951965, "learning_rate": 9.989621024384812e-05, "loss": 0.0341, "step": 5560 }, { "grad_norm": 0.3342188596725464, "learning_rate": 9.989487495035959e-05, "loss": 0.0347, "step": 5570 }, { "grad_norm": 0.42383700609207153, "learning_rate": 9.989353113120156e-05, "loss": 0.032, "step": 5580 }, { "grad_norm": 0.2880796492099762, "learning_rate": 9.989217878660366e-05, "loss": 0.034, "step": 5590 }, { "grad_norm": 0.2878636121749878, "learning_rate": 9.989081791679695e-05, "loss": 0.0335, "step": 5600 }, { "grad_norm": 0.3054814338684082, "learning_rate": 9.988944852201397e-05, "loss": 0.032, "step": 5610 }, { "grad_norm": 0.381102055311203, "learning_rate": 9.988807060248873e-05, "loss": 0.0321, "step": 5620 }, { "grad_norm": 0.29753851890563965, "learning_rate": 9.988668415845665e-05, "loss": 0.0323, "step": 5630 }, { "grad_norm": 0.3280521631240845, "learning_rate": 9.988528919015466e-05, "loss": 0.0319, "step": 5640 }, { "grad_norm": 0.28142890334129333, "learning_rate": 9.988388569782112e-05, "loss": 0.0318, "step": 5650 }, { "grad_norm": 0.3997604548931122, "learning_rate": 9.988247368169583e-05, "loss": 0.0411, "step": 5660 }, { "grad_norm": 0.2951071560382843, "learning_rate": 9.988105314202007e-05, "loss": 0.0331, "step": 5670 }, { "grad_norm": 0.4105839133262634, "learning_rate": 9.987962407903659e-05, "loss": 0.0344, "step": 5680 }, { "grad_norm": 0.292222261428833, "learning_rate": 9.987818649298957e-05, "loss": 0.0309, "step": 5690 }, { "grad_norm": 0.39024025201797485, "learning_rate": 9.987674038412465e-05, "loss": 0.0354, "step": 5700 }, { "grad_norm": 0.4848085343837738, "learning_rate": 9.987528575268891e-05, "loss": 0.0336, "step": 5710 }, { "grad_norm": 0.34014463424682617, "learning_rate": 9.987382259893095e-05, "loss": 0.0353, "step": 5720 }, { "grad_norm": 0.33030110597610474, "learning_rate": 9.987235092310074e-05, "loss": 0.0352, "step": 5730 }, { "grad_norm": 0.35474392771720886, "learning_rate": 9.987087072544978e-05, "loss": 0.0365, "step": 5740 }, { "grad_norm": 0.3346961438655853, "learning_rate": 9.9869382006231e-05, "loss": 0.0362, "step": 5750 }, { "grad_norm": 0.2777644097805023, "learning_rate": 9.986788476569875e-05, "loss": 0.0315, "step": 5760 }, { "grad_norm": 0.30933165550231934, "learning_rate": 9.986637900410887e-05, "loss": 0.0347, "step": 5770 }, { "grad_norm": 0.40220457315444946, "learning_rate": 9.986486472171869e-05, "loss": 0.0341, "step": 5780 }, { "grad_norm": 0.31313595175743103, "learning_rate": 9.986334191878692e-05, "loss": 0.0337, "step": 5790 }, { "grad_norm": 0.30443206429481506, "learning_rate": 9.986181059557378e-05, "loss": 0.0328, "step": 5800 }, { "grad_norm": 0.36769434809684753, "learning_rate": 9.986027075234094e-05, "loss": 0.0301, "step": 5810 }, { "grad_norm": 0.31300345063209534, "learning_rate": 9.985872238935152e-05, "loss": 0.0332, "step": 5820 }, { "grad_norm": 0.2457679808139801, "learning_rate": 9.985716550687008e-05, "loss": 0.0354, "step": 5830 }, { "grad_norm": 0.38914754986763, "learning_rate": 9.985560010516264e-05, "loss": 0.0337, "step": 5840 }, { "grad_norm": 0.30064886808395386, "learning_rate": 9.985402618449668e-05, "loss": 0.0361, "step": 5850 }, { "grad_norm": 0.24531923234462738, "learning_rate": 9.985244374514118e-05, "loss": 0.0336, "step": 5860 }, { "grad_norm": 0.304599404335022, "learning_rate": 9.985085278736651e-05, "loss": 0.0376, "step": 5870 }, { "grad_norm": 0.30448195338249207, "learning_rate": 9.984925331144452e-05, "loss": 0.0353, "step": 5880 }, { "grad_norm": 0.3662973642349243, "learning_rate": 9.984764531764851e-05, "loss": 0.031, "step": 5890 }, { "grad_norm": 0.4185486435890198, "learning_rate": 9.984602880625326e-05, "loss": 0.0338, "step": 5900 }, { "grad_norm": 0.2728501558303833, "learning_rate": 9.9844403777535e-05, "loss": 0.0363, "step": 5910 }, { "grad_norm": 0.41229918599128723, "learning_rate": 9.984277023177135e-05, "loss": 0.0328, "step": 5920 }, { "grad_norm": 0.35289907455444336, "learning_rate": 9.984112816924148e-05, "loss": 0.0313, "step": 5930 }, { "grad_norm": 0.3302205204963684, "learning_rate": 9.983947759022596e-05, "loss": 0.0348, "step": 5940 }, { "grad_norm": 0.3574226498603821, "learning_rate": 9.983781849500682e-05, "loss": 0.031, "step": 5950 }, { "grad_norm": 0.34264859557151794, "learning_rate": 9.98361508838676e-05, "loss": 0.0312, "step": 5960 }, { "grad_norm": 0.32605716586112976, "learning_rate": 9.98344747570932e-05, "loss": 0.0352, "step": 5970 }, { "grad_norm": 0.3171367347240448, "learning_rate": 9.983279011497004e-05, "loss": 0.0323, "step": 5980 }, { "grad_norm": 0.28684133291244507, "learning_rate": 9.983109695778596e-05, "loss": 0.0345, "step": 5990 }, { "grad_norm": 0.36236077547073364, "learning_rate": 9.982939528583032e-05, "loss": 0.0358, "step": 6000 }, { "grad_norm": 0.34175780415534973, "learning_rate": 9.982768509939385e-05, "loss": 0.0307, "step": 6010 }, { "grad_norm": 0.3305891156196594, "learning_rate": 9.982596639876879e-05, "loss": 0.0313, "step": 6020 }, { "grad_norm": 0.29095458984375, "learning_rate": 9.982423918424881e-05, "loss": 0.0297, "step": 6030 }, { "grad_norm": 0.25121361017227173, "learning_rate": 9.982250345612908e-05, "loss": 0.034, "step": 6040 }, { "grad_norm": 0.38179904222488403, "learning_rate": 9.982075921470611e-05, "loss": 0.0339, "step": 6050 }, { "grad_norm": 0.3289402425289154, "learning_rate": 9.981900646027802e-05, "loss": 0.0304, "step": 6060 }, { "grad_norm": 0.3380860686302185, "learning_rate": 9.981724519314425e-05, "loss": 0.0325, "step": 6070 }, { "grad_norm": 0.3533649742603302, "learning_rate": 9.981547541360581e-05, "loss": 0.0362, "step": 6080 }, { "grad_norm": 0.3093990683555603, "learning_rate": 9.981369712196508e-05, "loss": 0.0291, "step": 6090 }, { "grad_norm": 0.30376654863357544, "learning_rate": 9.981191031852592e-05, "loss": 0.0371, "step": 6100 }, { "grad_norm": 0.38947921991348267, "learning_rate": 9.981011500359362e-05, "loss": 0.0341, "step": 6110 }, { "grad_norm": 0.36107349395751953, "learning_rate": 9.9808311177475e-05, "loss": 0.0393, "step": 6120 }, { "grad_norm": 0.3116806745529175, "learning_rate": 9.980649884047826e-05, "loss": 0.0339, "step": 6130 }, { "grad_norm": 0.2989245355129242, "learning_rate": 9.980467799291307e-05, "loss": 0.0341, "step": 6140 }, { "grad_norm": 0.40824875235557556, "learning_rate": 9.980284863509058e-05, "loss": 0.0361, "step": 6150 }, { "grad_norm": 0.3799442648887634, "learning_rate": 9.980101076732334e-05, "loss": 0.0332, "step": 6160 }, { "grad_norm": 0.3009151518344879, "learning_rate": 9.979916438992544e-05, "loss": 0.0342, "step": 6170 }, { "grad_norm": 0.292804479598999, "learning_rate": 9.979730950321237e-05, "loss": 0.0323, "step": 6180 }, { "grad_norm": 0.2744990587234497, "learning_rate": 9.979544610750104e-05, "loss": 0.0285, "step": 6190 }, { "grad_norm": 0.3342902660369873, "learning_rate": 9.97935742031099e-05, "loss": 0.0315, "step": 6200 }, { "grad_norm": 0.3327254354953766, "learning_rate": 9.979169379035878e-05, "loss": 0.0317, "step": 6210 }, { "grad_norm": 0.3202977180480957, "learning_rate": 9.978980486956899e-05, "loss": 0.0371, "step": 6220 }, { "grad_norm": 0.26722317934036255, "learning_rate": 9.978790744106332e-05, "loss": 0.0339, "step": 6230 }, { "grad_norm": 0.4024059772491455, "learning_rate": 9.978600150516594e-05, "loss": 0.0335, "step": 6240 }, { "grad_norm": 0.28512004017829895, "learning_rate": 9.978408706220259e-05, "loss": 0.0355, "step": 6250 }, { "grad_norm": 0.2888661324977875, "learning_rate": 9.978216411250032e-05, "loss": 0.0362, "step": 6260 }, { "grad_norm": 0.27441027760505676, "learning_rate": 9.978023265638778e-05, "loss": 0.0305, "step": 6270 }, { "grad_norm": 0.2719348073005676, "learning_rate": 9.977829269419495e-05, "loss": 0.0327, "step": 6280 }, { "grad_norm": 0.3087216019630432, "learning_rate": 9.977634422625335e-05, "loss": 0.028, "step": 6290 }, { "grad_norm": 0.3112841546535492, "learning_rate": 9.97743872528959e-05, "loss": 0.0366, "step": 6300 }, { "grad_norm": 0.3803221881389618, "learning_rate": 9.9772421774457e-05, "loss": 0.0332, "step": 6310 }, { "grad_norm": 0.318345308303833, "learning_rate": 9.977044779127252e-05, "loss": 0.0324, "step": 6320 }, { "grad_norm": 0.3648791015148163, "learning_rate": 9.976846530367971e-05, "loss": 0.0327, "step": 6330 }, { "grad_norm": 0.3619573414325714, "learning_rate": 9.976647431201735e-05, "loss": 0.0334, "step": 6340 }, { "grad_norm": 0.3865378499031067, "learning_rate": 9.976447481662568e-05, "loss": 0.0327, "step": 6350 }, { "grad_norm": 0.3224954605102539, "learning_rate": 9.976246681784629e-05, "loss": 0.0353, "step": 6360 }, { "grad_norm": 0.36603695154190063, "learning_rate": 9.976045031602234e-05, "loss": 0.035, "step": 6370 }, { "grad_norm": 0.3090134561061859, "learning_rate": 9.975842531149837e-05, "loss": 0.0355, "step": 6380 }, { "grad_norm": 0.3842290937900543, "learning_rate": 9.975639180462043e-05, "loss": 0.035, "step": 6390 }, { "grad_norm": 0.4608983099460602, "learning_rate": 9.975434979573596e-05, "loss": 0.0303, "step": 6400 }, { "grad_norm": 0.29932260513305664, "learning_rate": 9.97522992851939e-05, "loss": 0.0345, "step": 6410 }, { "grad_norm": 0.401941180229187, "learning_rate": 9.975024027334461e-05, "loss": 0.0354, "step": 6420 }, { "grad_norm": 0.2828305661678314, "learning_rate": 9.974817276053993e-05, "loss": 0.0309, "step": 6430 }, { "grad_norm": 0.32046589255332947, "learning_rate": 9.974609674713315e-05, "loss": 0.0308, "step": 6440 }, { "grad_norm": 0.3522738218307495, "learning_rate": 9.9744012233479e-05, "loss": 0.03, "step": 6450 }, { "grad_norm": 0.3775146007537842, "learning_rate": 9.974191921993366e-05, "loss": 0.0297, "step": 6460 }, { "grad_norm": 0.38201919198036194, "learning_rate": 9.973981770685474e-05, "loss": 0.0341, "step": 6470 }, { "grad_norm": 0.3030906021595001, "learning_rate": 9.97377076946014e-05, "loss": 0.0344, "step": 6480 }, { "grad_norm": 0.372117280960083, "learning_rate": 9.973558918353412e-05, "loss": 0.0369, "step": 6490 }, { "grad_norm": 0.3690277934074402, "learning_rate": 9.973346217401494e-05, "loss": 0.0314, "step": 6500 }, { "grad_norm": 0.33067747950553894, "learning_rate": 9.973132666640726e-05, "loss": 0.033, "step": 6510 }, { "grad_norm": 0.3108304440975189, "learning_rate": 9.972918266107602e-05, "loss": 0.0337, "step": 6520 }, { "grad_norm": 0.31917694211006165, "learning_rate": 9.972703015838756e-05, "loss": 0.0342, "step": 6530 }, { "grad_norm": 0.312173992395401, "learning_rate": 9.97248691587097e-05, "loss": 0.0331, "step": 6540 }, { "grad_norm": 0.33425700664520264, "learning_rate": 9.972269966241166e-05, "loss": 0.0336, "step": 6550 }, { "grad_norm": 0.32794177532196045, "learning_rate": 9.972052166986417e-05, "loss": 0.033, "step": 6560 }, { "grad_norm": 0.2956711947917938, "learning_rate": 9.971833518143938e-05, "loss": 0.036, "step": 6570 }, { "grad_norm": 0.3156307637691498, "learning_rate": 9.971614019751093e-05, "loss": 0.0303, "step": 6580 }, { "grad_norm": 0.27776384353637695, "learning_rate": 9.971393671845383e-05, "loss": 0.0332, "step": 6590 }, { "grad_norm": 0.2686438262462616, "learning_rate": 9.971172474464464e-05, "loss": 0.0353, "step": 6600 }, { "grad_norm": 0.33840954303741455, "learning_rate": 9.97095042764613e-05, "loss": 0.0383, "step": 6610 }, { "grad_norm": 0.20271019637584686, "learning_rate": 9.970727531428324e-05, "loss": 0.0275, "step": 6620 }, { "grad_norm": 0.2820912003517151, "learning_rate": 9.970503785849132e-05, "loss": 0.0371, "step": 6630 }, { "grad_norm": 0.33647826313972473, "learning_rate": 9.970279190946788e-05, "loss": 0.0343, "step": 6640 }, { "grad_norm": 0.31943199038505554, "learning_rate": 9.970053746759667e-05, "loss": 0.0303, "step": 6650 }, { "grad_norm": 0.29411786794662476, "learning_rate": 9.969827453326292e-05, "loss": 0.0355, "step": 6660 }, { "grad_norm": 0.38380539417266846, "learning_rate": 9.969600310685332e-05, "loss": 0.0341, "step": 6670 }, { "grad_norm": 0.39466649293899536, "learning_rate": 9.969372318875596e-05, "loss": 0.0323, "step": 6680 }, { "grad_norm": 0.3303551971912384, "learning_rate": 9.969143477936043e-05, "loss": 0.0306, "step": 6690 }, { "grad_norm": 0.29684263467788696, "learning_rate": 9.968913787905775e-05, "loss": 0.0316, "step": 6700 }, { "grad_norm": 0.2067209929227829, "learning_rate": 9.968683248824045e-05, "loss": 0.0331, "step": 6710 }, { "grad_norm": 0.26759183406829834, "learning_rate": 9.968451860730238e-05, "loss": 0.0313, "step": 6720 }, { "grad_norm": 0.3573274612426758, "learning_rate": 9.968219623663896e-05, "loss": 0.0292, "step": 6730 }, { "grad_norm": 0.2830476760864258, "learning_rate": 9.967986537664702e-05, "loss": 0.031, "step": 6740 }, { "grad_norm": 0.355923056602478, "learning_rate": 9.967752602772483e-05, "loss": 0.0292, "step": 6750 }, { "grad_norm": 0.4108932316303253, "learning_rate": 9.967517819027212e-05, "loss": 0.0315, "step": 6760 }, { "grad_norm": 0.29659274220466614, "learning_rate": 9.967282186469009e-05, "loss": 0.0313, "step": 6770 }, { "grad_norm": 0.302238404750824, "learning_rate": 9.967045705138135e-05, "loss": 0.0308, "step": 6780 }, { "grad_norm": 0.33241477608680725, "learning_rate": 9.966808375074998e-05, "loss": 0.0303, "step": 6790 }, { "grad_norm": 0.3170243203639984, "learning_rate": 9.966570196320154e-05, "loss": 0.03, "step": 6800 }, { "grad_norm": 0.2889682650566101, "learning_rate": 9.966331168914299e-05, "loss": 0.0311, "step": 6810 }, { "grad_norm": 0.33931848406791687, "learning_rate": 9.966091292898277e-05, "loss": 0.0333, "step": 6820 }, { "grad_norm": 0.29120543599128723, "learning_rate": 9.965850568313076e-05, "loss": 0.0304, "step": 6830 }, { "grad_norm": 0.3282957971096039, "learning_rate": 9.965608995199827e-05, "loss": 0.0319, "step": 6840 }, { "grad_norm": 0.33587634563446045, "learning_rate": 9.965366573599812e-05, "loss": 0.0351, "step": 6850 }, { "grad_norm": 0.36688876152038574, "learning_rate": 9.965123303554453e-05, "loss": 0.0392, "step": 6860 }, { "grad_norm": 0.29487913846969604, "learning_rate": 9.964879185105317e-05, "loss": 0.0302, "step": 6870 }, { "grad_norm": 0.2667387127876282, "learning_rate": 9.964634218294119e-05, "loss": 0.0358, "step": 6880 }, { "grad_norm": 0.23979437351226807, "learning_rate": 9.964388403162714e-05, "loss": 0.0262, "step": 6890 }, { "grad_norm": 0.2607864439487457, "learning_rate": 9.96414173975311e-05, "loss": 0.0298, "step": 6900 }, { "grad_norm": 0.29720932245254517, "learning_rate": 9.963894228107451e-05, "loss": 0.0283, "step": 6910 }, { "grad_norm": 0.29841578006744385, "learning_rate": 9.963645868268032e-05, "loss": 0.0299, "step": 6920 }, { "grad_norm": 0.3101212680339813, "learning_rate": 9.963396660277289e-05, "loss": 0.034, "step": 6930 }, { "grad_norm": 0.374271035194397, "learning_rate": 9.963146604177807e-05, "loss": 0.0295, "step": 6940 }, { "grad_norm": 0.38535043597221375, "learning_rate": 9.962895700012311e-05, "loss": 0.0303, "step": 6950 }, { "grad_norm": 0.29662302136421204, "learning_rate": 9.962643947823677e-05, "loss": 0.0352, "step": 6960 }, { "grad_norm": 0.2500973045825958, "learning_rate": 9.962391347654921e-05, "loss": 0.0307, "step": 6970 }, { "grad_norm": 0.21264077723026276, "learning_rate": 9.962137899549204e-05, "loss": 0.0304, "step": 6980 }, { "grad_norm": 0.24637088179588318, "learning_rate": 9.961883603549835e-05, "loss": 0.028, "step": 6990 }, { "grad_norm": 0.2505500018596649, "learning_rate": 9.961628459700267e-05, "loss": 0.0294, "step": 7000 }, { "grad_norm": 0.2831902503967285, "learning_rate": 9.961372468044095e-05, "loss": 0.0319, "step": 7010 }, { "grad_norm": 0.3399293124675751, "learning_rate": 9.961115628625062e-05, "loss": 0.0317, "step": 7020 }, { "grad_norm": 0.3054075837135315, "learning_rate": 9.960857941487056e-05, "loss": 0.0279, "step": 7030 }, { "grad_norm": 0.27637967467308044, "learning_rate": 9.960599406674106e-05, "loss": 0.0299, "step": 7040 }, { "grad_norm": 0.24918757379055023, "learning_rate": 9.960340024230393e-05, "loss": 0.0294, "step": 7050 }, { "grad_norm": 0.32056668400764465, "learning_rate": 9.960079794200232e-05, "loss": 0.03, "step": 7060 }, { "grad_norm": 0.33140912652015686, "learning_rate": 9.959818716628096e-05, "loss": 0.0294, "step": 7070 }, { "grad_norm": 0.2633589208126068, "learning_rate": 9.95955679155859e-05, "loss": 0.0301, "step": 7080 }, { "grad_norm": 0.24547787010669708, "learning_rate": 9.959294019036472e-05, "loss": 0.0291, "step": 7090 }, { "grad_norm": 0.295369952917099, "learning_rate": 9.959030399106646e-05, "loss": 0.0283, "step": 7100 }, { "grad_norm": 0.312034010887146, "learning_rate": 9.958765931814153e-05, "loss": 0.0307, "step": 7110 }, { "grad_norm": 0.22745037078857422, "learning_rate": 9.958500617204184e-05, "loss": 0.0306, "step": 7120 }, { "grad_norm": 0.37850284576416016, "learning_rate": 9.958234455322075e-05, "loss": 0.0268, "step": 7130 }, { "grad_norm": 0.3112441897392273, "learning_rate": 9.957967446213308e-05, "loss": 0.0323, "step": 7140 }, { "grad_norm": 0.32266315817832947, "learning_rate": 9.957699589923501e-05, "loss": 0.0271, "step": 7150 }, { "grad_norm": 0.3254435658454895, "learning_rate": 9.957430886498431e-05, "loss": 0.0275, "step": 7160 }, { "grad_norm": 0.2588164806365967, "learning_rate": 9.957161335984008e-05, "loss": 0.026, "step": 7170 }, { "grad_norm": 0.25905585289001465, "learning_rate": 9.956890938426291e-05, "loss": 0.0301, "step": 7180 }, { "grad_norm": 0.2824728488922119, "learning_rate": 9.956619693871482e-05, "loss": 0.0281, "step": 7190 }, { "grad_norm": 0.29256099462509155, "learning_rate": 9.956347602365934e-05, "loss": 0.0302, "step": 7200 }, { "grad_norm": 0.24998067319393158, "learning_rate": 9.956074663956135e-05, "loss": 0.0311, "step": 7210 }, { "grad_norm": 0.299710214138031, "learning_rate": 9.955800878688726e-05, "loss": 0.0293, "step": 7220 }, { "grad_norm": 0.3398659825325012, "learning_rate": 9.955526246610489e-05, "loss": 0.0304, "step": 7230 }, { "grad_norm": 0.3253668546676636, "learning_rate": 9.955250767768349e-05, "loss": 0.0263, "step": 7240 }, { "grad_norm": 0.2779759466648102, "learning_rate": 9.95497444220938e-05, "loss": 0.0262, "step": 7250 }, { "grad_norm": 0.3002147674560547, "learning_rate": 9.954697269980797e-05, "loss": 0.026, "step": 7260 }, { "grad_norm": 0.22349902987480164, "learning_rate": 9.954419251129962e-05, "loss": 0.0274, "step": 7270 }, { "grad_norm": 0.2270653247833252, "learning_rate": 9.95414038570438e-05, "loss": 0.025, "step": 7280 }, { "grad_norm": 0.2137085497379303, "learning_rate": 9.953860673751703e-05, "loss": 0.0291, "step": 7290 }, { "grad_norm": 0.28228211402893066, "learning_rate": 9.953580115319725e-05, "loss": 0.028, "step": 7300 }, { "grad_norm": 0.3132007122039795, "learning_rate": 9.953298710456387e-05, "loss": 0.0277, "step": 7310 }, { "grad_norm": 0.2528090476989746, "learning_rate": 9.953016459209771e-05, "loss": 0.0297, "step": 7320 }, { "grad_norm": 0.3125268816947937, "learning_rate": 9.952733361628108e-05, "loss": 0.0306, "step": 7330 }, { "grad_norm": 0.3831489086151123, "learning_rate": 9.952449417759772e-05, "loss": 0.0293, "step": 7340 }, { "grad_norm": 0.36307188868522644, "learning_rate": 9.952164627653279e-05, "loss": 0.0324, "step": 7350 }, { "grad_norm": 0.28852543234825134, "learning_rate": 9.951878991357292e-05, "loss": 0.0272, "step": 7360 }, { "grad_norm": 0.28808075189590454, "learning_rate": 9.951592508920622e-05, "loss": 0.0288, "step": 7370 }, { "grad_norm": 0.28094083070755005, "learning_rate": 9.951305180392219e-05, "loss": 0.028, "step": 7380 }, { "grad_norm": 0.2905932068824768, "learning_rate": 9.951017005821178e-05, "loss": 0.0304, "step": 7390 }, { "grad_norm": 0.2787303626537323, "learning_rate": 9.95072798525674e-05, "loss": 0.0273, "step": 7400 }, { "grad_norm": 0.22447362542152405, "learning_rate": 9.950438118748293e-05, "loss": 0.0314, "step": 7410 }, { "grad_norm": 0.18393456935882568, "learning_rate": 9.950147406345366e-05, "loss": 0.0295, "step": 7420 }, { "grad_norm": 0.28537797927856445, "learning_rate": 9.949855848097635e-05, "loss": 0.0362, "step": 7430 }, { "grad_norm": 0.27975285053253174, "learning_rate": 9.949563444054916e-05, "loss": 0.0318, "step": 7440 }, { "grad_norm": 0.3730669915676117, "learning_rate": 9.949270194267178e-05, "loss": 0.0313, "step": 7450 }, { "grad_norm": 0.30887213349342346, "learning_rate": 9.948976098784526e-05, "loss": 0.0304, "step": 7460 }, { "grad_norm": 0.3759748637676239, "learning_rate": 9.948681157657213e-05, "loss": 0.0364, "step": 7470 }, { "grad_norm": 0.4003210961818695, "learning_rate": 9.948385370935638e-05, "loss": 0.0295, "step": 7480 }, { "grad_norm": 0.34494438767433167, "learning_rate": 9.94808873867034e-05, "loss": 0.0322, "step": 7490 }, { "grad_norm": 0.25887271761894226, "learning_rate": 9.947791260912009e-05, "loss": 0.0313, "step": 7500 }, { "grad_norm": 0.4052548110485077, "learning_rate": 9.947492937711474e-05, "loss": 0.0269, "step": 7510 }, { "grad_norm": 0.27823805809020996, "learning_rate": 9.947193769119707e-05, "loss": 0.0285, "step": 7520 }, { "grad_norm": 0.26281383633613586, "learning_rate": 9.946893755187834e-05, "loss": 0.0294, "step": 7530 }, { "grad_norm": 0.2943587601184845, "learning_rate": 9.946592895967115e-05, "loss": 0.028, "step": 7540 }, { "grad_norm": 0.37568384408950806, "learning_rate": 9.94629119150896e-05, "loss": 0.0283, "step": 7550 }, { "grad_norm": 0.28669169545173645, "learning_rate": 9.94598864186492e-05, "loss": 0.033, "step": 7560 }, { "grad_norm": 0.21377070248126984, "learning_rate": 9.945685247086696e-05, "loss": 0.0274, "step": 7570 }, { "grad_norm": 0.3396747410297394, "learning_rate": 9.945381007226129e-05, "loss": 0.0331, "step": 7580 }, { "grad_norm": 0.3015192449092865, "learning_rate": 9.945075922335203e-05, "loss": 0.0255, "step": 7590 }, { "grad_norm": 0.2842022180557251, "learning_rate": 9.944769992466049e-05, "loss": 0.0259, "step": 7600 }, { "grad_norm": 0.2925052344799042, "learning_rate": 9.944463217670945e-05, "loss": 0.0298, "step": 7610 }, { "grad_norm": 0.2915891706943512, "learning_rate": 9.944155598002307e-05, "loss": 0.0287, "step": 7620 }, { "grad_norm": 0.2808391749858856, "learning_rate": 9.943847133512701e-05, "loss": 0.0279, "step": 7630 }, { "grad_norm": 0.33903488516807556, "learning_rate": 9.943537824254834e-05, "loss": 0.0349, "step": 7640 }, { "grad_norm": 0.32978183031082153, "learning_rate": 9.943227670281559e-05, "loss": 0.0327, "step": 7650 }, { "grad_norm": 0.3591379225254059, "learning_rate": 9.942916671645873e-05, "loss": 0.0264, "step": 7660 }, { "grad_norm": 0.32147717475891113, "learning_rate": 9.942604828400916e-05, "loss": 0.0296, "step": 7670 }, { "grad_norm": 0.34731796383857727, "learning_rate": 9.942292140599975e-05, "loss": 0.0292, "step": 7680 }, { "grad_norm": 0.30803829431533813, "learning_rate": 9.94197860829648e-05, "loss": 0.0311, "step": 7690 }, { "grad_norm": 0.2544945180416107, "learning_rate": 9.941664231544004e-05, "loss": 0.0295, "step": 7700 }, { "grad_norm": 0.303405225276947, "learning_rate": 9.941349010396264e-05, "loss": 0.0298, "step": 7710 }, { "grad_norm": 0.2519514262676239, "learning_rate": 9.941032944907125e-05, "loss": 0.0244, "step": 7720 }, { "grad_norm": 0.2523871064186096, "learning_rate": 9.940716035130596e-05, "loss": 0.0284, "step": 7730 }, { "grad_norm": 0.2602860927581787, "learning_rate": 9.940398281120821e-05, "loss": 0.0289, "step": 7740 }, { "grad_norm": 0.24820047616958618, "learning_rate": 9.940079682932102e-05, "loss": 0.0277, "step": 7750 }, { "grad_norm": 0.2655206322669983, "learning_rate": 9.939760240618877e-05, "loss": 0.0299, "step": 7760 }, { "grad_norm": 0.2825908660888672, "learning_rate": 9.939439954235729e-05, "loss": 0.0267, "step": 7770 }, { "grad_norm": 0.2553221583366394, "learning_rate": 9.939118823837387e-05, "loss": 0.0271, "step": 7780 }, { "grad_norm": 0.2603461444377899, "learning_rate": 9.938796849478725e-05, "loss": 0.0263, "step": 7790 }, { "grad_norm": 0.3806981146335602, "learning_rate": 9.938474031214755e-05, "loss": 0.0305, "step": 7800 }, { "grad_norm": 0.31233298778533936, "learning_rate": 9.938150369100643e-05, "loss": 0.0305, "step": 7810 }, { "grad_norm": 0.3379138708114624, "learning_rate": 9.93782586319169e-05, "loss": 0.0266, "step": 7820 }, { "grad_norm": 0.26952528953552246, "learning_rate": 9.937500513543348e-05, "loss": 0.027, "step": 7830 }, { "grad_norm": 0.25739720463752747, "learning_rate": 9.937174320211207e-05, "loss": 0.0283, "step": 7840 }, { "grad_norm": 0.2979893982410431, "learning_rate": 9.936847283251009e-05, "loss": 0.0246, "step": 7850 }, { "grad_norm": 0.26790788769721985, "learning_rate": 9.936519402718632e-05, "loss": 0.0265, "step": 7860 }, { "grad_norm": 0.2623661160469055, "learning_rate": 9.936190678670102e-05, "loss": 0.0272, "step": 7870 }, { "grad_norm": 0.2903725206851959, "learning_rate": 9.935861111161593e-05, "loss": 0.0311, "step": 7880 }, { "grad_norm": 0.19664554297924042, "learning_rate": 9.935530700249416e-05, "loss": 0.0254, "step": 7890 }, { "grad_norm": 0.26254338026046753, "learning_rate": 9.935199445990028e-05, "loss": 0.0275, "step": 7900 }, { "grad_norm": 0.2887759208679199, "learning_rate": 9.934867348440033e-05, "loss": 0.0327, "step": 7910 }, { "grad_norm": 0.25602543354034424, "learning_rate": 9.934534407656176e-05, "loss": 0.0277, "step": 7920 }, { "grad_norm": 0.23467670381069183, "learning_rate": 9.93420062369535e-05, "loss": 0.0305, "step": 7930 }, { "grad_norm": 0.28901249170303345, "learning_rate": 9.933865996614589e-05, "loss": 0.0312, "step": 7940 }, { "grad_norm": 0.23462854325771332, "learning_rate": 9.933530526471068e-05, "loss": 0.0254, "step": 7950 }, { "grad_norm": 0.23351065814495087, "learning_rate": 9.933194213322114e-05, "loss": 0.0279, "step": 7960 }, { "grad_norm": 0.2462201863527298, "learning_rate": 9.932857057225192e-05, "loss": 0.0264, "step": 7970 }, { "grad_norm": 0.23075060546398163, "learning_rate": 9.932519058237912e-05, "loss": 0.0289, "step": 7980 }, { "grad_norm": 0.18742844462394714, "learning_rate": 9.932180216418032e-05, "loss": 0.0254, "step": 7990 }, { "grad_norm": 0.24699735641479492, "learning_rate": 9.931840531823446e-05, "loss": 0.0254, "step": 8000 }, { "grad_norm": 0.28211626410484314, "learning_rate": 9.9315000045122e-05, "loss": 0.0271, "step": 8010 }, { "grad_norm": 0.24285010993480682, "learning_rate": 9.931158634542481e-05, "loss": 0.0291, "step": 8020 }, { "grad_norm": 0.27394118905067444, "learning_rate": 9.930816421972617e-05, "loss": 0.0249, "step": 8030 }, { "grad_norm": 0.31862834095954895, "learning_rate": 9.930473366861086e-05, "loss": 0.0299, "step": 8040 }, { "grad_norm": 0.2402433156967163, "learning_rate": 9.930129469266505e-05, "loss": 0.0244, "step": 8050 }, { "grad_norm": 0.2374587059020996, "learning_rate": 9.929784729247638e-05, "loss": 0.0247, "step": 8060 }, { "grad_norm": 0.3148080110549927, "learning_rate": 9.929439146863389e-05, "loss": 0.0287, "step": 8070 }, { "grad_norm": 0.2626878619194031, "learning_rate": 9.92909272217281e-05, "loss": 0.0289, "step": 8080 }, { "grad_norm": 0.23491959273815155, "learning_rate": 9.928745455235097e-05, "loss": 0.0236, "step": 8090 }, { "grad_norm": 0.36338356137275696, "learning_rate": 9.928397346109588e-05, "loss": 0.0286, "step": 8100 }, { "grad_norm": 0.315378874540329, "learning_rate": 9.928048394855762e-05, "loss": 0.0309, "step": 8110 }, { "grad_norm": 0.22462131083011627, "learning_rate": 9.92769860153325e-05, "loss": 0.0273, "step": 8120 }, { "grad_norm": 0.1949714869260788, "learning_rate": 9.927347966201819e-05, "loss": 0.0247, "step": 8130 }, { "grad_norm": 0.3172146677970886, "learning_rate": 9.926996488921383e-05, "loss": 0.0272, "step": 8140 }, { "grad_norm": 0.26683399081230164, "learning_rate": 9.926644169752001e-05, "loss": 0.0261, "step": 8150 }, { "grad_norm": 0.3005165755748749, "learning_rate": 9.926291008753875e-05, "loss": 0.0276, "step": 8160 }, { "grad_norm": 0.21768219769001007, "learning_rate": 9.92593700598735e-05, "loss": 0.0261, "step": 8170 }, { "grad_norm": 0.2814142107963562, "learning_rate": 9.925582161512915e-05, "loss": 0.0274, "step": 8180 }, { "grad_norm": 0.2408493012189865, "learning_rate": 9.925226475391205e-05, "loss": 0.0259, "step": 8190 }, { "grad_norm": 0.26034602522850037, "learning_rate": 9.924869947682993e-05, "loss": 0.0245, "step": 8200 }, { "grad_norm": 0.2788393795490265, "learning_rate": 9.924512578449204e-05, "loss": 0.0254, "step": 8210 }, { "grad_norm": 0.20665495097637177, "learning_rate": 9.924154367750901e-05, "loss": 0.0238, "step": 8220 }, { "grad_norm": 0.28702810406684875, "learning_rate": 9.923795315649293e-05, "loss": 0.0256, "step": 8230 }, { "grad_norm": 0.3176302909851074, "learning_rate": 9.92343542220573e-05, "loss": 0.0257, "step": 8240 }, { "grad_norm": 0.3391202390193939, "learning_rate": 9.92307468748171e-05, "loss": 0.0249, "step": 8250 }, { "grad_norm": 0.23491878807544708, "learning_rate": 9.922713111538873e-05, "loss": 0.0269, "step": 8260 }, { "grad_norm": 0.24631653726100922, "learning_rate": 9.922350694439003e-05, "loss": 0.0279, "step": 8270 }, { "grad_norm": 0.258461594581604, "learning_rate": 9.921987436244024e-05, "loss": 0.027, "step": 8280 }, { "grad_norm": 0.3010614216327667, "learning_rate": 9.921623337016008e-05, "loss": 0.0278, "step": 8290 }, { "grad_norm": 0.2844180166721344, "learning_rate": 9.921258396817172e-05, "loss": 0.0242, "step": 8300 }, { "grad_norm": 0.23872238397598267, "learning_rate": 9.920892615709874e-05, "loss": 0.0281, "step": 8310 }, { "grad_norm": 0.23292949795722961, "learning_rate": 9.920525993756612e-05, "loss": 0.0269, "step": 8320 }, { "grad_norm": 0.20526251196861267, "learning_rate": 9.920158531020036e-05, "loss": 0.025, "step": 8330 }, { "grad_norm": 0.3391219973564148, "learning_rate": 9.919790227562933e-05, "loss": 0.0244, "step": 8340 }, { "grad_norm": 0.30553150177001953, "learning_rate": 9.919421083448237e-05, "loss": 0.0253, "step": 8350 }, { "grad_norm": 0.33474358916282654, "learning_rate": 9.919051098739022e-05, "loss": 0.0239, "step": 8360 }, { "grad_norm": 0.2131931483745575, "learning_rate": 9.918680273498514e-05, "loss": 0.0249, "step": 8370 }, { "grad_norm": 0.2755624055862427, "learning_rate": 9.918308607790072e-05, "loss": 0.0254, "step": 8380 }, { "grad_norm": 0.28685224056243896, "learning_rate": 9.917936101677205e-05, "loss": 0.027, "step": 8390 }, { "grad_norm": 0.32575035095214844, "learning_rate": 9.917562755223564e-05, "loss": 0.0244, "step": 8400 }, { "grad_norm": 0.25040194392204285, "learning_rate": 9.917188568492944e-05, "loss": 0.0254, "step": 8410 }, { "grad_norm": 0.33060601353645325, "learning_rate": 9.916813541549283e-05, "loss": 0.0261, "step": 8420 }, { "grad_norm": 0.24100974202156067, "learning_rate": 9.916437674456663e-05, "loss": 0.025, "step": 8430 }, { "grad_norm": 0.20937266945838928, "learning_rate": 9.916060967279308e-05, "loss": 0.0259, "step": 8440 }, { "grad_norm": 0.29727861285209656, "learning_rate": 9.91568342008159e-05, "loss": 0.0267, "step": 8450 }, { "grad_norm": 0.2921645939350128, "learning_rate": 9.915305032928019e-05, "loss": 0.0269, "step": 8460 }, { "grad_norm": 0.2671777307987213, "learning_rate": 9.914925805883253e-05, "loss": 0.0278, "step": 8470 }, { "grad_norm": 0.31373995542526245, "learning_rate": 9.914545739012088e-05, "loss": 0.0251, "step": 8480 }, { "grad_norm": 0.2707233130931854, "learning_rate": 9.91416483237947e-05, "loss": 0.0241, "step": 8490 }, { "grad_norm": 0.3479993939399719, "learning_rate": 9.913783086050485e-05, "loss": 0.0272, "step": 8500 }, { "grad_norm": 0.3138609230518341, "learning_rate": 9.913400500090364e-05, "loss": 0.0277, "step": 8510 }, { "grad_norm": 0.2973633408546448, "learning_rate": 9.913017074564479e-05, "loss": 0.0285, "step": 8520 }, { "grad_norm": 0.277044415473938, "learning_rate": 9.912632809538348e-05, "loss": 0.0293, "step": 8530 }, { "grad_norm": 0.3020440340042114, "learning_rate": 9.912247705077629e-05, "loss": 0.029, "step": 8540 }, { "grad_norm": 0.2880062460899353, "learning_rate": 9.911861761248127e-05, "loss": 0.027, "step": 8550 }, { "grad_norm": 0.3118162751197815, "learning_rate": 9.91147497811579e-05, "loss": 0.0309, "step": 8560 }, { "grad_norm": 0.2701292634010315, "learning_rate": 9.911087355746709e-05, "loss": 0.0253, "step": 8570 }, { "grad_norm": 0.3174044191837311, "learning_rate": 9.910698894207117e-05, "loss": 0.0261, "step": 8580 }, { "grad_norm": 0.2066076397895813, "learning_rate": 9.910309593563392e-05, "loss": 0.0257, "step": 8590 }, { "grad_norm": 0.207026407122612, "learning_rate": 9.909919453882057e-05, "loss": 0.0244, "step": 8600 }, { "grad_norm": 0.18258196115493774, "learning_rate": 9.90952847522977e-05, "loss": 0.0256, "step": 8610 }, { "grad_norm": 0.2689923048019409, "learning_rate": 9.909136657673346e-05, "loss": 0.0254, "step": 8620 }, { "grad_norm": 0.2323203980922699, "learning_rate": 9.908744001279731e-05, "loss": 0.0232, "step": 8630 }, { "grad_norm": 0.24731189012527466, "learning_rate": 9.90835050611602e-05, "loss": 0.0242, "step": 8640 }, { "grad_norm": 0.2757399380207062, "learning_rate": 9.90795617224945e-05, "loss": 0.026, "step": 8650 }, { "grad_norm": 0.22234536707401276, "learning_rate": 9.907560999747405e-05, "loss": 0.0256, "step": 8660 }, { "grad_norm": 0.24195800721645355, "learning_rate": 9.907164988677408e-05, "loss": 0.0276, "step": 8670 }, { "grad_norm": 0.23798295855522156, "learning_rate": 9.906768139107124e-05, "loss": 0.026, "step": 8680 }, { "grad_norm": 0.20010139048099518, "learning_rate": 9.906370451104367e-05, "loss": 0.0251, "step": 8690 }, { "grad_norm": 0.30973872542381287, "learning_rate": 9.905971924737088e-05, "loss": 0.022, "step": 8700 }, { "grad_norm": 0.2917967438697815, "learning_rate": 9.905572560073387e-05, "loss": 0.0243, "step": 8710 }, { "grad_norm": 0.2721392512321472, "learning_rate": 9.905172357181501e-05, "loss": 0.0226, "step": 8720 }, { "grad_norm": 0.22939813137054443, "learning_rate": 9.904771316129817e-05, "loss": 0.0262, "step": 8730 }, { "grad_norm": 0.20791757106781006, "learning_rate": 9.904369436986862e-05, "loss": 0.0247, "step": 8740 }, { "grad_norm": 0.2012081742286682, "learning_rate": 9.903966719821303e-05, "loss": 0.0284, "step": 8750 }, { "grad_norm": 0.24155165255069733, "learning_rate": 9.903563164701956e-05, "loss": 0.0252, "step": 8760 }, { "grad_norm": 0.18655768036842346, "learning_rate": 9.903158771697778e-05, "loss": 0.0215, "step": 8770 }, { "grad_norm": 0.24078214168548584, "learning_rate": 9.902753540877867e-05, "loss": 0.0229, "step": 8780 }, { "grad_norm": 0.1948605328798294, "learning_rate": 9.902347472311466e-05, "loss": 0.0203, "step": 8790 }, { "grad_norm": 0.2134648859500885, "learning_rate": 9.901940566067962e-05, "loss": 0.0254, "step": 8800 }, { "grad_norm": 0.2843400239944458, "learning_rate": 9.901532822216883e-05, "loss": 0.0263, "step": 8810 }, { "grad_norm": 0.3828902244567871, "learning_rate": 9.901124240827904e-05, "loss": 0.0277, "step": 8820 }, { "grad_norm": 0.2630263864994049, "learning_rate": 9.900714821970835e-05, "loss": 0.0295, "step": 8830 }, { "grad_norm": 0.2380320131778717, "learning_rate": 9.900304565715641e-05, "loss": 0.026, "step": 8840 }, { "grad_norm": 0.23953847587108612, "learning_rate": 9.899893472132419e-05, "loss": 0.029, "step": 8850 }, { "grad_norm": 0.3168174624443054, "learning_rate": 9.899481541291415e-05, "loss": 0.0276, "step": 8860 }, { "grad_norm": 0.2286669760942459, "learning_rate": 9.899068773263016e-05, "loss": 0.0257, "step": 8870 }, { "grad_norm": 0.2791914641857147, "learning_rate": 9.898655168117754e-05, "loss": 0.0245, "step": 8880 }, { "grad_norm": 0.28504452109336853, "learning_rate": 9.898240725926302e-05, "loss": 0.029, "step": 8890 }, { "grad_norm": 0.2952522933483124, "learning_rate": 9.897825446759478e-05, "loss": 0.0222, "step": 8900 }, { "grad_norm": 0.3068820834159851, "learning_rate": 9.897409330688241e-05, "loss": 0.0249, "step": 8910 }, { "grad_norm": 0.24132667481899261, "learning_rate": 9.896992377783692e-05, "loss": 0.0227, "step": 8920 }, { "grad_norm": 0.21416360139846802, "learning_rate": 9.89657458811708e-05, "loss": 0.0242, "step": 8930 }, { "grad_norm": 0.2565059959888458, "learning_rate": 9.896155961759792e-05, "loss": 0.0228, "step": 8940 }, { "grad_norm": 0.27194976806640625, "learning_rate": 9.895736498783361e-05, "loss": 0.0258, "step": 8950 }, { "grad_norm": 0.23485788702964783, "learning_rate": 9.895316199259462e-05, "loss": 0.0286, "step": 8960 }, { "grad_norm": 0.2453605830669403, "learning_rate": 9.894895063259909e-05, "loss": 0.0277, "step": 8970 }, { "grad_norm": 0.2509019672870636, "learning_rate": 9.894473090856667e-05, "loss": 0.0269, "step": 8980 }, { "grad_norm": 0.4110601246356964, "learning_rate": 9.894050282121839e-05, "loss": 0.0263, "step": 8990 }, { "grad_norm": 0.26751846075057983, "learning_rate": 9.893626637127668e-05, "loss": 0.0294, "step": 9000 }, { "grad_norm": 0.27505698800086975, "learning_rate": 9.893202155946546e-05, "loss": 0.028, "step": 9010 }, { "grad_norm": 0.2441924661397934, "learning_rate": 9.892776838651006e-05, "loss": 0.0225, "step": 9020 }, { "grad_norm": 0.2531778812408447, "learning_rate": 9.892350685313722e-05, "loss": 0.0276, "step": 9030 }, { "grad_norm": 0.3647709786891937, "learning_rate": 9.891923696007513e-05, "loss": 0.026, "step": 9040 }, { "grad_norm": 0.26135024428367615, "learning_rate": 9.891495870805336e-05, "loss": 0.0233, "step": 9050 }, { "grad_norm": 0.22269341349601746, "learning_rate": 9.891067209780298e-05, "loss": 0.0232, "step": 9060 }, { "grad_norm": 0.24539688229560852, "learning_rate": 9.890637713005646e-05, "loss": 0.0247, "step": 9070 }, { "grad_norm": 0.2114863246679306, "learning_rate": 9.890207380554767e-05, "loss": 0.0252, "step": 9080 }, { "grad_norm": 0.25321120023727417, "learning_rate": 9.889776212501196e-05, "loss": 0.0259, "step": 9090 }, { "grad_norm": 0.21332262456417084, "learning_rate": 9.889344208918605e-05, "loss": 0.0248, "step": 9100 }, { "grad_norm": 0.2555060088634491, "learning_rate": 9.888911369880812e-05, "loss": 0.0244, "step": 9110 }, { "grad_norm": 0.3184070885181427, "learning_rate": 9.888477695461777e-05, "loss": 0.0253, "step": 9120 }, { "grad_norm": 0.27704161405563354, "learning_rate": 9.888043185735607e-05, "loss": 0.0245, "step": 9130 }, { "grad_norm": 0.29724979400634766, "learning_rate": 9.887607840776542e-05, "loss": 0.0254, "step": 9140 }, { "grad_norm": 0.22308042645454407, "learning_rate": 9.887171660658975e-05, "loss": 0.0228, "step": 9150 }, { "grad_norm": 0.2914009094238281, "learning_rate": 9.886734645457435e-05, "loss": 0.0237, "step": 9160 }, { "grad_norm": 0.2736971974372864, "learning_rate": 9.886296795246597e-05, "loss": 0.0275, "step": 9170 }, { "grad_norm": 0.2739271819591522, "learning_rate": 9.885858110101276e-05, "loss": 0.0232, "step": 9180 }, { "grad_norm": 0.2284928858280182, "learning_rate": 9.885418590096434e-05, "loss": 0.0241, "step": 9190 }, { "grad_norm": 0.2966867685317993, "learning_rate": 9.88497823530717e-05, "loss": 0.0248, "step": 9200 }, { "grad_norm": 0.25770479440689087, "learning_rate": 9.884537045808732e-05, "loss": 0.0258, "step": 9210 }, { "grad_norm": 0.2945628762245178, "learning_rate": 9.884095021676502e-05, "loss": 0.026, "step": 9220 }, { "grad_norm": 0.19446446001529694, "learning_rate": 9.883652162986017e-05, "loss": 0.0241, "step": 9230 }, { "grad_norm": 0.23758628964424133, "learning_rate": 9.883208469812943e-05, "loss": 0.0258, "step": 9240 }, { "grad_norm": 0.2403724044561386, "learning_rate": 9.882763942233098e-05, "loss": 0.0242, "step": 9250 }, { "grad_norm": 0.22912289202213287, "learning_rate": 9.882318580322441e-05, "loss": 0.0218, "step": 9260 }, { "grad_norm": 0.25627270340919495, "learning_rate": 9.881872384157067e-05, "loss": 0.0236, "step": 9270 }, { "grad_norm": 0.3178584575653076, "learning_rate": 9.881425353813225e-05, "loss": 0.0262, "step": 9280 }, { "grad_norm": 0.22086754441261292, "learning_rate": 9.880977489367296e-05, "loss": 0.0236, "step": 9290 }, { "grad_norm": 0.3000679314136505, "learning_rate": 9.88052879089581e-05, "loss": 0.0234, "step": 9300 }, { "grad_norm": 0.22717317938804626, "learning_rate": 9.880079258475434e-05, "loss": 0.0213, "step": 9310 }, { "grad_norm": 0.18658824265003204, "learning_rate": 9.879628892182985e-05, "loss": 0.0202, "step": 9320 }, { "grad_norm": 0.26592186093330383, "learning_rate": 9.879177692095416e-05, "loss": 0.0221, "step": 9330 }, { "grad_norm": 0.29425618052482605, "learning_rate": 9.878725658289825e-05, "loss": 0.0213, "step": 9340 }, { "grad_norm": 0.2057674080133438, "learning_rate": 9.878272790843454e-05, "loss": 0.024, "step": 9350 }, { "grad_norm": 0.21369394659996033, "learning_rate": 9.877819089833682e-05, "loss": 0.0234, "step": 9360 }, { "grad_norm": 0.24316628277301788, "learning_rate": 9.877364555338038e-05, "loss": 0.0212, "step": 9370 }, { "grad_norm": 0.32323116064071655, "learning_rate": 9.876909187434186e-05, "loss": 0.0248, "step": 9380 }, { "grad_norm": 0.22062614560127258, "learning_rate": 9.876452986199939e-05, "loss": 0.0214, "step": 9390 }, { "grad_norm": 0.22642730176448822, "learning_rate": 9.875995951713248e-05, "loss": 0.0235, "step": 9400 }, { "grad_norm": 0.2840273082256317, "learning_rate": 9.875538084052207e-05, "loss": 0.0209, "step": 9410 }, { "grad_norm": 0.23741567134857178, "learning_rate": 9.875079383295053e-05, "loss": 0.0246, "step": 9420 }, { "grad_norm": 0.2962169647216797, "learning_rate": 9.874619849520167e-05, "loss": 0.0265, "step": 9430 }, { "grad_norm": 0.2778750956058502, "learning_rate": 9.874159482806069e-05, "loss": 0.0233, "step": 9440 }, { "grad_norm": 0.23937393724918365, "learning_rate": 9.873698283231426e-05, "loss": 0.0237, "step": 9450 }, { "grad_norm": 0.2522561252117157, "learning_rate": 9.87323625087504e-05, "loss": 0.0206, "step": 9460 }, { "grad_norm": 0.1955396831035614, "learning_rate": 9.872773385815863e-05, "loss": 0.0209, "step": 9470 }, { "grad_norm": 0.2896108031272888, "learning_rate": 9.872309688132986e-05, "loss": 0.024, "step": 9480 }, { "grad_norm": 0.2110123634338379, "learning_rate": 9.871845157905639e-05, "loss": 0.0232, "step": 9490 }, { "grad_norm": 0.23059354722499847, "learning_rate": 9.871379795213201e-05, "loss": 0.0235, "step": 9500 }, { "grad_norm": 0.2566559314727783, "learning_rate": 9.87091360013519e-05, "loss": 0.0246, "step": 9510 }, { "grad_norm": 0.25323665142059326, "learning_rate": 9.870446572751262e-05, "loss": 0.0212, "step": 9520 }, { "grad_norm": 0.22453325986862183, "learning_rate": 9.869978713141224e-05, "loss": 0.018, "step": 9530 }, { "grad_norm": 0.225039541721344, "learning_rate": 9.869510021385016e-05, "loss": 0.0194, "step": 9540 }, { "grad_norm": 0.19739633798599243, "learning_rate": 9.869040497562727e-05, "loss": 0.0267, "step": 9550 }, { "grad_norm": 0.23039773106575012, "learning_rate": 9.868570141754587e-05, "loss": 0.0237, "step": 9560 }, { "grad_norm": 0.24782751500606537, "learning_rate": 9.868098954040965e-05, "loss": 0.023, "step": 9570 }, { "grad_norm": 0.240155890583992, "learning_rate": 9.867626934502374e-05, "loss": 0.0248, "step": 9580 }, { "grad_norm": 0.26592662930488586, "learning_rate": 9.86715408321947e-05, "loss": 0.026, "step": 9590 }, { "grad_norm": 0.18668431043624878, "learning_rate": 9.86668040027305e-05, "loss": 0.0212, "step": 9600 }, { "grad_norm": 0.1968608796596527, "learning_rate": 9.866205885744053e-05, "loss": 0.0228, "step": 9610 }, { "grad_norm": 0.3024444282054901, "learning_rate": 9.865730539713563e-05, "loss": 0.019, "step": 9620 }, { "grad_norm": 0.25478360056877136, "learning_rate": 9.8652543622628e-05, "loss": 0.0217, "step": 9630 }, { "grad_norm": 0.2643311619758606, "learning_rate": 9.864777353473132e-05, "loss": 0.0256, "step": 9640 }, { "grad_norm": 0.21843715012073517, "learning_rate": 9.864299513426068e-05, "loss": 0.0245, "step": 9650 }, { "grad_norm": 0.2162047177553177, "learning_rate": 9.863820842203254e-05, "loss": 0.0247, "step": 9660 }, { "grad_norm": 0.24678054451942444, "learning_rate": 9.863341339886483e-05, "loss": 0.0195, "step": 9670 }, { "grad_norm": 0.22724701464176178, "learning_rate": 9.86286100655769e-05, "loss": 0.0199, "step": 9680 }, { "grad_norm": 0.23253992199897766, "learning_rate": 9.862379842298953e-05, "loss": 0.0243, "step": 9690 }, { "grad_norm": 0.25402265787124634, "learning_rate": 9.861897847192485e-05, "loss": 0.0227, "step": 9700 }, { "grad_norm": 0.2355540692806244, "learning_rate": 9.86141502132065e-05, "loss": 0.0223, "step": 9710 }, { "grad_norm": 0.2546367049217224, "learning_rate": 9.860931364765946e-05, "loss": 0.0225, "step": 9720 }, { "grad_norm": 0.2801545262336731, "learning_rate": 9.860446877611021e-05, "loss": 0.024, "step": 9730 }, { "grad_norm": 0.2448757290840149, "learning_rate": 9.859961559938655e-05, "loss": 0.0219, "step": 9740 }, { "grad_norm": 0.2960597574710846, "learning_rate": 9.85947541183178e-05, "loss": 0.0257, "step": 9750 }, { "grad_norm": 0.24744437634944916, "learning_rate": 9.858988433373463e-05, "loss": 0.0232, "step": 9760 }, { "grad_norm": 0.17202965915203094, "learning_rate": 9.858500624646918e-05, "loss": 0.0212, "step": 9770 }, { "grad_norm": 0.19850370287895203, "learning_rate": 9.858011985735497e-05, "loss": 0.0192, "step": 9780 }, { "grad_norm": 0.2943878769874573, "learning_rate": 9.857522516722693e-05, "loss": 0.0213, "step": 9790 }, { "grad_norm": 0.19792218506336212, "learning_rate": 9.857032217692145e-05, "loss": 0.0213, "step": 9800 }, { "grad_norm": 0.21205469965934753, "learning_rate": 9.856541088727631e-05, "loss": 0.0305, "step": 9810 }, { "grad_norm": 0.2294398695230484, "learning_rate": 9.856049129913072e-05, "loss": 0.0242, "step": 9820 }, { "grad_norm": 0.2401529848575592, "learning_rate": 9.85555634133253e-05, "loss": 0.024, "step": 9830 }, { "grad_norm": 0.18413066864013672, "learning_rate": 9.855062723070208e-05, "loss": 0.0237, "step": 9840 }, { "grad_norm": 0.19735632836818695, "learning_rate": 9.854568275210454e-05, "loss": 0.0247, "step": 9850 }, { "grad_norm": 0.30107298493385315, "learning_rate": 9.854072997837754e-05, "loss": 0.0227, "step": 9860 }, { "grad_norm": 0.17048239707946777, "learning_rate": 9.853576891036737e-05, "loss": 0.0204, "step": 9870 }, { "grad_norm": 0.2614099681377411, "learning_rate": 9.853079954892177e-05, "loss": 0.0221, "step": 9880 }, { "grad_norm": 0.2268400937318802, "learning_rate": 9.852582189488983e-05, "loss": 0.0219, "step": 9890 }, { "grad_norm": 0.2199339121580124, "learning_rate": 9.852083594912212e-05, "loss": 0.0224, "step": 9900 }, { "grad_norm": 0.21014918386936188, "learning_rate": 9.851584171247058e-05, "loss": 0.0226, "step": 9910 }, { "grad_norm": 0.23987914621829987, "learning_rate": 9.851083918578863e-05, "loss": 0.0215, "step": 9920 }, { "grad_norm": 0.21438872814178467, "learning_rate": 9.850582836993103e-05, "loss": 0.0198, "step": 9930 }, { "grad_norm": 0.2252170294523239, "learning_rate": 9.850080926575397e-05, "loss": 0.0192, "step": 9940 }, { "grad_norm": 0.21537628769874573, "learning_rate": 9.849578187411515e-05, "loss": 0.0266, "step": 9950 }, { "grad_norm": 0.3174359202384949, "learning_rate": 9.849074619587354e-05, "loss": 0.0234, "step": 9960 }, { "grad_norm": 0.18866656720638275, "learning_rate": 9.848570223188964e-05, "loss": 0.0193, "step": 9970 }, { "grad_norm": 0.22614778578281403, "learning_rate": 9.848064998302531e-05, "loss": 0.0184, "step": 9980 }, { "grad_norm": 0.19308888912200928, "learning_rate": 9.847558945014386e-05, "loss": 0.0207, "step": 9990 }, { "grad_norm": 0.26961997151374817, "learning_rate": 9.847052063410996e-05, "loss": 0.0202, "step": 10000 }, { "grad_norm": 0.20809614658355713, "learning_rate": 9.846544353578977e-05, "loss": 0.0175, "step": 10010 }, { "grad_norm": 0.24848991632461548, "learning_rate": 9.846035815605081e-05, "loss": 0.0215, "step": 10020 }, { "grad_norm": 0.2785988450050354, "learning_rate": 9.845526449576204e-05, "loss": 0.0226, "step": 10030 }, { "grad_norm": 0.2725575864315033, "learning_rate": 9.845016255579383e-05, "loss": 0.0196, "step": 10040 }, { "grad_norm": 0.2217407524585724, "learning_rate": 9.844505233701794e-05, "loss": 0.0204, "step": 10050 }, { "grad_norm": 0.23323039710521698, "learning_rate": 9.843993384030757e-05, "loss": 0.0208, "step": 10060 }, { "grad_norm": 0.3047633171081543, "learning_rate": 9.843480706653737e-05, "loss": 0.02, "step": 10070 }, { "grad_norm": 0.19537346065044403, "learning_rate": 9.84296720165833e-05, "loss": 0.0263, "step": 10080 }, { "grad_norm": 0.23998971283435822, "learning_rate": 9.842452869132286e-05, "loss": 0.0201, "step": 10090 }, { "grad_norm": 0.17061205208301544, "learning_rate": 9.841937709163489e-05, "loss": 0.0197, "step": 10100 }, { "grad_norm": 0.1917419582605362, "learning_rate": 9.841421721839962e-05, "loss": 0.0183, "step": 10110 }, { "grad_norm": 0.2613951861858368, "learning_rate": 9.840904907249879e-05, "loss": 0.0215, "step": 10120 }, { "grad_norm": 0.2044459879398346, "learning_rate": 9.840387265481545e-05, "loss": 0.0228, "step": 10130 }, { "grad_norm": 0.22983524203300476, "learning_rate": 9.839868796623411e-05, "loss": 0.0206, "step": 10140 }, { "grad_norm": 0.2267732471227646, "learning_rate": 9.839349500764072e-05, "loss": 0.0205, "step": 10150 }, { "grad_norm": 0.3075944483280182, "learning_rate": 9.83882937799226e-05, "loss": 0.0223, "step": 10160 }, { "grad_norm": 0.32401660084724426, "learning_rate": 9.838308428396849e-05, "loss": 0.0239, "step": 10170 }, { "grad_norm": 0.32308608293533325, "learning_rate": 9.837786652066854e-05, "loss": 0.0251, "step": 10180 }, { "grad_norm": 0.23594045639038086, "learning_rate": 9.837264049091437e-05, "loss": 0.0227, "step": 10190 }, { "grad_norm": 0.2785346508026123, "learning_rate": 9.836740619559893e-05, "loss": 0.0223, "step": 10200 }, { "grad_norm": 0.3432246744632721, "learning_rate": 9.836216363561659e-05, "loss": 0.0257, "step": 10210 }, { "grad_norm": 0.348998486995697, "learning_rate": 9.835691281186322e-05, "loss": 0.0285, "step": 10220 }, { "grad_norm": 0.21793405711650848, "learning_rate": 9.8351653725236e-05, "loss": 0.0221, "step": 10230 }, { "grad_norm": 0.1746242642402649, "learning_rate": 9.83463863766336e-05, "loss": 0.0203, "step": 10240 }, { "grad_norm": 0.26554203033447266, "learning_rate": 9.834111076695602e-05, "loss": 0.0198, "step": 10250 }, { "grad_norm": 0.2234218269586563, "learning_rate": 9.833582689710477e-05, "loss": 0.0268, "step": 10260 }, { "grad_norm": 0.2906879186630249, "learning_rate": 9.833053476798268e-05, "loss": 0.025, "step": 10270 }, { "grad_norm": 0.2566686272621155, "learning_rate": 9.832523438049404e-05, "loss": 0.0245, "step": 10280 }, { "grad_norm": 0.24494850635528564, "learning_rate": 9.831992573554454e-05, "loss": 0.0209, "step": 10290 }, { "grad_norm": 0.2032083123922348, "learning_rate": 9.831460883404128e-05, "loss": 0.0211, "step": 10300 }, { "grad_norm": 0.2822588086128235, "learning_rate": 9.830928367689278e-05, "loss": 0.0203, "step": 10310 }, { "grad_norm": 0.2268766313791275, "learning_rate": 9.830395026500896e-05, "loss": 0.0203, "step": 10320 }, { "grad_norm": 0.1969042718410492, "learning_rate": 9.829860859930115e-05, "loss": 0.022, "step": 10330 }, { "grad_norm": 0.2267187535762787, "learning_rate": 9.829325868068212e-05, "loss": 0.0208, "step": 10340 }, { "grad_norm": 0.25103700160980225, "learning_rate": 9.8287900510066e-05, "loss": 0.0266, "step": 10350 }, { "grad_norm": 0.2981354296207428, "learning_rate": 9.828253408836834e-05, "loss": 0.0243, "step": 10360 }, { "grad_norm": 0.20996086299419403, "learning_rate": 9.827715941650615e-05, "loss": 0.0203, "step": 10370 }, { "grad_norm": 0.27647629380226135, "learning_rate": 9.82717764953978e-05, "loss": 0.0199, "step": 10380 }, { "grad_norm": 0.27861830592155457, "learning_rate": 9.826638532596308e-05, "loss": 0.0213, "step": 10390 }, { "grad_norm": 0.18612372875213623, "learning_rate": 9.82609859091232e-05, "loss": 0.0212, "step": 10400 }, { "grad_norm": 0.1686278134584427, "learning_rate": 9.825557824580076e-05, "loss": 0.0213, "step": 10410 }, { "grad_norm": 0.21589244902133942, "learning_rate": 9.82501623369198e-05, "loss": 0.0193, "step": 10420 }, { "grad_norm": 0.2259666472673416, "learning_rate": 9.824473818340574e-05, "loss": 0.0224, "step": 10430 }, { "grad_norm": 0.14523650705814362, "learning_rate": 9.823930578618541e-05, "loss": 0.0205, "step": 10440 }, { "grad_norm": 0.19132061302661896, "learning_rate": 9.823386514618709e-05, "loss": 0.0182, "step": 10450 }, { "grad_norm": 0.15157058835029602, "learning_rate": 9.82284162643404e-05, "loss": 0.0194, "step": 10460 }, { "grad_norm": 0.22968091070652008, "learning_rate": 9.822295914157642e-05, "loss": 0.0199, "step": 10470 }, { "grad_norm": 0.21209630370140076, "learning_rate": 9.821749377882763e-05, "loss": 0.0198, "step": 10480 }, { "grad_norm": 0.23055493831634521, "learning_rate": 9.821202017702791e-05, "loss": 0.0207, "step": 10490 }, { "grad_norm": 0.29555830359458923, "learning_rate": 9.820653833711253e-05, "loss": 0.0205, "step": 10500 }, { "grad_norm": 0.2967710494995117, "learning_rate": 9.820104826001822e-05, "loss": 0.0214, "step": 10510 }, { "grad_norm": 0.22918333113193512, "learning_rate": 9.819554994668305e-05, "loss": 0.0236, "step": 10520 }, { "grad_norm": 0.26998060941696167, "learning_rate": 9.819004339804654e-05, "loss": 0.0218, "step": 10530 }, { "grad_norm": 0.22261658310890198, "learning_rate": 9.818452861504961e-05, "loss": 0.0222, "step": 10540 }, { "grad_norm": 0.25885242223739624, "learning_rate": 9.81790055986346e-05, "loss": 0.0231, "step": 10550 }, { "grad_norm": 0.19105304777622223, "learning_rate": 9.817347434974523e-05, "loss": 0.0188, "step": 10560 }, { "grad_norm": 0.2614167332649231, "learning_rate": 9.816793486932664e-05, "loss": 0.0222, "step": 10570 }, { "grad_norm": 0.18128004670143127, "learning_rate": 9.816238715832538e-05, "loss": 0.0181, "step": 10580 }, { "grad_norm": 0.2572338879108429, "learning_rate": 9.815683121768939e-05, "loss": 0.0233, "step": 10590 }, { "grad_norm": 0.25759807229042053, "learning_rate": 9.815126704836804e-05, "loss": 0.025, "step": 10600 }, { "grad_norm": 0.16561737656593323, "learning_rate": 9.81456946513121e-05, "loss": 0.0244, "step": 10610 }, { "grad_norm": 0.24474148452281952, "learning_rate": 9.814011402747373e-05, "loss": 0.0256, "step": 10620 }, { "grad_norm": 0.2208680957555771, "learning_rate": 9.813452517780651e-05, "loss": 0.021, "step": 10630 }, { "grad_norm": 0.3400189280509949, "learning_rate": 9.81289281032654e-05, "loss": 0.0235, "step": 10640 }, { "grad_norm": 0.25788336992263794, "learning_rate": 9.812332280480683e-05, "loss": 0.0208, "step": 10650 }, { "grad_norm": 0.35556453466415405, "learning_rate": 9.811770928338854e-05, "loss": 0.0206, "step": 10660 }, { "grad_norm": 0.18611778318881989, "learning_rate": 9.811208753996979e-05, "loss": 0.0191, "step": 10670 }, { "grad_norm": 0.24126601219177246, "learning_rate": 9.810645757551113e-05, "loss": 0.0175, "step": 10680 }, { "grad_norm": 0.2546038329601288, "learning_rate": 9.810081939097459e-05, "loss": 0.0193, "step": 10690 }, { "grad_norm": 0.14464770257472992, "learning_rate": 9.809517298732356e-05, "loss": 0.0222, "step": 10700 }, { "grad_norm": 0.21817757189273834, "learning_rate": 9.80895183655229e-05, "loss": 0.02, "step": 10710 }, { "grad_norm": 0.23463736474514008, "learning_rate": 9.808385552653877e-05, "loss": 0.0246, "step": 10720 }, { "grad_norm": 0.24115689098834991, "learning_rate": 9.807818447133886e-05, "loss": 0.0234, "step": 10730 }, { "grad_norm": 0.28977805376052856, "learning_rate": 9.807250520089215e-05, "loss": 0.0212, "step": 10740 }, { "grad_norm": 0.21122904121875763, "learning_rate": 9.806681771616908e-05, "loss": 0.0193, "step": 10750 }, { "grad_norm": 0.2103225439786911, "learning_rate": 9.80611220181415e-05, "loss": 0.0219, "step": 10760 }, { "grad_norm": 0.19506509602069855, "learning_rate": 9.805541810778264e-05, "loss": 0.0217, "step": 10770 }, { "grad_norm": 0.24711866676807404, "learning_rate": 9.804970598606716e-05, "loss": 0.0197, "step": 10780 }, { "grad_norm": 0.2931860685348511, "learning_rate": 9.804398565397106e-05, "loss": 0.0254, "step": 10790 }, { "grad_norm": 0.27549341320991516, "learning_rate": 9.803825711247183e-05, "loss": 0.0222, "step": 10800 }, { "grad_norm": 0.2801666259765625, "learning_rate": 9.803252036254831e-05, "loss": 0.0189, "step": 10810 }, { "grad_norm": 0.2342822253704071, "learning_rate": 9.802677540518076e-05, "loss": 0.0231, "step": 10820 }, { "grad_norm": 0.18011580407619476, "learning_rate": 9.802102224135081e-05, "loss": 0.0213, "step": 10830 }, { "grad_norm": 0.2394137680530548, "learning_rate": 9.801526087204155e-05, "loss": 0.0215, "step": 10840 }, { "grad_norm": 0.26677748560905457, "learning_rate": 9.800949129823743e-05, "loss": 0.0176, "step": 10850 }, { "grad_norm": 0.2732047140598297, "learning_rate": 9.80037135209243e-05, "loss": 0.0214, "step": 10860 }, { "grad_norm": 0.20070677995681763, "learning_rate": 9.799792754108946e-05, "loss": 0.0202, "step": 10870 }, { "grad_norm": 0.2546859681606293, "learning_rate": 9.799213335972152e-05, "loss": 0.0203, "step": 10880 }, { "grad_norm": 0.15687482059001923, "learning_rate": 9.798633097781058e-05, "loss": 0.0231, "step": 10890 }, { "grad_norm": 0.25428488850593567, "learning_rate": 9.79805203963481e-05, "loss": 0.0176, "step": 10900 }, { "grad_norm": 0.2802523374557495, "learning_rate": 9.797470161632697e-05, "loss": 0.0224, "step": 10910 }, { "grad_norm": 0.2254740297794342, "learning_rate": 9.796887463874145e-05, "loss": 0.0216, "step": 10920 }, { "grad_norm": 0.24676060676574707, "learning_rate": 9.796303946458718e-05, "loss": 0.0213, "step": 10930 }, { "grad_norm": 0.1868765950202942, "learning_rate": 9.795719609486127e-05, "loss": 0.021, "step": 10940 }, { "grad_norm": 0.183708056807518, "learning_rate": 9.795134453056219e-05, "loss": 0.0175, "step": 10950 }, { "grad_norm": 0.21780797839164734, "learning_rate": 9.794548477268979e-05, "loss": 0.0181, "step": 10960 }, { "grad_norm": 0.34662503004074097, "learning_rate": 9.793961682224537e-05, "loss": 0.0167, "step": 10970 }, { "grad_norm": 0.19642207026481628, "learning_rate": 9.793374068023156e-05, "loss": 0.0211, "step": 10980 }, { "grad_norm": 0.21870143711566925, "learning_rate": 9.792785634765247e-05, "loss": 0.0224, "step": 10990 }, { "grad_norm": 0.19128194451332092, "learning_rate": 9.792196382551357e-05, "loss": 0.0203, "step": 11000 }, { "grad_norm": 0.23246031999588013, "learning_rate": 9.791606311482171e-05, "loss": 0.0191, "step": 11010 }, { "grad_norm": 0.21310578286647797, "learning_rate": 9.791015421658518e-05, "loss": 0.02, "step": 11020 }, { "grad_norm": 0.19358836114406586, "learning_rate": 9.790423713181362e-05, "loss": 0.0207, "step": 11030 }, { "grad_norm": 0.2539893388748169, "learning_rate": 9.789831186151814e-05, "loss": 0.021, "step": 11040 }, { "grad_norm": 0.27905043959617615, "learning_rate": 9.789237840671118e-05, "loss": 0.0213, "step": 11050 }, { "grad_norm": 0.24071796238422394, "learning_rate": 9.78864367684066e-05, "loss": 0.0213, "step": 11060 }, { "grad_norm": 0.2661580443382263, "learning_rate": 9.788048694761968e-05, "loss": 0.0208, "step": 11070 }, { "grad_norm": 0.26514729857444763, "learning_rate": 9.787452894536709e-05, "loss": 0.021, "step": 11080 }, { "grad_norm": 0.23766183853149414, "learning_rate": 9.786856276266685e-05, "loss": 0.02, "step": 11090 }, { "grad_norm": 0.257985383272171, "learning_rate": 9.786258840053845e-05, "loss": 0.0209, "step": 11100 }, { "grad_norm": 0.2039182037115097, "learning_rate": 9.785660586000273e-05, "loss": 0.0205, "step": 11110 }, { "grad_norm": 0.2639724612236023, "learning_rate": 9.785061514208196e-05, "loss": 0.0199, "step": 11120 }, { "grad_norm": 0.18440720438957214, "learning_rate": 9.784461624779977e-05, "loss": 0.0192, "step": 11130 }, { "grad_norm": 0.17821702361106873, "learning_rate": 9.783860917818123e-05, "loss": 0.0192, "step": 11140 }, { "grad_norm": 0.1801283210515976, "learning_rate": 9.783259393425277e-05, "loss": 0.0215, "step": 11150 }, { "grad_norm": 0.1837438941001892, "learning_rate": 9.782657051704221e-05, "loss": 0.0165, "step": 11160 }, { "grad_norm": 0.27288568019866943, "learning_rate": 9.782053892757883e-05, "loss": 0.0187, "step": 11170 }, { "grad_norm": 0.2442503422498703, "learning_rate": 9.781449916689324e-05, "loss": 0.0207, "step": 11180 }, { "grad_norm": 0.18398027122020721, "learning_rate": 9.780845123601746e-05, "loss": 0.0191, "step": 11190 }, { "grad_norm": 0.16121412813663483, "learning_rate": 9.780239513598492e-05, "loss": 0.0192, "step": 11200 }, { "grad_norm": 0.22384223341941833, "learning_rate": 9.779633086783047e-05, "loss": 0.0244, "step": 11210 }, { "grad_norm": 0.23557329177856445, "learning_rate": 9.779025843259031e-05, "loss": 0.0194, "step": 11220 }, { "grad_norm": 0.22464483976364136, "learning_rate": 9.778417783130204e-05, "loss": 0.0196, "step": 11230 }, { "grad_norm": 0.17542876303195953, "learning_rate": 9.777808906500468e-05, "loss": 0.0186, "step": 11240 }, { "grad_norm": 0.23094409704208374, "learning_rate": 9.777199213473862e-05, "loss": 0.0218, "step": 11250 }, { "grad_norm": 0.21684619784355164, "learning_rate": 9.77658870415457e-05, "loss": 0.023, "step": 11260 }, { "grad_norm": 0.29711437225341797, "learning_rate": 9.775977378646906e-05, "loss": 0.0257, "step": 11270 }, { "grad_norm": 0.2102448046207428, "learning_rate": 9.775365237055331e-05, "loss": 0.0199, "step": 11280 }, { "grad_norm": 0.25835880637168884, "learning_rate": 9.774752279484445e-05, "loss": 0.0198, "step": 11290 }, { "grad_norm": 0.20612594485282898, "learning_rate": 9.774138506038984e-05, "loss": 0.019, "step": 11300 }, { "grad_norm": 0.19647875428199768, "learning_rate": 9.773523916823826e-05, "loss": 0.0235, "step": 11310 }, { "grad_norm": 0.20673628151416779, "learning_rate": 9.772908511943986e-05, "loss": 0.0177, "step": 11320 }, { "grad_norm": 0.2060486227273941, "learning_rate": 9.77229229150462e-05, "loss": 0.0191, "step": 11330 }, { "grad_norm": 0.25069746375083923, "learning_rate": 9.771675255611024e-05, "loss": 0.0213, "step": 11340 }, { "grad_norm": 0.2518250346183777, "learning_rate": 9.771057404368632e-05, "loss": 0.022, "step": 11350 }, { "grad_norm": 0.2093028426170349, "learning_rate": 9.770438737883018e-05, "loss": 0.0189, "step": 11360 }, { "grad_norm": 0.19704517722129822, "learning_rate": 9.769819256259898e-05, "loss": 0.0197, "step": 11370 }, { "grad_norm": 0.2817666232585907, "learning_rate": 9.769198959605119e-05, "loss": 0.0219, "step": 11380 }, { "grad_norm": 0.21544700860977173, "learning_rate": 9.768577848024678e-05, "loss": 0.0216, "step": 11390 }, { "grad_norm": 0.2166435867547989, "learning_rate": 9.767955921624702e-05, "loss": 0.019, "step": 11400 }, { "grad_norm": 0.2450254112482071, "learning_rate": 9.767333180511465e-05, "loss": 0.0206, "step": 11410 }, { "grad_norm": 0.25445055961608887, "learning_rate": 9.766709624791373e-05, "loss": 0.0243, "step": 11420 }, { "grad_norm": 0.2795272171497345, "learning_rate": 9.766085254570975e-05, "loss": 0.0228, "step": 11430 }, { "grad_norm": 0.26265770196914673, "learning_rate": 9.76546006995696e-05, "loss": 0.0202, "step": 11440 }, { "grad_norm": 0.2101667821407318, "learning_rate": 9.764834071056155e-05, "loss": 0.0185, "step": 11450 }, { "grad_norm": 0.2032785415649414, "learning_rate": 9.764207257975526e-05, "loss": 0.0191, "step": 11460 }, { "grad_norm": 0.20200437307357788, "learning_rate": 9.763579630822179e-05, "loss": 0.0205, "step": 11470 }, { "grad_norm": 0.19481270015239716, "learning_rate": 9.762951189703356e-05, "loss": 0.0204, "step": 11480 }, { "grad_norm": 0.2546454071998596, "learning_rate": 9.762321934726442e-05, "loss": 0.02, "step": 11490 }, { "grad_norm": 0.25541990995407104, "learning_rate": 9.761691865998959e-05, "loss": 0.0231, "step": 11500 }, { "grad_norm": 0.23918718099594116, "learning_rate": 9.76106098362857e-05, "loss": 0.0209, "step": 11510 }, { "grad_norm": 0.18298447132110596, "learning_rate": 9.760429287723072e-05, "loss": 0.0225, "step": 11520 }, { "grad_norm": 0.2602711319923401, "learning_rate": 9.759796778390406e-05, "loss": 0.0213, "step": 11530 }, { "grad_norm": 0.18487848341464996, "learning_rate": 9.759163455738653e-05, "loss": 0.0215, "step": 11540 }, { "grad_norm": 0.20212407410144806, "learning_rate": 9.75852931987603e-05, "loss": 0.0174, "step": 11550 }, { "grad_norm": 0.2021971046924591, "learning_rate": 9.757894370910891e-05, "loss": 0.0223, "step": 11560 }, { "grad_norm": 0.2091476321220398, "learning_rate": 9.757258608951733e-05, "loss": 0.0223, "step": 11570 }, { "grad_norm": 0.20180189609527588, "learning_rate": 9.75662203410719e-05, "loss": 0.0175, "step": 11580 }, { "grad_norm": 0.20404857397079468, "learning_rate": 9.755984646486034e-05, "loss": 0.0173, "step": 11590 }, { "grad_norm": 0.22063903510570526, "learning_rate": 9.75534644619718e-05, "loss": 0.0165, "step": 11600 }, { "grad_norm": 0.20388542115688324, "learning_rate": 9.754707433349676e-05, "loss": 0.0209, "step": 11610 }, { "grad_norm": 0.21842126548290253, "learning_rate": 9.754067608052715e-05, "loss": 0.0228, "step": 11620 }, { "grad_norm": 0.22050881385803223, "learning_rate": 9.753426970415622e-05, "loss": 0.0184, "step": 11630 }, { "grad_norm": 0.21304497122764587, "learning_rate": 9.752785520547868e-05, "loss": 0.0184, "step": 11640 }, { "grad_norm": 0.24169524013996124, "learning_rate": 9.752143258559056e-05, "loss": 0.0197, "step": 11650 }, { "grad_norm": 0.20981931686401367, "learning_rate": 9.751500184558933e-05, "loss": 0.0195, "step": 11660 }, { "grad_norm": 0.2430279403924942, "learning_rate": 9.750856298657383e-05, "loss": 0.0167, "step": 11670 }, { "grad_norm": 0.1812673956155777, "learning_rate": 9.750211600964428e-05, "loss": 0.0169, "step": 11680 }, { "grad_norm": 0.24989870190620422, "learning_rate": 9.749566091590226e-05, "loss": 0.0226, "step": 11690 }, { "grad_norm": 0.23981137573719025, "learning_rate": 9.748919770645083e-05, "loss": 0.0207, "step": 11700 }, { "grad_norm": 0.24656595289707184, "learning_rate": 9.748272638239432e-05, "loss": 0.019, "step": 11710 }, { "grad_norm": 0.3150559365749359, "learning_rate": 9.747624694483855e-05, "loss": 0.0219, "step": 11720 }, { "grad_norm": 0.1882297247648239, "learning_rate": 9.746975939489065e-05, "loss": 0.0161, "step": 11730 }, { "grad_norm": 0.20836667716503143, "learning_rate": 9.746326373365918e-05, "loss": 0.0168, "step": 11740 }, { "grad_norm": 0.20403710007667542, "learning_rate": 9.745675996225403e-05, "loss": 0.0182, "step": 11750 }, { "grad_norm": 0.22024458646774292, "learning_rate": 9.745024808178657e-05, "loss": 0.0186, "step": 11760 }, { "grad_norm": 0.22555091977119446, "learning_rate": 9.744372809336947e-05, "loss": 0.0184, "step": 11770 }, { "grad_norm": 0.2297881692647934, "learning_rate": 9.743719999811682e-05, "loss": 0.0224, "step": 11780 }, { "grad_norm": 0.16630977392196655, "learning_rate": 9.743066379714412e-05, "loss": 0.0182, "step": 11790 }, { "grad_norm": 0.23821724951267242, "learning_rate": 9.74241194915682e-05, "loss": 0.0237, "step": 11800 }, { "grad_norm": 0.14319004118442535, "learning_rate": 9.741756708250731e-05, "loss": 0.0178, "step": 11810 }, { "grad_norm": 0.24049215018749237, "learning_rate": 9.741100657108109e-05, "loss": 0.0206, "step": 11820 }, { "grad_norm": 0.24477066099643707, "learning_rate": 9.740443795841054e-05, "loss": 0.0214, "step": 11830 }, { "grad_norm": 0.18658742308616638, "learning_rate": 9.739786124561805e-05, "loss": 0.0189, "step": 11840 }, { "grad_norm": 0.18931864202022552, "learning_rate": 9.73912764338274e-05, "loss": 0.0208, "step": 11850 }, { "grad_norm": 0.2403751015663147, "learning_rate": 9.738468352416377e-05, "loss": 0.0207, "step": 11860 }, { "grad_norm": 0.25772595405578613, "learning_rate": 9.737808251775369e-05, "loss": 0.0222, "step": 11870 }, { "grad_norm": 0.17878897488117218, "learning_rate": 9.737147341572512e-05, "loss": 0.0177, "step": 11880 }, { "grad_norm": 0.2152876853942871, "learning_rate": 9.736485621920735e-05, "loss": 0.0209, "step": 11890 }, { "grad_norm": 0.2875625491142273, "learning_rate": 9.735823092933108e-05, "loss": 0.0266, "step": 11900 }, { "grad_norm": 0.25403469800949097, "learning_rate": 9.735159754722838e-05, "loss": 0.0184, "step": 11910 }, { "grad_norm": 0.22179651260375977, "learning_rate": 9.734495607403275e-05, "loss": 0.0203, "step": 11920 }, { "grad_norm": 0.2035081386566162, "learning_rate": 9.733830651087901e-05, "loss": 0.0204, "step": 11930 }, { "grad_norm": 0.1774919182062149, "learning_rate": 9.733164885890338e-05, "loss": 0.016, "step": 11940 }, { "grad_norm": 0.2574309706687927, "learning_rate": 9.732498311924349e-05, "loss": 0.0228, "step": 11950 }, { "grad_norm": 0.15009112656116486, "learning_rate": 9.731830929303833e-05, "loss": 0.0216, "step": 11960 }, { "grad_norm": 0.20553629100322723, "learning_rate": 9.731162738142827e-05, "loss": 0.0209, "step": 11970 }, { "grad_norm": 0.21595603227615356, "learning_rate": 9.730493738555506e-05, "loss": 0.0227, "step": 11980 }, { "grad_norm": 0.2816123366355896, "learning_rate": 9.729823930656186e-05, "loss": 0.0218, "step": 11990 }, { "grad_norm": 0.2088700532913208, "learning_rate": 9.729153314559316e-05, "loss": 0.0215, "step": 12000 }, { "grad_norm": 0.19722174108028412, "learning_rate": 9.728481890379486e-05, "loss": 0.023, "step": 12010 }, { "grad_norm": 0.203772634267807, "learning_rate": 9.727809658231428e-05, "loss": 0.0202, "step": 12020 }, { "grad_norm": 0.3015415370464325, "learning_rate": 9.727136618230003e-05, "loss": 0.0212, "step": 12030 }, { "grad_norm": 0.20020794868469238, "learning_rate": 9.726462770490219e-05, "loss": 0.0216, "step": 12040 }, { "grad_norm": 0.17625494301319122, "learning_rate": 9.725788115127214e-05, "loss": 0.0169, "step": 12050 }, { "grad_norm": 0.21973876655101776, "learning_rate": 9.725112652256274e-05, "loss": 0.0185, "step": 12060 }, { "grad_norm": 0.1971815824508667, "learning_rate": 9.724436381992812e-05, "loss": 0.0195, "step": 12070 }, { "grad_norm": 0.2673432528972626, "learning_rate": 9.723759304452387e-05, "loss": 0.0175, "step": 12080 }, { "grad_norm": 0.2231379747390747, "learning_rate": 9.72308141975069e-05, "loss": 0.0171, "step": 12090 }, { "grad_norm": 0.1909802109003067, "learning_rate": 9.722402728003557e-05, "loss": 0.0209, "step": 12100 }, { "grad_norm": 0.2360478788614273, "learning_rate": 9.721723229326953e-05, "loss": 0.0195, "step": 12110 }, { "grad_norm": 0.23512516915798187, "learning_rate": 9.721042923836992e-05, "loss": 0.0212, "step": 12120 }, { "grad_norm": 0.1672394573688507, "learning_rate": 9.720361811649914e-05, "loss": 0.0184, "step": 12130 }, { "grad_norm": 0.18706119060516357, "learning_rate": 9.719679892882106e-05, "loss": 0.0156, "step": 12140 }, { "grad_norm": 0.1614978015422821, "learning_rate": 9.718997167650085e-05, "loss": 0.0189, "step": 12150 }, { "grad_norm": 0.19572335481643677, "learning_rate": 9.718313636070515e-05, "loss": 0.0188, "step": 12160 }, { "grad_norm": 0.28832268714904785, "learning_rate": 9.717629298260192e-05, "loss": 0.0204, "step": 12170 }, { "grad_norm": 0.1509818732738495, "learning_rate": 9.716944154336047e-05, "loss": 0.0169, "step": 12180 }, { "grad_norm": 0.22695580124855042, "learning_rate": 9.716258204415157e-05, "loss": 0.0177, "step": 12190 }, { "grad_norm": 0.2467280477285385, "learning_rate": 9.715571448614728e-05, "loss": 0.0218, "step": 12200 }, { "grad_norm": 0.20595453679561615, "learning_rate": 9.71488388705211e-05, "loss": 0.0174, "step": 12210 }, { "grad_norm": 0.23047621548175812, "learning_rate": 9.714195519844788e-05, "loss": 0.0183, "step": 12220 }, { "grad_norm": 0.27103984355926514, "learning_rate": 9.713506347110386e-05, "loss": 0.018, "step": 12230 }, { "grad_norm": 0.2478049248456955, "learning_rate": 9.712816368966662e-05, "loss": 0.0184, "step": 12240 }, { "grad_norm": 0.2189720720052719, "learning_rate": 9.712125585531517e-05, "loss": 0.0209, "step": 12250 }, { "grad_norm": 0.21080002188682556, "learning_rate": 9.711433996922988e-05, "loss": 0.0187, "step": 12260 }, { "grad_norm": 0.17510753870010376, "learning_rate": 9.710741603259245e-05, "loss": 0.0221, "step": 12270 }, { "grad_norm": 0.22005091607570648, "learning_rate": 9.710048404658603e-05, "loss": 0.021, "step": 12280 }, { "grad_norm": 0.23665376007556915, "learning_rate": 9.709354401239508e-05, "loss": 0.0196, "step": 12290 }, { "grad_norm": 0.1756831705570221, "learning_rate": 9.708659593120546e-05, "loss": 0.0194, "step": 12300 }, { "grad_norm": 0.2663521468639374, "learning_rate": 9.707963980420443e-05, "loss": 0.0216, "step": 12310 }, { "grad_norm": 0.26580309867858887, "learning_rate": 9.707267563258058e-05, "loss": 0.0187, "step": 12320 }, { "grad_norm": 0.22119109332561493, "learning_rate": 9.70657034175239e-05, "loss": 0.0206, "step": 12330 }, { "grad_norm": 0.2807902693748474, "learning_rate": 9.705872316022577e-05, "loss": 0.0232, "step": 12340 }, { "grad_norm": 0.24233554303646088, "learning_rate": 9.705173486187891e-05, "loss": 0.0196, "step": 12350 }, { "grad_norm": 0.23200400173664093, "learning_rate": 9.704473852367741e-05, "loss": 0.0263, "step": 12360 }, { "grad_norm": 0.2284305840730667, "learning_rate": 9.70377341468168e-05, "loss": 0.0195, "step": 12370 }, { "grad_norm": 0.23854856193065643, "learning_rate": 9.703072173249389e-05, "loss": 0.0204, "step": 12380 }, { "grad_norm": 0.21498742699623108, "learning_rate": 9.702370128190693e-05, "loss": 0.018, "step": 12390 }, { "grad_norm": 0.15965694189071655, "learning_rate": 9.701667279625552e-05, "loss": 0.0207, "step": 12400 }, { "grad_norm": 0.17208264768123627, "learning_rate": 9.700963627674065e-05, "loss": 0.0176, "step": 12410 }, { "grad_norm": 0.2724571228027344, "learning_rate": 9.700259172456466e-05, "loss": 0.0167, "step": 12420 }, { "grad_norm": 0.23109565675258636, "learning_rate": 9.699553914093124e-05, "loss": 0.0162, "step": 12430 }, { "grad_norm": 0.1808565855026245, "learning_rate": 9.698847852704553e-05, "loss": 0.0176, "step": 12440 }, { "grad_norm": 0.20511792600154877, "learning_rate": 9.6981409884114e-05, "loss": 0.0256, "step": 12450 }, { "grad_norm": 0.20235399901866913, "learning_rate": 9.697433321334443e-05, "loss": 0.0193, "step": 12460 }, { "grad_norm": 0.18111026287078857, "learning_rate": 9.696724851594607e-05, "loss": 0.0217, "step": 12470 }, { "grad_norm": 0.2200455516576767, "learning_rate": 9.696015579312952e-05, "loss": 0.0169, "step": 12480 }, { "grad_norm": 0.2099611461162567, "learning_rate": 9.695305504610668e-05, "loss": 0.0185, "step": 12490 }, { "grad_norm": 0.1961633861064911, "learning_rate": 9.694594627609092e-05, "loss": 0.0172, "step": 12500 }, { "grad_norm": 0.17398954927921295, "learning_rate": 9.693882948429691e-05, "loss": 0.0167, "step": 12510 }, { "grad_norm": 0.23561957478523254, "learning_rate": 9.693170467194071e-05, "loss": 0.0185, "step": 12520 }, { "grad_norm": 0.2791477143764496, "learning_rate": 9.692457184023977e-05, "loss": 0.0207, "step": 12530 }, { "grad_norm": 0.24666430056095123, "learning_rate": 9.691743099041291e-05, "loss": 0.0199, "step": 12540 }, { "grad_norm": 0.27030953764915466, "learning_rate": 9.691028212368027e-05, "loss": 0.0192, "step": 12550 }, { "grad_norm": 0.22009839117527008, "learning_rate": 9.690312524126342e-05, "loss": 0.0175, "step": 12560 }, { "grad_norm": 0.15474101901054382, "learning_rate": 9.689596034438527e-05, "loss": 0.0183, "step": 12570 }, { "grad_norm": 0.18434633314609528, "learning_rate": 9.688878743427012e-05, "loss": 0.0158, "step": 12580 }, { "grad_norm": 0.19694460928440094, "learning_rate": 9.688160651214359e-05, "loss": 0.0179, "step": 12590 }, { "grad_norm": 0.24440604448318481, "learning_rate": 9.687441757923273e-05, "loss": 0.0183, "step": 12600 }, { "grad_norm": 0.19175370037555695, "learning_rate": 9.68672206367659e-05, "loss": 0.0172, "step": 12610 }, { "grad_norm": 0.17769581079483032, "learning_rate": 9.686001568597291e-05, "loss": 0.0196, "step": 12620 }, { "grad_norm": 0.2525118291378021, "learning_rate": 9.685280272808486e-05, "loss": 0.0191, "step": 12630 }, { "grad_norm": 0.27539873123168945, "learning_rate": 9.684558176433424e-05, "loss": 0.0193, "step": 12640 }, { "grad_norm": 0.21469242870807648, "learning_rate": 9.683835279595495e-05, "loss": 0.0185, "step": 12650 }, { "grad_norm": 0.202436164021492, "learning_rate": 9.683111582418216e-05, "loss": 0.016, "step": 12660 }, { "grad_norm": 0.26156264543533325, "learning_rate": 9.682387085025254e-05, "loss": 0.0159, "step": 12670 }, { "grad_norm": 0.20535293221473694, "learning_rate": 9.681661787540401e-05, "loss": 0.019, "step": 12680 }, { "grad_norm": 0.22410210967063904, "learning_rate": 9.680935690087593e-05, "loss": 0.0189, "step": 12690 }, { "grad_norm": 0.20481887459754944, "learning_rate": 9.680208792790901e-05, "loss": 0.0165, "step": 12700 }, { "grad_norm": 0.21530932188034058, "learning_rate": 9.679481095774529e-05, "loss": 0.0155, "step": 12710 }, { "grad_norm": 0.2196047306060791, "learning_rate": 9.678752599162822e-05, "loss": 0.0176, "step": 12720 }, { "grad_norm": 0.223812997341156, "learning_rate": 9.678023303080259e-05, "loss": 0.0171, "step": 12730 }, { "grad_norm": 0.18366560339927673, "learning_rate": 9.677293207651459e-05, "loss": 0.0163, "step": 12740 }, { "grad_norm": 0.19625087082386017, "learning_rate": 9.676562313001173e-05, "loss": 0.0168, "step": 12750 }, { "grad_norm": 0.16528300940990448, "learning_rate": 9.675830619254293e-05, "loss": 0.0178, "step": 12760 }, { "grad_norm": 0.24494041502475739, "learning_rate": 9.675098126535843e-05, "loss": 0.0184, "step": 12770 }, { "grad_norm": 0.2103242725133896, "learning_rate": 9.674364834970988e-05, "loss": 0.0211, "step": 12780 }, { "grad_norm": 0.24709148705005646, "learning_rate": 9.673630744685028e-05, "loss": 0.0165, "step": 12790 }, { "grad_norm": 0.2066642791032791, "learning_rate": 9.672895855803397e-05, "loss": 0.0197, "step": 12800 }, { "grad_norm": 0.2721308171749115, "learning_rate": 9.672160168451667e-05, "loss": 0.0158, "step": 12810 }, { "grad_norm": 0.2875535190105438, "learning_rate": 9.671423682755549e-05, "loss": 0.0198, "step": 12820 }, { "grad_norm": 0.17737503349781036, "learning_rate": 9.670686398840888e-05, "loss": 0.0185, "step": 12830 }, { "grad_norm": 0.2120293527841568, "learning_rate": 9.669948316833664e-05, "loss": 0.0184, "step": 12840 }, { "grad_norm": 0.2515837550163269, "learning_rate": 9.669209436859997e-05, "loss": 0.018, "step": 12850 }, { "grad_norm": 0.19430780410766602, "learning_rate": 9.66846975904614e-05, "loss": 0.0196, "step": 12860 }, { "grad_norm": 0.2372366338968277, "learning_rate": 9.667729283518483e-05, "loss": 0.017, "step": 12870 }, { "grad_norm": 0.18086029589176178, "learning_rate": 9.666988010403557e-05, "loss": 0.0162, "step": 12880 }, { "grad_norm": 0.19561684131622314, "learning_rate": 9.66624593982802e-05, "loss": 0.0195, "step": 12890 }, { "grad_norm": 0.2118196338415146, "learning_rate": 9.665503071918675e-05, "loss": 0.0182, "step": 12900 }, { "grad_norm": 0.293753981590271, "learning_rate": 9.664759406802456e-05, "loss": 0.0172, "step": 12910 }, { "grad_norm": 0.1786370575428009, "learning_rate": 9.664014944606437e-05, "loss": 0.0181, "step": 12920 }, { "grad_norm": 0.2419387847185135, "learning_rate": 9.663269685457822e-05, "loss": 0.0192, "step": 12930 }, { "grad_norm": 0.2485974282026291, "learning_rate": 9.662523629483962e-05, "loss": 0.0194, "step": 12940 }, { "grad_norm": 0.25198012590408325, "learning_rate": 9.661776776812333e-05, "loss": 0.0176, "step": 12950 }, { "grad_norm": 0.224061980843544, "learning_rate": 9.661029127570553e-05, "loss": 0.0195, "step": 12960 }, { "grad_norm": 0.2205585390329361, "learning_rate": 9.660280681886373e-05, "loss": 0.0182, "step": 12970 }, { "grad_norm": 0.18702007830142975, "learning_rate": 9.659531439887685e-05, "loss": 0.0187, "step": 12980 }, { "grad_norm": 0.21981929242610931, "learning_rate": 9.658781401702511e-05, "loss": 0.0255, "step": 12990 }, { "grad_norm": 0.2063789963722229, "learning_rate": 9.658030567459015e-05, "loss": 0.0174, "step": 13000 }, { "grad_norm": 0.1635896861553192, "learning_rate": 9.65727893728549e-05, "loss": 0.0187, "step": 13010 }, { "grad_norm": 0.30426937341690063, "learning_rate": 9.656526511310375e-05, "loss": 0.0185, "step": 13020 }, { "grad_norm": 0.19814233481884003, "learning_rate": 9.655773289662233e-05, "loss": 0.017, "step": 13030 }, { "grad_norm": 0.23212900757789612, "learning_rate": 9.655019272469772e-05, "loss": 0.0226, "step": 13040 }, { "grad_norm": 0.2035931795835495, "learning_rate": 9.654264459861832e-05, "loss": 0.02, "step": 13050 }, { "grad_norm": 0.16950726509094238, "learning_rate": 9.653508851967391e-05, "loss": 0.0172, "step": 13060 }, { "grad_norm": 0.21827830374240875, "learning_rate": 9.65275244891556e-05, "loss": 0.0164, "step": 13070 }, { "grad_norm": 0.2471899390220642, "learning_rate": 9.651995250835591e-05, "loss": 0.0165, "step": 13080 }, { "grad_norm": 0.2148866057395935, "learning_rate": 9.651237257856862e-05, "loss": 0.0167, "step": 13090 }, { "grad_norm": 0.23740631341934204, "learning_rate": 9.6504784701089e-05, "loss": 0.0155, "step": 13100 }, { "grad_norm": 0.25026997923851013, "learning_rate": 9.649718887721357e-05, "loss": 0.0189, "step": 13110 }, { "grad_norm": 0.1651357263326645, "learning_rate": 9.648958510824028e-05, "loss": 0.0165, "step": 13120 }, { "grad_norm": 0.206197589635849, "learning_rate": 9.648197339546837e-05, "loss": 0.0184, "step": 13130 }, { "grad_norm": 0.1556624174118042, "learning_rate": 9.647435374019851e-05, "loss": 0.019, "step": 13140 }, { "grad_norm": 0.202991783618927, "learning_rate": 9.646672614373266e-05, "loss": 0.0204, "step": 13150 }, { "grad_norm": 0.24042703211307526, "learning_rate": 9.645909060737418e-05, "loss": 0.0172, "step": 13160 }, { "grad_norm": 0.14460647106170654, "learning_rate": 9.645144713242778e-05, "loss": 0.0201, "step": 13170 }, { "grad_norm": 0.23799745738506317, "learning_rate": 9.64437957201995e-05, "loss": 0.0177, "step": 13180 }, { "grad_norm": 0.24450938403606415, "learning_rate": 9.643613637199678e-05, "loss": 0.018, "step": 13190 }, { "grad_norm": 0.20990781486034393, "learning_rate": 9.642846908912839e-05, "loss": 0.0185, "step": 13200 }, { "grad_norm": 0.18253813683986664, "learning_rate": 9.642079387290444e-05, "loss": 0.0185, "step": 13210 }, { "grad_norm": 0.20193932950496674, "learning_rate": 9.641311072463644e-05, "loss": 0.0199, "step": 13220 }, { "grad_norm": 0.22477132081985474, "learning_rate": 9.640541964563722e-05, "loss": 0.0217, "step": 13230 }, { "grad_norm": 0.19230003654956818, "learning_rate": 9.639772063722096e-05, "loss": 0.0211, "step": 13240 }, { "grad_norm": 0.2528436481952667, "learning_rate": 9.639001370070324e-05, "loss": 0.0188, "step": 13250 }, { "grad_norm": 0.21001702547073364, "learning_rate": 9.638229883740095e-05, "loss": 0.0179, "step": 13260 }, { "grad_norm": 0.15041552484035492, "learning_rate": 9.637457604863233e-05, "loss": 0.0166, "step": 13270 }, { "grad_norm": 0.18092098832130432, "learning_rate": 9.636684533571703e-05, "loss": 0.0178, "step": 13280 }, { "grad_norm": 0.20051662623882294, "learning_rate": 9.635910669997599e-05, "loss": 0.0149, "step": 13290 }, { "grad_norm": 0.17714479565620422, "learning_rate": 9.635136014273154e-05, "loss": 0.0194, "step": 13300 }, { "grad_norm": 0.2299744337797165, "learning_rate": 9.634360566530735e-05, "loss": 0.018, "step": 13310 }, { "grad_norm": 0.2306700199842453, "learning_rate": 9.633584326902845e-05, "loss": 0.0169, "step": 13320 }, { "grad_norm": 0.21339738368988037, "learning_rate": 9.632807295522124e-05, "loss": 0.0181, "step": 13330 }, { "grad_norm": 0.20326539874076843, "learning_rate": 9.632029472521342e-05, "loss": 0.0171, "step": 13340 }, { "grad_norm": 0.20653587579727173, "learning_rate": 9.631250858033409e-05, "loss": 0.0162, "step": 13350 }, { "grad_norm": 0.19994832575321198, "learning_rate": 9.630471452191371e-05, "loss": 0.0167, "step": 13360 }, { "grad_norm": 0.24443717300891876, "learning_rate": 9.629691255128405e-05, "loss": 0.0175, "step": 13370 }, { "grad_norm": 0.26435863971710205, "learning_rate": 9.628910266977825e-05, "loss": 0.0174, "step": 13380 }, { "grad_norm": 0.18239755928516388, "learning_rate": 9.628128487873083e-05, "loss": 0.0159, "step": 13390 }, { "grad_norm": 0.19785724580287933, "learning_rate": 9.627345917947761e-05, "loss": 0.0182, "step": 13400 }, { "grad_norm": 0.1935376077890396, "learning_rate": 9.626562557335579e-05, "loss": 0.0176, "step": 13410 }, { "grad_norm": 0.1940050572156906, "learning_rate": 9.625778406170393e-05, "loss": 0.0161, "step": 13420 }, { "grad_norm": 0.16965435445308685, "learning_rate": 9.624993464586193e-05, "loss": 0.0192, "step": 13430 }, { "grad_norm": 0.21279039978981018, "learning_rate": 9.624207732717105e-05, "loss": 0.0195, "step": 13440 }, { "grad_norm": 0.30646732449531555, "learning_rate": 9.623421210697386e-05, "loss": 0.0179, "step": 13450 }, { "grad_norm": 0.2836243808269501, "learning_rate": 9.622633898661434e-05, "loss": 0.02, "step": 13460 }, { "grad_norm": 0.2004477083683014, "learning_rate": 9.621845796743778e-05, "loss": 0.0224, "step": 13470 }, { "grad_norm": 0.30587875843048096, "learning_rate": 9.621056905079082e-05, "loss": 0.0197, "step": 13480 }, { "grad_norm": 0.23421519994735718, "learning_rate": 9.620267223802149e-05, "loss": 0.0164, "step": 13490 }, { "grad_norm": 0.1538884937763214, "learning_rate": 9.619476753047911e-05, "loss": 0.0165, "step": 13500 }, { "grad_norm": 0.22118107974529266, "learning_rate": 9.618685492951438e-05, "loss": 0.0182, "step": 13510 }, { "grad_norm": 0.18994800746440887, "learning_rate": 9.617893443647938e-05, "loss": 0.0159, "step": 13520 }, { "grad_norm": 0.23333464562892914, "learning_rate": 9.617100605272746e-05, "loss": 0.0167, "step": 13530 }, { "grad_norm": 0.22451820969581604, "learning_rate": 9.616306977961338e-05, "loss": 0.0143, "step": 13540 }, { "grad_norm": 0.17059986293315887, "learning_rate": 9.615512561849326e-05, "loss": 0.0156, "step": 13550 }, { "grad_norm": 0.18828585743904114, "learning_rate": 9.61471735707245e-05, "loss": 0.0214, "step": 13560 }, { "grad_norm": 0.1794409155845642, "learning_rate": 9.613921363766592e-05, "loss": 0.0175, "step": 13570 }, { "grad_norm": 0.14726465940475464, "learning_rate": 9.613124582067763e-05, "loss": 0.0148, "step": 13580 }, { "grad_norm": 0.16564111411571503, "learning_rate": 9.612327012112112e-05, "loss": 0.0176, "step": 13590 }, { "grad_norm": 0.18326042592525482, "learning_rate": 9.611528654035921e-05, "loss": 0.0164, "step": 13600 }, { "grad_norm": 0.18307073414325714, "learning_rate": 9.610729507975611e-05, "loss": 0.0177, "step": 13610 }, { "grad_norm": 0.24898360669612885, "learning_rate": 9.609929574067731e-05, "loss": 0.0174, "step": 13620 }, { "grad_norm": 0.1877957582473755, "learning_rate": 9.609128852448967e-05, "loss": 0.0189, "step": 13630 }, { "grad_norm": 0.1698104292154312, "learning_rate": 9.608327343256143e-05, "loss": 0.0177, "step": 13640 }, { "grad_norm": 0.24922096729278564, "learning_rate": 9.607525046626216e-05, "loss": 0.0164, "step": 13650 }, { "grad_norm": 0.18832312524318695, "learning_rate": 9.606721962696272e-05, "loss": 0.0154, "step": 13660 }, { "grad_norm": 0.16172775626182556, "learning_rate": 9.60591809160354e-05, "loss": 0.0171, "step": 13670 }, { "grad_norm": 0.23236669600009918, "learning_rate": 9.605113433485378e-05, "loss": 0.0159, "step": 13680 }, { "grad_norm": 0.18395400047302246, "learning_rate": 9.604307988479279e-05, "loss": 0.0162, "step": 13690 }, { "grad_norm": 0.1498807817697525, "learning_rate": 9.603501756722876e-05, "loss": 0.0155, "step": 13700 }, { "grad_norm": 0.17850174009799957, "learning_rate": 9.602694738353927e-05, "loss": 0.0175, "step": 13710 }, { "grad_norm": 0.23903581500053406, "learning_rate": 9.601886933510331e-05, "loss": 0.0184, "step": 13720 }, { "grad_norm": 0.18870115280151367, "learning_rate": 9.60107834233012e-05, "loss": 0.0173, "step": 13730 }, { "grad_norm": 0.3134974539279938, "learning_rate": 9.60026896495146e-05, "loss": 0.0196, "step": 13740 }, { "grad_norm": 0.1972499042749405, "learning_rate": 9.599458801512652e-05, "loss": 0.0143, "step": 13750 }, { "grad_norm": 0.19169604778289795, "learning_rate": 9.598647852152129e-05, "loss": 0.0154, "step": 13760 }, { "grad_norm": 0.17857427895069122, "learning_rate": 9.597836117008462e-05, "loss": 0.0189, "step": 13770 }, { "grad_norm": 0.22455266118049622, "learning_rate": 9.597023596220356e-05, "loss": 0.0184, "step": 13780 }, { "grad_norm": 0.21145309507846832, "learning_rate": 9.596210289926643e-05, "loss": 0.0161, "step": 13790 }, { "grad_norm": 0.20868074893951416, "learning_rate": 9.5953961982663e-05, "loss": 0.0178, "step": 13800 }, { "grad_norm": 0.22679422795772552, "learning_rate": 9.594581321378431e-05, "loss": 0.0165, "step": 13810 }, { "grad_norm": 0.1723686158657074, "learning_rate": 9.593765659402276e-05, "loss": 0.0168, "step": 13820 }, { "grad_norm": 0.1406583935022354, "learning_rate": 9.59294921247721e-05, "loss": 0.0142, "step": 13830 }, { "grad_norm": 0.18189826607704163, "learning_rate": 9.59213198074274e-05, "loss": 0.0151, "step": 13840 }, { "grad_norm": 0.1742108315229416, "learning_rate": 9.59131396433851e-05, "loss": 0.0169, "step": 13850 }, { "grad_norm": 0.20823360979557037, "learning_rate": 9.590495163404297e-05, "loss": 0.0194, "step": 13860 }, { "grad_norm": 0.16518594324588776, "learning_rate": 9.589675578080009e-05, "loss": 0.0182, "step": 13870 }, { "grad_norm": 0.22140640020370483, "learning_rate": 9.588855208505694e-05, "loss": 0.0178, "step": 13880 }, { "grad_norm": 0.16803641617298126, "learning_rate": 9.588034054821529e-05, "loss": 0.0158, "step": 13890 }, { "grad_norm": 0.1800430417060852, "learning_rate": 9.587212117167826e-05, "loss": 0.0161, "step": 13900 }, { "grad_norm": 0.18882852792739868, "learning_rate": 9.586389395685033e-05, "loss": 0.0144, "step": 13910 }, { "grad_norm": 0.1807837039232254, "learning_rate": 9.585565890513733e-05, "loss": 0.0154, "step": 13920 }, { "grad_norm": 0.20858415961265564, "learning_rate": 9.584741601794636e-05, "loss": 0.0153, "step": 13930 }, { "grad_norm": 0.24757209420204163, "learning_rate": 9.58391652966859e-05, "loss": 0.0162, "step": 13940 }, { "grad_norm": 0.23194460570812225, "learning_rate": 9.583090674276583e-05, "loss": 0.0184, "step": 13950 }, { "grad_norm": 0.29531511664390564, "learning_rate": 9.582264035759726e-05, "loss": 0.0183, "step": 13960 }, { "grad_norm": 0.23708385229110718, "learning_rate": 9.58143661425927e-05, "loss": 0.021, "step": 13970 }, { "grad_norm": 0.20327085256576538, "learning_rate": 9.580608409916601e-05, "loss": 0.0201, "step": 13980 }, { "grad_norm": 0.24641291797161102, "learning_rate": 9.579779422873233e-05, "loss": 0.0211, "step": 13990 }, { "grad_norm": 0.2232898324728012, "learning_rate": 9.578949653270819e-05, "loss": 0.022, "step": 14000 }, { "grad_norm": 0.20794281363487244, "learning_rate": 9.578119101251144e-05, "loss": 0.017, "step": 14010 }, { "grad_norm": 0.15075863897800446, "learning_rate": 9.577287766956127e-05, "loss": 0.0167, "step": 14020 }, { "grad_norm": 0.2248222678899765, "learning_rate": 9.57645565052782e-05, "loss": 0.0162, "step": 14030 }, { "grad_norm": 0.2098083198070526, "learning_rate": 9.575622752108407e-05, "loss": 0.0183, "step": 14040 }, { "grad_norm": 0.2104147970676422, "learning_rate": 9.57478907184021e-05, "loss": 0.02, "step": 14050 }, { "grad_norm": 0.17693017423152924, "learning_rate": 9.573954609865681e-05, "loss": 0.0186, "step": 14060 }, { "grad_norm": 0.19454878568649292, "learning_rate": 9.573119366327408e-05, "loss": 0.019, "step": 14070 }, { "grad_norm": 0.20922373235225677, "learning_rate": 9.57228334136811e-05, "loss": 0.0173, "step": 14080 }, { "grad_norm": 0.19273672997951508, "learning_rate": 9.571446535130641e-05, "loss": 0.0188, "step": 14090 }, { "grad_norm": 0.19620917737483978, "learning_rate": 9.570608947757988e-05, "loss": 0.0173, "step": 14100 }, { "grad_norm": 0.18886299431324005, "learning_rate": 9.569770579393274e-05, "loss": 0.0153, "step": 14110 }, { "grad_norm": 0.1883373260498047, "learning_rate": 9.56893143017975e-05, "loss": 0.0168, "step": 14120 }, { "grad_norm": 0.23924002051353455, "learning_rate": 9.568091500260806e-05, "loss": 0.0161, "step": 14130 }, { "grad_norm": 0.18100959062576294, "learning_rate": 9.567250789779961e-05, "loss": 0.0149, "step": 14140 }, { "grad_norm": 0.21910874545574188, "learning_rate": 9.566409298880872e-05, "loss": 0.0167, "step": 14150 }, { "grad_norm": 0.1924504190683365, "learning_rate": 9.565567027707326e-05, "loss": 0.0176, "step": 14160 }, { "grad_norm": 0.2120586484670639, "learning_rate": 9.56472397640324e-05, "loss": 0.0165, "step": 14170 }, { "grad_norm": 0.20222018659114838, "learning_rate": 9.563880145112675e-05, "loss": 0.021, "step": 14180 }, { "grad_norm": 0.18427254259586334, "learning_rate": 9.563035533979814e-05, "loss": 0.0164, "step": 14190 }, { "grad_norm": 0.1533413827419281, "learning_rate": 9.562190143148981e-05, "loss": 0.0202, "step": 14200 }, { "grad_norm": 0.1838887333869934, "learning_rate": 9.561343972764627e-05, "loss": 0.0134, "step": 14210 }, { "grad_norm": 0.21518565714359283, "learning_rate": 9.560497022971343e-05, "loss": 0.0173, "step": 14220 }, { "grad_norm": 0.2068198174238205, "learning_rate": 9.559649293913847e-05, "loss": 0.0171, "step": 14230 }, { "grad_norm": 0.19233402609825134, "learning_rate": 9.558800785736993e-05, "loss": 0.0203, "step": 14240 }, { "grad_norm": 0.24281388521194458, "learning_rate": 9.557951498585767e-05, "loss": 0.0184, "step": 14250 }, { "grad_norm": 0.19444639980793, "learning_rate": 9.557101432605293e-05, "loss": 0.0165, "step": 14260 }, { "grad_norm": 0.2228144258260727, "learning_rate": 9.556250587940818e-05, "loss": 0.0184, "step": 14270 }, { "grad_norm": 0.2302861511707306, "learning_rate": 9.555398964737734e-05, "loss": 0.0168, "step": 14280 }, { "grad_norm": 0.20340856909751892, "learning_rate": 9.554546563141555e-05, "loss": 0.0176, "step": 14290 }, { "grad_norm": 0.22167351841926575, "learning_rate": 9.553693383297937e-05, "loss": 0.0207, "step": 14300 }, { "grad_norm": 0.21373192965984344, "learning_rate": 9.552839425352663e-05, "loss": 0.0161, "step": 14310 }, { "grad_norm": 0.20997655391693115, "learning_rate": 9.551984689451652e-05, "loss": 0.0198, "step": 14320 }, { "grad_norm": 0.21427537500858307, "learning_rate": 9.551129175740953e-05, "loss": 0.0164, "step": 14330 }, { "grad_norm": 0.1662493795156479, "learning_rate": 9.550272884366754e-05, "loss": 0.0154, "step": 14340 }, { "grad_norm": 0.1536032259464264, "learning_rate": 9.549415815475369e-05, "loss": 0.0171, "step": 14350 }, { "grad_norm": 0.20343175530433655, "learning_rate": 9.548557969213247e-05, "loss": 0.017, "step": 14360 }, { "grad_norm": 0.18067742884159088, "learning_rate": 9.547699345726972e-05, "loss": 0.0154, "step": 14370 }, { "grad_norm": 0.22906744480133057, "learning_rate": 9.546839945163257e-05, "loss": 0.0151, "step": 14380 }, { "grad_norm": 0.20455193519592285, "learning_rate": 9.545979767668953e-05, "loss": 0.0194, "step": 14390 }, { "grad_norm": 0.17674513161182404, "learning_rate": 9.54511881339104e-05, "loss": 0.0175, "step": 14400 }, { "grad_norm": 0.22247488796710968, "learning_rate": 9.54425708247663e-05, "loss": 0.0179, "step": 14410 }, { "grad_norm": 0.18724893033504486, "learning_rate": 9.543394575072972e-05, "loss": 0.0165, "step": 14420 }, { "grad_norm": 0.17932556569576263, "learning_rate": 9.542531291327441e-05, "loss": 0.016, "step": 14430 }, { "grad_norm": 0.1979820728302002, "learning_rate": 9.541667231387552e-05, "loss": 0.0164, "step": 14440 }, { "grad_norm": 0.21589617431163788, "learning_rate": 9.540802395400949e-05, "loss": 0.017, "step": 14450 }, { "grad_norm": 0.20512376725673676, "learning_rate": 9.539936783515406e-05, "loss": 0.0163, "step": 14460 }, { "grad_norm": 0.24764470756053925, "learning_rate": 9.539070395878835e-05, "loss": 0.0195, "step": 14470 }, { "grad_norm": 0.18259745836257935, "learning_rate": 9.538203232639277e-05, "loss": 0.0157, "step": 14480 }, { "grad_norm": 0.16614994406700134, "learning_rate": 9.537335293944907e-05, "loss": 0.0168, "step": 14490 }, { "grad_norm": 0.15071819722652435, "learning_rate": 9.536466579944032e-05, "loss": 0.0161, "step": 14500 }, { "grad_norm": 0.18969669938087463, "learning_rate": 9.535597090785091e-05, "loss": 0.0175, "step": 14510 }, { "grad_norm": 0.21365828812122345, "learning_rate": 9.534726826616656e-05, "loss": 0.0159, "step": 14520 }, { "grad_norm": 0.18403743207454681, "learning_rate": 9.53385578758743e-05, "loss": 0.017, "step": 14530 }, { "grad_norm": 0.1945379078388214, "learning_rate": 9.532983973846252e-05, "loss": 0.0183, "step": 14540 }, { "grad_norm": 0.17356865108013153, "learning_rate": 9.53211138554209e-05, "loss": 0.0144, "step": 14550 }, { "grad_norm": 0.14292418956756592, "learning_rate": 9.531238022824047e-05, "loss": 0.0153, "step": 14560 }, { "grad_norm": 0.22766119241714478, "learning_rate": 9.530363885841355e-05, "loss": 0.0185, "step": 14570 }, { "grad_norm": 0.1998056322336197, "learning_rate": 9.52948897474338e-05, "loss": 0.016, "step": 14580 }, { "grad_norm": 0.17921118438243866, "learning_rate": 9.528613289679622e-05, "loss": 0.0161, "step": 14590 }, { "grad_norm": 0.15146052837371826, "learning_rate": 9.52773683079971e-05, "loss": 0.0151, "step": 14600 }, { "grad_norm": 0.16708484292030334, "learning_rate": 9.526859598253407e-05, "loss": 0.0169, "step": 14610 }, { "grad_norm": 0.2810259163379669, "learning_rate": 9.525981592190609e-05, "loss": 0.018, "step": 14620 }, { "grad_norm": 0.1862398087978363, "learning_rate": 9.525102812761342e-05, "loss": 0.0182, "step": 14630 }, { "grad_norm": 0.2048797309398651, "learning_rate": 9.524223260115768e-05, "loss": 0.0168, "step": 14640 }, { "grad_norm": 0.22174841165542603, "learning_rate": 9.523342934404175e-05, "loss": 0.0133, "step": 14650 }, { "grad_norm": 0.17947302758693695, "learning_rate": 9.522461835776989e-05, "loss": 0.0199, "step": 14660 }, { "grad_norm": 0.24838972091674805, "learning_rate": 9.521579964384764e-05, "loss": 0.017, "step": 14670 }, { "grad_norm": 0.16766981780529022, "learning_rate": 9.52069732037819e-05, "loss": 0.0179, "step": 14680 }, { "grad_norm": 0.21215926110744476, "learning_rate": 9.519813903908083e-05, "loss": 0.0161, "step": 14690 }, { "grad_norm": 0.20570027828216553, "learning_rate": 9.5189297151254e-05, "loss": 0.0156, "step": 14700 }, { "grad_norm": 0.18040791153907776, "learning_rate": 9.518044754181218e-05, "loss": 0.0142, "step": 14710 }, { "grad_norm": 0.17823940515518188, "learning_rate": 9.51715902122676e-05, "loss": 0.017, "step": 14720 }, { "grad_norm": 0.18964332342147827, "learning_rate": 9.516272516413368e-05, "loss": 0.0165, "step": 14730 }, { "grad_norm": 0.16518110036849976, "learning_rate": 9.515385239892525e-05, "loss": 0.019, "step": 14740 }, { "grad_norm": 0.13334153592586517, "learning_rate": 9.514497191815839e-05, "loss": 0.0183, "step": 14750 }, { "grad_norm": 0.16689282655715942, "learning_rate": 9.513608372335055e-05, "loss": 0.0148, "step": 14760 }, { "grad_norm": 0.19916212558746338, "learning_rate": 9.512718781602045e-05, "loss": 0.0165, "step": 14770 }, { "grad_norm": 0.16902600228786469, "learning_rate": 9.511828419768823e-05, "loss": 0.0154, "step": 14780 }, { "grad_norm": 0.21583192050457, "learning_rate": 9.510937286987521e-05, "loss": 0.0164, "step": 14790 }, { "grad_norm": 0.17798671126365662, "learning_rate": 9.510045383410408e-05, "loss": 0.0147, "step": 14800 }, { "grad_norm": 0.2835707366466522, "learning_rate": 9.509152709189892e-05, "loss": 0.0148, "step": 14810 }, { "grad_norm": 0.1869775503873825, "learning_rate": 9.508259264478504e-05, "loss": 0.0184, "step": 14820 }, { "grad_norm": 0.17353777587413788, "learning_rate": 9.507365049428909e-05, "loss": 0.0183, "step": 14830 }, { "grad_norm": 0.17916040122509003, "learning_rate": 9.506470064193902e-05, "loss": 0.0179, "step": 14840 }, { "grad_norm": 0.23692329227924347, "learning_rate": 9.505574308926414e-05, "loss": 0.0181, "step": 14850 }, { "grad_norm": 0.12672768533229828, "learning_rate": 9.504677783779505e-05, "loss": 0.0144, "step": 14860 }, { "grad_norm": 0.1545877456665039, "learning_rate": 9.503780488906365e-05, "loss": 0.0148, "step": 14870 }, { "grad_norm": 0.26043614745140076, "learning_rate": 9.502882424460319e-05, "loss": 0.0178, "step": 14880 }, { "grad_norm": 0.22003722190856934, "learning_rate": 9.501983590594821e-05, "loss": 0.0182, "step": 14890 }, { "grad_norm": 0.23476830124855042, "learning_rate": 9.501083987463455e-05, "loss": 0.0166, "step": 14900 }, { "grad_norm": 0.20440922677516937, "learning_rate": 9.500183615219942e-05, "loss": 0.0154, "step": 14910 }, { "grad_norm": 0.18894720077514648, "learning_rate": 9.49928247401813e-05, "loss": 0.0189, "step": 14920 }, { "grad_norm": 0.1721775084733963, "learning_rate": 9.498380564011997e-05, "loss": 0.0173, "step": 14930 }, { "grad_norm": 0.18698391318321228, "learning_rate": 9.497477885355656e-05, "loss": 0.019, "step": 14940 }, { "grad_norm": 0.1907275766134262, "learning_rate": 9.496574438203353e-05, "loss": 0.0154, "step": 14950 }, { "grad_norm": 0.19880908727645874, "learning_rate": 9.495670222709459e-05, "loss": 0.0155, "step": 14960 }, { "grad_norm": 0.17491436004638672, "learning_rate": 9.494765239028483e-05, "loss": 0.0224, "step": 14970 }, { "grad_norm": 0.21181520819664001, "learning_rate": 9.493859487315057e-05, "loss": 0.0167, "step": 14980 }, { "grad_norm": 0.17495959997177124, "learning_rate": 9.492952967723953e-05, "loss": 0.0154, "step": 14990 }, { "grad_norm": 0.19873790442943573, "learning_rate": 9.492045680410068e-05, "loss": 0.0168, "step": 15000 }, { "grad_norm": 0.185705304145813, "learning_rate": 9.491137625528436e-05, "loss": 0.016, "step": 15010 }, { "grad_norm": 0.21184776723384857, "learning_rate": 9.490228803234215e-05, "loss": 0.016, "step": 15020 }, { "grad_norm": 0.20073246955871582, "learning_rate": 9.489319213682701e-05, "loss": 0.0141, "step": 15030 }, { "grad_norm": 0.1813969612121582, "learning_rate": 9.488408857029316e-05, "loss": 0.0139, "step": 15040 }, { "grad_norm": 0.18704642355442047, "learning_rate": 9.487497733429616e-05, "loss": 0.0153, "step": 15050 }, { "grad_norm": 0.17182348668575287, "learning_rate": 9.486585843039286e-05, "loss": 0.0153, "step": 15060 }, { "grad_norm": 0.20641890168190002, "learning_rate": 9.485673186014143e-05, "loss": 0.0154, "step": 15070 }, { "grad_norm": 0.23596692085266113, "learning_rate": 9.484759762510137e-05, "loss": 0.0156, "step": 15080 }, { "grad_norm": 0.2566509246826172, "learning_rate": 9.483845572683346e-05, "loss": 0.0169, "step": 15090 }, { "grad_norm": 0.2235187143087387, "learning_rate": 9.48293061668998e-05, "loss": 0.0159, "step": 15100 }, { "grad_norm": 0.22134855389595032, "learning_rate": 9.48201489468638e-05, "loss": 0.017, "step": 15110 }, { "grad_norm": 0.16857151687145233, "learning_rate": 9.481098406829016e-05, "loss": 0.0154, "step": 15120 }, { "grad_norm": 0.14539483189582825, "learning_rate": 9.480181153274495e-05, "loss": 0.016, "step": 15130 }, { "grad_norm": 0.2224392294883728, "learning_rate": 9.479263134179548e-05, "loss": 0.0198, "step": 15140 }, { "grad_norm": 0.23167508840560913, "learning_rate": 9.478344349701039e-05, "loss": 0.0202, "step": 15150 }, { "grad_norm": 0.19086478650569916, "learning_rate": 9.477424799995964e-05, "loss": 0.0172, "step": 15160 }, { "grad_norm": 0.22429387271404266, "learning_rate": 9.476504485221448e-05, "loss": 0.0156, "step": 15170 }, { "grad_norm": 0.30361440777778625, "learning_rate": 9.475583405534748e-05, "loss": 0.0175, "step": 15180 }, { "grad_norm": 0.1923341602087021, "learning_rate": 9.474661561093251e-05, "loss": 0.0191, "step": 15190 }, { "grad_norm": 0.1990882307291031, "learning_rate": 9.473738952054478e-05, "loss": 0.0155, "step": 15200 }, { "grad_norm": 0.1993861198425293, "learning_rate": 9.472815578576073e-05, "loss": 0.0177, "step": 15210 }, { "grad_norm": 0.14652487635612488, "learning_rate": 9.471891440815817e-05, "loss": 0.0151, "step": 15220 }, { "grad_norm": 0.1853862702846527, "learning_rate": 9.470966538931621e-05, "loss": 0.0157, "step": 15230 }, { "grad_norm": 0.2146259993314743, "learning_rate": 9.470040873081525e-05, "loss": 0.0161, "step": 15240 }, { "grad_norm": 0.15016792714595795, "learning_rate": 9.469114443423698e-05, "loss": 0.0146, "step": 15250 }, { "grad_norm": 0.16285859048366547, "learning_rate": 9.468187250116445e-05, "loss": 0.0153, "step": 15260 }, { "grad_norm": 0.16135652363300323, "learning_rate": 9.467259293318197e-05, "loss": 0.014, "step": 15270 }, { "grad_norm": 0.2334098219871521, "learning_rate": 9.466330573187514e-05, "loss": 0.0163, "step": 15280 }, { "grad_norm": 0.16114981472492218, "learning_rate": 9.46540108988309e-05, "loss": 0.0139, "step": 15290 }, { "grad_norm": 0.15708354115486145, "learning_rate": 9.46447084356375e-05, "loss": 0.0148, "step": 15300 }, { "grad_norm": 0.1783980429172516, "learning_rate": 9.463539834388447e-05, "loss": 0.0137, "step": 15310 }, { "grad_norm": 0.16762927174568176, "learning_rate": 9.462608062516263e-05, "loss": 0.0167, "step": 15320 }, { "grad_norm": 0.17670825123786926, "learning_rate": 9.461675528106413e-05, "loss": 0.0146, "step": 15330 }, { "grad_norm": 0.2190227508544922, "learning_rate": 9.460742231318244e-05, "loss": 0.0177, "step": 15340 }, { "grad_norm": 0.19035066664218903, "learning_rate": 9.459808172311229e-05, "loss": 0.0161, "step": 15350 }, { "grad_norm": 0.17247317731380463, "learning_rate": 9.458873351244972e-05, "loss": 0.0165, "step": 15360 }, { "grad_norm": 0.1732923686504364, "learning_rate": 9.457937768279211e-05, "loss": 0.0191, "step": 15370 }, { "grad_norm": 0.23104214668273926, "learning_rate": 9.45700142357381e-05, "loss": 0.0182, "step": 15380 }, { "grad_norm": 0.1452673375606537, "learning_rate": 9.456064317288765e-05, "loss": 0.0165, "step": 15390 }, { "grad_norm": 0.19116376340389252, "learning_rate": 9.455126449584201e-05, "loss": 0.0181, "step": 15400 }, { "grad_norm": 0.1881909817457199, "learning_rate": 9.454187820620375e-05, "loss": 0.0171, "step": 15410 }, { "grad_norm": 0.20265203714370728, "learning_rate": 9.453248430557673e-05, "loss": 0.0169, "step": 15420 }, { "grad_norm": 0.21585778892040253, "learning_rate": 9.452308279556611e-05, "loss": 0.018, "step": 15430 }, { "grad_norm": 0.16763469576835632, "learning_rate": 9.451367367777835e-05, "loss": 0.0179, "step": 15440 }, { "grad_norm": 0.1810198724269867, "learning_rate": 9.450425695382122e-05, "loss": 0.0157, "step": 15450 }, { "grad_norm": 0.1739109605550766, "learning_rate": 9.449483262530375e-05, "loss": 0.0166, "step": 15460 }, { "grad_norm": 0.19316348433494568, "learning_rate": 9.448540069383633e-05, "loss": 0.0172, "step": 15470 }, { "grad_norm": 0.23180703818798065, "learning_rate": 9.447596116103061e-05, "loss": 0.0182, "step": 15480 }, { "grad_norm": 0.17496372759342194, "learning_rate": 9.446651402849955e-05, "loss": 0.0218, "step": 15490 }, { "grad_norm": 0.24258875846862793, "learning_rate": 9.44570592978574e-05, "loss": 0.02, "step": 15500 }, { "grad_norm": 0.1818191558122635, "learning_rate": 9.444759697071972e-05, "loss": 0.018, "step": 15510 }, { "grad_norm": 0.2120419591665268, "learning_rate": 9.443812704870336e-05, "loss": 0.0163, "step": 15520 }, { "grad_norm": 0.15434695780277252, "learning_rate": 9.442864953342649e-05, "loss": 0.0166, "step": 15530 }, { "grad_norm": 0.1465461254119873, "learning_rate": 9.441916442650852e-05, "loss": 0.0147, "step": 15540 }, { "grad_norm": 0.2037661075592041, "learning_rate": 9.440967172957023e-05, "loss": 0.0158, "step": 15550 }, { "grad_norm": 0.21762233972549438, "learning_rate": 9.440017144423364e-05, "loss": 0.015, "step": 15560 }, { "grad_norm": 0.20739319920539856, "learning_rate": 9.439066357212209e-05, "loss": 0.0152, "step": 15570 }, { "grad_norm": 0.18874581158161163, "learning_rate": 9.438114811486022e-05, "loss": 0.0144, "step": 15580 }, { "grad_norm": 0.16213664412498474, "learning_rate": 9.4371625074074e-05, "loss": 0.0168, "step": 15590 }, { "grad_norm": 0.20125813782215118, "learning_rate": 9.436209445139059e-05, "loss": 0.0167, "step": 15600 }, { "grad_norm": 0.21817107498645782, "learning_rate": 9.435255624843855e-05, "loss": 0.0163, "step": 15610 }, { "grad_norm": 0.1503806710243225, "learning_rate": 9.43430104668477e-05, "loss": 0.014, "step": 15620 }, { "grad_norm": 0.17277535796165466, "learning_rate": 9.433345710824914e-05, "loss": 0.0155, "step": 15630 }, { "grad_norm": 0.1725858449935913, "learning_rate": 9.432389617427529e-05, "loss": 0.0158, "step": 15640 }, { "grad_norm": 0.18696987628936768, "learning_rate": 9.431432766655984e-05, "loss": 0.0152, "step": 15650 }, { "grad_norm": 0.3112759590148926, "learning_rate": 9.430475158673778e-05, "loss": 0.0183, "step": 15660 }, { "grad_norm": 0.16448502242565155, "learning_rate": 9.429516793644542e-05, "loss": 0.0182, "step": 15670 }, { "grad_norm": 0.20491710305213928, "learning_rate": 9.428557671732034e-05, "loss": 0.017, "step": 15680 }, { "grad_norm": 0.22474683821201324, "learning_rate": 9.42759779310014e-05, "loss": 0.0163, "step": 15690 }, { "grad_norm": 0.2035386860370636, "learning_rate": 9.426637157912879e-05, "loss": 0.0145, "step": 15700 }, { "grad_norm": 0.14866508543491364, "learning_rate": 9.425675766334397e-05, "loss": 0.0153, "step": 15710 }, { "grad_norm": 0.15668828785419464, "learning_rate": 9.424713618528968e-05, "loss": 0.0154, "step": 15720 }, { "grad_norm": 0.18738707900047302, "learning_rate": 9.423750714661e-05, "loss": 0.0173, "step": 15730 }, { "grad_norm": 0.18814516067504883, "learning_rate": 9.422787054895022e-05, "loss": 0.0169, "step": 15740 }, { "grad_norm": 0.15112614631652832, "learning_rate": 9.4218226393957e-05, "loss": 0.0174, "step": 15750 }, { "grad_norm": 0.162822425365448, "learning_rate": 9.420857468327828e-05, "loss": 0.0182, "step": 15760 }, { "grad_norm": 0.16981446743011475, "learning_rate": 9.419891541856323e-05, "loss": 0.0174, "step": 15770 }, { "grad_norm": 0.24034370481967926, "learning_rate": 9.41892486014624e-05, "loss": 0.0154, "step": 15780 }, { "grad_norm": 0.15182919800281525, "learning_rate": 9.417957423362756e-05, "loss": 0.0154, "step": 15790 }, { "grad_norm": 0.2541085183620453, "learning_rate": 9.416989231671178e-05, "loss": 0.0166, "step": 15800 }, { "grad_norm": 0.2202940434217453, "learning_rate": 9.416020285236946e-05, "loss": 0.0149, "step": 15810 }, { "grad_norm": 0.15045392513275146, "learning_rate": 9.415050584225626e-05, "loss": 0.0189, "step": 15820 }, { "grad_norm": 0.2389354258775711, "learning_rate": 9.414080128802914e-05, "loss": 0.0138, "step": 15830 }, { "grad_norm": 0.1669539511203766, "learning_rate": 9.413108919134632e-05, "loss": 0.0196, "step": 15840 }, { "grad_norm": 0.17385445535182953, "learning_rate": 9.412136955386734e-05, "loss": 0.0157, "step": 15850 }, { "grad_norm": 0.15857554972171783, "learning_rate": 9.411164237725303e-05, "loss": 0.0173, "step": 15860 }, { "grad_norm": 0.1298702508211136, "learning_rate": 9.41019076631655e-05, "loss": 0.0148, "step": 15870 }, { "grad_norm": 0.16555972397327423, "learning_rate": 9.409216541326815e-05, "loss": 0.017, "step": 15880 }, { "grad_norm": 0.33069029450416565, "learning_rate": 9.408241562922564e-05, "loss": 0.0168, "step": 15890 }, { "grad_norm": 0.21914222836494446, "learning_rate": 9.407265831270395e-05, "loss": 0.0202, "step": 15900 }, { "grad_norm": 0.2280777394771576, "learning_rate": 9.406289346537035e-05, "loss": 0.0187, "step": 15910 }, { "grad_norm": 0.2634629011154175, "learning_rate": 9.405312108889339e-05, "loss": 0.0205, "step": 15920 }, { "grad_norm": 0.20910605788230896, "learning_rate": 9.404334118494288e-05, "loss": 0.0173, "step": 15930 }, { "grad_norm": 0.26835083961486816, "learning_rate": 9.403355375518995e-05, "loss": 0.0163, "step": 15940 }, { "grad_norm": 0.2074998915195465, "learning_rate": 9.4023758801307e-05, "loss": 0.0178, "step": 15950 }, { "grad_norm": 0.2429390698671341, "learning_rate": 9.401395632496774e-05, "loss": 0.0155, "step": 15960 }, { "grad_norm": 0.16472536325454712, "learning_rate": 9.400414632784711e-05, "loss": 0.0151, "step": 15970 }, { "grad_norm": 0.15032869577407837, "learning_rate": 9.39943288116214e-05, "loss": 0.0143, "step": 15980 }, { "grad_norm": 0.15538857877254486, "learning_rate": 9.398450377796815e-05, "loss": 0.0154, "step": 15990 }, { "grad_norm": 0.1406938135623932, "learning_rate": 9.397467122856616e-05, "loss": 0.0187, "step": 16000 }, { "grad_norm": 0.20746517181396484, "learning_rate": 9.396483116509558e-05, "loss": 0.0176, "step": 16010 }, { "grad_norm": 0.1466626226902008, "learning_rate": 9.39549835892378e-05, "loss": 0.0178, "step": 16020 }, { "grad_norm": 0.21750134229660034, "learning_rate": 9.39451285026755e-05, "loss": 0.0153, "step": 16030 }, { "grad_norm": 0.17076170444488525, "learning_rate": 9.393526590709262e-05, "loss": 0.0153, "step": 16040 }, { "grad_norm": 0.20099303126335144, "learning_rate": 9.392539580417444e-05, "loss": 0.0167, "step": 16050 }, { "grad_norm": 0.22756193578243256, "learning_rate": 9.391551819560747e-05, "loss": 0.0162, "step": 16060 }, { "grad_norm": 0.17089428007602692, "learning_rate": 9.390563308307955e-05, "loss": 0.0134, "step": 16070 }, { "grad_norm": 0.16063442826271057, "learning_rate": 9.389574046827974e-05, "loss": 0.0164, "step": 16080 }, { "grad_norm": 0.1798606961965561, "learning_rate": 9.388584035289845e-05, "loss": 0.0144, "step": 16090 }, { "grad_norm": 0.20398348569869995, "learning_rate": 9.387593273862732e-05, "loss": 0.015, "step": 16100 }, { "grad_norm": 0.16362737119197845, "learning_rate": 9.386601762715929e-05, "loss": 0.0152, "step": 16110 }, { "grad_norm": 0.22290287911891937, "learning_rate": 9.38560950201886e-05, "loss": 0.015, "step": 16120 }, { "grad_norm": 0.1868409365415573, "learning_rate": 9.384616491941071e-05, "loss": 0.0164, "step": 16130 }, { "grad_norm": 0.23107613623142242, "learning_rate": 9.383622732652245e-05, "loss": 0.0166, "step": 16140 }, { "grad_norm": 0.3080769181251526, "learning_rate": 9.382628224322187e-05, "loss": 0.0167, "step": 16150 }, { "grad_norm": 0.21557216346263885, "learning_rate": 9.381632967120829e-05, "loss": 0.0172, "step": 16160 }, { "grad_norm": 0.16968126595020294, "learning_rate": 9.380636961218235e-05, "loss": 0.0158, "step": 16170 }, { "grad_norm": 0.1793704777956009, "learning_rate": 9.379640206784597e-05, "loss": 0.0169, "step": 16180 }, { "grad_norm": 0.17536145448684692, "learning_rate": 9.378642703990229e-05, "loss": 0.0152, "step": 16190 }, { "grad_norm": 0.20677804946899414, "learning_rate": 9.37764445300558e-05, "loss": 0.0151, "step": 16200 }, { "grad_norm": 0.21751046180725098, "learning_rate": 9.376645454001222e-05, "loss": 0.0159, "step": 16210 }, { "grad_norm": 0.16118231415748596, "learning_rate": 9.375645707147858e-05, "loss": 0.0166, "step": 16220 }, { "grad_norm": 0.17243754863739014, "learning_rate": 9.374645212616316e-05, "loss": 0.0149, "step": 16230 }, { "grad_norm": 0.1836068332195282, "learning_rate": 9.373643970577555e-05, "loss": 0.0177, "step": 16240 }, { "grad_norm": 0.1338600069284439, "learning_rate": 9.372641981202659e-05, "loss": 0.0134, "step": 16250 }, { "grad_norm": 0.22387392818927765, "learning_rate": 9.37163924466284e-05, "loss": 0.019, "step": 16260 }, { "grad_norm": 0.18942007422447205, "learning_rate": 9.370635761129438e-05, "loss": 0.016, "step": 16270 }, { "grad_norm": 0.20048804581165314, "learning_rate": 9.36963153077392e-05, "loss": 0.0198, "step": 16280 }, { "grad_norm": 0.16986685991287231, "learning_rate": 9.368626553767888e-05, "loss": 0.0136, "step": 16290 }, { "grad_norm": 0.23468385636806488, "learning_rate": 9.367620830283057e-05, "loss": 0.0158, "step": 16300 }, { "grad_norm": 0.18747954070568085, "learning_rate": 9.366614360491281e-05, "loss": 0.0191, "step": 16310 }, { "grad_norm": 0.14701540768146515, "learning_rate": 9.365607144564539e-05, "loss": 0.0167, "step": 16320 }, { "grad_norm": 0.20210565626621246, "learning_rate": 9.364599182674934e-05, "loss": 0.0139, "step": 16330 }, { "grad_norm": 0.16574330627918243, "learning_rate": 9.3635904749947e-05, "loss": 0.0133, "step": 16340 }, { "grad_norm": 0.16651317477226257, "learning_rate": 9.362581021696202e-05, "loss": 0.0168, "step": 16350 }, { "grad_norm": 0.16212907433509827, "learning_rate": 9.361570822951921e-05, "loss": 0.017, "step": 16360 }, { "grad_norm": 0.18674403429031372, "learning_rate": 9.360559878934476e-05, "loss": 0.0147, "step": 16370 }, { "grad_norm": 0.19468455016613007, "learning_rate": 9.359548189816611e-05, "loss": 0.0132, "step": 16380 }, { "grad_norm": 0.19777581095695496, "learning_rate": 9.358535755771193e-05, "loss": 0.0147, "step": 16390 }, { "grad_norm": 0.17550261318683624, "learning_rate": 9.357522576971221e-05, "loss": 0.0184, "step": 16400 }, { "grad_norm": 0.19864003360271454, "learning_rate": 9.356508653589819e-05, "loss": 0.0175, "step": 16410 }, { "grad_norm": 0.27079206705093384, "learning_rate": 9.355493985800237e-05, "loss": 0.0176, "step": 16420 }, { "grad_norm": 0.2501062750816345, "learning_rate": 9.354478573775857e-05, "loss": 0.0124, "step": 16430 }, { "grad_norm": 0.1963309347629547, "learning_rate": 9.353462417690186e-05, "loss": 0.0141, "step": 16440 }, { "grad_norm": 0.2129698246717453, "learning_rate": 9.352445517716853e-05, "loss": 0.0153, "step": 16450 }, { "grad_norm": 0.25038227438926697, "learning_rate": 9.351427874029621e-05, "loss": 0.0174, "step": 16460 }, { "grad_norm": 0.15815328061580658, "learning_rate": 9.350409486802379e-05, "loss": 0.0152, "step": 16470 }, { "grad_norm": 0.14434152841567993, "learning_rate": 9.349390356209138e-05, "loss": 0.0156, "step": 16480 }, { "grad_norm": 0.15880799293518066, "learning_rate": 9.348370482424042e-05, "loss": 0.014, "step": 16490 }, { "grad_norm": 0.19480374455451965, "learning_rate": 9.347349865621357e-05, "loss": 0.0152, "step": 16500 }, { "grad_norm": 0.15751321613788605, "learning_rate": 9.346328505975481e-05, "loss": 0.0131, "step": 16510 }, { "grad_norm": 0.1633833348751068, "learning_rate": 9.345306403660936e-05, "loss": 0.0154, "step": 16520 }, { "grad_norm": 0.15481233596801758, "learning_rate": 9.344283558852371e-05, "loss": 0.015, "step": 16530 }, { "grad_norm": 0.21014061570167542, "learning_rate": 9.343259971724563e-05, "loss": 0.0144, "step": 16540 }, { "grad_norm": 0.2019609808921814, "learning_rate": 9.342235642452413e-05, "loss": 0.0126, "step": 16550 }, { "grad_norm": 0.5102528929710388, "learning_rate": 9.341210571210954e-05, "loss": 0.0159, "step": 16560 }, { "grad_norm": 0.20188455283641815, "learning_rate": 9.340184758175338e-05, "loss": 0.0169, "step": 16570 }, { "grad_norm": 0.1498555690050125, "learning_rate": 9.339158203520854e-05, "loss": 0.0138, "step": 16580 }, { "grad_norm": 0.16392068564891815, "learning_rate": 9.338130907422908e-05, "loss": 0.0147, "step": 16590 }, { "grad_norm": 0.1801535189151764, "learning_rate": 9.337102870057037e-05, "loss": 0.0154, "step": 16600 }, { "grad_norm": 0.2127629816532135, "learning_rate": 9.336074091598907e-05, "loss": 0.014, "step": 16610 }, { "grad_norm": 0.1878816932439804, "learning_rate": 9.335044572224306e-05, "loss": 0.0157, "step": 16620 }, { "grad_norm": 0.16564048826694489, "learning_rate": 9.334014312109151e-05, "loss": 0.0166, "step": 16630 }, { "grad_norm": 0.17137601971626282, "learning_rate": 9.332983311429486e-05, "loss": 0.0145, "step": 16640 }, { "grad_norm": 0.1630014032125473, "learning_rate": 9.33195157036148e-05, "loss": 0.0211, "step": 16650 }, { "grad_norm": 0.17092397809028625, "learning_rate": 9.330919089081432e-05, "loss": 0.014, "step": 16660 }, { "grad_norm": 0.2431865930557251, "learning_rate": 9.32988586776576e-05, "loss": 0.0181, "step": 16670 }, { "grad_norm": 0.1478491723537445, "learning_rate": 9.328851906591016e-05, "loss": 0.0156, "step": 16680 }, { "grad_norm": 0.1768191009759903, "learning_rate": 9.327817205733875e-05, "loss": 0.0135, "step": 16690 }, { "grad_norm": 0.16100452840328217, "learning_rate": 9.326781765371142e-05, "loss": 0.013, "step": 16700 }, { "grad_norm": 0.15339772403240204, "learning_rate": 9.325745585679741e-05, "loss": 0.0151, "step": 16710 }, { "grad_norm": 0.17995451390743256, "learning_rate": 9.32470866683673e-05, "loss": 0.014, "step": 16720 }, { "grad_norm": 0.2305857241153717, "learning_rate": 9.323671009019288e-05, "loss": 0.0154, "step": 16730 }, { "grad_norm": 0.22034087777137756, "learning_rate": 9.322632612404725e-05, "loss": 0.0188, "step": 16740 }, { "grad_norm": 0.23400864005088806, "learning_rate": 9.321593477170471e-05, "loss": 0.0185, "step": 16750 }, { "grad_norm": 0.2342105507850647, "learning_rate": 9.320553603494088e-05, "loss": 0.0159, "step": 16760 }, { "grad_norm": 0.18291792273521423, "learning_rate": 9.319512991553261e-05, "loss": 0.0164, "step": 16770 }, { "grad_norm": 0.14909371733665466, "learning_rate": 9.318471641525803e-05, "loss": 0.0163, "step": 16780 }, { "grad_norm": 0.143172949552536, "learning_rate": 9.317429553589652e-05, "loss": 0.0142, "step": 16790 }, { "grad_norm": 0.14235475659370422, "learning_rate": 9.316386727922873e-05, "loss": 0.0148, "step": 16800 }, { "grad_norm": 0.18517255783081055, "learning_rate": 9.315343164703656e-05, "loss": 0.0138, "step": 16810 }, { "grad_norm": 0.16804690659046173, "learning_rate": 9.314298864110316e-05, "loss": 0.014, "step": 16820 }, { "grad_norm": 0.16143831610679626, "learning_rate": 9.313253826321295e-05, "loss": 0.0145, "step": 16830 }, { "grad_norm": 0.16203510761260986, "learning_rate": 9.312208051515165e-05, "loss": 0.0155, "step": 16840 }, { "grad_norm": 0.2146998792886734, "learning_rate": 9.311161539870618e-05, "loss": 0.0183, "step": 16850 }, { "grad_norm": 0.2330964356660843, "learning_rate": 9.310114291566474e-05, "loss": 0.0151, "step": 16860 }, { "grad_norm": 0.21834783256053925, "learning_rate": 9.309066306781679e-05, "loss": 0.0145, "step": 16870 }, { "grad_norm": 0.24738019704818726, "learning_rate": 9.308017585695306e-05, "loss": 0.0163, "step": 16880 }, { "grad_norm": 0.17857307195663452, "learning_rate": 9.306968128486552e-05, "loss": 0.0149, "step": 16890 }, { "grad_norm": 0.23162154853343964, "learning_rate": 9.30591793533474e-05, "loss": 0.0164, "step": 16900 }, { "grad_norm": 0.19484810531139374, "learning_rate": 9.304867006419321e-05, "loss": 0.0147, "step": 16910 }, { "grad_norm": 0.19130541384220123, "learning_rate": 9.303815341919868e-05, "loss": 0.0192, "step": 16920 }, { "grad_norm": 0.1497754603624344, "learning_rate": 9.302762942016084e-05, "loss": 0.0137, "step": 16930 }, { "grad_norm": 0.1549014300107956, "learning_rate": 9.301709806887792e-05, "loss": 0.0146, "step": 16940 }, { "grad_norm": 0.11985474824905396, "learning_rate": 9.300655936714948e-05, "loss": 0.0161, "step": 16950 }, { "grad_norm": 0.19335372745990753, "learning_rate": 9.299601331677627e-05, "loss": 0.0176, "step": 16960 }, { "grad_norm": 0.18046420812606812, "learning_rate": 9.298545991956033e-05, "loss": 0.0189, "step": 16970 }, { "grad_norm": 0.174277201294899, "learning_rate": 9.297489917730493e-05, "loss": 0.0159, "step": 16980 }, { "grad_norm": 0.15379254519939423, "learning_rate": 9.296433109181464e-05, "loss": 0.014, "step": 16990 }, { "grad_norm": 0.17667852342128754, "learning_rate": 9.295375566489523e-05, "loss": 0.0158, "step": 17000 }, { "grad_norm": 0.22008895874023438, "learning_rate": 9.294317289835379e-05, "loss": 0.0193, "step": 17010 }, { "grad_norm": 0.15606632828712463, "learning_rate": 9.293258279399859e-05, "loss": 0.0178, "step": 17020 }, { "grad_norm": 0.21387960016727448, "learning_rate": 9.292198535363919e-05, "loss": 0.0144, "step": 17030 }, { "grad_norm": 0.12404244393110275, "learning_rate": 9.291138057908641e-05, "loss": 0.0176, "step": 17040 }, { "grad_norm": 0.22140084207057953, "learning_rate": 9.290076847215234e-05, "loss": 0.0163, "step": 17050 }, { "grad_norm": 0.2028769701719284, "learning_rate": 9.289014903465025e-05, "loss": 0.0163, "step": 17060 }, { "grad_norm": 0.3112863600254059, "learning_rate": 9.287952226839475e-05, "loss": 0.0161, "step": 17070 }, { "grad_norm": 0.21812331676483154, "learning_rate": 9.286888817520164e-05, "loss": 0.0153, "step": 17080 }, { "grad_norm": 0.1678917109966278, "learning_rate": 9.285824675688803e-05, "loss": 0.0185, "step": 17090 }, { "grad_norm": 0.15170904994010925, "learning_rate": 9.28475980152722e-05, "loss": 0.0144, "step": 17100 }, { "grad_norm": 0.23351740837097168, "learning_rate": 9.283694195217379e-05, "loss": 0.0142, "step": 17110 }, { "grad_norm": 0.17536482214927673, "learning_rate": 9.282627856941356e-05, "loss": 0.0166, "step": 17120 }, { "grad_norm": 0.21259622275829315, "learning_rate": 9.281560786881363e-05, "loss": 0.0148, "step": 17130 }, { "grad_norm": 0.1909111589193344, "learning_rate": 9.280492985219733e-05, "loss": 0.0178, "step": 17140 }, { "grad_norm": 0.1574559211730957, "learning_rate": 9.279424452138924e-05, "loss": 0.015, "step": 17150 }, { "grad_norm": 0.2004327028989792, "learning_rate": 9.278355187821517e-05, "loss": 0.0168, "step": 17160 }, { "grad_norm": 0.1929813176393509, "learning_rate": 9.277285192450224e-05, "loss": 0.0137, "step": 17170 }, { "grad_norm": 0.20977206528186798, "learning_rate": 9.276214466207875e-05, "loss": 0.0138, "step": 17180 }, { "grad_norm": 0.1388510763645172, "learning_rate": 9.275143009277427e-05, "loss": 0.0169, "step": 17190 }, { "grad_norm": 0.20908133685588837, "learning_rate": 9.274070821841964e-05, "loss": 0.0135, "step": 17200 }, { "grad_norm": 0.18082541227340698, "learning_rate": 9.272997904084696e-05, "loss": 0.0123, "step": 17210 }, { "grad_norm": 0.22460603713989258, "learning_rate": 9.271924256188951e-05, "loss": 0.015, "step": 17220 }, { "grad_norm": 0.18679288029670715, "learning_rate": 9.270849878338189e-05, "loss": 0.0175, "step": 17230 }, { "grad_norm": 0.19336962699890137, "learning_rate": 9.269774770715991e-05, "loss": 0.0169, "step": 17240 }, { "grad_norm": 0.16160130500793457, "learning_rate": 9.268698933506061e-05, "loss": 0.014, "step": 17250 }, { "grad_norm": 0.15637333691120148, "learning_rate": 9.267622366892235e-05, "loss": 0.0161, "step": 17260 }, { "grad_norm": 0.2030116766691208, "learning_rate": 9.266545071058465e-05, "loss": 0.0167, "step": 17270 }, { "grad_norm": 0.16385041177272797, "learning_rate": 9.265467046188833e-05, "loss": 0.0144, "step": 17280 }, { "grad_norm": 0.1547798365354538, "learning_rate": 9.264388292467543e-05, "loss": 0.0133, "step": 17290 }, { "grad_norm": 0.1895754635334015, "learning_rate": 9.263308810078926e-05, "loss": 0.0149, "step": 17300 }, { "grad_norm": 0.17389707267284393, "learning_rate": 9.262228599207434e-05, "loss": 0.0147, "step": 17310 }, { "grad_norm": 0.17974016070365906, "learning_rate": 9.261147660037647e-05, "loss": 0.0154, "step": 17320 }, { "grad_norm": 0.2273060530424118, "learning_rate": 9.26006599275427e-05, "loss": 0.0177, "step": 17330 }, { "grad_norm": 0.2321760207414627, "learning_rate": 9.258983597542124e-05, "loss": 0.0154, "step": 17340 }, { "grad_norm": 0.1736520230770111, "learning_rate": 9.257900474586167e-05, "loss": 0.0171, "step": 17350 }, { "grad_norm": 0.1273392140865326, "learning_rate": 9.256816624071471e-05, "loss": 0.0148, "step": 17360 }, { "grad_norm": 0.1427561193704605, "learning_rate": 9.25573204618324e-05, "loss": 0.0138, "step": 17370 }, { "grad_norm": 0.17800211906433105, "learning_rate": 9.254646741106796e-05, "loss": 0.0155, "step": 17380 }, { "grad_norm": 0.171278178691864, "learning_rate": 9.253560709027589e-05, "loss": 0.0153, "step": 17390 }, { "grad_norm": 0.1948280781507492, "learning_rate": 9.252473950131192e-05, "loss": 0.0165, "step": 17400 }, { "grad_norm": 0.14672093093395233, "learning_rate": 9.251386464603302e-05, "loss": 0.016, "step": 17410 }, { "grad_norm": 0.17682453989982605, "learning_rate": 9.250298252629741e-05, "loss": 0.0147, "step": 17420 }, { "grad_norm": 0.20124387741088867, "learning_rate": 9.249209314396454e-05, "loss": 0.0138, "step": 17430 }, { "grad_norm": 0.18228666484355927, "learning_rate": 9.248119650089513e-05, "loss": 0.0162, "step": 17440 }, { "grad_norm": 0.23797069489955902, "learning_rate": 9.247029259895108e-05, "loss": 0.0152, "step": 17450 }, { "grad_norm": 0.1733875423669815, "learning_rate": 9.24593814399956e-05, "loss": 0.0134, "step": 17460 }, { "grad_norm": 0.16580912470817566, "learning_rate": 9.244846302589309e-05, "loss": 0.0156, "step": 17470 }, { "grad_norm": 0.2288675755262375, "learning_rate": 9.243753735850923e-05, "loss": 0.0162, "step": 17480 }, { "grad_norm": 0.1953766942024231, "learning_rate": 9.24266044397109e-05, "loss": 0.0165, "step": 17490 }, { "grad_norm": 0.1379387229681015, "learning_rate": 9.241566427136624e-05, "loss": 0.015, "step": 17500 }, { "grad_norm": 0.21355001628398895, "learning_rate": 9.240471685534463e-05, "loss": 0.016, "step": 17510 }, { "grad_norm": 0.2001180499792099, "learning_rate": 9.239376219351667e-05, "loss": 0.0126, "step": 17520 }, { "grad_norm": 0.194303497672081, "learning_rate": 9.238280028775425e-05, "loss": 0.0159, "step": 17530 }, { "grad_norm": 0.24694356322288513, "learning_rate": 9.237183113993041e-05, "loss": 0.0149, "step": 17540 }, { "grad_norm": 0.18623821437358856, "learning_rate": 9.236085475191952e-05, "loss": 0.0139, "step": 17550 }, { "grad_norm": 0.1796272248029709, "learning_rate": 9.234987112559709e-05, "loss": 0.016, "step": 17560 }, { "grad_norm": 0.22025157511234283, "learning_rate": 9.233888026283999e-05, "loss": 0.0166, "step": 17570 }, { "grad_norm": 0.14445637166500092, "learning_rate": 9.232788216552619e-05, "loss": 0.0161, "step": 17580 }, { "grad_norm": 0.11359013617038727, "learning_rate": 9.231687683553502e-05, "loss": 0.0144, "step": 17590 }, { "grad_norm": 0.1419796496629715, "learning_rate": 9.230586427474698e-05, "loss": 0.012, "step": 17600 }, { "grad_norm": 0.1682516485452652, "learning_rate": 9.229484448504379e-05, "loss": 0.0147, "step": 17610 }, { "grad_norm": 0.15540242195129395, "learning_rate": 9.228381746830843e-05, "loss": 0.0142, "step": 17620 }, { "grad_norm": 0.21465162932872772, "learning_rate": 9.227278322642514e-05, "loss": 0.0157, "step": 17630 }, { "grad_norm": 0.14892235398292542, "learning_rate": 9.226174176127937e-05, "loss": 0.0147, "step": 17640 }, { "grad_norm": 0.15291254222393036, "learning_rate": 9.22506930747578e-05, "loss": 0.0112, "step": 17650 }, { "grad_norm": 0.15935440361499786, "learning_rate": 9.223963716874831e-05, "loss": 0.0119, "step": 17660 }, { "grad_norm": 0.1546660214662552, "learning_rate": 9.222857404514012e-05, "loss": 0.0146, "step": 17670 }, { "grad_norm": 0.12865495681762695, "learning_rate": 9.221750370582355e-05, "loss": 0.0172, "step": 17680 }, { "grad_norm": 0.1847829669713974, "learning_rate": 9.220642615269028e-05, "loss": 0.0148, "step": 17690 }, { "grad_norm": 0.2012481391429901, "learning_rate": 9.219534138763311e-05, "loss": 0.0153, "step": 17700 }, { "grad_norm": 0.185409814119339, "learning_rate": 9.218424941254613e-05, "loss": 0.0142, "step": 17710 }, { "grad_norm": 0.17361003160476685, "learning_rate": 9.217315022932468e-05, "loss": 0.018, "step": 17720 }, { "grad_norm": 0.1442561000585556, "learning_rate": 9.216204383986528e-05, "loss": 0.014, "step": 17730 }, { "grad_norm": 0.1711854636669159, "learning_rate": 9.215093024606574e-05, "loss": 0.0185, "step": 17740 }, { "grad_norm": 0.2131822407245636, "learning_rate": 9.213980944982506e-05, "loss": 0.0145, "step": 17750 }, { "grad_norm": 0.23410001397132874, "learning_rate": 9.212868145304346e-05, "loss": 0.0201, "step": 17760 }, { "grad_norm": 0.2129627913236618, "learning_rate": 9.211754625762241e-05, "loss": 0.0171, "step": 17770 }, { "grad_norm": 0.17919035255908966, "learning_rate": 9.210640386546463e-05, "loss": 0.0153, "step": 17780 }, { "grad_norm": 0.1396772861480713, "learning_rate": 9.209525427847405e-05, "loss": 0.0136, "step": 17790 }, { "grad_norm": 0.24206286668777466, "learning_rate": 9.208409749855583e-05, "loss": 0.0142, "step": 17800 }, { "grad_norm": 0.17839226126670837, "learning_rate": 9.207293352761633e-05, "loss": 0.017, "step": 17810 }, { "grad_norm": 0.1518133580684662, "learning_rate": 9.206176236756319e-05, "loss": 0.0151, "step": 17820 }, { "grad_norm": 0.15279167890548706, "learning_rate": 9.205058402030525e-05, "loss": 0.0129, "step": 17830 }, { "grad_norm": 0.14225010573863983, "learning_rate": 9.203939848775259e-05, "loss": 0.0149, "step": 17840 }, { "grad_norm": 0.21409770846366882, "learning_rate": 9.202820577181652e-05, "loss": 0.0133, "step": 17850 }, { "grad_norm": 0.21313288807868958, "learning_rate": 9.201700587440953e-05, "loss": 0.0163, "step": 17860 }, { "grad_norm": 0.2127433568239212, "learning_rate": 9.200579879744544e-05, "loss": 0.0169, "step": 17870 }, { "grad_norm": 0.2076033502817154, "learning_rate": 9.199458454283918e-05, "loss": 0.0145, "step": 17880 }, { "grad_norm": 0.20250658690929413, "learning_rate": 9.198336311250697e-05, "loss": 0.0165, "step": 17890 }, { "grad_norm": 0.20180180668830872, "learning_rate": 9.197213450836626e-05, "loss": 0.0163, "step": 17900 }, { "grad_norm": 0.19497458636760712, "learning_rate": 9.19608987323357e-05, "loss": 0.0154, "step": 17910 }, { "grad_norm": 0.1647719293832779, "learning_rate": 9.194965578633517e-05, "loss": 0.0162, "step": 17920 }, { "grad_norm": 0.1812736690044403, "learning_rate": 9.193840567228582e-05, "loss": 0.0143, "step": 17930 }, { "grad_norm": 0.17275291681289673, "learning_rate": 9.192714839210994e-05, "loss": 0.0161, "step": 17940 }, { "grad_norm": 0.18675896525382996, "learning_rate": 9.19158839477311e-05, "loss": 0.0113, "step": 17950 }, { "grad_norm": 0.1946500837802887, "learning_rate": 9.190461234107411e-05, "loss": 0.0173, "step": 17960 }, { "grad_norm": 0.1683131903409958, "learning_rate": 9.189333357406496e-05, "loss": 0.0146, "step": 17970 }, { "grad_norm": 0.19408127665519714, "learning_rate": 9.188204764863089e-05, "loss": 0.0144, "step": 17980 }, { "grad_norm": 0.1528145968914032, "learning_rate": 9.187075456670033e-05, "loss": 0.0158, "step": 17990 }, { "grad_norm": 0.1805361956357956, "learning_rate": 9.1859454330203e-05, "loss": 0.0139, "step": 18000 }, { "grad_norm": 0.20382894575595856, "learning_rate": 9.18481469410698e-05, "loss": 0.0159, "step": 18010 }, { "grad_norm": 0.15340879559516907, "learning_rate": 9.183683240123281e-05, "loss": 0.0139, "step": 18020 }, { "grad_norm": 0.12306004762649536, "learning_rate": 9.182551071262541e-05, "loss": 0.0134, "step": 18030 }, { "grad_norm": 0.17450320720672607, "learning_rate": 9.181418187718218e-05, "loss": 0.0149, "step": 18040 }, { "grad_norm": 0.14664526283740997, "learning_rate": 9.180284589683888e-05, "loss": 0.0147, "step": 18050 }, { "grad_norm": 0.18246445059776306, "learning_rate": 9.17915027735325e-05, "loss": 0.0166, "step": 18060 }, { "grad_norm": 0.18822081387043, "learning_rate": 9.178015250920133e-05, "loss": 0.018, "step": 18070 }, { "grad_norm": 0.22067944705486298, "learning_rate": 9.176879510578477e-05, "loss": 0.0127, "step": 18080 }, { "grad_norm": 0.19958707690238953, "learning_rate": 9.17574305652235e-05, "loss": 0.0157, "step": 18090 }, { "grad_norm": 0.186444953083992, "learning_rate": 9.174605888945942e-05, "loss": 0.0158, "step": 18100 }, { "grad_norm": 0.16011279821395874, "learning_rate": 9.173468008043564e-05, "loss": 0.0141, "step": 18110 }, { "grad_norm": 0.16918177902698517, "learning_rate": 9.172329414009648e-05, "loss": 0.0189, "step": 18120 }, { "grad_norm": 0.1825774759054184, "learning_rate": 9.171190107038747e-05, "loss": 0.0146, "step": 18130 }, { "grad_norm": 0.19020673632621765, "learning_rate": 9.170050087325541e-05, "loss": 0.0119, "step": 18140 }, { "grad_norm": 0.15742170810699463, "learning_rate": 9.168909355064824e-05, "loss": 0.0117, "step": 18150 }, { "grad_norm": 0.22082039713859558, "learning_rate": 9.167767910451519e-05, "loss": 0.0147, "step": 18160 }, { "grad_norm": 0.234725683927536, "learning_rate": 9.166625753680669e-05, "loss": 0.014, "step": 18170 }, { "grad_norm": 0.19388574361801147, "learning_rate": 9.165482884947431e-05, "loss": 0.0138, "step": 18180 }, { "grad_norm": 0.1507689505815506, "learning_rate": 9.164339304447098e-05, "loss": 0.0155, "step": 18190 }, { "grad_norm": 0.1709357351064682, "learning_rate": 9.163195012375072e-05, "loss": 0.0155, "step": 18200 }, { "grad_norm": 0.15027908980846405, "learning_rate": 9.16205000892688e-05, "loss": 0.0138, "step": 18210 }, { "grad_norm": 0.14612005650997162, "learning_rate": 9.160904294298175e-05, "loss": 0.0126, "step": 18220 }, { "grad_norm": 0.17368662357330322, "learning_rate": 9.159757868684727e-05, "loss": 0.0157, "step": 18230 }, { "grad_norm": 0.16817724704742432, "learning_rate": 9.15861073228243e-05, "loss": 0.0175, "step": 18240 }, { "grad_norm": 0.15866069495677948, "learning_rate": 9.157462885287296e-05, "loss": 0.0154, "step": 18250 }, { "grad_norm": 0.19659653306007385, "learning_rate": 9.156314327895461e-05, "loss": 0.0147, "step": 18260 }, { "grad_norm": 0.15384455025196075, "learning_rate": 9.155165060303185e-05, "loss": 0.0166, "step": 18270 }, { "grad_norm": 0.14863774180412292, "learning_rate": 9.154015082706841e-05, "loss": 0.0141, "step": 18280 }, { "grad_norm": 0.15449585020542145, "learning_rate": 9.152864395302936e-05, "loss": 0.0118, "step": 18290 }, { "grad_norm": 0.17421191930770874, "learning_rate": 9.151712998288085e-05, "loss": 0.0128, "step": 18300 }, { "grad_norm": 0.20968681573867798, "learning_rate": 9.150560891859031e-05, "loss": 0.018, "step": 18310 }, { "grad_norm": 0.21650558710098267, "learning_rate": 9.14940807621264e-05, "loss": 0.0136, "step": 18320 }, { "grad_norm": 0.15670648217201233, "learning_rate": 9.148254551545894e-05, "loss": 0.017, "step": 18330 }, { "grad_norm": 0.23580265045166016, "learning_rate": 9.147100318055901e-05, "loss": 0.0154, "step": 18340 }, { "grad_norm": 0.15677814185619354, "learning_rate": 9.145945375939888e-05, "loss": 0.0135, "step": 18350 }, { "grad_norm": 0.20806443691253662, "learning_rate": 9.144789725395203e-05, "loss": 0.013, "step": 18360 }, { "grad_norm": 0.17741714417934418, "learning_rate": 9.14363336661931e-05, "loss": 0.0122, "step": 18370 }, { "grad_norm": 0.19178232550621033, "learning_rate": 9.142476299809806e-05, "loss": 0.0161, "step": 18380 }, { "grad_norm": 0.1400037556886673, "learning_rate": 9.1413185251644e-05, "loss": 0.0125, "step": 18390 }, { "grad_norm": 0.13288475573062897, "learning_rate": 9.140160042880923e-05, "loss": 0.0137, "step": 18400 }, { "grad_norm": 0.18454281985759735, "learning_rate": 9.139000853157327e-05, "loss": 0.0143, "step": 18410 }, { "grad_norm": 0.1912003457546234, "learning_rate": 9.137840956191688e-05, "loss": 0.0135, "step": 18420 }, { "grad_norm": 0.15829360485076904, "learning_rate": 9.136680352182199e-05, "loss": 0.0153, "step": 18430 }, { "grad_norm": 0.18152327835559845, "learning_rate": 9.135519041327177e-05, "loss": 0.0145, "step": 18440 }, { "grad_norm": 0.2319052517414093, "learning_rate": 9.134357023825058e-05, "loss": 0.0154, "step": 18450 }, { "grad_norm": 0.2022933065891266, "learning_rate": 9.133194299874398e-05, "loss": 0.0127, "step": 18460 }, { "grad_norm": 0.1712590456008911, "learning_rate": 9.132030869673876e-05, "loss": 0.0139, "step": 18470 }, { "grad_norm": 0.20038795471191406, "learning_rate": 9.130866733422288e-05, "loss": 0.0129, "step": 18480 }, { "grad_norm": 0.14008693397045135, "learning_rate": 9.129701891318556e-05, "loss": 0.0158, "step": 18490 }, { "grad_norm": 0.1762465536594391, "learning_rate": 9.128536343561718e-05, "loss": 0.0148, "step": 18500 }, { "grad_norm": 0.151472270488739, "learning_rate": 9.127370090350934e-05, "loss": 0.0152, "step": 18510 }, { "grad_norm": 0.17356179654598236, "learning_rate": 9.126203131885487e-05, "loss": 0.0149, "step": 18520 }, { "grad_norm": 0.11959119886159897, "learning_rate": 9.125035468364775e-05, "loss": 0.0134, "step": 18530 }, { "grad_norm": 0.18906265497207642, "learning_rate": 9.123867099988322e-05, "loss": 0.015, "step": 18540 }, { "grad_norm": 0.2180686593055725, "learning_rate": 9.122698026955769e-05, "loss": 0.0145, "step": 18550 }, { "grad_norm": 0.16667287051677704, "learning_rate": 9.12152824946688e-05, "loss": 0.0148, "step": 18560 }, { "grad_norm": 0.14874957501888275, "learning_rate": 9.120357767721538e-05, "loss": 0.0134, "step": 18570 }, { "grad_norm": 0.18015576899051666, "learning_rate": 9.119186581919745e-05, "loss": 0.0126, "step": 18580 }, { "grad_norm": 0.23122794926166534, "learning_rate": 9.118014692261624e-05, "loss": 0.0155, "step": 18590 }, { "grad_norm": 0.17063017189502716, "learning_rate": 9.116842098947422e-05, "loss": 0.0137, "step": 18600 }, { "grad_norm": 0.2004876434803009, "learning_rate": 9.115668802177499e-05, "loss": 0.0173, "step": 18610 }, { "grad_norm": 0.15322624146938324, "learning_rate": 9.114494802152342e-05, "loss": 0.0149, "step": 18620 }, { "grad_norm": 0.18772350251674652, "learning_rate": 9.113320099072555e-05, "loss": 0.0157, "step": 18630 }, { "grad_norm": 0.164973646402359, "learning_rate": 9.112144693138864e-05, "loss": 0.0119, "step": 18640 }, { "grad_norm": 0.1461000293493271, "learning_rate": 9.110968584552111e-05, "loss": 0.0128, "step": 18650 }, { "grad_norm": 0.10834341496229172, "learning_rate": 9.109791773513264e-05, "loss": 0.0126, "step": 18660 }, { "grad_norm": 0.1767684817314148, "learning_rate": 9.108614260223403e-05, "loss": 0.0126, "step": 18670 }, { "grad_norm": 0.18695347011089325, "learning_rate": 9.107436044883738e-05, "loss": 0.0161, "step": 18680 }, { "grad_norm": 0.22841498255729675, "learning_rate": 9.10625712769559e-05, "loss": 0.0145, "step": 18690 }, { "grad_norm": 0.164154514670372, "learning_rate": 9.105077508860406e-05, "loss": 0.0138, "step": 18700 }, { "grad_norm": 0.1541796624660492, "learning_rate": 9.103897188579751e-05, "loss": 0.0137, "step": 18710 }, { "grad_norm": 0.20747517049312592, "learning_rate": 9.102716167055308e-05, "loss": 0.0135, "step": 18720 }, { "grad_norm": 0.14527873694896698, "learning_rate": 9.10153444448888e-05, "loss": 0.016, "step": 18730 }, { "grad_norm": 0.17457233369350433, "learning_rate": 9.100352021082393e-05, "loss": 0.0147, "step": 18740 }, { "grad_norm": 0.177158385515213, "learning_rate": 9.099168897037891e-05, "loss": 0.0126, "step": 18750 }, { "grad_norm": 0.1573837846517563, "learning_rate": 9.097985072557538e-05, "loss": 0.0106, "step": 18760 }, { "grad_norm": 0.21297191083431244, "learning_rate": 9.096800547843615e-05, "loss": 0.0135, "step": 18770 }, { "grad_norm": 0.15280640125274658, "learning_rate": 9.095615323098526e-05, "loss": 0.0149, "step": 18780 }, { "grad_norm": 0.15483854711055756, "learning_rate": 9.094429398524795e-05, "loss": 0.0114, "step": 18790 }, { "grad_norm": 0.1657351404428482, "learning_rate": 9.093242774325061e-05, "loss": 0.0137, "step": 18800 }, { "grad_norm": 0.12052399665117264, "learning_rate": 9.092055450702088e-05, "loss": 0.012, "step": 18810 }, { "grad_norm": 0.19481724500656128, "learning_rate": 9.090867427858756e-05, "loss": 0.0167, "step": 18820 }, { "grad_norm": 0.15054042637348175, "learning_rate": 9.089678705998066e-05, "loss": 0.0133, "step": 18830 }, { "grad_norm": 0.17224569618701935, "learning_rate": 9.088489285323139e-05, "loss": 0.0135, "step": 18840 }, { "grad_norm": 0.19299139082431793, "learning_rate": 9.087299166037212e-05, "loss": 0.0135, "step": 18850 }, { "grad_norm": 0.19365514814853668, "learning_rate": 9.086108348343647e-05, "loss": 0.0109, "step": 18860 }, { "grad_norm": 0.15515057742595673, "learning_rate": 9.08491683244592e-05, "loss": 0.0114, "step": 18870 }, { "grad_norm": 0.1630600243806839, "learning_rate": 9.08372461854763e-05, "loss": 0.014, "step": 18880 }, { "grad_norm": 0.22788643836975098, "learning_rate": 9.082531706852492e-05, "loss": 0.0157, "step": 18890 }, { "grad_norm": 0.19533716142177582, "learning_rate": 9.081338097564342e-05, "loss": 0.0131, "step": 18900 }, { "grad_norm": 0.16461911797523499, "learning_rate": 9.080143790887137e-05, "loss": 0.014, "step": 18910 }, { "grad_norm": 0.16594178974628448, "learning_rate": 9.07894878702495e-05, "loss": 0.0146, "step": 18920 }, { "grad_norm": 0.21016667783260345, "learning_rate": 9.077753086181974e-05, "loss": 0.0153, "step": 18930 }, { "grad_norm": 0.20934933423995972, "learning_rate": 9.076556688562524e-05, "loss": 0.0142, "step": 18940 }, { "grad_norm": 0.1933712512254715, "learning_rate": 9.075359594371029e-05, "loss": 0.0159, "step": 18950 }, { "grad_norm": 0.20115256309509277, "learning_rate": 9.07416180381204e-05, "loss": 0.0173, "step": 18960 }, { "grad_norm": 0.18323475122451782, "learning_rate": 9.072963317090228e-05, "loss": 0.016, "step": 18970 }, { "grad_norm": 0.14180290699005127, "learning_rate": 9.071764134410382e-05, "loss": 0.0154, "step": 18980 }, { "grad_norm": 0.10982415825128555, "learning_rate": 9.070564255977407e-05, "loss": 0.0145, "step": 18990 }, { "grad_norm": 0.19468535482883453, "learning_rate": 9.06936368199633e-05, "loss": 0.0146, "step": 19000 }, { "grad_norm": 0.17123889923095703, "learning_rate": 9.0681624126723e-05, "loss": 0.014, "step": 19010 }, { "grad_norm": 0.15318985283374786, "learning_rate": 9.066960448210576e-05, "loss": 0.0136, "step": 19020 }, { "grad_norm": 0.16471454501152039, "learning_rate": 9.065757788816543e-05, "loss": 0.0165, "step": 19030 }, { "grad_norm": 0.15873272716999054, "learning_rate": 9.064554434695705e-05, "loss": 0.0119, "step": 19040 }, { "grad_norm": 0.17149707674980164, "learning_rate": 9.063350386053677e-05, "loss": 0.0129, "step": 19050 }, { "grad_norm": 0.14411050081253052, "learning_rate": 9.062145643096202e-05, "loss": 0.0128, "step": 19060 }, { "grad_norm": 0.1751519739627838, "learning_rate": 9.060940206029136e-05, "loss": 0.0147, "step": 19070 }, { "grad_norm": 0.17983992397785187, "learning_rate": 9.059734075058457e-05, "loss": 0.0172, "step": 19080 }, { "grad_norm": 0.17846184968948364, "learning_rate": 9.058527250390257e-05, "loss": 0.0155, "step": 19090 }, { "grad_norm": 0.16968998312950134, "learning_rate": 9.057319732230752e-05, "loss": 0.0136, "step": 19100 }, { "grad_norm": 0.12252268195152283, "learning_rate": 9.056111520786273e-05, "loss": 0.0135, "step": 19110 }, { "grad_norm": 0.15326760709285736, "learning_rate": 9.054902616263268e-05, "loss": 0.0141, "step": 19120 }, { "grad_norm": 0.1501753032207489, "learning_rate": 9.05369301886831e-05, "loss": 0.0147, "step": 19130 }, { "grad_norm": 0.13745062053203583, "learning_rate": 9.052482728808083e-05, "loss": 0.0131, "step": 19140 }, { "grad_norm": 0.14967240393161774, "learning_rate": 9.051271746289391e-05, "loss": 0.0128, "step": 19150 }, { "grad_norm": 0.1540239453315735, "learning_rate": 9.050060071519162e-05, "loss": 0.0119, "step": 19160 }, { "grad_norm": 0.19994568824768066, "learning_rate": 9.048847704704437e-05, "loss": 0.0133, "step": 19170 }, { "grad_norm": 0.15436820685863495, "learning_rate": 9.047634646052376e-05, "loss": 0.0164, "step": 19180 }, { "grad_norm": 0.17681877315044403, "learning_rate": 9.046420895770256e-05, "loss": 0.014, "step": 19190 }, { "grad_norm": 0.17202840745449066, "learning_rate": 9.045206454065473e-05, "loss": 0.0131, "step": 19200 }, { "grad_norm": 0.17109909653663635, "learning_rate": 9.043991321145546e-05, "loss": 0.0116, "step": 19210 }, { "grad_norm": 0.16465319693088531, "learning_rate": 9.042775497218105e-05, "loss": 0.0111, "step": 19220 }, { "grad_norm": 0.12419386208057404, "learning_rate": 9.041558982490901e-05, "loss": 0.0125, "step": 19230 }, { "grad_norm": 0.15776966512203217, "learning_rate": 9.040341777171805e-05, "loss": 0.0114, "step": 19240 }, { "grad_norm": 0.1328745037317276, "learning_rate": 9.039123881468802e-05, "loss": 0.0154, "step": 19250 }, { "grad_norm": 0.20106706023216248, "learning_rate": 9.037905295589998e-05, "loss": 0.0124, "step": 19260 }, { "grad_norm": 0.15983803570270538, "learning_rate": 9.036686019743617e-05, "loss": 0.0127, "step": 19270 }, { "grad_norm": 0.12670937180519104, "learning_rate": 9.035466054137997e-05, "loss": 0.0146, "step": 19280 }, { "grad_norm": 0.14453601837158203, "learning_rate": 9.0342453989816e-05, "loss": 0.015, "step": 19290 }, { "grad_norm": 0.13865071535110474, "learning_rate": 9.033024054483e-05, "loss": 0.0121, "step": 19300 }, { "grad_norm": 0.17834946513175964, "learning_rate": 9.031802020850894e-05, "loss": 0.0121, "step": 19310 }, { "grad_norm": 0.18375751376152039, "learning_rate": 9.030579298294092e-05, "loss": 0.0126, "step": 19320 }, { "grad_norm": 0.13923580944538116, "learning_rate": 9.029355887021524e-05, "loss": 0.0168, "step": 19330 }, { "grad_norm": 0.20416978001594543, "learning_rate": 9.028131787242238e-05, "loss": 0.0141, "step": 19340 }, { "grad_norm": 0.1441822499036789, "learning_rate": 9.026906999165399e-05, "loss": 0.0122, "step": 19350 }, { "grad_norm": 0.23256756365299225, "learning_rate": 9.025681523000291e-05, "loss": 0.0142, "step": 19360 }, { "grad_norm": 0.19255901873111725, "learning_rate": 9.024455358956315e-05, "loss": 0.0119, "step": 19370 }, { "grad_norm": 0.19259043037891388, "learning_rate": 9.023228507242984e-05, "loss": 0.0142, "step": 19380 }, { "grad_norm": 0.138695627450943, "learning_rate": 9.022000968069937e-05, "loss": 0.0149, "step": 19390 }, { "grad_norm": 0.17717097699642181, "learning_rate": 9.020772741646928e-05, "loss": 0.0126, "step": 19400 }, { "grad_norm": 0.1275445520877838, "learning_rate": 9.019543828183826e-05, "loss": 0.0157, "step": 19410 }, { "grad_norm": 0.17810706794261932, "learning_rate": 9.018314227890616e-05, "loss": 0.0139, "step": 19420 }, { "grad_norm": 0.1478692889213562, "learning_rate": 9.017083940977408e-05, "loss": 0.013, "step": 19430 }, { "grad_norm": 0.17625267803668976, "learning_rate": 9.015852967654422e-05, "loss": 0.0135, "step": 19440 }, { "grad_norm": 0.21869826316833496, "learning_rate": 9.014621308131996e-05, "loss": 0.0144, "step": 19450 }, { "grad_norm": 0.25416070222854614, "learning_rate": 9.01338896262059e-05, "loss": 0.0127, "step": 19460 }, { "grad_norm": 0.14610427618026733, "learning_rate": 9.012155931330777e-05, "loss": 0.0142, "step": 19470 }, { "grad_norm": 0.1920468807220459, "learning_rate": 9.010922214473246e-05, "loss": 0.015, "step": 19480 }, { "grad_norm": 0.21122747659683228, "learning_rate": 9.009687812258808e-05, "loss": 0.0122, "step": 19490 }, { "grad_norm": 0.1683853715658188, "learning_rate": 9.00845272489839e-05, "loss": 0.0126, "step": 19500 }, { "grad_norm": 0.15695056319236755, "learning_rate": 9.007216952603031e-05, "loss": 0.0156, "step": 19510 }, { "grad_norm": 0.13854561746120453, "learning_rate": 9.005980495583894e-05, "loss": 0.0128, "step": 19520 }, { "grad_norm": 0.18265250325202942, "learning_rate": 9.004743354052252e-05, "loss": 0.0139, "step": 19530 }, { "grad_norm": 0.1315876692533493, "learning_rate": 9.003505528219503e-05, "loss": 0.0128, "step": 19540 }, { "grad_norm": 0.21250075101852417, "learning_rate": 9.002267018297154e-05, "loss": 0.0151, "step": 19550 }, { "grad_norm": 0.19733354449272156, "learning_rate": 9.001027824496834e-05, "loss": 0.0124, "step": 19560 }, { "grad_norm": 0.1846744567155838, "learning_rate": 8.999787947030287e-05, "loss": 0.0142, "step": 19570 }, { "grad_norm": 0.1329020857810974, "learning_rate": 8.998547386109376e-05, "loss": 0.0099, "step": 19580 }, { "grad_norm": 0.2050841897726059, "learning_rate": 8.997306141946073e-05, "loss": 0.0146, "step": 19590 }, { "grad_norm": 0.1628722995519638, "learning_rate": 8.996064214752481e-05, "loss": 0.0161, "step": 19600 }, { "grad_norm": 0.1931181102991104, "learning_rate": 8.994821604740806e-05, "loss": 0.0126, "step": 19610 }, { "grad_norm": 0.22447673976421356, "learning_rate": 8.993578312123377e-05, "loss": 0.0151, "step": 19620 }, { "grad_norm": 0.18643689155578613, "learning_rate": 8.992334337112639e-05, "loss": 0.0129, "step": 19630 }, { "grad_norm": 0.1873546838760376, "learning_rate": 8.991089679921154e-05, "loss": 0.0166, "step": 19640 }, { "grad_norm": 0.1652456670999527, "learning_rate": 8.989844340761599e-05, "loss": 0.0137, "step": 19650 }, { "grad_norm": 0.21319974958896637, "learning_rate": 8.988598319846768e-05, "loss": 0.0132, "step": 19660 }, { "grad_norm": 0.1481136828660965, "learning_rate": 8.987351617389574e-05, "loss": 0.0137, "step": 19670 }, { "grad_norm": 0.16862550377845764, "learning_rate": 8.98610423360304e-05, "loss": 0.0156, "step": 19680 }, { "grad_norm": 0.16283193230628967, "learning_rate": 8.984856168700317e-05, "loss": 0.0134, "step": 19690 }, { "grad_norm": 0.11692789942026138, "learning_rate": 8.983607422894658e-05, "loss": 0.0128, "step": 19700 }, { "grad_norm": 0.16222110390663147, "learning_rate": 8.982357996399442e-05, "loss": 0.0154, "step": 19710 }, { "grad_norm": 0.14174796640872955, "learning_rate": 8.981107889428164e-05, "loss": 0.0122, "step": 19720 }, { "grad_norm": 0.1770855039358139, "learning_rate": 8.979857102194428e-05, "loss": 0.016, "step": 19730 }, { "grad_norm": 0.21254116296768188, "learning_rate": 8.978605634911968e-05, "loss": 0.014, "step": 19740 }, { "grad_norm": 0.1676366925239563, "learning_rate": 8.977353487794616e-05, "loss": 0.0137, "step": 19750 }, { "grad_norm": 0.18452998995780945, "learning_rate": 8.976100661056334e-05, "loss": 0.0125, "step": 19760 }, { "grad_norm": 0.21996045112609863, "learning_rate": 8.974847154911197e-05, "loss": 0.0155, "step": 19770 }, { "grad_norm": 0.19013026356697083, "learning_rate": 8.973592969573393e-05, "loss": 0.0137, "step": 19780 }, { "grad_norm": 0.1314350813627243, "learning_rate": 8.972338105257228e-05, "loss": 0.0153, "step": 19790 }, { "grad_norm": 0.12646959722042084, "learning_rate": 8.971082562177125e-05, "loss": 0.0129, "step": 19800 }, { "grad_norm": 0.18236736953258514, "learning_rate": 8.96982634054762e-05, "loss": 0.0142, "step": 19810 }, { "grad_norm": 0.19839009642601013, "learning_rate": 8.96856944058337e-05, "loss": 0.0151, "step": 19820 }, { "grad_norm": 0.1880313754081726, "learning_rate": 8.967311862499144e-05, "loss": 0.0139, "step": 19830 }, { "grad_norm": 0.1860601156949997, "learning_rate": 8.966053606509825e-05, "loss": 0.0144, "step": 19840 }, { "grad_norm": 0.16239745914936066, "learning_rate": 8.964794672830417e-05, "loss": 0.0121, "step": 19850 }, { "grad_norm": 0.14880549907684326, "learning_rate": 8.963535061676038e-05, "loss": 0.0142, "step": 19860 }, { "grad_norm": 0.17134444415569305, "learning_rate": 8.962274773261918e-05, "loss": 0.0153, "step": 19870 }, { "grad_norm": 0.1930658221244812, "learning_rate": 8.961013807803409e-05, "loss": 0.0154, "step": 19880 }, { "grad_norm": 0.1956813633441925, "learning_rate": 8.959752165515973e-05, "loss": 0.0138, "step": 19890 }, { "grad_norm": 0.2219718098640442, "learning_rate": 8.958489846615193e-05, "loss": 0.0144, "step": 19900 }, { "grad_norm": 0.20055319368839264, "learning_rate": 8.957226851316762e-05, "loss": 0.0178, "step": 19910 }, { "grad_norm": 0.1554572433233261, "learning_rate": 8.955963179836493e-05, "loss": 0.014, "step": 19920 }, { "grad_norm": 0.20797285437583923, "learning_rate": 8.954698832390312e-05, "loss": 0.0142, "step": 19930 }, { "grad_norm": 0.14583420753479004, "learning_rate": 8.953433809194263e-05, "loss": 0.0138, "step": 19940 }, { "grad_norm": 0.13616929948329926, "learning_rate": 8.9521681104645e-05, "loss": 0.0116, "step": 19950 }, { "grad_norm": 0.1652648150920868, "learning_rate": 8.9509017364173e-05, "loss": 0.0146, "step": 19960 }, { "grad_norm": 0.18458104133605957, "learning_rate": 8.949634687269052e-05, "loss": 0.0159, "step": 19970 }, { "grad_norm": 0.17755600810050964, "learning_rate": 8.948366963236259e-05, "loss": 0.0132, "step": 19980 }, { "grad_norm": 0.19062121212482452, "learning_rate": 8.947098564535538e-05, "loss": 0.0119, "step": 19990 }, { "grad_norm": 0.1547919064760208, "learning_rate": 8.945829491383627e-05, "loss": 0.0137, "step": 20000 }, { "grad_norm": 0.16712749004364014, "learning_rate": 8.944559743997374e-05, "loss": 0.0171, "step": 20010 }, { "grad_norm": 0.15045000612735748, "learning_rate": 8.943289322593746e-05, "loss": 0.0144, "step": 20020 }, { "grad_norm": 0.1265750527381897, "learning_rate": 8.942018227389821e-05, "loss": 0.0137, "step": 20030 }, { "grad_norm": 0.14640821516513824, "learning_rate": 8.940746458602795e-05, "loss": 0.0125, "step": 20040 }, { "grad_norm": 0.14573277533054352, "learning_rate": 8.939474016449979e-05, "loss": 0.0124, "step": 20050 }, { "grad_norm": 0.12674812972545624, "learning_rate": 8.938200901148799e-05, "loss": 0.0136, "step": 20060 }, { "grad_norm": 0.14912734925746918, "learning_rate": 8.936927112916795e-05, "loss": 0.0163, "step": 20070 }, { "grad_norm": 0.135565385222435, "learning_rate": 8.935652651971622e-05, "loss": 0.0137, "step": 20080 }, { "grad_norm": 0.21589724719524384, "learning_rate": 8.934377518531052e-05, "loss": 0.0154, "step": 20090 }, { "grad_norm": 0.21369117498397827, "learning_rate": 8.933101712812967e-05, "loss": 0.0186, "step": 20100 }, { "grad_norm": 0.22640763223171234, "learning_rate": 8.931825235035374e-05, "loss": 0.0155, "step": 20110 }, { "grad_norm": 0.18056857585906982, "learning_rate": 8.930548085416382e-05, "loss": 0.0157, "step": 20120 }, { "grad_norm": 0.17934750020503998, "learning_rate": 8.92927026417422e-05, "loss": 0.0149, "step": 20130 }, { "grad_norm": 0.17899715900421143, "learning_rate": 8.92799177152724e-05, "loss": 0.0136, "step": 20140 }, { "grad_norm": 0.18310704827308655, "learning_rate": 8.926712607693895e-05, "loss": 0.017, "step": 20150 }, { "grad_norm": 0.15883193910121918, "learning_rate": 8.925432772892762e-05, "loss": 0.0122, "step": 20160 }, { "grad_norm": 0.1803090125322342, "learning_rate": 8.924152267342529e-05, "loss": 0.0115, "step": 20170 }, { "grad_norm": 0.1118762269616127, "learning_rate": 8.922871091261998e-05, "loss": 0.0124, "step": 20180 }, { "grad_norm": 0.15347933769226074, "learning_rate": 8.92158924487009e-05, "loss": 0.0134, "step": 20190 }, { "grad_norm": 0.1477338671684265, "learning_rate": 8.920306728385834e-05, "loss": 0.0133, "step": 20200 }, { "grad_norm": 0.17612451314926147, "learning_rate": 8.919023542028379e-05, "loss": 0.0143, "step": 20210 }, { "grad_norm": 0.21617606282234192, "learning_rate": 8.917739686016988e-05, "loss": 0.0148, "step": 20220 }, { "grad_norm": 0.17714983224868774, "learning_rate": 8.916455160571033e-05, "loss": 0.0121, "step": 20230 }, { "grad_norm": 0.1893293708562851, "learning_rate": 8.915169965910008e-05, "loss": 0.0123, "step": 20240 }, { "grad_norm": 0.18036307394504547, "learning_rate": 8.913884102253514e-05, "loss": 0.0153, "step": 20250 }, { "grad_norm": 0.1751820147037506, "learning_rate": 8.912597569821273e-05, "loss": 0.0143, "step": 20260 }, { "grad_norm": 0.205885112285614, "learning_rate": 8.911310368833118e-05, "loss": 0.012, "step": 20270 }, { "grad_norm": 0.18704545497894287, "learning_rate": 8.910022499508994e-05, "loss": 0.0139, "step": 20280 }, { "grad_norm": 0.14935141801834106, "learning_rate": 8.908733962068965e-05, "loss": 0.0134, "step": 20290 }, { "grad_norm": 0.12389028072357178, "learning_rate": 8.907444756733207e-05, "loss": 0.0129, "step": 20300 }, { "grad_norm": 0.14901575446128845, "learning_rate": 8.906154883722006e-05, "loss": 0.015, "step": 20310 }, { "grad_norm": 0.20654843747615814, "learning_rate": 8.904864343255773e-05, "loss": 0.0128, "step": 20320 }, { "grad_norm": 0.17292805016040802, "learning_rate": 8.90357313555502e-05, "loss": 0.0135, "step": 20330 }, { "grad_norm": 0.18872950971126556, "learning_rate": 8.90228126084038e-05, "loss": 0.0129, "step": 20340 }, { "grad_norm": 0.16290771961212158, "learning_rate": 8.900988719332601e-05, "loss": 0.015, "step": 20350 }, { "grad_norm": 0.1481514424085617, "learning_rate": 8.899695511252542e-05, "loss": 0.014, "step": 20360 }, { "grad_norm": 0.1816163808107376, "learning_rate": 8.898401636821176e-05, "loss": 0.0143, "step": 20370 }, { "grad_norm": 0.23333920538425446, "learning_rate": 8.897107096259593e-05, "loss": 0.0158, "step": 20380 }, { "grad_norm": 0.15410113334655762, "learning_rate": 8.895811889788994e-05, "loss": 0.0154, "step": 20390 }, { "grad_norm": 0.12053615599870682, "learning_rate": 8.894516017630692e-05, "loss": 0.0145, "step": 20400 }, { "grad_norm": 0.18041875958442688, "learning_rate": 8.893219480006118e-05, "loss": 0.0131, "step": 20410 }, { "grad_norm": 0.16749168932437897, "learning_rate": 8.891922277136817e-05, "loss": 0.0101, "step": 20420 }, { "grad_norm": 0.16524547338485718, "learning_rate": 8.890624409244441e-05, "loss": 0.0104, "step": 20430 }, { "grad_norm": 0.14230455458164215, "learning_rate": 8.889325876550763e-05, "loss": 0.0129, "step": 20440 }, { "grad_norm": 0.19434860348701477, "learning_rate": 8.888026679277666e-05, "loss": 0.012, "step": 20450 }, { "grad_norm": 0.22512321174144745, "learning_rate": 8.886726817647147e-05, "loss": 0.0157, "step": 20460 }, { "grad_norm": 0.17009063065052032, "learning_rate": 8.885426291881319e-05, "loss": 0.0136, "step": 20470 }, { "grad_norm": 0.1543276607990265, "learning_rate": 8.884125102202401e-05, "loss": 0.0149, "step": 20480 }, { "grad_norm": 0.18976160883903503, "learning_rate": 8.882823248832736e-05, "loss": 0.0161, "step": 20490 }, { "grad_norm": 0.13555218279361725, "learning_rate": 8.881520731994772e-05, "loss": 0.0139, "step": 20500 }, { "grad_norm": 0.14013074338436127, "learning_rate": 8.880217551911077e-05, "loss": 0.013, "step": 20510 }, { "grad_norm": 0.15855464339256287, "learning_rate": 8.878913708804323e-05, "loss": 0.0171, "step": 20520 }, { "grad_norm": 0.14204365015029907, "learning_rate": 8.877609202897308e-05, "loss": 0.0154, "step": 20530 }, { "grad_norm": 0.11991337686777115, "learning_rate": 8.876304034412933e-05, "loss": 0.0118, "step": 20540 }, { "grad_norm": 0.1432671695947647, "learning_rate": 8.874998203574214e-05, "loss": 0.0136, "step": 20550 }, { "grad_norm": 0.14439018070697784, "learning_rate": 8.873691710604284e-05, "loss": 0.0152, "step": 20560 }, { "grad_norm": 0.10635829716920853, "learning_rate": 8.872384555726387e-05, "loss": 0.0132, "step": 20570 }, { "grad_norm": 0.18329071998596191, "learning_rate": 8.871076739163878e-05, "loss": 0.015, "step": 20580 }, { "grad_norm": 0.17236840724945068, "learning_rate": 8.86976826114023e-05, "loss": 0.0124, "step": 20590 }, { "grad_norm": 0.16411294043064117, "learning_rate": 8.868459121879023e-05, "loss": 0.0127, "step": 20600 }, { "grad_norm": 0.14076463878154755, "learning_rate": 8.867149321603956e-05, "loss": 0.0126, "step": 20610 }, { "grad_norm": 0.14183972775936127, "learning_rate": 8.865838860538835e-05, "loss": 0.0103, "step": 20620 }, { "grad_norm": 0.1915813386440277, "learning_rate": 8.864527738907585e-05, "loss": 0.0142, "step": 20630 }, { "grad_norm": 0.1764681488275528, "learning_rate": 8.863215956934239e-05, "loss": 0.0137, "step": 20640 }, { "grad_norm": 0.1811363697052002, "learning_rate": 8.861903514842947e-05, "loss": 0.0141, "step": 20650 }, { "grad_norm": 0.14658154547214508, "learning_rate": 8.860590412857966e-05, "loss": 0.0156, "step": 20660 }, { "grad_norm": 0.14379620552062988, "learning_rate": 8.85927665120367e-05, "loss": 0.0142, "step": 20670 }, { "grad_norm": 0.2013394683599472, "learning_rate": 8.857962230104546e-05, "loss": 0.0129, "step": 20680 }, { "grad_norm": 0.15869922935962677, "learning_rate": 8.856647149785193e-05, "loss": 0.0133, "step": 20690 }, { "grad_norm": 0.2206345796585083, "learning_rate": 8.855331410470322e-05, "loss": 0.0146, "step": 20700 }, { "grad_norm": 0.1640857607126236, "learning_rate": 8.854015012384756e-05, "loss": 0.0131, "step": 20710 }, { "grad_norm": 0.14111879467964172, "learning_rate": 8.852697955753433e-05, "loss": 0.0153, "step": 20720 }, { "grad_norm": 0.19966357946395874, "learning_rate": 8.851380240801399e-05, "loss": 0.0173, "step": 20730 }, { "grad_norm": 0.22385476529598236, "learning_rate": 8.850061867753818e-05, "loss": 0.0168, "step": 20740 }, { "grad_norm": 0.19193068146705627, "learning_rate": 8.848742836835963e-05, "loss": 0.0139, "step": 20750 }, { "grad_norm": 0.1840834617614746, "learning_rate": 8.847423148273221e-05, "loss": 0.0147, "step": 20760 }, { "grad_norm": 0.15601909160614014, "learning_rate": 8.846102802291092e-05, "loss": 0.0158, "step": 20770 }, { "grad_norm": 0.16946031153202057, "learning_rate": 8.844781799115183e-05, "loss": 0.0122, "step": 20780 }, { "grad_norm": 0.1613406538963318, "learning_rate": 8.84346013897122e-05, "loss": 0.0118, "step": 20790 }, { "grad_norm": 0.16716255247592926, "learning_rate": 8.842137822085038e-05, "loss": 0.0123, "step": 20800 }, { "grad_norm": 0.21193073689937592, "learning_rate": 8.840814848682585e-05, "loss": 0.0119, "step": 20810 }, { "grad_norm": 0.16257141530513763, "learning_rate": 8.83949121898992e-05, "loss": 0.0127, "step": 20820 }, { "grad_norm": 0.1540544182062149, "learning_rate": 8.838166933233217e-05, "loss": 0.0129, "step": 20830 }, { "grad_norm": 0.1401490867137909, "learning_rate": 8.83684199163876e-05, "loss": 0.0116, "step": 20840 }, { "grad_norm": 0.13685433566570282, "learning_rate": 8.835516394432943e-05, "loss": 0.0116, "step": 20850 }, { "grad_norm": 0.15344567596912384, "learning_rate": 8.834190141842276e-05, "loss": 0.0133, "step": 20860 }, { "grad_norm": 0.15758441388607025, "learning_rate": 8.83286323409338e-05, "loss": 0.012, "step": 20870 }, { "grad_norm": 0.13448332250118256, "learning_rate": 8.831535671412986e-05, "loss": 0.0132, "step": 20880 }, { "grad_norm": 0.1424795687198639, "learning_rate": 8.830207454027938e-05, "loss": 0.0115, "step": 20890 }, { "grad_norm": 0.12957924604415894, "learning_rate": 8.828878582165192e-05, "loss": 0.0106, "step": 20900 }, { "grad_norm": 0.17772828042507172, "learning_rate": 8.827549056051818e-05, "loss": 0.0134, "step": 20910 }, { "grad_norm": 0.15033212304115295, "learning_rate": 8.826218875914993e-05, "loss": 0.0129, "step": 20920 }, { "grad_norm": 0.16004578769207, "learning_rate": 8.82488804198201e-05, "loss": 0.012, "step": 20930 }, { "grad_norm": 0.19309240579605103, "learning_rate": 8.82355655448027e-05, "loss": 0.0159, "step": 20940 }, { "grad_norm": 0.14552225172519684, "learning_rate": 8.822224413637293e-05, "loss": 0.0119, "step": 20950 }, { "grad_norm": 0.20550845563411713, "learning_rate": 8.820891619680697e-05, "loss": 0.0121, "step": 20960 }, { "grad_norm": 0.1332225203514099, "learning_rate": 8.819558172838227e-05, "loss": 0.0132, "step": 20970 }, { "grad_norm": 0.1293896585702896, "learning_rate": 8.818224073337731e-05, "loss": 0.0137, "step": 20980 }, { "grad_norm": 0.19027838110923767, "learning_rate": 8.816889321407169e-05, "loss": 0.0114, "step": 20990 }, { "grad_norm": 0.15027284622192383, "learning_rate": 8.815553917274615e-05, "loss": 0.0157, "step": 21000 }, { "grad_norm": 0.1476898044347763, "learning_rate": 8.81421786116825e-05, "loss": 0.0117, "step": 21010 }, { "grad_norm": 0.17172032594680786, "learning_rate": 8.812881153316373e-05, "loss": 0.0145, "step": 21020 }, { "grad_norm": 0.15412573516368866, "learning_rate": 8.81154379394739e-05, "loss": 0.0129, "step": 21030 }, { "grad_norm": 0.17968866229057312, "learning_rate": 8.810205783289818e-05, "loss": 0.0146, "step": 21040 }, { "grad_norm": 0.14617407321929932, "learning_rate": 8.808867121572286e-05, "loss": 0.0167, "step": 21050 }, { "grad_norm": 0.1557944416999817, "learning_rate": 8.807527809023537e-05, "loss": 0.0119, "step": 21060 }, { "grad_norm": 0.1515052318572998, "learning_rate": 8.80618784587242e-05, "loss": 0.0161, "step": 21070 }, { "grad_norm": 0.1490851193666458, "learning_rate": 8.804847232347902e-05, "loss": 0.0139, "step": 21080 }, { "grad_norm": 0.22377856075763702, "learning_rate": 8.803505968679054e-05, "loss": 0.0148, "step": 21090 }, { "grad_norm": 0.1489691585302353, "learning_rate": 8.802164055095061e-05, "loss": 0.0142, "step": 21100 }, { "grad_norm": 0.1717820018529892, "learning_rate": 8.80082149182522e-05, "loss": 0.0172, "step": 21110 }, { "grad_norm": 0.19342336058616638, "learning_rate": 8.79947827909894e-05, "loss": 0.0115, "step": 21120 }, { "grad_norm": 0.20394043624401093, "learning_rate": 8.798134417145738e-05, "loss": 0.0155, "step": 21130 }, { "grad_norm": 0.21257284283638, "learning_rate": 8.796789906195243e-05, "loss": 0.0153, "step": 21140 }, { "grad_norm": 0.24907466769218445, "learning_rate": 8.795444746477195e-05, "loss": 0.0179, "step": 21150 }, { "grad_norm": 0.15078669786453247, "learning_rate": 8.794098938221446e-05, "loss": 0.0145, "step": 21160 }, { "grad_norm": 0.13384996354579926, "learning_rate": 8.792752481657957e-05, "loss": 0.0142, "step": 21170 }, { "grad_norm": 0.17152918875217438, "learning_rate": 8.791405377016802e-05, "loss": 0.0144, "step": 21180 }, { "grad_norm": 0.16588446497917175, "learning_rate": 8.790057624528163e-05, "loss": 0.016, "step": 21190 }, { "grad_norm": 0.19307786226272583, "learning_rate": 8.788709224422333e-05, "loss": 0.0146, "step": 21200 }, { "grad_norm": 0.22007180750370026, "learning_rate": 8.787360176929717e-05, "loss": 0.0112, "step": 21210 }, { "grad_norm": 0.19172453880310059, "learning_rate": 8.786010482280834e-05, "loss": 0.0143, "step": 21220 }, { "grad_norm": 0.14500324428081512, "learning_rate": 8.784660140706306e-05, "loss": 0.0133, "step": 21230 }, { "grad_norm": 0.15050089359283447, "learning_rate": 8.783309152436872e-05, "loss": 0.0135, "step": 21240 }, { "grad_norm": 0.1213625818490982, "learning_rate": 8.781957517703375e-05, "loss": 0.013, "step": 21250 }, { "grad_norm": 0.1471129059791565, "learning_rate": 8.780605236736776e-05, "loss": 0.0138, "step": 21260 }, { "grad_norm": 0.1878434717655182, "learning_rate": 8.779252309768142e-05, "loss": 0.0154, "step": 21270 }, { "grad_norm": 0.1415344923734665, "learning_rate": 8.777898737028652e-05, "loss": 0.0124, "step": 21280 }, { "grad_norm": 0.16626127064228058, "learning_rate": 8.776544518749591e-05, "loss": 0.0143, "step": 21290 }, { "grad_norm": 0.2078690379858017, "learning_rate": 8.775189655162364e-05, "loss": 0.0128, "step": 21300 }, { "grad_norm": 0.19269967079162598, "learning_rate": 8.773834146498474e-05, "loss": 0.0157, "step": 21310 }, { "grad_norm": 0.1712518185377121, "learning_rate": 8.772477992989545e-05, "loss": 0.0146, "step": 21320 }, { "grad_norm": 0.1496550291776657, "learning_rate": 8.771121194867304e-05, "loss": 0.0119, "step": 21330 }, { "grad_norm": 0.16985578835010529, "learning_rate": 8.769763752363589e-05, "loss": 0.0143, "step": 21340 }, { "grad_norm": 0.17649419605731964, "learning_rate": 8.768405665710352e-05, "loss": 0.0142, "step": 21350 }, { "grad_norm": 0.1644134223461151, "learning_rate": 8.767046935139655e-05, "loss": 0.0127, "step": 21360 }, { "grad_norm": 0.12571004033088684, "learning_rate": 8.765687560883666e-05, "loss": 0.0129, "step": 21370 }, { "grad_norm": 0.1624612659215927, "learning_rate": 8.764327543174664e-05, "loss": 0.0107, "step": 21380 }, { "grad_norm": 0.16752532124519348, "learning_rate": 8.762966882245038e-05, "loss": 0.0153, "step": 21390 }, { "grad_norm": 0.17273221909999847, "learning_rate": 8.761605578327291e-05, "loss": 0.0135, "step": 21400 }, { "grad_norm": 0.16428320109844208, "learning_rate": 8.76024363165403e-05, "loss": 0.0134, "step": 21410 }, { "grad_norm": 0.18322022259235382, "learning_rate": 8.758881042457976e-05, "loss": 0.013, "step": 21420 }, { "grad_norm": 0.1762966513633728, "learning_rate": 8.757517810971957e-05, "loss": 0.0129, "step": 21430 }, { "grad_norm": 0.13578782975673676, "learning_rate": 8.756153937428913e-05, "loss": 0.0138, "step": 21440 }, { "grad_norm": 0.11884541809558868, "learning_rate": 8.754789422061889e-05, "loss": 0.0158, "step": 21450 }, { "grad_norm": 0.13753363490104675, "learning_rate": 8.753424265104052e-05, "loss": 0.0118, "step": 21460 }, { "grad_norm": 0.11536481976509094, "learning_rate": 8.752058466788659e-05, "loss": 0.0112, "step": 21470 }, { "grad_norm": 0.15366944670677185, "learning_rate": 8.750692027349097e-05, "loss": 0.0122, "step": 21480 }, { "grad_norm": 0.15507854521274567, "learning_rate": 8.749324947018847e-05, "loss": 0.0113, "step": 21490 }, { "grad_norm": 0.15951679646968842, "learning_rate": 8.747957226031507e-05, "loss": 0.0138, "step": 21500 }, { "grad_norm": 0.15968595445156097, "learning_rate": 8.746588864620787e-05, "loss": 0.0114, "step": 21510 }, { "grad_norm": 0.1486419290304184, "learning_rate": 8.745219863020498e-05, "loss": 0.0126, "step": 21520 }, { "grad_norm": 0.12830257415771484, "learning_rate": 8.743850221464564e-05, "loss": 0.0126, "step": 21530 }, { "grad_norm": 0.17568770051002502, "learning_rate": 8.742479940187026e-05, "loss": 0.015, "step": 21540 }, { "grad_norm": 0.14825372397899628, "learning_rate": 8.74110901942202e-05, "loss": 0.0133, "step": 21550 }, { "grad_norm": 0.16060015559196472, "learning_rate": 8.739737459403803e-05, "loss": 0.0143, "step": 21560 }, { "grad_norm": 0.11669547110795975, "learning_rate": 8.738365260366737e-05, "loss": 0.0114, "step": 21570 }, { "grad_norm": 0.12380330264568329, "learning_rate": 8.736992422545292e-05, "loss": 0.0149, "step": 21580 }, { "grad_norm": 0.18540512025356293, "learning_rate": 8.73561894617405e-05, "loss": 0.0137, "step": 21590 }, { "grad_norm": 0.17578661441802979, "learning_rate": 8.734244831487697e-05, "loss": 0.0148, "step": 21600 }, { "grad_norm": 0.21341973543167114, "learning_rate": 8.732870078721035e-05, "loss": 0.014, "step": 21610 }, { "grad_norm": 0.167141392827034, "learning_rate": 8.731494688108972e-05, "loss": 0.0142, "step": 21620 }, { "grad_norm": 0.126710906624794, "learning_rate": 8.730118659886523e-05, "loss": 0.0134, "step": 21630 }, { "grad_norm": 0.12416554987430573, "learning_rate": 8.728741994288814e-05, "loss": 0.0135, "step": 21640 }, { "grad_norm": 0.1303633749485016, "learning_rate": 8.727364691551079e-05, "loss": 0.0128, "step": 21650 }, { "grad_norm": 0.21976038813591003, "learning_rate": 8.725986751908661e-05, "loss": 0.0129, "step": 21660 }, { "grad_norm": 0.14485177397727966, "learning_rate": 8.724608175597016e-05, "loss": 0.0135, "step": 21670 }, { "grad_norm": 0.17523974180221558, "learning_rate": 8.723228962851699e-05, "loss": 0.0136, "step": 21680 }, { "grad_norm": 0.2400628626346588, "learning_rate": 8.721849113908385e-05, "loss": 0.0156, "step": 21690 }, { "grad_norm": 0.10518354177474976, "learning_rate": 8.720468629002848e-05, "loss": 0.0138, "step": 21700 }, { "grad_norm": 0.16309964656829834, "learning_rate": 8.719087508370978e-05, "loss": 0.0129, "step": 21710 }, { "grad_norm": 0.15321700274944305, "learning_rate": 8.717705752248772e-05, "loss": 0.0174, "step": 21720 }, { "grad_norm": 0.15687181055545807, "learning_rate": 8.71632336087233e-05, "loss": 0.0157, "step": 21730 }, { "grad_norm": 0.21135534346103668, "learning_rate": 8.71494033447787e-05, "loss": 0.0115, "step": 21740 }, { "grad_norm": 0.1742962747812271, "learning_rate": 8.713556673301708e-05, "loss": 0.0127, "step": 21750 }, { "grad_norm": 0.18615417182445526, "learning_rate": 8.712172377580278e-05, "loss": 0.0135, "step": 21760 }, { "grad_norm": 0.1671704351902008, "learning_rate": 8.710787447550114e-05, "loss": 0.016, "step": 21770 }, { "grad_norm": 0.185220405459404, "learning_rate": 8.70940188344787e-05, "loss": 0.0131, "step": 21780 }, { "grad_norm": 0.18168196082115173, "learning_rate": 8.708015685510293e-05, "loss": 0.0127, "step": 21790 }, { "grad_norm": 0.15378683805465698, "learning_rate": 8.706628853974252e-05, "loss": 0.0126, "step": 21800 }, { "grad_norm": 0.16318176686763763, "learning_rate": 8.705241389076715e-05, "loss": 0.0138, "step": 21810 }, { "grad_norm": 0.1866101771593094, "learning_rate": 8.703853291054764e-05, "loss": 0.0117, "step": 21820 }, { "grad_norm": 0.12429940700531006, "learning_rate": 8.702464560145587e-05, "loss": 0.0124, "step": 21830 }, { "grad_norm": 0.12739472091197968, "learning_rate": 8.701075196586476e-05, "loss": 0.0121, "step": 21840 }, { "grad_norm": 0.18178223073482513, "learning_rate": 8.699685200614842e-05, "loss": 0.0153, "step": 21850 }, { "grad_norm": 0.13997134566307068, "learning_rate": 8.698294572468193e-05, "loss": 0.013, "step": 21860 }, { "grad_norm": 0.16400444507598877, "learning_rate": 8.696903312384148e-05, "loss": 0.0136, "step": 21870 }, { "grad_norm": 0.203622505068779, "learning_rate": 8.695511420600439e-05, "loss": 0.0122, "step": 21880 }, { "grad_norm": 0.1549222320318222, "learning_rate": 8.694118897354901e-05, "loss": 0.0125, "step": 21890 }, { "grad_norm": 0.12333611398935318, "learning_rate": 8.692725742885478e-05, "loss": 0.0141, "step": 21900 }, { "grad_norm": 0.16926145553588867, "learning_rate": 8.691331957430221e-05, "loss": 0.0122, "step": 21910 }, { "grad_norm": 0.17130780220031738, "learning_rate": 8.68993754122729e-05, "loss": 0.0122, "step": 21920 }, { "grad_norm": 0.16333018243312836, "learning_rate": 8.688542494514955e-05, "loss": 0.0145, "step": 21930 }, { "grad_norm": 0.16678187251091003, "learning_rate": 8.68714681753159e-05, "loss": 0.0128, "step": 21940 }, { "grad_norm": 0.20371340215206146, "learning_rate": 8.685750510515676e-05, "loss": 0.0102, "step": 21950 }, { "grad_norm": 0.17991159856319427, "learning_rate": 8.684353573705805e-05, "loss": 0.0116, "step": 21960 }, { "grad_norm": 0.1477205902338028, "learning_rate": 8.682956007340677e-05, "loss": 0.0141, "step": 21970 }, { "grad_norm": 0.20061935484409332, "learning_rate": 8.681557811659095e-05, "loss": 0.0147, "step": 21980 }, { "grad_norm": 0.16262847185134888, "learning_rate": 8.680158986899974e-05, "loss": 0.0127, "step": 21990 }, { "grad_norm": 0.20005860924720764, "learning_rate": 8.678759533302335e-05, "loss": 0.0106, "step": 22000 }, { "grad_norm": 0.15561848878860474, "learning_rate": 8.677359451105308e-05, "loss": 0.0131, "step": 22010 }, { "grad_norm": 0.1391676664352417, "learning_rate": 8.675958740548123e-05, "loss": 0.0137, "step": 22020 }, { "grad_norm": 0.18524473905563354, "learning_rate": 8.674557401870129e-05, "loss": 0.0116, "step": 22030 }, { "grad_norm": 0.1441892832517624, "learning_rate": 8.673155435310775e-05, "loss": 0.0135, "step": 22040 }, { "grad_norm": 0.1531236618757248, "learning_rate": 8.671752841109617e-05, "loss": 0.0119, "step": 22050 }, { "grad_norm": 0.14790315926074982, "learning_rate": 8.670349619506321e-05, "loss": 0.0108, "step": 22060 }, { "grad_norm": 0.16021929681301117, "learning_rate": 8.66894577074066e-05, "loss": 0.0132, "step": 22070 }, { "grad_norm": 0.15565961599349976, "learning_rate": 8.667541295052513e-05, "loss": 0.0122, "step": 22080 }, { "grad_norm": 0.18071532249450684, "learning_rate": 8.666136192681865e-05, "loss": 0.0114, "step": 22090 }, { "grad_norm": 0.16357752680778503, "learning_rate": 8.664730463868811e-05, "loss": 0.0149, "step": 22100 }, { "grad_norm": 0.18164362013339996, "learning_rate": 8.663324108853552e-05, "loss": 0.0125, "step": 22110 }, { "grad_norm": 0.13805145025253296, "learning_rate": 8.661917127876395e-05, "loss": 0.0134, "step": 22120 }, { "grad_norm": 0.1604243814945221, "learning_rate": 8.660509521177754e-05, "loss": 0.0152, "step": 22130 }, { "grad_norm": 0.11521239578723907, "learning_rate": 8.65910128899815e-05, "loss": 0.0128, "step": 22140 }, { "grad_norm": 0.18203692138195038, "learning_rate": 8.657692431578214e-05, "loss": 0.0123, "step": 22150 }, { "grad_norm": 0.15436530113220215, "learning_rate": 8.656282949158679e-05, "loss": 0.0123, "step": 22160 }, { "grad_norm": 0.16808833181858063, "learning_rate": 8.654872841980388e-05, "loss": 0.013, "step": 22170 }, { "grad_norm": 0.1542072743177414, "learning_rate": 8.653462110284289e-05, "loss": 0.0129, "step": 22180 }, { "grad_norm": 0.14698046445846558, "learning_rate": 8.652050754311437e-05, "loss": 0.0124, "step": 22190 }, { "grad_norm": 0.15728336572647095, "learning_rate": 8.650638774302995e-05, "loss": 0.0112, "step": 22200 }, { "grad_norm": 0.15184669196605682, "learning_rate": 8.649226170500233e-05, "loss": 0.013, "step": 22210 }, { "grad_norm": 0.16060039401054382, "learning_rate": 8.647812943144524e-05, "loss": 0.0139, "step": 22220 }, { "grad_norm": 0.1580822616815567, "learning_rate": 8.646399092477351e-05, "loss": 0.0126, "step": 22230 }, { "grad_norm": 0.12822894752025604, "learning_rate": 8.644984618740301e-05, "loss": 0.0126, "step": 22240 }, { "grad_norm": 0.16549162566661835, "learning_rate": 8.643569522175073e-05, "loss": 0.0162, "step": 22250 }, { "grad_norm": 0.16315902769565582, "learning_rate": 8.642153803023463e-05, "loss": 0.0149, "step": 22260 }, { "grad_norm": 0.1795201301574707, "learning_rate": 8.640737461527383e-05, "loss": 0.0125, "step": 22270 }, { "grad_norm": 0.173614963889122, "learning_rate": 8.639320497928845e-05, "loss": 0.0126, "step": 22280 }, { "grad_norm": 0.16704952716827393, "learning_rate": 8.637902912469969e-05, "loss": 0.0127, "step": 22290 }, { "grad_norm": 0.09932737797498703, "learning_rate": 8.636484705392982e-05, "loss": 0.0114, "step": 22300 }, { "grad_norm": 0.155056893825531, "learning_rate": 8.635065876940216e-05, "loss": 0.012, "step": 22310 }, { "grad_norm": 0.1667722463607788, "learning_rate": 8.633646427354112e-05, "loss": 0.0118, "step": 22320 }, { "grad_norm": 0.1677176058292389, "learning_rate": 8.632226356877213e-05, "loss": 0.0148, "step": 22330 }, { "grad_norm": 0.15521782636642456, "learning_rate": 8.630805665752173e-05, "loss": 0.0134, "step": 22340 }, { "grad_norm": 0.1581423580646515, "learning_rate": 8.629384354221748e-05, "loss": 0.0119, "step": 22350 }, { "grad_norm": 0.10682156682014465, "learning_rate": 8.627962422528797e-05, "loss": 0.0121, "step": 22360 }, { "grad_norm": 0.12229792028665543, "learning_rate": 8.626539870916296e-05, "loss": 0.012, "step": 22370 }, { "grad_norm": 0.14740607142448425, "learning_rate": 8.625116699627317e-05, "loss": 0.0106, "step": 22380 }, { "grad_norm": 0.17665652930736542, "learning_rate": 8.623692908905041e-05, "loss": 0.0157, "step": 22390 }, { "grad_norm": 0.16295458376407623, "learning_rate": 8.622268498992755e-05, "loss": 0.0135, "step": 22400 }, { "grad_norm": 0.1533687710762024, "learning_rate": 8.620843470133851e-05, "loss": 0.0116, "step": 22410 }, { "grad_norm": 0.11899887025356293, "learning_rate": 8.619417822571829e-05, "loss": 0.0135, "step": 22420 }, { "grad_norm": 0.17822423577308655, "learning_rate": 8.617991556550292e-05, "loss": 0.0129, "step": 22430 }, { "grad_norm": 0.17797937989234924, "learning_rate": 8.616564672312952e-05, "loss": 0.0145, "step": 22440 }, { "grad_norm": 0.18923349678516388, "learning_rate": 8.61513717010362e-05, "loss": 0.0113, "step": 22450 }, { "grad_norm": 0.19345971941947937, "learning_rate": 8.613709050166221e-05, "loss": 0.0116, "step": 22460 }, { "grad_norm": 0.11008413881063461, "learning_rate": 8.61228031274478e-05, "loss": 0.0111, "step": 22470 }, { "grad_norm": 0.15638162195682526, "learning_rate": 8.610850958083431e-05, "loss": 0.0109, "step": 22480 }, { "grad_norm": 0.19363434612751007, "learning_rate": 8.609420986426409e-05, "loss": 0.0116, "step": 22490 }, { "grad_norm": 0.13855619728565216, "learning_rate": 8.60799039801806e-05, "loss": 0.0129, "step": 22500 }, { "grad_norm": 0.14666907489299774, "learning_rate": 8.606559193102828e-05, "loss": 0.0119, "step": 22510 }, { "grad_norm": 0.1375860571861267, "learning_rate": 8.605127371925273e-05, "loss": 0.0132, "step": 22520 }, { "grad_norm": 0.09555696696043015, "learning_rate": 8.603694934730047e-05, "loss": 0.0116, "step": 22530 }, { "grad_norm": 0.13672171533107758, "learning_rate": 8.602261881761919e-05, "loss": 0.0135, "step": 22540 }, { "grad_norm": 0.15654252469539642, "learning_rate": 8.600828213265759e-05, "loss": 0.0123, "step": 22550 }, { "grad_norm": 0.20286771655082703, "learning_rate": 8.599393929486539e-05, "loss": 0.0144, "step": 22560 }, { "grad_norm": 0.15439893305301666, "learning_rate": 8.59795903066934e-05, "loss": 0.0132, "step": 22570 }, { "grad_norm": 0.13235439360141754, "learning_rate": 8.596523517059347e-05, "loss": 0.0102, "step": 22580 }, { "grad_norm": 0.14005321264266968, "learning_rate": 8.59508738890185e-05, "loss": 0.0114, "step": 22590 }, { "grad_norm": 0.12536074221134186, "learning_rate": 8.593650646442246e-05, "loss": 0.0121, "step": 22600 }, { "grad_norm": 0.1438731551170349, "learning_rate": 8.59221328992603e-05, "loss": 0.0117, "step": 22610 }, { "grad_norm": 0.1532890647649765, "learning_rate": 8.590775319598813e-05, "loss": 0.0109, "step": 22620 }, { "grad_norm": 0.17827318608760834, "learning_rate": 8.589336735706301e-05, "loss": 0.0124, "step": 22630 }, { "grad_norm": 0.1588418334722519, "learning_rate": 8.587897538494307e-05, "loss": 0.0123, "step": 22640 }, { "grad_norm": 0.16021937131881714, "learning_rate": 8.586457728208756e-05, "loss": 0.0137, "step": 22650 }, { "grad_norm": 0.1264418512582779, "learning_rate": 8.585017305095667e-05, "loss": 0.0115, "step": 22660 }, { "grad_norm": 0.13301491737365723, "learning_rate": 8.583576269401173e-05, "loss": 0.0174, "step": 22670 }, { "grad_norm": 0.20078511536121368, "learning_rate": 8.582134621371504e-05, "loss": 0.013, "step": 22680 }, { "grad_norm": 0.13923288881778717, "learning_rate": 8.580692361253e-05, "loss": 0.0134, "step": 22690 }, { "grad_norm": 0.1757393330335617, "learning_rate": 8.579249489292104e-05, "loss": 0.0131, "step": 22700 }, { "grad_norm": 0.1760152280330658, "learning_rate": 8.577806005735363e-05, "loss": 0.013, "step": 22710 }, { "grad_norm": 0.1642882227897644, "learning_rate": 8.576361910829429e-05, "loss": 0.0114, "step": 22720 }, { "grad_norm": 0.12696363031864166, "learning_rate": 8.574917204821057e-05, "loss": 0.0108, "step": 22730 }, { "grad_norm": 0.12924079596996307, "learning_rate": 8.57347188795711e-05, "loss": 0.0136, "step": 22740 }, { "grad_norm": 0.12152840942144394, "learning_rate": 8.572025960484551e-05, "loss": 0.0122, "step": 22750 }, { "grad_norm": 0.12798307836055756, "learning_rate": 8.57057942265045e-05, "loss": 0.0106, "step": 22760 }, { "grad_norm": 0.21291090548038483, "learning_rate": 8.569132274701984e-05, "loss": 0.0137, "step": 22770 }, { "grad_norm": 0.11142987012863159, "learning_rate": 8.567684516886427e-05, "loss": 0.013, "step": 22780 }, { "grad_norm": 0.1494983732700348, "learning_rate": 8.56623614945116e-05, "loss": 0.0126, "step": 22790 }, { "grad_norm": 0.14852410554885864, "learning_rate": 8.564787172643675e-05, "loss": 0.0122, "step": 22800 }, { "grad_norm": 0.2615726590156555, "learning_rate": 8.563337586711559e-05, "loss": 0.016, "step": 22810 }, { "grad_norm": 0.15508966147899628, "learning_rate": 8.561887391902506e-05, "loss": 0.0105, "step": 22820 }, { "grad_norm": 0.14180390536785126, "learning_rate": 8.560436588464316e-05, "loss": 0.0132, "step": 22830 }, { "grad_norm": 0.14388114213943481, "learning_rate": 8.55898517664489e-05, "loss": 0.0107, "step": 22840 }, { "grad_norm": 0.2146926075220108, "learning_rate": 8.557533156692236e-05, "loss": 0.0141, "step": 22850 }, { "grad_norm": 0.18083710968494415, "learning_rate": 8.556080528854467e-05, "loss": 0.0129, "step": 22860 }, { "grad_norm": 0.1896589994430542, "learning_rate": 8.554627293379791e-05, "loss": 0.0132, "step": 22870 }, { "grad_norm": 0.09797470271587372, "learning_rate": 8.553173450516531e-05, "loss": 0.0131, "step": 22880 }, { "grad_norm": 0.12338824570178986, "learning_rate": 8.551719000513108e-05, "loss": 0.012, "step": 22890 }, { "grad_norm": 0.15183423459529877, "learning_rate": 8.550263943618049e-05, "loss": 0.014, "step": 22900 }, { "grad_norm": 0.13343384861946106, "learning_rate": 8.54880828007998e-05, "loss": 0.0128, "step": 22910 }, { "grad_norm": 0.15391157567501068, "learning_rate": 8.547352010147637e-05, "loss": 0.0126, "step": 22920 }, { "grad_norm": 0.17223121225833893, "learning_rate": 8.545895134069855e-05, "loss": 0.0121, "step": 22930 }, { "grad_norm": 0.16938947141170502, "learning_rate": 8.544437652095576e-05, "loss": 0.0126, "step": 22940 }, { "grad_norm": 0.15074507892131805, "learning_rate": 8.542979564473843e-05, "loss": 0.0142, "step": 22950 }, { "grad_norm": 0.20594936609268188, "learning_rate": 8.541520871453802e-05, "loss": 0.0136, "step": 22960 }, { "grad_norm": 0.16608503460884094, "learning_rate": 8.540061573284705e-05, "loss": 0.0138, "step": 22970 }, { "grad_norm": 0.14158904552459717, "learning_rate": 8.538601670215906e-05, "loss": 0.0111, "step": 22980 }, { "grad_norm": 0.14858709275722504, "learning_rate": 8.537141162496864e-05, "loss": 0.0113, "step": 22990 }, { "grad_norm": 0.15600252151489258, "learning_rate": 8.535680050377137e-05, "loss": 0.0129, "step": 23000 }, { "grad_norm": 0.1259976327419281, "learning_rate": 8.534218334106391e-05, "loss": 0.0117, "step": 23010 }, { "grad_norm": 0.1131158173084259, "learning_rate": 8.532756013934393e-05, "loss": 0.0109, "step": 23020 }, { "grad_norm": 0.14764699339866638, "learning_rate": 8.531293090111012e-05, "loss": 0.0099, "step": 23030 }, { "grad_norm": 0.13532204926013947, "learning_rate": 8.529829562886225e-05, "loss": 0.0122, "step": 23040 }, { "grad_norm": 0.20401357114315033, "learning_rate": 8.528365432510105e-05, "loss": 0.0115, "step": 23050 }, { "grad_norm": 0.17627747356891632, "learning_rate": 8.526900699232833e-05, "loss": 0.0143, "step": 23060 }, { "grad_norm": 0.12673994898796082, "learning_rate": 8.525435363304695e-05, "loss": 0.0123, "step": 23070 }, { "grad_norm": 0.13875354826450348, "learning_rate": 8.523969424976072e-05, "loss": 0.0143, "step": 23080 }, { "grad_norm": 0.16099700331687927, "learning_rate": 8.522502884497457e-05, "loss": 0.0146, "step": 23090 }, { "grad_norm": 0.13762331008911133, "learning_rate": 8.521035742119437e-05, "loss": 0.0138, "step": 23100 }, { "grad_norm": 0.17578421533107758, "learning_rate": 8.519567998092712e-05, "loss": 0.0127, "step": 23110 }, { "grad_norm": 0.15430690348148346, "learning_rate": 8.518099652668075e-05, "loss": 0.0113, "step": 23120 }, { "grad_norm": 0.1277693808078766, "learning_rate": 8.516630706096429e-05, "loss": 0.0112, "step": 23130 }, { "grad_norm": 0.19903826713562012, "learning_rate": 8.515161158628773e-05, "loss": 0.0124, "step": 23140 }, { "grad_norm": 0.14929330348968506, "learning_rate": 8.513691010516216e-05, "loss": 0.0139, "step": 23150 }, { "grad_norm": 0.13026095926761627, "learning_rate": 8.512220262009966e-05, "loss": 0.0126, "step": 23160 }, { "grad_norm": 0.11925075948238373, "learning_rate": 8.510748913361332e-05, "loss": 0.0118, "step": 23170 }, { "grad_norm": 0.14960213005542755, "learning_rate": 8.509276964821726e-05, "loss": 0.0117, "step": 23180 }, { "grad_norm": 0.1801432967185974, "learning_rate": 8.507804416642669e-05, "loss": 0.0141, "step": 23190 }, { "grad_norm": 0.16829326748847961, "learning_rate": 8.506331269075774e-05, "loss": 0.0159, "step": 23200 }, { "grad_norm": 0.14139549434185028, "learning_rate": 8.504857522372765e-05, "loss": 0.0108, "step": 23210 }, { "grad_norm": 0.17230993509292603, "learning_rate": 8.503383176785461e-05, "loss": 0.012, "step": 23220 }, { "grad_norm": 0.18384382128715515, "learning_rate": 8.501908232565792e-05, "loss": 0.0126, "step": 23230 }, { "grad_norm": 0.14805251359939575, "learning_rate": 8.50043268996578e-05, "loss": 0.0119, "step": 23240 }, { "grad_norm": 0.15682387351989746, "learning_rate": 8.498956549237562e-05, "loss": 0.0099, "step": 23250 }, { "grad_norm": 0.14397303760051727, "learning_rate": 8.497479810633366e-05, "loss": 0.0099, "step": 23260 }, { "grad_norm": 0.13857433199882507, "learning_rate": 8.496002474405525e-05, "loss": 0.0121, "step": 23270 }, { "grad_norm": 0.19453881680965424, "learning_rate": 8.494524540806478e-05, "loss": 0.0135, "step": 23280 }, { "grad_norm": 0.15159350633621216, "learning_rate": 8.493046010088761e-05, "loss": 0.0126, "step": 23290 }, { "grad_norm": 0.12668375670909882, "learning_rate": 8.491566882505018e-05, "loss": 0.0117, "step": 23300 }, { "grad_norm": 0.14121709764003754, "learning_rate": 8.490087158307988e-05, "loss": 0.0107, "step": 23310 }, { "grad_norm": 0.1449390947818756, "learning_rate": 8.488606837750518e-05, "loss": 0.0129, "step": 23320 }, { "grad_norm": 0.15223044157028198, "learning_rate": 8.487125921085552e-05, "loss": 0.011, "step": 23330 }, { "grad_norm": 0.14398786425590515, "learning_rate": 8.485644408566141e-05, "loss": 0.0108, "step": 23340 }, { "grad_norm": 0.1628955900669098, "learning_rate": 8.484162300445431e-05, "loss": 0.0125, "step": 23350 }, { "grad_norm": 0.20054557919502258, "learning_rate": 8.482679596976676e-05, "loss": 0.014, "step": 23360 }, { "grad_norm": 0.12666063010692596, "learning_rate": 8.48119629841323e-05, "loss": 0.0137, "step": 23370 }, { "grad_norm": 0.1333237886428833, "learning_rate": 8.479712405008547e-05, "loss": 0.0145, "step": 23380 }, { "grad_norm": 0.1517612636089325, "learning_rate": 8.478227917016184e-05, "loss": 0.0119, "step": 23390 }, { "grad_norm": 0.19081108272075653, "learning_rate": 8.476742834689801e-05, "loss": 0.0124, "step": 23400 }, { "grad_norm": 0.11394978314638138, "learning_rate": 8.475257158283157e-05, "loss": 0.0124, "step": 23410 }, { "grad_norm": 0.11936739087104797, "learning_rate": 8.473770888050112e-05, "loss": 0.0141, "step": 23420 }, { "grad_norm": 0.1722259670495987, "learning_rate": 8.47228402424463e-05, "loss": 0.0167, "step": 23430 }, { "grad_norm": 0.21199432015419006, "learning_rate": 8.470796567120775e-05, "loss": 0.0124, "step": 23440 }, { "grad_norm": 0.19324679672718048, "learning_rate": 8.469308516932714e-05, "loss": 0.0153, "step": 23450 }, { "grad_norm": 0.1703164279460907, "learning_rate": 8.467819873934714e-05, "loss": 0.0138, "step": 23460 }, { "grad_norm": 0.1693006306886673, "learning_rate": 8.466330638381143e-05, "loss": 0.0136, "step": 23470 }, { "grad_norm": 0.17034390568733215, "learning_rate": 8.464840810526469e-05, "loss": 0.0126, "step": 23480 }, { "grad_norm": 0.1362106055021286, "learning_rate": 8.463350390625264e-05, "loss": 0.0103, "step": 23490 }, { "grad_norm": 0.17274941504001617, "learning_rate": 8.4618593789322e-05, "loss": 0.0122, "step": 23500 }, { "grad_norm": 0.16359843313694, "learning_rate": 8.46036777570205e-05, "loss": 0.0125, "step": 23510 }, { "grad_norm": 0.1889811009168625, "learning_rate": 8.458875581189688e-05, "loss": 0.0137, "step": 23520 }, { "grad_norm": 0.15997371077537537, "learning_rate": 8.457382795650092e-05, "loss": 0.0147, "step": 23530 }, { "grad_norm": 0.15582260489463806, "learning_rate": 8.455889419338335e-05, "loss": 0.0137, "step": 23540 }, { "grad_norm": 0.10642852634191513, "learning_rate": 8.454395452509593e-05, "loss": 0.0116, "step": 23550 }, { "grad_norm": 0.12509125471115112, "learning_rate": 8.452900895419146e-05, "loss": 0.011, "step": 23560 }, { "grad_norm": 0.1363651007413864, "learning_rate": 8.451405748322376e-05, "loss": 0.011, "step": 23570 }, { "grad_norm": 0.1486625224351883, "learning_rate": 8.449910011474759e-05, "loss": 0.0154, "step": 23580 }, { "grad_norm": 0.15472623705863953, "learning_rate": 8.448413685131876e-05, "loss": 0.012, "step": 23590 }, { "grad_norm": 0.1679830700159073, "learning_rate": 8.446916769549407e-05, "loss": 0.0116, "step": 23600 }, { "grad_norm": 0.10653870552778244, "learning_rate": 8.445419264983136e-05, "loss": 0.0109, "step": 23610 }, { "grad_norm": 0.17944826185703278, "learning_rate": 8.443921171688947e-05, "loss": 0.0099, "step": 23620 }, { "grad_norm": 0.14586342871189117, "learning_rate": 8.442422489922819e-05, "loss": 0.0119, "step": 23630 }, { "grad_norm": 0.18486644327640533, "learning_rate": 8.440923219940838e-05, "loss": 0.0124, "step": 23640 }, { "grad_norm": 0.17230354249477386, "learning_rate": 8.439423361999189e-05, "loss": 0.0113, "step": 23650 }, { "grad_norm": 0.18175828456878662, "learning_rate": 8.437922916354155e-05, "loss": 0.0118, "step": 23660 }, { "grad_norm": 0.1902206838130951, "learning_rate": 8.436421883262123e-05, "loss": 0.0122, "step": 23670 }, { "grad_norm": 0.14045549929141998, "learning_rate": 8.434920262979577e-05, "loss": 0.0108, "step": 23680 }, { "grad_norm": 0.22011181712150574, "learning_rate": 8.433418055763104e-05, "loss": 0.0101, "step": 23690 }, { "grad_norm": 0.14770178496837616, "learning_rate": 8.431915261869389e-05, "loss": 0.0117, "step": 23700 }, { "grad_norm": 0.14938616752624512, "learning_rate": 8.43041188155522e-05, "loss": 0.0103, "step": 23710 }, { "grad_norm": 0.14415296912193298, "learning_rate": 8.428907915077481e-05, "loss": 0.0128, "step": 23720 }, { "grad_norm": 0.20126228034496307, "learning_rate": 8.42740336269316e-05, "loss": 0.0111, "step": 23730 }, { "grad_norm": 0.1389048993587494, "learning_rate": 8.425898224659345e-05, "loss": 0.0133, "step": 23740 }, { "grad_norm": 0.1306774914264679, "learning_rate": 8.42439250123322e-05, "loss": 0.0109, "step": 23750 }, { "grad_norm": 0.18688207864761353, "learning_rate": 8.422886192672076e-05, "loss": 0.0126, "step": 23760 }, { "grad_norm": 0.180662602186203, "learning_rate": 8.421379299233297e-05, "loss": 0.0109, "step": 23770 }, { "grad_norm": 0.12380702793598175, "learning_rate": 8.419871821174371e-05, "loss": 0.0114, "step": 23780 }, { "grad_norm": 0.15729929506778717, "learning_rate": 8.418363758752884e-05, "loss": 0.0122, "step": 23790 }, { "grad_norm": 0.14965231716632843, "learning_rate": 8.416855112226523e-05, "loss": 0.0114, "step": 23800 }, { "grad_norm": 0.18766455352306366, "learning_rate": 8.415345881853075e-05, "loss": 0.0122, "step": 23810 }, { "grad_norm": 0.2194981724023819, "learning_rate": 8.413836067890426e-05, "loss": 0.0142, "step": 23820 }, { "grad_norm": 0.13372957706451416, "learning_rate": 8.41232567059656e-05, "loss": 0.0099, "step": 23830 }, { "grad_norm": 0.13174280524253845, "learning_rate": 8.410814690229565e-05, "loss": 0.0116, "step": 23840 }, { "grad_norm": 0.14646016061306, "learning_rate": 8.409303127047626e-05, "loss": 0.0115, "step": 23850 }, { "grad_norm": 0.17763245105743408, "learning_rate": 8.407790981309028e-05, "loss": 0.0112, "step": 23860 }, { "grad_norm": 0.1214187890291214, "learning_rate": 8.406278253272153e-05, "loss": 0.0118, "step": 23870 }, { "grad_norm": 0.1622522622346878, "learning_rate": 8.404764943195487e-05, "loss": 0.0117, "step": 23880 }, { "grad_norm": 0.15528841316699982, "learning_rate": 8.403251051337613e-05, "loss": 0.0132, "step": 23890 }, { "grad_norm": 0.1674230396747589, "learning_rate": 8.401736577957214e-05, "loss": 0.0119, "step": 23900 }, { "grad_norm": 0.12505123019218445, "learning_rate": 8.40022152331307e-05, "loss": 0.012, "step": 23910 }, { "grad_norm": 0.14542599022388458, "learning_rate": 8.398705887664064e-05, "loss": 0.0109, "step": 23920 }, { "grad_norm": 0.16918547451496124, "learning_rate": 8.397189671269177e-05, "loss": 0.0098, "step": 23930 }, { "grad_norm": 0.14797192811965942, "learning_rate": 8.395672874387488e-05, "loss": 0.0124, "step": 23940 }, { "grad_norm": 0.252231240272522, "learning_rate": 8.394155497278177e-05, "loss": 0.0131, "step": 23950 }, { "grad_norm": 0.10597778111696243, "learning_rate": 8.392637540200523e-05, "loss": 0.0127, "step": 23960 }, { "grad_norm": 0.13379992544651031, "learning_rate": 8.391119003413902e-05, "loss": 0.0091, "step": 23970 }, { "grad_norm": 0.1778348684310913, "learning_rate": 8.38959988717779e-05, "loss": 0.0098, "step": 23980 }, { "grad_norm": 0.12473509460687637, "learning_rate": 8.388080191751764e-05, "loss": 0.0116, "step": 23990 }, { "grad_norm": 0.1506722867488861, "learning_rate": 8.386559917395496e-05, "loss": 0.0116, "step": 24000 }, { "grad_norm": 0.13811413943767548, "learning_rate": 8.385039064368761e-05, "loss": 0.0114, "step": 24010 }, { "grad_norm": 0.16351759433746338, "learning_rate": 8.383517632931431e-05, "loss": 0.0131, "step": 24020 }, { "grad_norm": 0.13119973242282867, "learning_rate": 8.381995623343477e-05, "loss": 0.011, "step": 24030 }, { "grad_norm": 0.19598552584648132, "learning_rate": 8.380473035864968e-05, "loss": 0.0121, "step": 24040 }, { "grad_norm": 0.16005520522594452, "learning_rate": 8.378949870756076e-05, "loss": 0.0125, "step": 24050 }, { "grad_norm": 0.1882438063621521, "learning_rate": 8.377426128277063e-05, "loss": 0.014, "step": 24060 }, { "grad_norm": 0.16447384655475616, "learning_rate": 8.375901808688298e-05, "loss": 0.0107, "step": 24070 }, { "grad_norm": 0.1174616813659668, "learning_rate": 8.374376912250246e-05, "loss": 0.0103, "step": 24080 }, { "grad_norm": 0.16870293021202087, "learning_rate": 8.372851439223468e-05, "loss": 0.0127, "step": 24090 }, { "grad_norm": 0.13710680603981018, "learning_rate": 8.371325389868627e-05, "loss": 0.0108, "step": 24100 }, { "grad_norm": 0.17778296768665314, "learning_rate": 8.369798764446482e-05, "loss": 0.0115, "step": 24110 }, { "grad_norm": 0.14895200729370117, "learning_rate": 8.368271563217893e-05, "loss": 0.0119, "step": 24120 }, { "grad_norm": 0.13327831029891968, "learning_rate": 8.366743786443817e-05, "loss": 0.0117, "step": 24130 }, { "grad_norm": 0.13808093965053558, "learning_rate": 8.365215434385309e-05, "loss": 0.0112, "step": 24140 }, { "grad_norm": 0.13887184858322144, "learning_rate": 8.36368650730352e-05, "loss": 0.0107, "step": 24150 }, { "grad_norm": 0.16738395392894745, "learning_rate": 8.362157005459705e-05, "loss": 0.0122, "step": 24160 }, { "grad_norm": 0.1147041991353035, "learning_rate": 8.360626929115213e-05, "loss": 0.0103, "step": 24170 }, { "grad_norm": 0.165358304977417, "learning_rate": 8.359096278531492e-05, "loss": 0.0142, "step": 24180 }, { "grad_norm": 0.12540872395038605, "learning_rate": 8.357565053970088e-05, "loss": 0.0157, "step": 24190 }, { "grad_norm": 0.19941818714141846, "learning_rate": 8.356033255692647e-05, "loss": 0.011, "step": 24200 }, { "grad_norm": 0.16159477829933167, "learning_rate": 8.354500883960911e-05, "loss": 0.0111, "step": 24210 }, { "grad_norm": 0.16797712445259094, "learning_rate": 8.352967939036717e-05, "loss": 0.0145, "step": 24220 }, { "grad_norm": 0.11861052364110947, "learning_rate": 8.35143442118201e-05, "loss": 0.0135, "step": 24230 }, { "grad_norm": 0.13788112998008728, "learning_rate": 8.349900330658819e-05, "loss": 0.012, "step": 24240 }, { "grad_norm": 0.1602039784193039, "learning_rate": 8.348365667729284e-05, "loss": 0.0125, "step": 24250 }, { "grad_norm": 0.1615951657295227, "learning_rate": 8.346830432655633e-05, "loss": 0.0112, "step": 24260 }, { "grad_norm": 0.14395612478256226, "learning_rate": 8.345294625700195e-05, "loss": 0.0103, "step": 24270 }, { "grad_norm": 0.12758582830429077, "learning_rate": 8.343758247125402e-05, "loss": 0.0119, "step": 24280 }, { "grad_norm": 0.15191231667995453, "learning_rate": 8.342221297193776e-05, "loss": 0.0151, "step": 24290 }, { "grad_norm": 0.15252308547496796, "learning_rate": 8.34068377616794e-05, "loss": 0.0122, "step": 24300 }, { "grad_norm": 0.12985384464263916, "learning_rate": 8.339145684310615e-05, "loss": 0.013, "step": 24310 }, { "grad_norm": 0.155411958694458, "learning_rate": 8.337607021884618e-05, "loss": 0.011, "step": 24320 }, { "grad_norm": 0.12187878787517548, "learning_rate": 8.336067789152867e-05, "loss": 0.0121, "step": 24330 }, { "grad_norm": 0.15706640481948853, "learning_rate": 8.334527986378369e-05, "loss": 0.0122, "step": 24340 }, { "grad_norm": 0.11808659881353378, "learning_rate": 8.332987613824239e-05, "loss": 0.0115, "step": 24350 }, { "grad_norm": 0.19745640456676483, "learning_rate": 8.331446671753685e-05, "loss": 0.014, "step": 24360 }, { "grad_norm": 0.18065889179706573, "learning_rate": 8.329905160430007e-05, "loss": 0.0138, "step": 24370 }, { "grad_norm": 0.19356852769851685, "learning_rate": 8.328363080116611e-05, "loss": 0.012, "step": 24380 }, { "grad_norm": 0.12860752642154694, "learning_rate": 8.326820431076997e-05, "loss": 0.0114, "step": 24390 }, { "grad_norm": 0.13903489708900452, "learning_rate": 8.325277213574759e-05, "loss": 0.0112, "step": 24400 }, { "grad_norm": 0.15537840127944946, "learning_rate": 8.32373342787359e-05, "loss": 0.0115, "step": 24410 }, { "grad_norm": 0.1062636524438858, "learning_rate": 8.322189074237285e-05, "loss": 0.0111, "step": 24420 }, { "grad_norm": 0.14591607451438904, "learning_rate": 8.32064415292973e-05, "loss": 0.0143, "step": 24430 }, { "grad_norm": 0.15800617635250092, "learning_rate": 8.319098664214907e-05, "loss": 0.0137, "step": 24440 }, { "grad_norm": 0.1755717247724533, "learning_rate": 8.3175526083569e-05, "loss": 0.0124, "step": 24450 }, { "grad_norm": 0.11629556864500046, "learning_rate": 8.316005985619889e-05, "loss": 0.0108, "step": 24460 }, { "grad_norm": 0.1453036367893219, "learning_rate": 8.314458796268147e-05, "loss": 0.0104, "step": 24470 }, { "grad_norm": 0.136766716837883, "learning_rate": 8.312911040566047e-05, "loss": 0.0114, "step": 24480 }, { "grad_norm": 0.17022933065891266, "learning_rate": 8.31136271877806e-05, "loss": 0.0128, "step": 24490 }, { "grad_norm": 0.16922107338905334, "learning_rate": 8.309813831168748e-05, "loss": 0.0149, "step": 24500 }, { "grad_norm": 0.14134426414966583, "learning_rate": 8.308264378002777e-05, "loss": 0.0127, "step": 24510 }, { "grad_norm": 0.14756068587303162, "learning_rate": 8.306714359544906e-05, "loss": 0.0126, "step": 24520 }, { "grad_norm": 0.15570497512817383, "learning_rate": 8.30516377605999e-05, "loss": 0.0118, "step": 24530 }, { "grad_norm": 0.13212479650974274, "learning_rate": 8.30361262781298e-05, "loss": 0.0145, "step": 24540 }, { "grad_norm": 0.14979203045368195, "learning_rate": 8.302060915068924e-05, "loss": 0.0117, "step": 24550 }, { "grad_norm": 0.12192767858505249, "learning_rate": 8.300508638092972e-05, "loss": 0.012, "step": 24560 }, { "grad_norm": 0.13706254959106445, "learning_rate": 8.298955797150361e-05, "loss": 0.0146, "step": 24570 }, { "grad_norm": 0.1469642072916031, "learning_rate": 8.297402392506433e-05, "loss": 0.011, "step": 24580 }, { "grad_norm": 0.11938624829053879, "learning_rate": 8.295848424426617e-05, "loss": 0.01, "step": 24590 }, { "grad_norm": 0.15341010689735413, "learning_rate": 8.29429389317645e-05, "loss": 0.0142, "step": 24600 }, { "grad_norm": 0.17331580817699432, "learning_rate": 8.292738799021556e-05, "loss": 0.0116, "step": 24610 }, { "grad_norm": 0.2381897121667862, "learning_rate": 8.291183142227656e-05, "loss": 0.0133, "step": 24620 }, { "grad_norm": 0.15803560614585876, "learning_rate": 8.289626923060572e-05, "loss": 0.0109, "step": 24630 }, { "grad_norm": 0.15544292330741882, "learning_rate": 8.288070141786218e-05, "loss": 0.0105, "step": 24640 }, { "grad_norm": 0.15753117203712463, "learning_rate": 8.286512798670605e-05, "loss": 0.0115, "step": 24650 }, { "grad_norm": 0.20673592388629913, "learning_rate": 8.284954893979842e-05, "loss": 0.0127, "step": 24660 }, { "grad_norm": 0.2057354748249054, "learning_rate": 8.283396427980131e-05, "loss": 0.0132, "step": 24670 }, { "grad_norm": 0.12391418218612671, "learning_rate": 8.281837400937771e-05, "loss": 0.0123, "step": 24680 }, { "grad_norm": 0.1518607884645462, "learning_rate": 8.28027781311916e-05, "loss": 0.0137, "step": 24690 }, { "grad_norm": 0.14865216612815857, "learning_rate": 8.278717664790785e-05, "loss": 0.0118, "step": 24700 }, { "grad_norm": 0.15085093677043915, "learning_rate": 8.277156956219234e-05, "loss": 0.0108, "step": 24710 }, { "grad_norm": 0.15297631919384003, "learning_rate": 8.275595687671189e-05, "loss": 0.0149, "step": 24720 }, { "grad_norm": 0.13973669707775116, "learning_rate": 8.27403385941343e-05, "loss": 0.0113, "step": 24730 }, { "grad_norm": 0.16192544996738434, "learning_rate": 8.272471471712828e-05, "loss": 0.0121, "step": 24740 }, { "grad_norm": 0.15981069207191467, "learning_rate": 8.270908524836355e-05, "loss": 0.011, "step": 24750 }, { "grad_norm": 0.15787625312805176, "learning_rate": 8.269345019051074e-05, "loss": 0.0142, "step": 24760 }, { "grad_norm": 0.1515597105026245, "learning_rate": 8.267780954624147e-05, "loss": 0.0109, "step": 24770 }, { "grad_norm": 0.13865074515342712, "learning_rate": 8.266216331822827e-05, "loss": 0.0117, "step": 24780 }, { "grad_norm": 0.12431933730840683, "learning_rate": 8.264651150914469e-05, "loss": 0.0129, "step": 24790 }, { "grad_norm": 0.16811737418174744, "learning_rate": 8.263085412166517e-05, "loss": 0.0117, "step": 24800 }, { "grad_norm": 0.19489656388759613, "learning_rate": 8.261519115846514e-05, "loss": 0.0141, "step": 24810 }, { "grad_norm": 0.12691840529441833, "learning_rate": 8.259952262222096e-05, "loss": 0.0103, "step": 24820 }, { "grad_norm": 0.17330557107925415, "learning_rate": 8.258384851560997e-05, "loss": 0.011, "step": 24830 }, { "grad_norm": 0.11372408270835876, "learning_rate": 8.256816884131044e-05, "loss": 0.0108, "step": 24840 }, { "grad_norm": 0.17830945551395416, "learning_rate": 8.255248360200159e-05, "loss": 0.0121, "step": 24850 }, { "grad_norm": 0.14874069392681122, "learning_rate": 8.253679280036359e-05, "loss": 0.009, "step": 24860 }, { "grad_norm": 0.15131032466888428, "learning_rate": 8.252109643907762e-05, "loss": 0.0099, "step": 24870 }, { "grad_norm": 0.10488244891166687, "learning_rate": 8.250539452082569e-05, "loss": 0.0116, "step": 24880 }, { "grad_norm": 0.14381848275661469, "learning_rate": 8.248968704829087e-05, "loss": 0.0094, "step": 24890 }, { "grad_norm": 0.14338862895965576, "learning_rate": 8.247397402415714e-05, "loss": 0.0111, "step": 24900 }, { "grad_norm": 0.1129688173532486, "learning_rate": 8.24582554511094e-05, "loss": 0.0108, "step": 24910 }, { "grad_norm": 0.12409959733486176, "learning_rate": 8.244253133183355e-05, "loss": 0.0135, "step": 24920 }, { "grad_norm": 0.14364556968212128, "learning_rate": 8.24268016690164e-05, "loss": 0.0123, "step": 24930 }, { "grad_norm": 0.15165749192237854, "learning_rate": 8.241106646534571e-05, "loss": 0.0115, "step": 24940 }, { "grad_norm": 0.15991461277008057, "learning_rate": 8.23953257235102e-05, "loss": 0.0115, "step": 24950 }, { "grad_norm": 0.14650976657867432, "learning_rate": 8.237957944619956e-05, "loss": 0.0128, "step": 24960 }, { "grad_norm": 0.1566316783428192, "learning_rate": 8.236382763610437e-05, "loss": 0.0097, "step": 24970 }, { "grad_norm": 0.1738828420639038, "learning_rate": 8.234807029591619e-05, "loss": 0.014, "step": 24980 }, { "grad_norm": 0.1279086023569107, "learning_rate": 8.233230742832752e-05, "loss": 0.0115, "step": 24990 }, { "grad_norm": 0.17993155121803284, "learning_rate": 8.231653903603178e-05, "loss": 0.0129, "step": 25000 }, { "grad_norm": 0.1582684963941574, "learning_rate": 8.23007651217234e-05, "loss": 0.0122, "step": 25010 }, { "grad_norm": 0.1948581337928772, "learning_rate": 8.228498568809769e-05, "loss": 0.0104, "step": 25020 }, { "grad_norm": 0.15368354320526123, "learning_rate": 8.22692007378509e-05, "loss": 0.0133, "step": 25030 }, { "grad_norm": 0.18160352110862732, "learning_rate": 8.225341027368028e-05, "loss": 0.0112, "step": 25040 }, { "grad_norm": 0.13400723040103912, "learning_rate": 8.223761429828399e-05, "loss": 0.0126, "step": 25050 }, { "grad_norm": 0.14628705382347107, "learning_rate": 8.22218128143611e-05, "loss": 0.0105, "step": 25060 }, { "grad_norm": 0.1762632578611374, "learning_rate": 8.220600582461166e-05, "loss": 0.0142, "step": 25070 }, { "grad_norm": 0.14913542568683624, "learning_rate": 8.219019333173668e-05, "loss": 0.0112, "step": 25080 }, { "grad_norm": 0.18430466949939728, "learning_rate": 8.217437533843805e-05, "loss": 0.011, "step": 25090 }, { "grad_norm": 0.15984153747558594, "learning_rate": 8.215855184741867e-05, "loss": 0.0129, "step": 25100 }, { "grad_norm": 0.17200854420661926, "learning_rate": 8.21427228613823e-05, "loss": 0.011, "step": 25110 }, { "grad_norm": 0.17255541682243347, "learning_rate": 8.21268883830337e-05, "loss": 0.014, "step": 25120 }, { "grad_norm": 0.18337088823318481, "learning_rate": 8.211104841507855e-05, "loss": 0.012, "step": 25130 }, { "grad_norm": 0.16349047422409058, "learning_rate": 8.209520296022346e-05, "loss": 0.0136, "step": 25140 }, { "grad_norm": 0.21041369438171387, "learning_rate": 8.207935202117599e-05, "loss": 0.0122, "step": 25150 }, { "grad_norm": 0.15785902738571167, "learning_rate": 8.206349560064463e-05, "loss": 0.0117, "step": 25160 }, { "grad_norm": 0.1400385946035385, "learning_rate": 8.204763370133881e-05, "loss": 0.0121, "step": 25170 }, { "grad_norm": 0.13896715641021729, "learning_rate": 8.203176632596892e-05, "loss": 0.0127, "step": 25180 }, { "grad_norm": 0.16699303686618805, "learning_rate": 8.20158934772462e-05, "loss": 0.012, "step": 25190 }, { "grad_norm": 0.12961822748184204, "learning_rate": 8.200001515788294e-05, "loss": 0.0101, "step": 25200 }, { "grad_norm": 0.1552421599626541, "learning_rate": 8.198413137059228e-05, "loss": 0.0098, "step": 25210 }, { "grad_norm": 0.13356776535511017, "learning_rate": 8.196824211808835e-05, "loss": 0.0138, "step": 25220 }, { "grad_norm": 0.1472722589969635, "learning_rate": 8.195234740308617e-05, "loss": 0.012, "step": 25230 }, { "grad_norm": 0.17688632011413574, "learning_rate": 8.193644722830171e-05, "loss": 0.0108, "step": 25240 }, { "grad_norm": 0.1128082200884819, "learning_rate": 8.19205415964519e-05, "loss": 0.0101, "step": 25250 }, { "grad_norm": 0.1187625601887703, "learning_rate": 8.190463051025456e-05, "loss": 0.0105, "step": 25260 }, { "grad_norm": 0.17889481782913208, "learning_rate": 8.188871397242843e-05, "loss": 0.0119, "step": 25270 }, { "grad_norm": 0.16915646195411682, "learning_rate": 8.187279198569326e-05, "loss": 0.0112, "step": 25280 }, { "grad_norm": 0.13274697959423065, "learning_rate": 8.185686455276966e-05, "loss": 0.0123, "step": 25290 }, { "grad_norm": 0.14098148047924042, "learning_rate": 8.184093167637921e-05, "loss": 0.0118, "step": 25300 }, { "grad_norm": 0.17828640341758728, "learning_rate": 8.182499335924437e-05, "loss": 0.0129, "step": 25310 }, { "grad_norm": 0.1350916475057602, "learning_rate": 8.18090496040886e-05, "loss": 0.0122, "step": 25320 }, { "grad_norm": 0.14250269532203674, "learning_rate": 8.179310041363621e-05, "loss": 0.0108, "step": 25330 }, { "grad_norm": 0.16191741824150085, "learning_rate": 8.17771457906125e-05, "loss": 0.011, "step": 25340 }, { "grad_norm": 0.16163884103298187, "learning_rate": 8.176118573774371e-05, "loss": 0.0105, "step": 25350 }, { "grad_norm": 0.1328710913658142, "learning_rate": 8.174522025775692e-05, "loss": 0.0113, "step": 25360 }, { "grad_norm": 0.13460369408130646, "learning_rate": 8.172924935338022e-05, "loss": 0.0119, "step": 25370 }, { "grad_norm": 0.1584811955690384, "learning_rate": 8.171327302734262e-05, "loss": 0.0119, "step": 25380 }, { "grad_norm": 0.13603545725345612, "learning_rate": 8.169729128237401e-05, "loss": 0.011, "step": 25390 }, { "grad_norm": 0.13998553156852722, "learning_rate": 8.168130412120525e-05, "loss": 0.0123, "step": 25400 }, { "grad_norm": 0.11979334056377411, "learning_rate": 8.16653115465681e-05, "loss": 0.0154, "step": 25410 }, { "grad_norm": 0.21992036700248718, "learning_rate": 8.164931356119526e-05, "loss": 0.0125, "step": 25420 }, { "grad_norm": 0.10479602217674255, "learning_rate": 8.163331016782032e-05, "loss": 0.011, "step": 25430 }, { "grad_norm": 0.12628956139087677, "learning_rate": 8.161730136917785e-05, "loss": 0.0125, "step": 25440 }, { "grad_norm": 0.0946296751499176, "learning_rate": 8.160128716800333e-05, "loss": 0.0095, "step": 25450 }, { "grad_norm": 0.13979849219322205, "learning_rate": 8.158526756703313e-05, "loss": 0.0124, "step": 25460 }, { "grad_norm": 0.1715368777513504, "learning_rate": 8.156924256900455e-05, "loss": 0.0156, "step": 25470 }, { "grad_norm": 0.1805524379014969, "learning_rate": 8.155321217665584e-05, "loss": 0.0155, "step": 25480 }, { "grad_norm": 0.11591696739196777, "learning_rate": 8.153717639272614e-05, "loss": 0.014, "step": 25490 }, { "grad_norm": 0.19317583739757538, "learning_rate": 8.152113521995555e-05, "loss": 0.0121, "step": 25500 }, { "grad_norm": 0.127710223197937, "learning_rate": 8.150508866108505e-05, "loss": 0.0123, "step": 25510 }, { "grad_norm": 0.16213686764240265, "learning_rate": 8.148903671885657e-05, "loss": 0.0136, "step": 25520 }, { "grad_norm": 0.1497134566307068, "learning_rate": 8.147297939601292e-05, "loss": 0.0135, "step": 25530 }, { "grad_norm": 0.18006174266338348, "learning_rate": 8.145691669529792e-05, "loss": 0.0137, "step": 25540 }, { "grad_norm": 0.16763338446617126, "learning_rate": 8.144084861945618e-05, "loss": 0.0095, "step": 25550 }, { "grad_norm": 0.12199714779853821, "learning_rate": 8.142477517123333e-05, "loss": 0.011, "step": 25560 }, { "grad_norm": 0.1266680806875229, "learning_rate": 8.140869635337586e-05, "loss": 0.0123, "step": 25570 }, { "grad_norm": 0.15975899994373322, "learning_rate": 8.139261216863123e-05, "loss": 0.0134, "step": 25580 }, { "grad_norm": 0.17818699777126312, "learning_rate": 8.137652261974776e-05, "loss": 0.0136, "step": 25590 }, { "grad_norm": 0.10302584618330002, "learning_rate": 8.136042770947472e-05, "loss": 0.0142, "step": 25600 }, { "grad_norm": 0.13257813453674316, "learning_rate": 8.134432744056228e-05, "loss": 0.0104, "step": 25610 }, { "grad_norm": 0.11594008654356003, "learning_rate": 8.132822181576158e-05, "loss": 0.0111, "step": 25620 }, { "grad_norm": 0.1650087684392929, "learning_rate": 8.131211083782459e-05, "loss": 0.0115, "step": 25630 }, { "grad_norm": 0.19280801713466644, "learning_rate": 8.129599450950424e-05, "loss": 0.0111, "step": 25640 }, { "grad_norm": 0.12932123243808746, "learning_rate": 8.127987283355438e-05, "loss": 0.0107, "step": 25650 }, { "grad_norm": 0.19079335033893585, "learning_rate": 8.126374581272976e-05, "loss": 0.0113, "step": 25660 }, { "grad_norm": 0.1684059500694275, "learning_rate": 8.124761344978605e-05, "loss": 0.0159, "step": 25670 }, { "grad_norm": 0.14271825551986694, "learning_rate": 8.12314757474798e-05, "loss": 0.0124, "step": 25680 }, { "grad_norm": 0.10732117295265198, "learning_rate": 8.121533270856856e-05, "loss": 0.009, "step": 25690 }, { "grad_norm": 0.13523443043231964, "learning_rate": 8.119918433581069e-05, "loss": 0.0123, "step": 25700 }, { "grad_norm": 0.1403578519821167, "learning_rate": 8.118303063196551e-05, "loss": 0.0139, "step": 25710 }, { "grad_norm": 0.11490978300571442, "learning_rate": 8.116687159979326e-05, "loss": 0.0107, "step": 25720 }, { "grad_norm": 0.13848178088665009, "learning_rate": 8.115070724205508e-05, "loss": 0.0116, "step": 25730 }, { "grad_norm": 0.1506589949131012, "learning_rate": 8.113453756151296e-05, "loss": 0.0136, "step": 25740 }, { "grad_norm": 0.16234156489372253, "learning_rate": 8.111836256092995e-05, "loss": 0.0121, "step": 25750 }, { "grad_norm": 0.22545945644378662, "learning_rate": 8.110218224306985e-05, "loss": 0.0126, "step": 25760 }, { "grad_norm": 0.17237193882465363, "learning_rate": 8.108599661069745e-05, "loss": 0.0112, "step": 25770 }, { "grad_norm": 0.1809501200914383, "learning_rate": 8.106980566657845e-05, "loss": 0.0124, "step": 25780 }, { "grad_norm": 0.15627837181091309, "learning_rate": 8.10536094134794e-05, "loss": 0.0122, "step": 25790 }, { "grad_norm": 0.12852241098880768, "learning_rate": 8.103740785416783e-05, "loss": 0.0099, "step": 25800 }, { "grad_norm": 0.15449392795562744, "learning_rate": 8.102120099141212e-05, "loss": 0.0107, "step": 25810 }, { "grad_norm": 0.13309377431869507, "learning_rate": 8.100498882798163e-05, "loss": 0.0116, "step": 25820 }, { "grad_norm": 0.1299305409193039, "learning_rate": 8.09887713666465e-05, "loss": 0.0138, "step": 25830 }, { "grad_norm": 0.11300049722194672, "learning_rate": 8.09725486101779e-05, "loss": 0.0126, "step": 25840 }, { "grad_norm": 0.11404834687709808, "learning_rate": 8.095632056134784e-05, "loss": 0.0119, "step": 25850 }, { "grad_norm": 0.19789916276931763, "learning_rate": 8.094008722292925e-05, "loss": 0.015, "step": 25860 }, { "grad_norm": 0.14473755657672882, "learning_rate": 8.092384859769598e-05, "loss": 0.0108, "step": 25870 }, { "grad_norm": 0.14637944102287292, "learning_rate": 8.090760468842275e-05, "loss": 0.0159, "step": 25880 }, { "grad_norm": 0.16917622089385986, "learning_rate": 8.089135549788521e-05, "loss": 0.0114, "step": 25890 }, { "grad_norm": 0.11384990811347961, "learning_rate": 8.087510102885987e-05, "loss": 0.0112, "step": 25900 }, { "grad_norm": 0.1463475078344345, "learning_rate": 8.085884128412422e-05, "loss": 0.0132, "step": 25910 }, { "grad_norm": 0.15364031493663788, "learning_rate": 8.084257626645659e-05, "loss": 0.012, "step": 25920 }, { "grad_norm": 0.18138329684734344, "learning_rate": 8.08263059786362e-05, "loss": 0.0102, "step": 25930 }, { "grad_norm": 0.15162378549575806, "learning_rate": 8.081003042344325e-05, "loss": 0.0103, "step": 25940 }, { "grad_norm": 0.14809289574623108, "learning_rate": 8.079374960365872e-05, "loss": 0.0118, "step": 25950 }, { "grad_norm": 0.14299523830413818, "learning_rate": 8.077746352206463e-05, "loss": 0.0107, "step": 25960 }, { "grad_norm": 0.14303983747959137, "learning_rate": 8.076117218144377e-05, "loss": 0.0085, "step": 25970 }, { "grad_norm": 0.1341438740491867, "learning_rate": 8.074487558457991e-05, "loss": 0.0099, "step": 25980 }, { "grad_norm": 0.14886344969272614, "learning_rate": 8.072857373425768e-05, "loss": 0.0093, "step": 25990 }, { "grad_norm": 0.15586009621620178, "learning_rate": 8.071226663326264e-05, "loss": 0.01, "step": 26000 }, { "grad_norm": 0.17313462495803833, "learning_rate": 8.069595428438121e-05, "loss": 0.0149, "step": 26010 }, { "grad_norm": 0.15295690298080444, "learning_rate": 8.067963669040072e-05, "loss": 0.01, "step": 26020 }, { "grad_norm": 0.18264438211917877, "learning_rate": 8.066331385410942e-05, "loss": 0.013, "step": 26030 }, { "grad_norm": 0.15616187453269958, "learning_rate": 8.064698577829641e-05, "loss": 0.0107, "step": 26040 }, { "grad_norm": 0.15005266666412354, "learning_rate": 8.063065246575175e-05, "loss": 0.0093, "step": 26050 }, { "grad_norm": 0.16898444294929504, "learning_rate": 8.061431391926631e-05, "loss": 0.0109, "step": 26060 }, { "grad_norm": 0.19251655042171478, "learning_rate": 8.059797014163195e-05, "loss": 0.0114, "step": 26070 }, { "grad_norm": 0.15682008862495422, "learning_rate": 8.058162113564133e-05, "loss": 0.0126, "step": 26080 }, { "grad_norm": 0.16161251068115234, "learning_rate": 8.056526690408806e-05, "loss": 0.0131, "step": 26090 }, { "grad_norm": 0.17385952174663544, "learning_rate": 8.054890744976666e-05, "loss": 0.0126, "step": 26100 }, { "grad_norm": 0.19045722484588623, "learning_rate": 8.053254277547248e-05, "loss": 0.0118, "step": 26110 }, { "grad_norm": 0.1298605054616928, "learning_rate": 8.051617288400182e-05, "loss": 0.0103, "step": 26120 }, { "grad_norm": 0.13510997593402863, "learning_rate": 8.049979777815182e-05, "loss": 0.0098, "step": 26130 }, { "grad_norm": 0.17771726846694946, "learning_rate": 8.048341746072054e-05, "loss": 0.0112, "step": 26140 }, { "grad_norm": 0.13308395445346832, "learning_rate": 8.046703193450696e-05, "loss": 0.0131, "step": 26150 }, { "grad_norm": 0.18350286781787872, "learning_rate": 8.04506412023109e-05, "loss": 0.0101, "step": 26160 }, { "grad_norm": 0.16652335226535797, "learning_rate": 8.043424526693306e-05, "loss": 0.0135, "step": 26170 }, { "grad_norm": 0.15452101826667786, "learning_rate": 8.04178441311751e-05, "loss": 0.0095, "step": 26180 }, { "grad_norm": 0.10951674729585648, "learning_rate": 8.04014377978395e-05, "loss": 0.01, "step": 26190 }, { "grad_norm": 0.1474817842245102, "learning_rate": 8.038502626972967e-05, "loss": 0.0135, "step": 26200 }, { "grad_norm": 0.13989666104316711, "learning_rate": 8.036860954964989e-05, "loss": 0.0136, "step": 26210 }, { "grad_norm": 0.12488847225904465, "learning_rate": 8.035218764040531e-05, "loss": 0.0114, "step": 26220 }, { "grad_norm": 0.15653187036514282, "learning_rate": 8.033576054480199e-05, "loss": 0.0124, "step": 26230 }, { "grad_norm": 0.21816624701023102, "learning_rate": 8.031932826564688e-05, "loss": 0.0108, "step": 26240 }, { "grad_norm": 0.14133042097091675, "learning_rate": 8.030289080574782e-05, "loss": 0.0095, "step": 26250 }, { "grad_norm": 0.12459827214479446, "learning_rate": 8.028644816791349e-05, "loss": 0.0103, "step": 26260 }, { "grad_norm": 0.12759165465831757, "learning_rate": 8.027000035495351e-05, "loss": 0.0107, "step": 26270 }, { "grad_norm": 0.11871016025543213, "learning_rate": 8.025354736967836e-05, "loss": 0.0084, "step": 26280 }, { "grad_norm": 0.16668884456157684, "learning_rate": 8.023708921489941e-05, "loss": 0.0101, "step": 26290 }, { "grad_norm": 0.13617466390132904, "learning_rate": 8.022062589342887e-05, "loss": 0.0123, "step": 26300 }, { "grad_norm": 0.14906653761863708, "learning_rate": 8.020415740807993e-05, "loss": 0.0102, "step": 26310 }, { "grad_norm": 0.10449494421482086, "learning_rate": 8.018768376166656e-05, "loss": 0.0113, "step": 26320 }, { "grad_norm": 0.12820203602313995, "learning_rate": 8.017120495700368e-05, "loss": 0.0122, "step": 26330 }, { "grad_norm": 0.15811006724834442, "learning_rate": 8.015472099690704e-05, "loss": 0.0126, "step": 26340 }, { "grad_norm": 0.10659117996692657, "learning_rate": 8.013823188419332e-05, "loss": 0.0147, "step": 26350 }, { "grad_norm": 0.16035369038581848, "learning_rate": 8.012173762168006e-05, "loss": 0.01, "step": 26360 }, { "grad_norm": 0.14109289646148682, "learning_rate": 8.010523821218567e-05, "loss": 0.0105, "step": 26370 }, { "grad_norm": 0.13724078238010406, "learning_rate": 8.008873365852945e-05, "loss": 0.0129, "step": 26380 }, { "grad_norm": 0.13990895450115204, "learning_rate": 8.007222396353157e-05, "loss": 0.012, "step": 26390 }, { "grad_norm": 0.12510588765144348, "learning_rate": 8.00557091300131e-05, "loss": 0.0104, "step": 26400 }, { "grad_norm": 0.18982043862342834, "learning_rate": 8.003918916079597e-05, "loss": 0.012, "step": 26410 }, { "grad_norm": 0.1009538397192955, "learning_rate": 8.002266405870298e-05, "loss": 0.0112, "step": 26420 }, { "grad_norm": 0.10473239421844482, "learning_rate": 8.000613382655782e-05, "loss": 0.0108, "step": 26430 }, { "grad_norm": 0.1664637178182602, "learning_rate": 7.998959846718505e-05, "loss": 0.0124, "step": 26440 }, { "grad_norm": 0.12148254364728928, "learning_rate": 7.997305798341012e-05, "loss": 0.0109, "step": 26450 }, { "grad_norm": 0.16784235835075378, "learning_rate": 7.995651237805937e-05, "loss": 0.0119, "step": 26460 }, { "grad_norm": 0.15480156242847443, "learning_rate": 7.993996165395996e-05, "loss": 0.011, "step": 26470 }, { "grad_norm": 0.13830561935901642, "learning_rate": 7.992340581393996e-05, "loss": 0.0123, "step": 26480 }, { "grad_norm": 0.20115573704242706, "learning_rate": 7.990684486082831e-05, "loss": 0.012, "step": 26490 }, { "grad_norm": 0.1391242891550064, "learning_rate": 7.989027879745482e-05, "loss": 0.0107, "step": 26500 }, { "grad_norm": 0.1326046884059906, "learning_rate": 7.98737076266502e-05, "loss": 0.0124, "step": 26510 }, { "grad_norm": 0.15769092738628387, "learning_rate": 7.985713135124598e-05, "loss": 0.0123, "step": 26520 }, { "grad_norm": 0.14774100482463837, "learning_rate": 7.98405499740746e-05, "loss": 0.0101, "step": 26530 }, { "grad_norm": 0.1651613712310791, "learning_rate": 7.98239634979694e-05, "loss": 0.014, "step": 26540 }, { "grad_norm": 0.12928280234336853, "learning_rate": 7.98073719257645e-05, "loss": 0.0115, "step": 26550 }, { "grad_norm": 0.18176771700382233, "learning_rate": 7.979077526029499e-05, "loss": 0.0119, "step": 26560 }, { "grad_norm": 0.14925454556941986, "learning_rate": 7.977417350439675e-05, "loss": 0.0137, "step": 26570 }, { "grad_norm": 0.13249415159225464, "learning_rate": 7.97575666609066e-05, "loss": 0.0128, "step": 26580 }, { "grad_norm": 0.1348983347415924, "learning_rate": 7.974095473266216e-05, "loss": 0.0105, "step": 26590 }, { "grad_norm": 0.09612315148115158, "learning_rate": 7.972433772250198e-05, "loss": 0.0103, "step": 26600 }, { "grad_norm": 0.14354906976222992, "learning_rate": 7.970771563326544e-05, "loss": 0.0128, "step": 26610 }, { "grad_norm": 0.13501860201358795, "learning_rate": 7.96910884677928e-05, "loss": 0.013, "step": 26620 }, { "grad_norm": 0.20998366177082062, "learning_rate": 7.967445622892523e-05, "loss": 0.0118, "step": 26630 }, { "grad_norm": 0.08610311895608902, "learning_rate": 7.965781891950465e-05, "loss": 0.0109, "step": 26640 }, { "grad_norm": 0.13651321828365326, "learning_rate": 7.964117654237397e-05, "loss": 0.01, "step": 26650 }, { "grad_norm": 0.16097724437713623, "learning_rate": 7.962452910037692e-05, "loss": 0.011, "step": 26660 }, { "grad_norm": 0.1566552221775055, "learning_rate": 7.96078765963581e-05, "loss": 0.0103, "step": 26670 }, { "grad_norm": 0.11081795394420624, "learning_rate": 7.95912190331629e-05, "loss": 0.0101, "step": 26680 }, { "grad_norm": 0.14360210299491882, "learning_rate": 7.957455641363772e-05, "loss": 0.0104, "step": 26690 }, { "grad_norm": 0.18674376606941223, "learning_rate": 7.955788874062968e-05, "loss": 0.0111, "step": 26700 }, { "grad_norm": 0.19735018908977509, "learning_rate": 7.95412160169869e-05, "loss": 0.0139, "step": 26710 }, { "grad_norm": 0.14768460392951965, "learning_rate": 7.952453824555824e-05, "loss": 0.011, "step": 26720 }, { "grad_norm": 0.1439557671546936, "learning_rate": 7.95078554291935e-05, "loss": 0.0099, "step": 26730 }, { "grad_norm": 0.12858055531978607, "learning_rate": 7.94911675707433e-05, "loss": 0.0094, "step": 26740 }, { "grad_norm": 0.13590119779109955, "learning_rate": 7.947447467305915e-05, "loss": 0.0101, "step": 26750 }, { "grad_norm": 0.14676125347614288, "learning_rate": 7.94577767389934e-05, "loss": 0.0114, "step": 26760 }, { "grad_norm": 0.1200423538684845, "learning_rate": 7.944107377139928e-05, "loss": 0.0091, "step": 26770 }, { "grad_norm": 0.15201228857040405, "learning_rate": 7.942436577313088e-05, "loss": 0.0131, "step": 26780 }, { "grad_norm": 0.12959112226963043, "learning_rate": 7.940765274704312e-05, "loss": 0.008, "step": 26790 }, { "grad_norm": 0.17246241867542267, "learning_rate": 7.939093469599181e-05, "loss": 0.0097, "step": 26800 }, { "grad_norm": 0.12864945828914642, "learning_rate": 7.93742116228336e-05, "loss": 0.0107, "step": 26810 }, { "grad_norm": 0.16231071949005127, "learning_rate": 7.935748353042602e-05, "loss": 0.0118, "step": 26820 }, { "grad_norm": 0.22599147260189056, "learning_rate": 7.934075042162744e-05, "loss": 0.0123, "step": 26830 }, { "grad_norm": 0.14705263078212738, "learning_rate": 7.932401229929705e-05, "loss": 0.0109, "step": 26840 }, { "grad_norm": 0.12991863489151, "learning_rate": 7.9307269166295e-05, "loss": 0.0101, "step": 26850 }, { "grad_norm": 0.12677671015262604, "learning_rate": 7.92905210254822e-05, "loss": 0.0093, "step": 26860 }, { "grad_norm": 0.1695934385061264, "learning_rate": 7.927376787972045e-05, "loss": 0.0135, "step": 26870 }, { "grad_norm": 0.18913687765598297, "learning_rate": 7.92570097318724e-05, "loss": 0.0118, "step": 26880 }, { "grad_norm": 0.1844182014465332, "learning_rate": 7.924024658480158e-05, "loss": 0.0134, "step": 26890 }, { "grad_norm": 0.12773141264915466, "learning_rate": 7.922347844137233e-05, "loss": 0.0112, "step": 26900 }, { "grad_norm": 0.1694871485233307, "learning_rate": 7.92067053044499e-05, "loss": 0.0109, "step": 26910 }, { "grad_norm": 0.13859674334526062, "learning_rate": 7.918992717690031e-05, "loss": 0.0114, "step": 26920 }, { "grad_norm": 0.18267899751663208, "learning_rate": 7.917314406159053e-05, "loss": 0.0118, "step": 26930 }, { "grad_norm": 0.09789030998945236, "learning_rate": 7.915635596138832e-05, "loss": 0.0108, "step": 26940 }, { "grad_norm": 0.1762329339981079, "learning_rate": 7.913956287916228e-05, "loss": 0.0134, "step": 26950 }, { "grad_norm": 0.14599378407001495, "learning_rate": 7.912276481778193e-05, "loss": 0.0112, "step": 26960 }, { "grad_norm": 0.16256244480609894, "learning_rate": 7.910596178011759e-05, "loss": 0.0103, "step": 26970 }, { "grad_norm": 0.1435663104057312, "learning_rate": 7.908915376904043e-05, "loss": 0.0097, "step": 26980 }, { "grad_norm": 0.1567980796098709, "learning_rate": 7.907234078742247e-05, "loss": 0.0113, "step": 26990 }, { "grad_norm": 0.14131176471710205, "learning_rate": 7.90555228381366e-05, "loss": 0.0101, "step": 27000 }, { "grad_norm": 0.1666794717311859, "learning_rate": 7.903869992405656e-05, "loss": 0.0119, "step": 27010 }, { "grad_norm": 0.15918709337711334, "learning_rate": 7.902187204805691e-05, "loss": 0.0109, "step": 27020 }, { "grad_norm": 0.1568075716495514, "learning_rate": 7.900503921301308e-05, "loss": 0.0117, "step": 27030 }, { "grad_norm": 0.14675849676132202, "learning_rate": 7.898820142180133e-05, "loss": 0.0099, "step": 27040 }, { "grad_norm": 0.1435491144657135, "learning_rate": 7.897135867729879e-05, "loss": 0.0103, "step": 27050 }, { "grad_norm": 0.10348373651504517, "learning_rate": 7.89545109823834e-05, "loss": 0.0125, "step": 27060 }, { "grad_norm": 0.15595728158950806, "learning_rate": 7.8937658339934e-05, "loss": 0.0106, "step": 27070 }, { "grad_norm": 0.1712397336959839, "learning_rate": 7.892080075283026e-05, "loss": 0.0129, "step": 27080 }, { "grad_norm": 0.10279858112335205, "learning_rate": 7.890393822395263e-05, "loss": 0.0094, "step": 27090 }, { "grad_norm": 0.1108570396900177, "learning_rate": 7.88870707561825e-05, "loss": 0.0101, "step": 27100 }, { "grad_norm": 0.12467852979898453, "learning_rate": 7.887019835240203e-05, "loss": 0.0097, "step": 27110 }, { "grad_norm": 0.1515246033668518, "learning_rate": 7.885332101549427e-05, "loss": 0.0114, "step": 27120 }, { "grad_norm": 0.18123091757297516, "learning_rate": 7.883643874834308e-05, "loss": 0.011, "step": 27130 }, { "grad_norm": 0.1274695098400116, "learning_rate": 7.881955155383321e-05, "loss": 0.0091, "step": 27140 }, { "grad_norm": 0.17323389649391174, "learning_rate": 7.880265943485017e-05, "loss": 0.0112, "step": 27150 }, { "grad_norm": 0.11113240569829941, "learning_rate": 7.878576239428038e-05, "loss": 0.0124, "step": 27160 }, { "grad_norm": 0.12568753957748413, "learning_rate": 7.87688604350111e-05, "loss": 0.013, "step": 27170 }, { "grad_norm": 0.09576369822025299, "learning_rate": 7.875195355993042e-05, "loss": 0.0112, "step": 27180 }, { "grad_norm": 0.14298231899738312, "learning_rate": 7.873504177192724e-05, "loss": 0.0138, "step": 27190 }, { "grad_norm": 0.16169483959674835, "learning_rate": 7.87181250738913e-05, "loss": 0.0115, "step": 27200 }, { "grad_norm": 0.1285269558429718, "learning_rate": 7.870120346871324e-05, "loss": 0.0108, "step": 27210 }, { "grad_norm": 0.1344793289899826, "learning_rate": 7.86842769592845e-05, "loss": 0.01, "step": 27220 }, { "grad_norm": 0.1317881941795349, "learning_rate": 7.866734554849732e-05, "loss": 0.0091, "step": 27230 }, { "grad_norm": 0.12122010439634323, "learning_rate": 7.865040923924486e-05, "loss": 0.0092, "step": 27240 }, { "grad_norm": 0.11872538924217224, "learning_rate": 7.863346803442104e-05, "loss": 0.0107, "step": 27250 }, { "grad_norm": 0.12470261752605438, "learning_rate": 7.861652193692067e-05, "loss": 0.0107, "step": 27260 }, { "grad_norm": 0.14451104402542114, "learning_rate": 7.859957094963937e-05, "loss": 0.0105, "step": 27270 }, { "grad_norm": 0.11277925968170166, "learning_rate": 7.858261507547357e-05, "loss": 0.0104, "step": 27280 }, { "grad_norm": 0.12028439342975616, "learning_rate": 7.856565431732061e-05, "loss": 0.0113, "step": 27290 }, { "grad_norm": 0.14616553485393524, "learning_rate": 7.854868867807859e-05, "loss": 0.0126, "step": 27300 }, { "grad_norm": 0.13038691878318787, "learning_rate": 7.85317181606465e-05, "loss": 0.0107, "step": 27310 }, { "grad_norm": 0.14342407882213593, "learning_rate": 7.85147427679241e-05, "loss": 0.0102, "step": 27320 }, { "grad_norm": 0.15787789225578308, "learning_rate": 7.849776250281205e-05, "loss": 0.0092, "step": 27330 }, { "grad_norm": 0.11784125864505768, "learning_rate": 7.84807773682118e-05, "loss": 0.0104, "step": 27340 }, { "grad_norm": 0.15980824828147888, "learning_rate": 7.846378736702565e-05, "loss": 0.01, "step": 27350 }, { "grad_norm": 0.1609446406364441, "learning_rate": 7.844679250215671e-05, "loss": 0.0099, "step": 27360 }, { "grad_norm": 0.15037451684474945, "learning_rate": 7.842979277650898e-05, "loss": 0.0126, "step": 27370 }, { "grad_norm": 0.1399991512298584, "learning_rate": 7.84127881929872e-05, "loss": 0.0122, "step": 27380 }, { "grad_norm": 0.15538756549358368, "learning_rate": 7.839577875449704e-05, "loss": 0.0113, "step": 27390 }, { "grad_norm": 0.13487520813941956, "learning_rate": 7.837876446394489e-05, "loss": 0.0109, "step": 27400 }, { "grad_norm": 0.15987733006477356, "learning_rate": 7.836174532423805e-05, "loss": 0.0106, "step": 27410 }, { "grad_norm": 0.13042989373207092, "learning_rate": 7.834472133828466e-05, "loss": 0.0107, "step": 27420 }, { "grad_norm": 0.17114707827568054, "learning_rate": 7.832769250899359e-05, "loss": 0.0128, "step": 27430 }, { "grad_norm": 0.19385145604610443, "learning_rate": 7.831065883927464e-05, "loss": 0.0104, "step": 27440 }, { "grad_norm": 0.1511535495519638, "learning_rate": 7.829362033203841e-05, "loss": 0.0129, "step": 27450 }, { "grad_norm": 0.1412895917892456, "learning_rate": 7.827657699019628e-05, "loss": 0.0095, "step": 27460 }, { "grad_norm": 0.13873644173145294, "learning_rate": 7.825952881666052e-05, "loss": 0.0128, "step": 27470 }, { "grad_norm": 0.13589532673358917, "learning_rate": 7.824247581434418e-05, "loss": 0.0108, "step": 27480 }, { "grad_norm": 0.10316057503223419, "learning_rate": 7.822541798616116e-05, "loss": 0.0093, "step": 27490 }, { "grad_norm": 0.11042754352092743, "learning_rate": 7.820835533502617e-05, "loss": 0.0104, "step": 27500 }, { "grad_norm": 0.15589666366577148, "learning_rate": 7.819128786385475e-05, "loss": 0.0103, "step": 27510 }, { "grad_norm": 0.10374440252780914, "learning_rate": 7.817421557556329e-05, "loss": 0.01, "step": 27520 }, { "grad_norm": 0.14115191996097565, "learning_rate": 7.815713847306893e-05, "loss": 0.0098, "step": 27530 }, { "grad_norm": 0.13029296696186066, "learning_rate": 7.81400565592897e-05, "loss": 0.0112, "step": 27540 }, { "grad_norm": 0.11848962306976318, "learning_rate": 7.812296983714444e-05, "loss": 0.01, "step": 27550 }, { "grad_norm": 0.11817947030067444, "learning_rate": 7.810587830955281e-05, "loss": 0.0099, "step": 27560 }, { "grad_norm": 0.14658032357692719, "learning_rate": 7.808878197943528e-05, "loss": 0.0128, "step": 27570 }, { "grad_norm": 0.17389173805713654, "learning_rate": 7.807168084971312e-05, "loss": 0.0105, "step": 27580 }, { "grad_norm": 0.165241539478302, "learning_rate": 7.805457492330849e-05, "loss": 0.0138, "step": 27590 }, { "grad_norm": 0.12155657261610031, "learning_rate": 7.803746420314428e-05, "loss": 0.0109, "step": 27600 }, { "grad_norm": 0.13484665751457214, "learning_rate": 7.802034869214428e-05, "loss": 0.0115, "step": 27610 }, { "grad_norm": 0.20519080758094788, "learning_rate": 7.800322839323303e-05, "loss": 0.0152, "step": 27620 }, { "grad_norm": 0.17301009595394135, "learning_rate": 7.798610330933593e-05, "loss": 0.0133, "step": 27630 }, { "grad_norm": 0.14664705097675323, "learning_rate": 7.796897344337922e-05, "loss": 0.0108, "step": 27640 }, { "grad_norm": 0.14275437593460083, "learning_rate": 7.795183879828989e-05, "loss": 0.0118, "step": 27650 }, { "grad_norm": 0.12559854984283447, "learning_rate": 7.793469937699579e-05, "loss": 0.01, "step": 27660 }, { "grad_norm": 0.1339370608329773, "learning_rate": 7.791755518242558e-05, "loss": 0.0091, "step": 27670 }, { "grad_norm": 0.14940676093101501, "learning_rate": 7.790040621750876e-05, "loss": 0.0095, "step": 27680 }, { "grad_norm": 0.15586671233177185, "learning_rate": 7.788325248517558e-05, "loss": 0.0115, "step": 27690 }, { "grad_norm": 0.16539588570594788, "learning_rate": 7.786609398835715e-05, "loss": 0.0115, "step": 27700 }, { "grad_norm": 0.13191677629947662, "learning_rate": 7.784893072998541e-05, "loss": 0.0101, "step": 27710 }, { "grad_norm": 0.1388600915670395, "learning_rate": 7.783176271299306e-05, "loss": 0.009, "step": 27720 }, { "grad_norm": 0.11371959000825882, "learning_rate": 7.781458994031368e-05, "loss": 0.0088, "step": 27730 }, { "grad_norm": 0.11042200773954391, "learning_rate": 7.779741241488161e-05, "loss": 0.0103, "step": 27740 }, { "grad_norm": 0.10932382196187973, "learning_rate": 7.7780230139632e-05, "loss": 0.0112, "step": 27750 }, { "grad_norm": 0.10632707178592682, "learning_rate": 7.776304311750087e-05, "loss": 0.0096, "step": 27760 }, { "grad_norm": 0.1594437062740326, "learning_rate": 7.7745851351425e-05, "loss": 0.0134, "step": 27770 }, { "grad_norm": 0.1810271441936493, "learning_rate": 7.772865484434197e-05, "loss": 0.0092, "step": 27780 }, { "grad_norm": 0.10136423259973526, "learning_rate": 7.77114535991902e-05, "loss": 0.0113, "step": 27790 }, { "grad_norm": 0.09709183871746063, "learning_rate": 7.769424761890893e-05, "loss": 0.0103, "step": 27800 }, { "grad_norm": 0.1490950584411621, "learning_rate": 7.767703690643817e-05, "loss": 0.0099, "step": 27810 }, { "grad_norm": 0.2110801339149475, "learning_rate": 7.76598214647188e-05, "loss": 0.0121, "step": 27820 }, { "grad_norm": 0.145950049161911, "learning_rate": 7.764260129669241e-05, "loss": 0.0099, "step": 27830 }, { "grad_norm": 0.13147656619548798, "learning_rate": 7.76253764053015e-05, "loss": 0.0085, "step": 27840 }, { "grad_norm": 0.13091236352920532, "learning_rate": 7.760814679348932e-05, "loss": 0.0102, "step": 27850 }, { "grad_norm": 0.13243544101715088, "learning_rate": 7.759091246419992e-05, "loss": 0.0105, "step": 27860 }, { "grad_norm": 0.17960532009601593, "learning_rate": 7.75736734203782e-05, "loss": 0.0121, "step": 27870 }, { "grad_norm": 0.1588514894247055, "learning_rate": 7.755642966496985e-05, "loss": 0.0115, "step": 27880 }, { "grad_norm": 0.1691705733537674, "learning_rate": 7.753918120092132e-05, "loss": 0.0126, "step": 27890 }, { "grad_norm": 0.1376391053199768, "learning_rate": 7.752192803117993e-05, "loss": 0.0101, "step": 27900 }, { "grad_norm": 0.1346323937177658, "learning_rate": 7.750467015869377e-05, "loss": 0.0133, "step": 27910 }, { "grad_norm": 0.129929319024086, "learning_rate": 7.748740758641174e-05, "loss": 0.0098, "step": 27920 }, { "grad_norm": 0.1350039392709732, "learning_rate": 7.74701403172835e-05, "loss": 0.0135, "step": 27930 }, { "grad_norm": 0.15135598182678223, "learning_rate": 7.745286835425962e-05, "loss": 0.0107, "step": 27940 }, { "grad_norm": 0.1139671728014946, "learning_rate": 7.743559170029138e-05, "loss": 0.0124, "step": 27950 }, { "grad_norm": 0.173349529504776, "learning_rate": 7.741831035833087e-05, "loss": 0.0104, "step": 27960 }, { "grad_norm": 0.14025093615055084, "learning_rate": 7.740102433133102e-05, "loss": 0.0119, "step": 27970 }, { "grad_norm": 0.1833762675523758, "learning_rate": 7.738373362224553e-05, "loss": 0.011, "step": 27980 }, { "grad_norm": 0.15930841863155365, "learning_rate": 7.73664382340289e-05, "loss": 0.0117, "step": 27990 }, { "grad_norm": 0.14638589322566986, "learning_rate": 7.734913816963647e-05, "loss": 0.0103, "step": 28000 }, { "grad_norm": 0.1312103420495987, "learning_rate": 7.73318334320243e-05, "loss": 0.0124, "step": 28010 }, { "grad_norm": 0.12029518932104111, "learning_rate": 7.731452402414934e-05, "loss": 0.0107, "step": 28020 }, { "grad_norm": 0.1392386108636856, "learning_rate": 7.729720994896928e-05, "loss": 0.0114, "step": 28030 }, { "grad_norm": 0.1336323320865631, "learning_rate": 7.727989120944262e-05, "loss": 0.0102, "step": 28040 }, { "grad_norm": 0.15501941740512848, "learning_rate": 7.726256780852865e-05, "loss": 0.013, "step": 28050 }, { "grad_norm": 0.16296866536140442, "learning_rate": 7.724523974918749e-05, "loss": 0.0104, "step": 28060 }, { "grad_norm": 0.1828499436378479, "learning_rate": 7.722790703438002e-05, "loss": 0.0104, "step": 28070 }, { "grad_norm": 0.17465706169605255, "learning_rate": 7.72105696670679e-05, "loss": 0.0099, "step": 28080 }, { "grad_norm": 0.13699790835380554, "learning_rate": 7.719322765021364e-05, "loss": 0.009, "step": 28090 }, { "grad_norm": 0.1349341720342636, "learning_rate": 7.717588098678051e-05, "loss": 0.0131, "step": 28100 }, { "grad_norm": 0.16982196271419525, "learning_rate": 7.715852967973258e-05, "loss": 0.0094, "step": 28110 }, { "grad_norm": 0.1281210333108902, "learning_rate": 7.714117373203474e-05, "loss": 0.0087, "step": 28120 }, { "grad_norm": 0.10885895788669586, "learning_rate": 7.712381314665259e-05, "loss": 0.0096, "step": 28130 }, { "grad_norm": 0.1289922446012497, "learning_rate": 7.710644792655261e-05, "loss": 0.0111, "step": 28140 }, { "grad_norm": 0.1534818708896637, "learning_rate": 7.708907807470207e-05, "loss": 0.0081, "step": 28150 }, { "grad_norm": 0.12313074618577957, "learning_rate": 7.707170359406896e-05, "loss": 0.0107, "step": 28160 }, { "grad_norm": 0.1670345515012741, "learning_rate": 7.705432448762213e-05, "loss": 0.0123, "step": 28170 }, { "grad_norm": 0.1601702719926834, "learning_rate": 7.703694075833117e-05, "loss": 0.0095, "step": 28180 }, { "grad_norm": 0.17860177159309387, "learning_rate": 7.70195524091665e-05, "loss": 0.0137, "step": 28190 }, { "grad_norm": 0.15000346302986145, "learning_rate": 7.70021594430993e-05, "loss": 0.0105, "step": 28200 }, { "grad_norm": 0.11334006488323212, "learning_rate": 7.698476186310157e-05, "loss": 0.0104, "step": 28210 }, { "grad_norm": 0.10364092886447906, "learning_rate": 7.696735967214608e-05, "loss": 0.01, "step": 28220 }, { "grad_norm": 0.11632838100194931, "learning_rate": 7.694995287320636e-05, "loss": 0.0084, "step": 28230 }, { "grad_norm": 0.12577375769615173, "learning_rate": 7.693254146925679e-05, "loss": 0.0112, "step": 28240 }, { "grad_norm": 0.1503802239894867, "learning_rate": 7.691512546327251e-05, "loss": 0.0098, "step": 28250 }, { "grad_norm": 0.13917317986488342, "learning_rate": 7.689770485822939e-05, "loss": 0.0113, "step": 28260 }, { "grad_norm": 0.12958940863609314, "learning_rate": 7.688027965710416e-05, "loss": 0.0091, "step": 28270 }, { "grad_norm": 0.15049834549427032, "learning_rate": 7.686284986287433e-05, "loss": 0.0121, "step": 28280 }, { "grad_norm": 0.19362586736679077, "learning_rate": 7.684541547851817e-05, "loss": 0.0098, "step": 28290 }, { "grad_norm": 0.18035709857940674, "learning_rate": 7.68279765070147e-05, "loss": 0.0107, "step": 28300 }, { "grad_norm": 0.12535884976387024, "learning_rate": 7.68105329513438e-05, "loss": 0.01, "step": 28310 }, { "grad_norm": 0.1334780901670456, "learning_rate": 7.67930848144861e-05, "loss": 0.0112, "step": 28320 }, { "grad_norm": 0.1295054852962494, "learning_rate": 7.6775632099423e-05, "loss": 0.0096, "step": 28330 }, { "grad_norm": 0.12004731595516205, "learning_rate": 7.675817480913667e-05, "loss": 0.0098, "step": 28340 }, { "grad_norm": 0.10955700278282166, "learning_rate": 7.674071294661011e-05, "loss": 0.0094, "step": 28350 }, { "grad_norm": 0.13269977271556854, "learning_rate": 7.672324651482707e-05, "loss": 0.0092, "step": 28360 }, { "grad_norm": 0.14509317278862, "learning_rate": 7.670577551677209e-05, "loss": 0.0088, "step": 28370 }, { "grad_norm": 0.11780855804681778, "learning_rate": 7.668829995543047e-05, "loss": 0.0095, "step": 28380 }, { "grad_norm": 0.1417151540517807, "learning_rate": 7.667081983378832e-05, "loss": 0.0096, "step": 28390 }, { "grad_norm": 0.1406959742307663, "learning_rate": 7.66533351548325e-05, "loss": 0.0084, "step": 28400 }, { "grad_norm": 0.13940401375293732, "learning_rate": 7.663584592155069e-05, "loss": 0.013, "step": 28410 }, { "grad_norm": 0.13354510068893433, "learning_rate": 7.661835213693129e-05, "loss": 0.0108, "step": 28420 }, { "grad_norm": 0.1642637699842453, "learning_rate": 7.660085380396353e-05, "loss": 0.0109, "step": 28430 }, { "grad_norm": 0.14710797369480133, "learning_rate": 7.658335092563738e-05, "loss": 0.01, "step": 28440 }, { "grad_norm": 0.15593694150447845, "learning_rate": 7.656584350494362e-05, "loss": 0.0113, "step": 28450 }, { "grad_norm": 0.18158425390720367, "learning_rate": 7.654833154487378e-05, "loss": 0.01, "step": 28460 }, { "grad_norm": 0.25898870825767517, "learning_rate": 7.653081504842017e-05, "loss": 0.0125, "step": 28470 }, { "grad_norm": 0.17962533235549927, "learning_rate": 7.65132940185759e-05, "loss": 0.0121, "step": 28480 }, { "grad_norm": 0.11211709678173065, "learning_rate": 7.649576845833481e-05, "loss": 0.0097, "step": 28490 }, { "grad_norm": 0.18254803121089935, "learning_rate": 7.647823837069156e-05, "loss": 0.0103, "step": 28500 }, { "grad_norm": 0.15490888059139252, "learning_rate": 7.646070375864156e-05, "loss": 0.0086, "step": 28510 }, { "grad_norm": 0.11473540961742401, "learning_rate": 7.644316462518099e-05, "loss": 0.01, "step": 28520 }, { "grad_norm": 0.12377962470054626, "learning_rate": 7.642562097330679e-05, "loss": 0.011, "step": 28530 }, { "grad_norm": 0.12034673988819122, "learning_rate": 7.640807280601671e-05, "loss": 0.0094, "step": 28540 }, { "grad_norm": 0.1676167994737625, "learning_rate": 7.639052012630927e-05, "loss": 0.0112, "step": 28550 }, { "grad_norm": 0.137478768825531, "learning_rate": 7.63729629371837e-05, "loss": 0.0115, "step": 28560 }, { "grad_norm": 0.11860200762748718, "learning_rate": 7.635540124164009e-05, "loss": 0.0099, "step": 28570 }, { "grad_norm": 0.10891025513410568, "learning_rate": 7.633783504267922e-05, "loss": 0.0107, "step": 28580 }, { "grad_norm": 0.1279425472021103, "learning_rate": 7.632026434330269e-05, "loss": 0.0104, "step": 28590 }, { "grad_norm": 0.15350745618343353, "learning_rate": 7.630268914651282e-05, "loss": 0.01, "step": 28600 }, { "grad_norm": 0.17049090564250946, "learning_rate": 7.628510945531278e-05, "loss": 0.0094, "step": 28610 }, { "grad_norm": 0.14714251458644867, "learning_rate": 7.626752527270641e-05, "loss": 0.0102, "step": 28620 }, { "grad_norm": 0.13700979948043823, "learning_rate": 7.62499366016984e-05, "loss": 0.0124, "step": 28630 }, { "grad_norm": 0.13424581289291382, "learning_rate": 7.623234344529416e-05, "loss": 0.0086, "step": 28640 }, { "grad_norm": 0.15639634430408478, "learning_rate": 7.62147458064999e-05, "loss": 0.0101, "step": 28650 }, { "grad_norm": 0.0932815819978714, "learning_rate": 7.619714368832254e-05, "loss": 0.0085, "step": 28660 }, { "grad_norm": 0.13545824587345123, "learning_rate": 7.61795370937698e-05, "loss": 0.0089, "step": 28670 }, { "grad_norm": 0.16829580068588257, "learning_rate": 7.61619260258502e-05, "loss": 0.0109, "step": 28680 }, { "grad_norm": 0.13918697834014893, "learning_rate": 7.614431048757298e-05, "loss": 0.0109, "step": 28690 }, { "grad_norm": 0.14351493120193481, "learning_rate": 7.612669048194814e-05, "loss": 0.0103, "step": 28700 }, { "grad_norm": 0.15448719263076782, "learning_rate": 7.610906601198646e-05, "loss": 0.0105, "step": 28710 }, { "grad_norm": 0.16839765012264252, "learning_rate": 7.60914370806995e-05, "loss": 0.0086, "step": 28720 }, { "grad_norm": 0.1362506002187729, "learning_rate": 7.607380369109953e-05, "loss": 0.0114, "step": 28730 }, { "grad_norm": 0.15525883436203003, "learning_rate": 7.605616584619961e-05, "loss": 0.0107, "step": 28740 }, { "grad_norm": 0.13321217894554138, "learning_rate": 7.603852354901362e-05, "loss": 0.0108, "step": 28750 }, { "grad_norm": 0.11556889861822128, "learning_rate": 7.602087680255609e-05, "loss": 0.0104, "step": 28760 }, { "grad_norm": 0.10595717281103134, "learning_rate": 7.600322560984238e-05, "loss": 0.0104, "step": 28770 }, { "grad_norm": 0.10218752175569534, "learning_rate": 7.598556997388863e-05, "loss": 0.0104, "step": 28780 }, { "grad_norm": 0.1025034636259079, "learning_rate": 7.596790989771166e-05, "loss": 0.0078, "step": 28790 }, { "grad_norm": 0.12573489546775818, "learning_rate": 7.595024538432914e-05, "loss": 0.009, "step": 28800 }, { "grad_norm": 0.11665812134742737, "learning_rate": 7.59325764367594e-05, "loss": 0.0102, "step": 28810 }, { "grad_norm": 0.16075856983661652, "learning_rate": 7.59149030580216e-05, "loss": 0.0122, "step": 28820 }, { "grad_norm": 0.11944352835416794, "learning_rate": 7.589722525113562e-05, "loss": 0.0091, "step": 28830 }, { "grad_norm": 0.12584972381591797, "learning_rate": 7.587954301912216e-05, "loss": 0.0114, "step": 28840 }, { "grad_norm": 0.12498616427183151, "learning_rate": 7.586185636500263e-05, "loss": 0.0106, "step": 28850 }, { "grad_norm": 0.13877469301223755, "learning_rate": 7.584416529179914e-05, "loss": 0.0096, "step": 28860 }, { "grad_norm": 0.14522433280944824, "learning_rate": 7.582646980253465e-05, "loss": 0.0095, "step": 28870 }, { "grad_norm": 0.14161832630634308, "learning_rate": 7.580876990023282e-05, "loss": 0.0094, "step": 28880 }, { "grad_norm": 0.09473076462745667, "learning_rate": 7.579106558791809e-05, "loss": 0.0094, "step": 28890 }, { "grad_norm": 0.11437752097845078, "learning_rate": 7.577335686861565e-05, "loss": 0.0111, "step": 28900 }, { "grad_norm": 0.16508178412914276, "learning_rate": 7.575564374535141e-05, "loss": 0.0098, "step": 28910 }, { "grad_norm": 0.13275878131389618, "learning_rate": 7.573792622115207e-05, "loss": 0.0091, "step": 28920 }, { "grad_norm": 0.14813128113746643, "learning_rate": 7.572020429904507e-05, "loss": 0.0112, "step": 28930 }, { "grad_norm": 0.1199636161327362, "learning_rate": 7.570247798205861e-05, "loss": 0.0088, "step": 28940 }, { "grad_norm": 0.20458878576755524, "learning_rate": 7.568474727322164e-05, "loss": 0.0102, "step": 28950 }, { "grad_norm": 0.1430962234735489, "learning_rate": 7.566701217556384e-05, "loss": 0.0116, "step": 28960 }, { "grad_norm": 0.1687445342540741, "learning_rate": 7.564927269211564e-05, "loss": 0.0113, "step": 28970 }, { "grad_norm": 0.1568024456501007, "learning_rate": 7.563152882590824e-05, "loss": 0.012, "step": 28980 }, { "grad_norm": 0.13701581954956055, "learning_rate": 7.56137805799736e-05, "loss": 0.012, "step": 28990 }, { "grad_norm": 0.17128966748714447, "learning_rate": 7.559602795734439e-05, "loss": 0.0108, "step": 29000 }, { "grad_norm": 0.15747053921222687, "learning_rate": 7.557827096105408e-05, "loss": 0.0107, "step": 29010 }, { "grad_norm": 0.12152373790740967, "learning_rate": 7.55605095941368e-05, "loss": 0.0102, "step": 29020 }, { "grad_norm": 0.11408289521932602, "learning_rate": 7.554274385962752e-05, "loss": 0.0088, "step": 29030 }, { "grad_norm": 0.13707493245601654, "learning_rate": 7.552497376056191e-05, "loss": 0.0122, "step": 29040 }, { "grad_norm": 0.16409458220005035, "learning_rate": 7.550719929997639e-05, "loss": 0.0084, "step": 29050 }, { "grad_norm": 0.08883734792470932, "learning_rate": 7.548942048090813e-05, "loss": 0.009, "step": 29060 }, { "grad_norm": 0.11965928971767426, "learning_rate": 7.547163730639506e-05, "loss": 0.0112, "step": 29070 }, { "grad_norm": 0.1192512959241867, "learning_rate": 7.545384977947583e-05, "loss": 0.011, "step": 29080 }, { "grad_norm": 0.13139702379703522, "learning_rate": 7.543605790318981e-05, "loss": 0.0108, "step": 29090 }, { "grad_norm": 0.1646365076303482, "learning_rate": 7.54182616805772e-05, "loss": 0.0095, "step": 29100 }, { "grad_norm": 0.1416463404893875, "learning_rate": 7.540046111467885e-05, "loss": 0.009, "step": 29110 }, { "grad_norm": 0.1633150577545166, "learning_rate": 7.53826562085364e-05, "loss": 0.0104, "step": 29120 }, { "grad_norm": 0.11114548146724701, "learning_rate": 7.536484696519221e-05, "loss": 0.0105, "step": 29130 }, { "grad_norm": 0.15969796478748322, "learning_rate": 7.534703338768942e-05, "loss": 0.0118, "step": 29140 }, { "grad_norm": 0.153697207570076, "learning_rate": 7.532921547907185e-05, "loss": 0.0109, "step": 29150 }, { "grad_norm": 0.12581178545951843, "learning_rate": 7.531139324238412e-05, "loss": 0.0114, "step": 29160 }, { "grad_norm": 0.16865874826908112, "learning_rate": 7.529356668067157e-05, "loss": 0.011, "step": 29170 }, { "grad_norm": 0.12054918706417084, "learning_rate": 7.527573579698023e-05, "loss": 0.0106, "step": 29180 }, { "grad_norm": 0.15055853128433228, "learning_rate": 7.525790059435693e-05, "loss": 0.0108, "step": 29190 }, { "grad_norm": 0.1305660754442215, "learning_rate": 7.524006107584926e-05, "loss": 0.009, "step": 29200 }, { "grad_norm": 0.1359865963459015, "learning_rate": 7.522221724450544e-05, "loss": 0.009, "step": 29210 }, { "grad_norm": 0.15553371608257294, "learning_rate": 7.520436910337451e-05, "loss": 0.0107, "step": 29220 }, { "grad_norm": 0.16595368087291718, "learning_rate": 7.518651665550627e-05, "loss": 0.0108, "step": 29230 }, { "grad_norm": 0.1411639153957367, "learning_rate": 7.516865990395117e-05, "loss": 0.0135, "step": 29240 }, { "grad_norm": 0.11632505059242249, "learning_rate": 7.515079885176047e-05, "loss": 0.0106, "step": 29250 }, { "grad_norm": 0.12536901235580444, "learning_rate": 7.513293350198612e-05, "loss": 0.0108, "step": 29260 }, { "grad_norm": 0.09719109535217285, "learning_rate": 7.511506385768081e-05, "loss": 0.0096, "step": 29270 }, { "grad_norm": 0.13181646168231964, "learning_rate": 7.509718992189801e-05, "loss": 0.0095, "step": 29280 }, { "grad_norm": 0.1199241504073143, "learning_rate": 7.507931169769182e-05, "loss": 0.011, "step": 29290 }, { "grad_norm": 0.1308780014514923, "learning_rate": 7.506142918811722e-05, "loss": 0.0124, "step": 29300 }, { "grad_norm": 0.1329977661371231, "learning_rate": 7.504354239622978e-05, "loss": 0.0104, "step": 29310 }, { "grad_norm": 0.11422035098075867, "learning_rate": 7.50256513250859e-05, "loss": 0.0104, "step": 29320 }, { "grad_norm": 0.12891431152820587, "learning_rate": 7.500775597774265e-05, "loss": 0.0116, "step": 29330 }, { "grad_norm": 0.1356744021177292, "learning_rate": 7.498985635725788e-05, "loss": 0.0097, "step": 29340 }, { "grad_norm": 0.12669040262699127, "learning_rate": 7.497195246669012e-05, "loss": 0.0111, "step": 29350 }, { "grad_norm": 0.15258167684078217, "learning_rate": 7.495404430909868e-05, "loss": 0.0104, "step": 29360 }, { "grad_norm": 0.14733028411865234, "learning_rate": 7.493613188754356e-05, "loss": 0.0107, "step": 29370 }, { "grad_norm": 0.13701024651527405, "learning_rate": 7.49182152050855e-05, "loss": 0.009, "step": 29380 }, { "grad_norm": 0.10090357065200806, "learning_rate": 7.490029426478598e-05, "loss": 0.0106, "step": 29390 }, { "grad_norm": 0.1364269107580185, "learning_rate": 7.488236906970719e-05, "loss": 0.0108, "step": 29400 }, { "grad_norm": 0.1474427580833435, "learning_rate": 7.486443962291207e-05, "loss": 0.0086, "step": 29410 }, { "grad_norm": 0.10885058343410492, "learning_rate": 7.484650592746424e-05, "loss": 0.0082, "step": 29420 }, { "grad_norm": 0.14628177881240845, "learning_rate": 7.482856798642811e-05, "loss": 0.0106, "step": 29430 }, { "grad_norm": 0.10476706176996231, "learning_rate": 7.481062580286878e-05, "loss": 0.0099, "step": 29440 }, { "grad_norm": 0.11956343054771423, "learning_rate": 7.479267937985208e-05, "loss": 0.0095, "step": 29450 }, { "grad_norm": 0.1268688142299652, "learning_rate": 7.477472872044456e-05, "loss": 0.0091, "step": 29460 }, { "grad_norm": 0.14511720836162567, "learning_rate": 7.475677382771347e-05, "loss": 0.0081, "step": 29470 }, { "grad_norm": 0.13384661078453064, "learning_rate": 7.473881470472683e-05, "loss": 0.0102, "step": 29480 }, { "grad_norm": 0.11487521231174469, "learning_rate": 7.472085135455337e-05, "loss": 0.008, "step": 29490 }, { "grad_norm": 0.1566438525915146, "learning_rate": 7.470288378026256e-05, "loss": 0.0113, "step": 29500 }, { "grad_norm": 0.09902604669332504, "learning_rate": 7.468491198492451e-05, "loss": 0.0111, "step": 29510 }, { "grad_norm": 0.1466352641582489, "learning_rate": 7.466693597161013e-05, "loss": 0.0106, "step": 29520 }, { "grad_norm": 0.1241990327835083, "learning_rate": 7.464895574339104e-05, "loss": 0.0088, "step": 29530 }, { "grad_norm": 0.13684076070785522, "learning_rate": 7.463097130333958e-05, "loss": 0.0128, "step": 29540 }, { "grad_norm": 0.11762619018554688, "learning_rate": 7.461298265452876e-05, "loss": 0.0088, "step": 29550 }, { "grad_norm": 0.16651497781276703, "learning_rate": 7.459498980003239e-05, "loss": 0.012, "step": 29560 }, { "grad_norm": 0.111813984811306, "learning_rate": 7.457699274292493e-05, "loss": 0.0119, "step": 29570 }, { "grad_norm": 0.14114224910736084, "learning_rate": 7.455899148628159e-05, "loss": 0.0112, "step": 29580 }, { "grad_norm": 0.1149466335773468, "learning_rate": 7.45409860331783e-05, "loss": 0.0102, "step": 29590 }, { "grad_norm": 0.23354022204875946, "learning_rate": 7.452297638669169e-05, "loss": 0.0119, "step": 29600 }, { "grad_norm": 0.20726051926612854, "learning_rate": 7.450496254989911e-05, "loss": 0.0127, "step": 29610 }, { "grad_norm": 0.16241280734539032, "learning_rate": 7.448694452587866e-05, "loss": 0.0104, "step": 29620 }, { "grad_norm": 0.14391082525253296, "learning_rate": 7.44689223177091e-05, "loss": 0.0097, "step": 29630 }, { "grad_norm": 0.20851676166057587, "learning_rate": 7.445089592846994e-05, "loss": 0.0096, "step": 29640 }, { "grad_norm": 0.10915077477693558, "learning_rate": 7.443286536124141e-05, "loss": 0.0094, "step": 29650 }, { "grad_norm": 0.17798717319965363, "learning_rate": 7.441483061910443e-05, "loss": 0.0104, "step": 29660 }, { "grad_norm": 0.1333981454372406, "learning_rate": 7.439679170514064e-05, "loss": 0.0095, "step": 29670 }, { "grad_norm": 0.17937549948692322, "learning_rate": 7.43787486224324e-05, "loss": 0.0097, "step": 29680 }, { "grad_norm": 0.17742572724819183, "learning_rate": 7.436070137406276e-05, "loss": 0.0114, "step": 29690 }, { "grad_norm": 0.15795686841011047, "learning_rate": 7.434264996311556e-05, "loss": 0.0108, "step": 29700 }, { "grad_norm": 0.12141293287277222, "learning_rate": 7.432459439267525e-05, "loss": 0.0106, "step": 29710 }, { "grad_norm": 0.12263722717761993, "learning_rate": 7.430653466582701e-05, "loss": 0.009, "step": 29720 }, { "grad_norm": 0.1197306215763092, "learning_rate": 7.42884707856568e-05, "loss": 0.0137, "step": 29730 }, { "grad_norm": 0.09462056308984756, "learning_rate": 7.427040275525122e-05, "loss": 0.0084, "step": 29740 }, { "grad_norm": 0.11936641484498978, "learning_rate": 7.42523305776976e-05, "loss": 0.0104, "step": 29750 }, { "grad_norm": 0.10043560713529587, "learning_rate": 7.4234254256084e-05, "loss": 0.0099, "step": 29760 }, { "grad_norm": 0.14798256754875183, "learning_rate": 7.421617379349915e-05, "loss": 0.0124, "step": 29770 }, { "grad_norm": 0.17654910683631897, "learning_rate": 7.41980891930325e-05, "loss": 0.0108, "step": 29780 }, { "grad_norm": 0.14072108268737793, "learning_rate": 7.418000045777425e-05, "loss": 0.0104, "step": 29790 }, { "grad_norm": 0.16243483126163483, "learning_rate": 7.416190759081523e-05, "loss": 0.0132, "step": 29800 }, { "grad_norm": 0.15597596764564514, "learning_rate": 7.414381059524704e-05, "loss": 0.0089, "step": 29810 }, { "grad_norm": 0.08484450727701187, "learning_rate": 7.412570947416195e-05, "loss": 0.0092, "step": 29820 }, { "grad_norm": 0.12919120490550995, "learning_rate": 7.410760423065295e-05, "loss": 0.0108, "step": 29830 }, { "grad_norm": 0.150302916765213, "learning_rate": 7.408949486781372e-05, "loss": 0.0102, "step": 29840 }, { "grad_norm": 0.1607242077589035, "learning_rate": 7.407138138873868e-05, "loss": 0.0118, "step": 29850 }, { "grad_norm": 0.15913811326026917, "learning_rate": 7.405326379652292e-05, "loss": 0.0118, "step": 29860 }, { "grad_norm": 0.1448216289281845, "learning_rate": 7.403514209426222e-05, "loss": 0.0114, "step": 29870 }, { "grad_norm": 0.13893873989582062, "learning_rate": 7.40170162850531e-05, "loss": 0.0122, "step": 29880 }, { "grad_norm": 0.1340334415435791, "learning_rate": 7.399888637199278e-05, "loss": 0.0125, "step": 29890 }, { "grad_norm": 0.18026848137378693, "learning_rate": 7.398075235817914e-05, "loss": 0.0105, "step": 29900 }, { "grad_norm": 0.1927584707736969, "learning_rate": 7.39626142467108e-05, "loss": 0.0126, "step": 29910 }, { "grad_norm": 0.10687064379453659, "learning_rate": 7.394447204068706e-05, "loss": 0.0097, "step": 29920 }, { "grad_norm": 0.148342102766037, "learning_rate": 7.392632574320793e-05, "loss": 0.0121, "step": 29930 }, { "grad_norm": 0.09489224851131439, "learning_rate": 7.390817535737411e-05, "loss": 0.0115, "step": 29940 }, { "grad_norm": 0.1680915653705597, "learning_rate": 7.389002088628703e-05, "loss": 0.011, "step": 29950 }, { "grad_norm": 0.11902885138988495, "learning_rate": 7.387186233304877e-05, "loss": 0.0088, "step": 29960 }, { "grad_norm": 0.14552396535873413, "learning_rate": 7.385369970076212e-05, "loss": 0.0082, "step": 29970 }, { "grad_norm": 0.11045894771814346, "learning_rate": 7.38355329925306e-05, "loss": 0.0077, "step": 29980 }, { "grad_norm": 0.10314492881298065, "learning_rate": 7.381736221145838e-05, "loss": 0.0089, "step": 29990 }, { "grad_norm": 0.1345749795436859, "learning_rate": 7.37991873606504e-05, "loss": 0.0086, "step": 30000 }, { "grad_norm": 0.15684927999973297, "learning_rate": 7.378100844321218e-05, "loss": 0.0091, "step": 30010 }, { "grad_norm": 0.17072844505310059, "learning_rate": 7.376282546225004e-05, "loss": 0.0141, "step": 30020 }, { "grad_norm": 0.2364114373922348, "learning_rate": 7.374463842087094e-05, "loss": 0.0105, "step": 30030 }, { "grad_norm": 0.09063352644443512, "learning_rate": 7.372644732218254e-05, "loss": 0.0091, "step": 30040 }, { "grad_norm": 0.09634294360876083, "learning_rate": 7.370825216929322e-05, "loss": 0.0117, "step": 30050 }, { "grad_norm": 0.16983504593372345, "learning_rate": 7.369005296531205e-05, "loss": 0.0117, "step": 30060 }, { "grad_norm": 0.13330459594726562, "learning_rate": 7.367184971334873e-05, "loss": 0.0097, "step": 30070 }, { "grad_norm": 0.1261105239391327, "learning_rate": 7.365364241651371e-05, "loss": 0.0094, "step": 30080 }, { "grad_norm": 0.11977528780698776, "learning_rate": 7.363543107791815e-05, "loss": 0.0108, "step": 30090 }, { "grad_norm": 0.14330291748046875, "learning_rate": 7.361721570067384e-05, "loss": 0.0101, "step": 30100 }, { "grad_norm": 0.13177737593650818, "learning_rate": 7.359899628789331e-05, "loss": 0.011, "step": 30110 }, { "grad_norm": 0.10941732674837112, "learning_rate": 7.358077284268974e-05, "loss": 0.0101, "step": 30120 }, { "grad_norm": 0.16177192330360413, "learning_rate": 7.356254536817702e-05, "loss": 0.01, "step": 30130 }, { "grad_norm": 0.18721291422843933, "learning_rate": 7.354431386746973e-05, "loss": 0.0135, "step": 30140 }, { "grad_norm": 0.1608598381280899, "learning_rate": 7.352607834368316e-05, "loss": 0.0105, "step": 30150 }, { "grad_norm": 0.1875276118516922, "learning_rate": 7.350783879993324e-05, "loss": 0.0114, "step": 30160 }, { "grad_norm": 0.13938181102275848, "learning_rate": 7.348959523933658e-05, "loss": 0.012, "step": 30170 }, { "grad_norm": 0.13338413834571838, "learning_rate": 7.347134766501057e-05, "loss": 0.0102, "step": 30180 }, { "grad_norm": 0.12127159535884857, "learning_rate": 7.345309608007315e-05, "loss": 0.0097, "step": 30190 }, { "grad_norm": 0.13119620084762573, "learning_rate": 7.343484048764308e-05, "loss": 0.0116, "step": 30200 }, { "grad_norm": 0.16025806963443756, "learning_rate": 7.341658089083972e-05, "loss": 0.011, "step": 30210 }, { "grad_norm": 0.13567593693733215, "learning_rate": 7.339831729278313e-05, "loss": 0.012, "step": 30220 }, { "grad_norm": 0.1258596032857895, "learning_rate": 7.338004969659404e-05, "loss": 0.0088, "step": 30230 }, { "grad_norm": 0.1338682621717453, "learning_rate": 7.336177810539391e-05, "loss": 0.0095, "step": 30240 }, { "grad_norm": 0.14304538071155548, "learning_rate": 7.334350252230485e-05, "loss": 0.0129, "step": 30250 }, { "grad_norm": 0.09396971762180328, "learning_rate": 7.332522295044965e-05, "loss": 0.0096, "step": 30260 }, { "grad_norm": 0.20178687572479248, "learning_rate": 7.33069393929518e-05, "loss": 0.0111, "step": 30270 }, { "grad_norm": 0.1523573398590088, "learning_rate": 7.328865185293545e-05, "loss": 0.0137, "step": 30280 }, { "grad_norm": 0.15315471589565277, "learning_rate": 7.327036033352546e-05, "loss": 0.0108, "step": 30290 }, { "grad_norm": 0.16182653605937958, "learning_rate": 7.325206483784733e-05, "loss": 0.0104, "step": 30300 }, { "grad_norm": 0.1136481910943985, "learning_rate": 7.323376536902724e-05, "loss": 0.0097, "step": 30310 }, { "grad_norm": 0.16965001821517944, "learning_rate": 7.321546193019213e-05, "loss": 0.0116, "step": 30320 }, { "grad_norm": 0.12274277955293655, "learning_rate": 7.31971545244695e-05, "loss": 0.0111, "step": 30330 }, { "grad_norm": 0.15249179303646088, "learning_rate": 7.31788431549876e-05, "loss": 0.013, "step": 30340 }, { "grad_norm": 0.19256183505058289, "learning_rate": 7.316052782487534e-05, "loss": 0.0105, "step": 30350 }, { "grad_norm": 0.19577650725841522, "learning_rate": 7.314220853726234e-05, "loss": 0.011, "step": 30360 }, { "grad_norm": 0.14583022892475128, "learning_rate": 7.312388529527884e-05, "loss": 0.0109, "step": 30370 }, { "grad_norm": 0.15608802437782288, "learning_rate": 7.310555810205577e-05, "loss": 0.0094, "step": 30380 }, { "grad_norm": 0.14240287244319916, "learning_rate": 7.308722696072476e-05, "loss": 0.0104, "step": 30390 }, { "grad_norm": 0.10842235386371613, "learning_rate": 7.306889187441811e-05, "loss": 0.0088, "step": 30400 }, { "grad_norm": 0.13293708860874176, "learning_rate": 7.305055284626876e-05, "loss": 0.0105, "step": 30410 }, { "grad_norm": 0.12480154633522034, "learning_rate": 7.303220987941037e-05, "loss": 0.0104, "step": 30420 }, { "grad_norm": 0.13939298689365387, "learning_rate": 7.301386297697726e-05, "loss": 0.007, "step": 30430 }, { "grad_norm": 0.11075839400291443, "learning_rate": 7.299551214210438e-05, "loss": 0.0089, "step": 30440 }, { "grad_norm": 0.1883809119462967, "learning_rate": 7.297715737792738e-05, "loss": 0.0103, "step": 30450 }, { "grad_norm": 0.10873688757419586, "learning_rate": 7.295879868758265e-05, "loss": 0.0094, "step": 30460 }, { "grad_norm": 0.20651088654994965, "learning_rate": 7.294043607420713e-05, "loss": 0.0103, "step": 30470 }, { "grad_norm": 0.14481258392333984, "learning_rate": 7.292206954093852e-05, "loss": 0.0109, "step": 30480 }, { "grad_norm": 0.14188355207443237, "learning_rate": 7.290369909091515e-05, "loss": 0.011, "step": 30490 }, { "grad_norm": 0.17666257917881012, "learning_rate": 7.2885324727276e-05, "loss": 0.0113, "step": 30500 }, { "grad_norm": 0.12329865247011185, "learning_rate": 7.286694645316076e-05, "loss": 0.0097, "step": 30510 }, { "grad_norm": 0.11977660655975342, "learning_rate": 7.284856427170982e-05, "loss": 0.0102, "step": 30520 }, { "grad_norm": 0.16234177350997925, "learning_rate": 7.283017818606414e-05, "loss": 0.0119, "step": 30530 }, { "grad_norm": 0.13226263225078583, "learning_rate": 7.28117881993654e-05, "loss": 0.0096, "step": 30540 }, { "grad_norm": 0.1364573836326599, "learning_rate": 7.279339431475598e-05, "loss": 0.0097, "step": 30550 }, { "grad_norm": 0.12469307333230972, "learning_rate": 7.277499653537887e-05, "loss": 0.0114, "step": 30560 }, { "grad_norm": 0.13000768423080444, "learning_rate": 7.275659486437776e-05, "loss": 0.0137, "step": 30570 }, { "grad_norm": 0.1277301013469696, "learning_rate": 7.273818930489695e-05, "loss": 0.0118, "step": 30580 }, { "grad_norm": 0.12636277079582214, "learning_rate": 7.271977986008151e-05, "loss": 0.0083, "step": 30590 }, { "grad_norm": 0.17332500219345093, "learning_rate": 7.270136653307705e-05, "loss": 0.0089, "step": 30600 }, { "grad_norm": 0.11866318434476852, "learning_rate": 7.268294932702994e-05, "loss": 0.0107, "step": 30610 }, { "grad_norm": 0.15035942196846008, "learning_rate": 7.266452824508719e-05, "loss": 0.008, "step": 30620 }, { "grad_norm": 0.14765508472919464, "learning_rate": 7.264610329039643e-05, "loss": 0.0112, "step": 30630 }, { "grad_norm": 0.1356213241815567, "learning_rate": 7.262767446610599e-05, "loss": 0.0101, "step": 30640 }, { "grad_norm": 0.11133482307195663, "learning_rate": 7.260924177536485e-05, "loss": 0.009, "step": 30650 }, { "grad_norm": 0.12500841915607452, "learning_rate": 7.259080522132265e-05, "loss": 0.0114, "step": 30660 }, { "grad_norm": 0.10367019474506378, "learning_rate": 7.257236480712972e-05, "loss": 0.0092, "step": 30670 }, { "grad_norm": 0.1318652331829071, "learning_rate": 7.255392053593697e-05, "loss": 0.0086, "step": 30680 }, { "grad_norm": 0.11948911100625992, "learning_rate": 7.253547241089607e-05, "loss": 0.0098, "step": 30690 }, { "grad_norm": 0.11644463986158371, "learning_rate": 7.251702043515927e-05, "loss": 0.0095, "step": 30700 }, { "grad_norm": 0.12396766245365143, "learning_rate": 7.249856461187952e-05, "loss": 0.011, "step": 30710 }, { "grad_norm": 0.09770944714546204, "learning_rate": 7.248010494421042e-05, "loss": 0.0096, "step": 30720 }, { "grad_norm": 0.135340616106987, "learning_rate": 7.246164143530622e-05, "loss": 0.0115, "step": 30730 }, { "grad_norm": 0.13007225096225739, "learning_rate": 7.244317408832181e-05, "loss": 0.0117, "step": 30740 }, { "grad_norm": 0.09310446679592133, "learning_rate": 7.242470290641279e-05, "loss": 0.0083, "step": 30750 }, { "grad_norm": 0.14083300530910492, "learning_rate": 7.240622789273536e-05, "loss": 0.0093, "step": 30760 }, { "grad_norm": 0.15216822922229767, "learning_rate": 7.238774905044638e-05, "loss": 0.0097, "step": 30770 }, { "grad_norm": 0.11801524460315704, "learning_rate": 7.236926638270341e-05, "loss": 0.0107, "step": 30780 }, { "grad_norm": 0.14884330332279205, "learning_rate": 7.23507798926646e-05, "loss": 0.0076, "step": 30790 }, { "grad_norm": 0.1309380680322647, "learning_rate": 7.23322895834888e-05, "loss": 0.0113, "step": 30800 }, { "grad_norm": 0.11074725538492203, "learning_rate": 7.231379545833552e-05, "loss": 0.0113, "step": 30810 }, { "grad_norm": 0.11864267289638519, "learning_rate": 7.229529752036487e-05, "loss": 0.0111, "step": 30820 }, { "grad_norm": 0.12410001456737518, "learning_rate": 7.227679577273765e-05, "loss": 0.0107, "step": 30830 }, { "grad_norm": 0.15775048732757568, "learning_rate": 7.225829021861529e-05, "loss": 0.0081, "step": 30840 }, { "grad_norm": 0.12327109277248383, "learning_rate": 7.223978086115992e-05, "loss": 0.0091, "step": 30850 }, { "grad_norm": 0.1559303104877472, "learning_rate": 7.222126770353425e-05, "loss": 0.0103, "step": 30860 }, { "grad_norm": 0.1321970522403717, "learning_rate": 7.22027507489017e-05, "loss": 0.0122, "step": 30870 }, { "grad_norm": 0.1587793081998825, "learning_rate": 7.218423000042627e-05, "loss": 0.0114, "step": 30880 }, { "grad_norm": 0.18119646608829498, "learning_rate": 7.216570546127268e-05, "loss": 0.0087, "step": 30890 }, { "grad_norm": 0.11581017822027206, "learning_rate": 7.214717713460626e-05, "loss": 0.0098, "step": 30900 }, { "grad_norm": 0.1913793683052063, "learning_rate": 7.2128645023593e-05, "loss": 0.009, "step": 30910 }, { "grad_norm": 0.14236438274383545, "learning_rate": 7.211010913139951e-05, "loss": 0.0097, "step": 30920 }, { "grad_norm": 0.13007661700248718, "learning_rate": 7.209156946119308e-05, "loss": 0.0091, "step": 30930 }, { "grad_norm": 0.09904663264751434, "learning_rate": 7.207302601614166e-05, "loss": 0.0088, "step": 30940 }, { "grad_norm": 0.11476331204175949, "learning_rate": 7.205447879941378e-05, "loss": 0.0087, "step": 30950 }, { "grad_norm": 0.14568214118480682, "learning_rate": 7.203592781417866e-05, "loss": 0.0103, "step": 30960 }, { "grad_norm": 0.09556527435779572, "learning_rate": 7.201737306360617e-05, "loss": 0.0095, "step": 30970 }, { "grad_norm": 0.09090495109558105, "learning_rate": 7.19988145508668e-05, "loss": 0.0087, "step": 30980 }, { "grad_norm": 0.11506107449531555, "learning_rate": 7.198025227913168e-05, "loss": 0.0124, "step": 30990 }, { "grad_norm": 0.13039694726467133, "learning_rate": 7.196168625157261e-05, "loss": 0.0091, "step": 31000 }, { "grad_norm": 0.11916035413742065, "learning_rate": 7.194311647136201e-05, "loss": 0.0102, "step": 31010 }, { "grad_norm": 0.12369786947965622, "learning_rate": 7.192454294167297e-05, "loss": 0.0104, "step": 31020 }, { "grad_norm": 0.10290399938821793, "learning_rate": 7.190596566567917e-05, "loss": 0.0086, "step": 31030 }, { "grad_norm": 0.15209531784057617, "learning_rate": 7.188738464655496e-05, "loss": 0.0116, "step": 31040 }, { "grad_norm": 0.13792213797569275, "learning_rate": 7.186879988747533e-05, "loss": 0.0099, "step": 31050 }, { "grad_norm": 0.1365036815404892, "learning_rate": 7.185021139161592e-05, "loss": 0.0092, "step": 31060 }, { "grad_norm": 0.11079082638025284, "learning_rate": 7.1831619162153e-05, "loss": 0.0099, "step": 31070 }, { "grad_norm": 0.12677699327468872, "learning_rate": 7.181302320226345e-05, "loss": 0.0107, "step": 31080 }, { "grad_norm": 0.1314263641834259, "learning_rate": 7.179442351512482e-05, "loss": 0.0084, "step": 31090 }, { "grad_norm": 0.1103416159749031, "learning_rate": 7.177582010391528e-05, "loss": 0.0106, "step": 31100 }, { "grad_norm": 0.1434856802225113, "learning_rate": 7.175721297181366e-05, "loss": 0.0098, "step": 31110 }, { "grad_norm": 0.14812637865543365, "learning_rate": 7.173860212199942e-05, "loss": 0.01, "step": 31120 }, { "grad_norm": 0.13842830061912537, "learning_rate": 7.171998755765263e-05, "loss": 0.0092, "step": 31130 }, { "grad_norm": 0.11111688613891602, "learning_rate": 7.170136928195398e-05, "loss": 0.0089, "step": 31140 }, { "grad_norm": 0.17501969635486603, "learning_rate": 7.168274729808489e-05, "loss": 0.0101, "step": 31150 }, { "grad_norm": 0.1353817880153656, "learning_rate": 7.166412160922728e-05, "loss": 0.01, "step": 31160 }, { "grad_norm": 0.12681007385253906, "learning_rate": 7.164549221856382e-05, "loss": 0.0084, "step": 31170 }, { "grad_norm": 0.14657339453697205, "learning_rate": 7.162685912927775e-05, "loss": 0.0086, "step": 31180 }, { "grad_norm": 0.15174855291843414, "learning_rate": 7.160822234455294e-05, "loss": 0.0088, "step": 31190 }, { "grad_norm": 0.1646859496831894, "learning_rate": 7.158958186757391e-05, "loss": 0.009, "step": 31200 }, { "grad_norm": 0.1316072940826416, "learning_rate": 7.157093770152582e-05, "loss": 0.0102, "step": 31210 }, { "grad_norm": 0.12327273190021515, "learning_rate": 7.155228984959446e-05, "loss": 0.0115, "step": 31220 }, { "grad_norm": 0.14189037680625916, "learning_rate": 7.153363831496621e-05, "loss": 0.01, "step": 31230 }, { "grad_norm": 0.1285688430070877, "learning_rate": 7.151498310082811e-05, "loss": 0.0128, "step": 31240 }, { "grad_norm": 0.1250811070203781, "learning_rate": 7.149632421036784e-05, "loss": 0.0086, "step": 31250 }, { "grad_norm": 0.11478279531002045, "learning_rate": 7.147766164677369e-05, "loss": 0.0082, "step": 31260 }, { "grad_norm": 0.10414396226406097, "learning_rate": 7.145899541323459e-05, "loss": 0.0093, "step": 31270 }, { "grad_norm": 0.12608906626701355, "learning_rate": 7.144032551294007e-05, "loss": 0.012, "step": 31280 }, { "grad_norm": 0.1452144980430603, "learning_rate": 7.14216519490803e-05, "loss": 0.0117, "step": 31290 }, { "grad_norm": 0.16755232214927673, "learning_rate": 7.140297472484609e-05, "loss": 0.012, "step": 31300 }, { "grad_norm": 0.13342340290546417, "learning_rate": 7.138429384342891e-05, "loss": 0.0087, "step": 31310 }, { "grad_norm": 0.20060290396213531, "learning_rate": 7.136560930802074e-05, "loss": 0.0088, "step": 31320 }, { "grad_norm": 0.14280526340007782, "learning_rate": 7.134692112181431e-05, "loss": 0.01, "step": 31330 }, { "grad_norm": 0.1784006804227829, "learning_rate": 7.13282292880029e-05, "loss": 0.0097, "step": 31340 }, { "grad_norm": 0.13913024961948395, "learning_rate": 7.130953380978043e-05, "loss": 0.01, "step": 31350 }, { "grad_norm": 0.16020436584949493, "learning_rate": 7.129083469034144e-05, "loss": 0.0111, "step": 31360 }, { "grad_norm": 0.11619344353675842, "learning_rate": 7.127213193288112e-05, "loss": 0.0101, "step": 31370 }, { "grad_norm": 0.12339527904987335, "learning_rate": 7.125342554059522e-05, "loss": 0.0101, "step": 31380 }, { "grad_norm": 0.1494312733411789, "learning_rate": 7.12347155166802e-05, "loss": 0.0135, "step": 31390 }, { "grad_norm": 0.16828912496566772, "learning_rate": 7.121600186433306e-05, "loss": 0.0101, "step": 31400 }, { "grad_norm": 0.14322903752326965, "learning_rate": 7.119728458675148e-05, "loss": 0.0111, "step": 31410 }, { "grad_norm": 0.17442010343074799, "learning_rate": 7.117856368713369e-05, "loss": 0.0126, "step": 31420 }, { "grad_norm": 0.1187237948179245, "learning_rate": 7.115983916867861e-05, "loss": 0.0095, "step": 31430 }, { "grad_norm": 0.14004643261432648, "learning_rate": 7.114111103458574e-05, "loss": 0.0092, "step": 31440 }, { "grad_norm": 0.17373088002204895, "learning_rate": 7.11223792880552e-05, "loss": 0.0088, "step": 31450 }, { "grad_norm": 0.1348227709531784, "learning_rate": 7.110364393228773e-05, "loss": 0.0109, "step": 31460 }, { "grad_norm": 0.10340471565723419, "learning_rate": 7.108490497048471e-05, "loss": 0.0098, "step": 31470 }, { "grad_norm": 0.10618460178375244, "learning_rate": 7.10661624058481e-05, "loss": 0.009, "step": 31480 }, { "grad_norm": 0.11991949379444122, "learning_rate": 7.10474162415805e-05, "loss": 0.0102, "step": 31490 }, { "grad_norm": 0.11724882572889328, "learning_rate": 7.102866648088511e-05, "loss": 0.0103, "step": 31500 }, { "grad_norm": 0.1302129179239273, "learning_rate": 7.100991312696576e-05, "loss": 0.0081, "step": 31510 }, { "grad_norm": 0.1296393722295761, "learning_rate": 7.099115618302686e-05, "loss": 0.0087, "step": 31520 }, { "grad_norm": 0.12285465002059937, "learning_rate": 7.097239565227349e-05, "loss": 0.0099, "step": 31530 }, { "grad_norm": 0.12966233491897583, "learning_rate": 7.09536315379113e-05, "loss": 0.0114, "step": 31540 }, { "grad_norm": 0.1579844057559967, "learning_rate": 7.093486384314656e-05, "loss": 0.0128, "step": 31550 }, { "grad_norm": 0.12271200120449066, "learning_rate": 7.091609257118616e-05, "loss": 0.0085, "step": 31560 }, { "grad_norm": 0.14593376219272614, "learning_rate": 7.08973177252376e-05, "loss": 0.0081, "step": 31570 }, { "grad_norm": 0.1230243593454361, "learning_rate": 7.087853930850898e-05, "loss": 0.0072, "step": 31580 }, { "grad_norm": 0.11291567236185074, "learning_rate": 7.085975732420903e-05, "loss": 0.0089, "step": 31590 }, { "grad_norm": 0.12133414298295975, "learning_rate": 7.084097177554706e-05, "loss": 0.0096, "step": 31600 }, { "grad_norm": 0.10515662282705307, "learning_rate": 7.082218266573301e-05, "loss": 0.0082, "step": 31610 }, { "grad_norm": 0.14068801701068878, "learning_rate": 7.080338999797743e-05, "loss": 0.0122, "step": 31620 }, { "grad_norm": 0.14102964103221893, "learning_rate": 7.07845937754915e-05, "loss": 0.0072, "step": 31630 }, { "grad_norm": 0.13925258815288544, "learning_rate": 7.076579400148693e-05, "loss": 0.0107, "step": 31640 }, { "grad_norm": 0.11207640171051025, "learning_rate": 7.074699067917611e-05, "loss": 0.0098, "step": 31650 }, { "grad_norm": 0.1829618364572525, "learning_rate": 7.072818381177201e-05, "loss": 0.0107, "step": 31660 }, { "grad_norm": 0.2521744668483734, "learning_rate": 7.070937340248823e-05, "loss": 0.0115, "step": 31670 }, { "grad_norm": 0.1164323166012764, "learning_rate": 7.069055945453893e-05, "loss": 0.0104, "step": 31680 }, { "grad_norm": 0.10183431208133698, "learning_rate": 7.067174197113892e-05, "loss": 0.0081, "step": 31690 }, { "grad_norm": 0.10791915655136108, "learning_rate": 7.065292095550355e-05, "loss": 0.0091, "step": 31700 }, { "grad_norm": 0.17097540199756622, "learning_rate": 7.063409641084887e-05, "loss": 0.0088, "step": 31710 }, { "grad_norm": 0.15634894371032715, "learning_rate": 7.061526834039145e-05, "loss": 0.0084, "step": 31720 }, { "grad_norm": 0.16770310699939728, "learning_rate": 7.05964367473485e-05, "loss": 0.0088, "step": 31730 }, { "grad_norm": 0.13554570078849792, "learning_rate": 7.057760163493783e-05, "loss": 0.01, "step": 31740 }, { "grad_norm": 0.1260569542646408, "learning_rate": 7.055876300637783e-05, "loss": 0.0103, "step": 31750 }, { "grad_norm": 0.12255702912807465, "learning_rate": 7.053992086488753e-05, "loss": 0.0095, "step": 31760 }, { "grad_norm": 0.19823352992534637, "learning_rate": 7.052107521368651e-05, "loss": 0.0114, "step": 31770 }, { "grad_norm": 0.13452064990997314, "learning_rate": 7.0502226055995e-05, "loss": 0.0091, "step": 31780 }, { "grad_norm": 0.1440981775522232, "learning_rate": 7.048337339503379e-05, "loss": 0.0101, "step": 31790 }, { "grad_norm": 0.17980556190013885, "learning_rate": 7.046451723402427e-05, "loss": 0.0107, "step": 31800 }, { "grad_norm": 0.14810070395469666, "learning_rate": 7.044565757618848e-05, "loss": 0.0111, "step": 31810 }, { "grad_norm": 0.11800433695316315, "learning_rate": 7.042679442474899e-05, "loss": 0.0089, "step": 31820 }, { "grad_norm": 0.12736721336841583, "learning_rate": 7.040792778292902e-05, "loss": 0.0103, "step": 31830 }, { "grad_norm": 0.13802441954612732, "learning_rate": 7.038905765395234e-05, "loss": 0.0093, "step": 31840 }, { "grad_norm": 0.16027435660362244, "learning_rate": 7.037018404104334e-05, "loss": 0.0098, "step": 31850 }, { "grad_norm": 0.1629565805196762, "learning_rate": 7.035130694742702e-05, "loss": 0.0096, "step": 31860 }, { "grad_norm": 0.12588059902191162, "learning_rate": 7.033242637632897e-05, "loss": 0.0105, "step": 31870 }, { "grad_norm": 0.10932037979364395, "learning_rate": 7.031354233097534e-05, "loss": 0.009, "step": 31880 }, { "grad_norm": 0.08521369844675064, "learning_rate": 7.029465481459289e-05, "loss": 0.0075, "step": 31890 }, { "grad_norm": 0.1495332419872284, "learning_rate": 7.027576383040898e-05, "loss": 0.0093, "step": 31900 }, { "grad_norm": 0.12030252069234848, "learning_rate": 7.025686938165159e-05, "loss": 0.0095, "step": 31910 }, { "grad_norm": 0.1256854087114334, "learning_rate": 7.023797147154924e-05, "loss": 0.009, "step": 31920 }, { "grad_norm": 0.14612631499767303, "learning_rate": 7.021907010333111e-05, "loss": 0.0088, "step": 31930 }, { "grad_norm": 0.16600313782691956, "learning_rate": 7.020016528022685e-05, "loss": 0.0119, "step": 31940 }, { "grad_norm": 0.1878722459077835, "learning_rate": 7.018125700546683e-05, "loss": 0.01, "step": 31950 }, { "grad_norm": 0.13909193873405457, "learning_rate": 7.016234528228196e-05, "loss": 0.0132, "step": 31960 }, { "grad_norm": 0.11231142282485962, "learning_rate": 7.014343011390372e-05, "loss": 0.0088, "step": 31970 }, { "grad_norm": 0.11788595467805862, "learning_rate": 7.01245115035642e-05, "loss": 0.0116, "step": 31980 }, { "grad_norm": 0.11374450474977493, "learning_rate": 7.010558945449606e-05, "loss": 0.0081, "step": 31990 }, { "grad_norm": 0.12928785383701324, "learning_rate": 7.008666396993258e-05, "loss": 0.0091, "step": 32000 }, { "grad_norm": 0.1123930811882019, "learning_rate": 7.006773505310759e-05, "loss": 0.011, "step": 32010 }, { "grad_norm": 0.12272828072309494, "learning_rate": 7.004880270725553e-05, "loss": 0.0098, "step": 32020 }, { "grad_norm": 0.1612859070301056, "learning_rate": 7.002986693561144e-05, "loss": 0.0094, "step": 32030 }, { "grad_norm": 0.10527011752128601, "learning_rate": 7.001092774141089e-05, "loss": 0.008, "step": 32040 }, { "grad_norm": 0.19817578792572021, "learning_rate": 6.999198512789009e-05, "loss": 0.008, "step": 32050 }, { "grad_norm": 0.18355029821395874, "learning_rate": 6.997303909828584e-05, "loss": 0.0097, "step": 32060 }, { "grad_norm": 0.12140623480081558, "learning_rate": 6.995408965583544e-05, "loss": 0.008, "step": 32070 }, { "grad_norm": 0.17812158167362213, "learning_rate": 6.993513680377688e-05, "loss": 0.0098, "step": 32080 }, { "grad_norm": 0.10484964400529861, "learning_rate": 6.991618054534868e-05, "loss": 0.0116, "step": 32090 }, { "grad_norm": 0.12737853825092316, "learning_rate": 6.989722088378991e-05, "loss": 0.0085, "step": 32100 }, { "grad_norm": 0.10785286128520966, "learning_rate": 6.987825782234027e-05, "loss": 0.0085, "step": 32110 }, { "grad_norm": 0.13895563781261444, "learning_rate": 6.985929136424006e-05, "loss": 0.0096, "step": 32120 }, { "grad_norm": 0.12956559658050537, "learning_rate": 6.984032151273012e-05, "loss": 0.0089, "step": 32130 }, { "grad_norm": 0.12025585025548935, "learning_rate": 6.982134827105186e-05, "loss": 0.0099, "step": 32140 }, { "grad_norm": 0.11118576675653458, "learning_rate": 6.980237164244729e-05, "loss": 0.011, "step": 32150 }, { "grad_norm": 0.16343359649181366, "learning_rate": 6.9783391630159e-05, "loss": 0.0083, "step": 32160 }, { "grad_norm": 0.1902928501367569, "learning_rate": 6.976440823743015e-05, "loss": 0.0091, "step": 32170 }, { "grad_norm": 0.14156275987625122, "learning_rate": 6.974542146750451e-05, "loss": 0.0098, "step": 32180 }, { "grad_norm": 0.11809518933296204, "learning_rate": 6.972643132362637e-05, "loss": 0.0086, "step": 32190 }, { "grad_norm": 0.11764025688171387, "learning_rate": 6.970743780904064e-05, "loss": 0.0081, "step": 32200 }, { "grad_norm": 0.17119915783405304, "learning_rate": 6.968844092699277e-05, "loss": 0.0135, "step": 32210 }, { "grad_norm": 0.14194715023040771, "learning_rate": 6.966944068072883e-05, "loss": 0.01, "step": 32220 }, { "grad_norm": 0.14764612913131714, "learning_rate": 6.965043707349545e-05, "loss": 0.0092, "step": 32230 }, { "grad_norm": 0.14881297945976257, "learning_rate": 6.963143010853982e-05, "loss": 0.0101, "step": 32240 }, { "grad_norm": 0.11893700808286667, "learning_rate": 6.961241978910971e-05, "loss": 0.0074, "step": 32250 }, { "grad_norm": 0.11109622567892075, "learning_rate": 6.959340611845344e-05, "loss": 0.0117, "step": 32260 }, { "grad_norm": 0.136417418718338, "learning_rate": 6.957438909981995e-05, "loss": 0.0088, "step": 32270 }, { "grad_norm": 0.166233092546463, "learning_rate": 6.955536873645872e-05, "loss": 0.0104, "step": 32280 }, { "grad_norm": 0.13255465030670166, "learning_rate": 6.953634503161982e-05, "loss": 0.0096, "step": 32290 }, { "grad_norm": 0.10777699947357178, "learning_rate": 6.951731798855387e-05, "loss": 0.0104, "step": 32300 }, { "grad_norm": 0.15165108442306519, "learning_rate": 6.949828761051208e-05, "loss": 0.0093, "step": 32310 }, { "grad_norm": 0.12306366115808487, "learning_rate": 6.947925390074622e-05, "loss": 0.0076, "step": 32320 }, { "grad_norm": 0.09412909299135208, "learning_rate": 6.946021686250863e-05, "loss": 0.0085, "step": 32330 }, { "grad_norm": 0.11062394827604294, "learning_rate": 6.94411764990522e-05, "loss": 0.0074, "step": 32340 }, { "grad_norm": 0.10722970217466354, "learning_rate": 6.942213281363044e-05, "loss": 0.0085, "step": 32350 }, { "grad_norm": 0.11399094760417938, "learning_rate": 6.940308580949737e-05, "loss": 0.0081, "step": 32360 }, { "grad_norm": 0.09884066134691238, "learning_rate": 6.93840354899076e-05, "loss": 0.0087, "step": 32370 }, { "grad_norm": 0.1312318593263626, "learning_rate": 6.936498185811633e-05, "loss": 0.0087, "step": 32380 }, { "grad_norm": 0.17373013496398926, "learning_rate": 6.93459249173793e-05, "loss": 0.0114, "step": 32390 }, { "grad_norm": 0.13972733914852142, "learning_rate": 6.932686467095279e-05, "loss": 0.0101, "step": 32400 }, { "grad_norm": 0.13232839107513428, "learning_rate": 6.930780112209373e-05, "loss": 0.0117, "step": 32410 }, { "grad_norm": 0.19192422926425934, "learning_rate": 6.92887342740595e-05, "loss": 0.0095, "step": 32420 }, { "grad_norm": 0.1588999330997467, "learning_rate": 6.926966413010816e-05, "loss": 0.0083, "step": 32430 }, { "grad_norm": 0.13005316257476807, "learning_rate": 6.925059069349824e-05, "loss": 0.0091, "step": 32440 }, { "grad_norm": 0.15808255970478058, "learning_rate": 6.923151396748886e-05, "loss": 0.0109, "step": 32450 }, { "grad_norm": 0.10800293833017349, "learning_rate": 6.921243395533974e-05, "loss": 0.01, "step": 32460 }, { "grad_norm": 0.12606243789196014, "learning_rate": 6.919335066031109e-05, "loss": 0.0105, "step": 32470 }, { "grad_norm": 0.11599144339561462, "learning_rate": 6.917426408566379e-05, "loss": 0.0102, "step": 32480 }, { "grad_norm": 0.10250746458768845, "learning_rate": 6.915517423465916e-05, "loss": 0.0091, "step": 32490 }, { "grad_norm": 0.08382853120565414, "learning_rate": 6.913608111055914e-05, "loss": 0.008, "step": 32500 }, { "grad_norm": 0.14344719052314758, "learning_rate": 6.911698471662623e-05, "loss": 0.0094, "step": 32510 }, { "grad_norm": 0.14701633155345917, "learning_rate": 6.90978850561235e-05, "loss": 0.0099, "step": 32520 }, { "grad_norm": 0.11477953940629959, "learning_rate": 6.907878213231454e-05, "loss": 0.0094, "step": 32530 }, { "grad_norm": 0.1367579698562622, "learning_rate": 6.90596759484635e-05, "loss": 0.0141, "step": 32540 }, { "grad_norm": 0.09951479732990265, "learning_rate": 6.904056650783514e-05, "loss": 0.0083, "step": 32550 }, { "grad_norm": 0.11639630049467087, "learning_rate": 6.902145381369471e-05, "loss": 0.0085, "step": 32560 }, { "grad_norm": 0.10620468854904175, "learning_rate": 6.900233786930808e-05, "loss": 0.0096, "step": 32570 }, { "grad_norm": 0.1294238269329071, "learning_rate": 6.898321867794161e-05, "loss": 0.011, "step": 32580 }, { "grad_norm": 0.11944781988859177, "learning_rate": 6.896409624286226e-05, "loss": 0.0112, "step": 32590 }, { "grad_norm": 0.10675608366727829, "learning_rate": 6.894497056733754e-05, "loss": 0.0128, "step": 32600 }, { "grad_norm": 0.1543404906988144, "learning_rate": 6.89258416546355e-05, "loss": 0.0072, "step": 32610 }, { "grad_norm": 0.1273985058069229, "learning_rate": 6.890670950802474e-05, "loss": 0.007, "step": 32620 }, { "grad_norm": 0.1481880247592926, "learning_rate": 6.88875741307744e-05, "loss": 0.0104, "step": 32630 }, { "grad_norm": 0.12029299885034561, "learning_rate": 6.886843552615425e-05, "loss": 0.0083, "step": 32640 }, { "grad_norm": 0.1583203375339508, "learning_rate": 6.884929369743451e-05, "loss": 0.0072, "step": 32650 }, { "grad_norm": 0.11726346611976624, "learning_rate": 6.8830148647886e-05, "loss": 0.0102, "step": 32660 }, { "grad_norm": 0.13233539462089539, "learning_rate": 6.88110003807801e-05, "loss": 0.0101, "step": 32670 }, { "grad_norm": 0.16485652327537537, "learning_rate": 6.87918488993887e-05, "loss": 0.0107, "step": 32680 }, { "grad_norm": 0.12746897339820862, "learning_rate": 6.877269420698431e-05, "loss": 0.009, "step": 32690 }, { "grad_norm": 0.1344207227230072, "learning_rate": 6.875353630683989e-05, "loss": 0.0095, "step": 32700 }, { "grad_norm": 0.1056370884180069, "learning_rate": 6.873437520222905e-05, "loss": 0.0075, "step": 32710 }, { "grad_norm": 0.10350770503282547, "learning_rate": 6.871521089642585e-05, "loss": 0.0083, "step": 32720 }, { "grad_norm": 0.13268987834453583, "learning_rate": 6.869604339270498e-05, "loss": 0.0112, "step": 32730 }, { "grad_norm": 0.12315332889556885, "learning_rate": 6.867687269434164e-05, "loss": 0.0078, "step": 32740 }, { "grad_norm": 0.10284596681594849, "learning_rate": 6.865769880461156e-05, "loss": 0.0101, "step": 32750 }, { "grad_norm": 0.10611461848020554, "learning_rate": 6.863852172679104e-05, "loss": 0.01, "step": 32760 }, { "grad_norm": 0.13251128792762756, "learning_rate": 6.861934146415693e-05, "loss": 0.0108, "step": 32770 }, { "grad_norm": 0.10541293770074844, "learning_rate": 6.86001580199866e-05, "loss": 0.0096, "step": 32780 }, { "grad_norm": 0.11912401020526886, "learning_rate": 6.858097139755798e-05, "loss": 0.0085, "step": 32790 }, { "grad_norm": 0.09203891456127167, "learning_rate": 6.856178160014955e-05, "loss": 0.0092, "step": 32800 }, { "grad_norm": 0.09692615270614624, "learning_rate": 6.85425886310403e-05, "loss": 0.0104, "step": 32810 }, { "grad_norm": 0.12095348536968231, "learning_rate": 6.852339249350979e-05, "loss": 0.0084, "step": 32820 }, { "grad_norm": 0.13660286366939545, "learning_rate": 6.850419319083812e-05, "loss": 0.0118, "step": 32830 }, { "grad_norm": 0.10156702995300293, "learning_rate": 6.848499072630592e-05, "loss": 0.0084, "step": 32840 }, { "grad_norm": 0.11673501878976822, "learning_rate": 6.846578510319439e-05, "loss": 0.0089, "step": 32850 }, { "grad_norm": 0.15448728203773499, "learning_rate": 6.844657632478519e-05, "loss": 0.0078, "step": 32860 }, { "grad_norm": 0.09277962148189545, "learning_rate": 6.842736439436063e-05, "loss": 0.0082, "step": 32870 }, { "grad_norm": 0.12989728152751923, "learning_rate": 6.84081493152035e-05, "loss": 0.0103, "step": 32880 }, { "grad_norm": 0.1269780695438385, "learning_rate": 6.83889310905971e-05, "loss": 0.0081, "step": 32890 }, { "grad_norm": 0.12359984219074249, "learning_rate": 6.836970972382533e-05, "loss": 0.0093, "step": 32900 }, { "grad_norm": 0.12140539288520813, "learning_rate": 6.835048521817257e-05, "loss": 0.0071, "step": 32910 }, { "grad_norm": 0.22820690274238586, "learning_rate": 6.833125757692379e-05, "loss": 0.0101, "step": 32920 }, { "grad_norm": 0.16751180589199066, "learning_rate": 6.831202680336441e-05, "loss": 0.0093, "step": 32930 }, { "grad_norm": 0.14854635298252106, "learning_rate": 6.829279290078052e-05, "loss": 0.0116, "step": 32940 }, { "grad_norm": 0.1299380362033844, "learning_rate": 6.827355587245863e-05, "loss": 0.0098, "step": 32950 }, { "grad_norm": 0.3053723871707916, "learning_rate": 6.82543157216858e-05, "loss": 0.0081, "step": 32960 }, { "grad_norm": 0.15179745852947235, "learning_rate": 6.823507245174969e-05, "loss": 0.0086, "step": 32970 }, { "grad_norm": 0.1777399778366089, "learning_rate": 6.821582606593841e-05, "loss": 0.0095, "step": 32980 }, { "grad_norm": 0.10114132612943649, "learning_rate": 6.81965765675407e-05, "loss": 0.0099, "step": 32990 }, { "grad_norm": 0.09270965307950974, "learning_rate": 6.81773239598457e-05, "loss": 0.0097, "step": 33000 }, { "grad_norm": 0.09621210396289825, "learning_rate": 6.815806824614319e-05, "loss": 0.0077, "step": 33010 }, { "grad_norm": 0.12187886983156204, "learning_rate": 6.813880942972343e-05, "loss": 0.0095, "step": 33020 }, { "grad_norm": 0.11958006024360657, "learning_rate": 6.811954751387726e-05, "loss": 0.0119, "step": 33030 }, { "grad_norm": 0.0973605215549469, "learning_rate": 6.810028250189598e-05, "loss": 0.0091, "step": 33040 }, { "grad_norm": 0.13649222254753113, "learning_rate": 6.808101439707147e-05, "loss": 0.0097, "step": 33050 }, { "grad_norm": 0.1472952961921692, "learning_rate": 6.806174320269609e-05, "loss": 0.0097, "step": 33060 }, { "grad_norm": 0.1651882529258728, "learning_rate": 6.804246892206281e-05, "loss": 0.009, "step": 33070 }, { "grad_norm": 0.1612326055765152, "learning_rate": 6.802319155846506e-05, "loss": 0.0118, "step": 33080 }, { "grad_norm": 0.13937389850616455, "learning_rate": 6.800391111519679e-05, "loss": 0.0113, "step": 33090 }, { "grad_norm": 0.10995670408010483, "learning_rate": 6.798462759555253e-05, "loss": 0.0103, "step": 33100 }, { "grad_norm": 0.10985682904720306, "learning_rate": 6.79653410028273e-05, "loss": 0.0094, "step": 33110 }, { "grad_norm": 0.10958170890808105, "learning_rate": 6.794605134031663e-05, "loss": 0.0089, "step": 33120 }, { "grad_norm": 0.15350961685180664, "learning_rate": 6.792675861131661e-05, "loss": 0.0087, "step": 33130 }, { "grad_norm": 0.16360872983932495, "learning_rate": 6.790746281912386e-05, "loss": 0.0128, "step": 33140 }, { "grad_norm": 0.11404059827327728, "learning_rate": 6.788816396703546e-05, "loss": 0.0096, "step": 33150 }, { "grad_norm": 0.11204639822244644, "learning_rate": 6.78688620583491e-05, "loss": 0.0097, "step": 33160 }, { "grad_norm": 0.11767775565385818, "learning_rate": 6.784955709636292e-05, "loss": 0.0109, "step": 33170 }, { "grad_norm": 0.14344137907028198, "learning_rate": 6.783024908437564e-05, "loss": 0.0107, "step": 33180 }, { "grad_norm": 0.1372833251953125, "learning_rate": 6.781093802568641e-05, "loss": 0.0084, "step": 33190 }, { "grad_norm": 0.13907228410243988, "learning_rate": 6.779162392359504e-05, "loss": 0.0081, "step": 33200 }, { "grad_norm": 0.16444863379001617, "learning_rate": 6.777230678140172e-05, "loss": 0.0123, "step": 33210 }, { "grad_norm": 0.12111644446849823, "learning_rate": 6.775298660240726e-05, "loss": 0.0088, "step": 33220 }, { "grad_norm": 0.12349922209978104, "learning_rate": 6.773366338991292e-05, "loss": 0.0092, "step": 33230 }, { "grad_norm": 0.1698487251996994, "learning_rate": 6.771433714722052e-05, "loss": 0.009, "step": 33240 }, { "grad_norm": 0.14103375375270844, "learning_rate": 6.769500787763239e-05, "loss": 0.0094, "step": 33250 }, { "grad_norm": 0.14802901446819305, "learning_rate": 6.76756755844514e-05, "loss": 0.0121, "step": 33260 }, { "grad_norm": 0.09868783503770828, "learning_rate": 6.765634027098087e-05, "loss": 0.0111, "step": 33270 }, { "grad_norm": 0.11681918054819107, "learning_rate": 6.763700194052468e-05, "loss": 0.0094, "step": 33280 }, { "grad_norm": 0.11786750704050064, "learning_rate": 6.761766059638723e-05, "loss": 0.0092, "step": 33290 }, { "grad_norm": 0.16025885939598083, "learning_rate": 6.759831624187345e-05, "loss": 0.0081, "step": 33300 }, { "grad_norm": 0.11102936416864395, "learning_rate": 6.757896888028871e-05, "loss": 0.0093, "step": 33310 }, { "grad_norm": 0.1467611938714981, "learning_rate": 6.7559618514939e-05, "loss": 0.0077, "step": 33320 }, { "grad_norm": 0.152387335896492, "learning_rate": 6.754026514913073e-05, "loss": 0.0121, "step": 33330 }, { "grad_norm": 0.18367555737495422, "learning_rate": 6.752090878617087e-05, "loss": 0.01, "step": 33340 }, { "grad_norm": 0.144243985414505, "learning_rate": 6.75015494293669e-05, "loss": 0.008, "step": 33350 }, { "grad_norm": 0.17789395153522491, "learning_rate": 6.74821870820268e-05, "loss": 0.0093, "step": 33360 }, { "grad_norm": 0.1620885133743286, "learning_rate": 6.746282174745907e-05, "loss": 0.0095, "step": 33370 }, { "grad_norm": 0.12029851973056793, "learning_rate": 6.744345342897271e-05, "loss": 0.0094, "step": 33380 }, { "grad_norm": 0.11043208837509155, "learning_rate": 6.742408212987724e-05, "loss": 0.01, "step": 33390 }, { "grad_norm": 0.10461723059415817, "learning_rate": 6.740470785348269e-05, "loss": 0.0089, "step": 33400 }, { "grad_norm": 0.13171786069869995, "learning_rate": 6.738533060309958e-05, "loss": 0.0101, "step": 33410 }, { "grad_norm": 0.15584295988082886, "learning_rate": 6.736595038203894e-05, "loss": 0.0087, "step": 33420 }, { "grad_norm": 0.14642012119293213, "learning_rate": 6.734656719361236e-05, "loss": 0.009, "step": 33430 }, { "grad_norm": 0.13175994157791138, "learning_rate": 6.732718104113189e-05, "loss": 0.0126, "step": 33440 }, { "grad_norm": 0.11988344043493271, "learning_rate": 6.730779192791006e-05, "loss": 0.008, "step": 33450 }, { "grad_norm": 0.11448116600513458, "learning_rate": 6.728839985725997e-05, "loss": 0.0075, "step": 33460 }, { "grad_norm": 0.12501771748065948, "learning_rate": 6.726900483249517e-05, "loss": 0.0096, "step": 33470 }, { "grad_norm": 0.11858642846345901, "learning_rate": 6.724960685692976e-05, "loss": 0.0103, "step": 33480 }, { "grad_norm": 0.12120742350816727, "learning_rate": 6.723020593387833e-05, "loss": 0.0083, "step": 33490 }, { "grad_norm": 0.11101709306240082, "learning_rate": 6.721080206665593e-05, "loss": 0.0074, "step": 33500 }, { "grad_norm": 0.14582477509975433, "learning_rate": 6.719139525857819e-05, "loss": 0.0116, "step": 33510 }, { "grad_norm": 0.12680618464946747, "learning_rate": 6.717198551296117e-05, "loss": 0.0082, "step": 33520 }, { "grad_norm": 0.10237190872430801, "learning_rate": 6.715257283312148e-05, "loss": 0.0117, "step": 33530 }, { "grad_norm": 0.12118330597877502, "learning_rate": 6.713315722237623e-05, "loss": 0.0079, "step": 33540 }, { "grad_norm": 0.11300300061702728, "learning_rate": 6.7113738684043e-05, "loss": 0.0088, "step": 33550 }, { "grad_norm": 0.0875084400177002, "learning_rate": 6.709431722143989e-05, "loss": 0.0089, "step": 33560 }, { "grad_norm": 0.14458708465099335, "learning_rate": 6.70748928378855e-05, "loss": 0.0085, "step": 33570 }, { "grad_norm": 0.14170153439044952, "learning_rate": 6.705546553669891e-05, "loss": 0.0096, "step": 33580 }, { "grad_norm": 0.15600556135177612, "learning_rate": 6.703603532119974e-05, "loss": 0.0091, "step": 33590 }, { "grad_norm": 0.09330105781555176, "learning_rate": 6.701660219470808e-05, "loss": 0.0076, "step": 33600 }, { "grad_norm": 0.12876977026462555, "learning_rate": 6.69971661605445e-05, "loss": 0.0102, "step": 33610 }, { "grad_norm": 0.08753987401723862, "learning_rate": 6.697772722203008e-05, "loss": 0.0086, "step": 33620 }, { "grad_norm": 0.14558926224708557, "learning_rate": 6.695828538248643e-05, "loss": 0.0098, "step": 33630 }, { "grad_norm": 0.1346265971660614, "learning_rate": 6.693884064523563e-05, "loss": 0.009, "step": 33640 }, { "grad_norm": 0.13922812044620514, "learning_rate": 6.691939301360023e-05, "loss": 0.0111, "step": 33650 }, { "grad_norm": 0.13031110167503357, "learning_rate": 6.689994249090333e-05, "loss": 0.0097, "step": 33660 }, { "grad_norm": 0.12636521458625793, "learning_rate": 6.688048908046845e-05, "loss": 0.0091, "step": 33670 }, { "grad_norm": 0.10308531671762466, "learning_rate": 6.686103278561969e-05, "loss": 0.008, "step": 33680 }, { "grad_norm": 0.11344656348228455, "learning_rate": 6.684157360968156e-05, "loss": 0.0105, "step": 33690 }, { "grad_norm": 0.08133026957511902, "learning_rate": 6.682211155597911e-05, "loss": 0.0074, "step": 33700 }, { "grad_norm": 0.15163645148277283, "learning_rate": 6.680264662783789e-05, "loss": 0.0105, "step": 33710 }, { "grad_norm": 0.09380505979061127, "learning_rate": 6.678317882858391e-05, "loss": 0.0087, "step": 33720 }, { "grad_norm": 0.12625141441822052, "learning_rate": 6.67637081615437e-05, "loss": 0.0082, "step": 33730 }, { "grad_norm": 0.11992734670639038, "learning_rate": 6.674423463004427e-05, "loss": 0.0081, "step": 33740 }, { "grad_norm": 0.0911652073264122, "learning_rate": 6.672475823741308e-05, "loss": 0.0083, "step": 33750 }, { "grad_norm": 0.10566894710063934, "learning_rate": 6.670527898697811e-05, "loss": 0.0081, "step": 33760 }, { "grad_norm": 0.1401548832654953, "learning_rate": 6.668579688206788e-05, "loss": 0.0097, "step": 33770 }, { "grad_norm": 0.14099112153053284, "learning_rate": 6.666631192601131e-05, "loss": 0.0092, "step": 33780 }, { "grad_norm": 0.15956445038318634, "learning_rate": 6.664682412213785e-05, "loss": 0.0094, "step": 33790 }, { "grad_norm": 0.14096884429454803, "learning_rate": 6.662733347377745e-05, "loss": 0.0113, "step": 33800 }, { "grad_norm": 0.15844912827014923, "learning_rate": 6.660783998426051e-05, "loss": 0.0096, "step": 33810 }, { "grad_norm": 0.23602735996246338, "learning_rate": 6.658834365691794e-05, "loss": 0.0084, "step": 33820 }, { "grad_norm": 0.14144253730773926, "learning_rate": 6.656884449508115e-05, "loss": 0.0104, "step": 33830 }, { "grad_norm": 0.09938454627990723, "learning_rate": 6.654934250208198e-05, "loss": 0.0077, "step": 33840 }, { "grad_norm": 0.13190288841724396, "learning_rate": 6.65298376812528e-05, "loss": 0.0091, "step": 33850 }, { "grad_norm": 0.08599203079938889, "learning_rate": 6.651033003592646e-05, "loss": 0.006, "step": 33860 }, { "grad_norm": 0.11174365878105164, "learning_rate": 6.649081956943626e-05, "loss": 0.0079, "step": 33870 }, { "grad_norm": 0.12921085953712463, "learning_rate": 6.647130628511604e-05, "loss": 0.0094, "step": 33880 }, { "grad_norm": 0.12141887843608856, "learning_rate": 6.645179018630005e-05, "loss": 0.009, "step": 33890 }, { "grad_norm": 0.12566441297531128, "learning_rate": 6.643227127632309e-05, "loss": 0.0108, "step": 33900 }, { "grad_norm": 0.1267399936914444, "learning_rate": 6.641274955852038e-05, "loss": 0.0073, "step": 33910 }, { "grad_norm": 0.13546481728553772, "learning_rate": 6.639322503622768e-05, "loss": 0.0098, "step": 33920 }, { "grad_norm": 0.1674976795911789, "learning_rate": 6.637369771278116e-05, "loss": 0.0118, "step": 33930 }, { "grad_norm": 0.12168249487876892, "learning_rate": 6.635416759151751e-05, "loss": 0.0079, "step": 33940 }, { "grad_norm": 0.10252176970243454, "learning_rate": 6.633463467577394e-05, "loss": 0.0075, "step": 33950 }, { "grad_norm": 0.11669144034385681, "learning_rate": 6.631509896888803e-05, "loss": 0.0096, "step": 33960 }, { "grad_norm": 0.1072634756565094, "learning_rate": 6.629556047419794e-05, "loss": 0.01, "step": 33970 }, { "grad_norm": 0.13936561346054077, "learning_rate": 6.627601919504223e-05, "loss": 0.0109, "step": 33980 }, { "grad_norm": 0.14923226833343506, "learning_rate": 6.625647513476001e-05, "loss": 0.0082, "step": 33990 }, { "grad_norm": 0.11971954256296158, "learning_rate": 6.62369282966908e-05, "loss": 0.0103, "step": 34000 }, { "grad_norm": 0.14883172512054443, "learning_rate": 6.621737868417464e-05, "loss": 0.0081, "step": 34010 }, { "grad_norm": 0.12368113547563553, "learning_rate": 6.619782630055198e-05, "loss": 0.0086, "step": 34020 }, { "grad_norm": 0.08593381196260452, "learning_rate": 6.617827114916382e-05, "loss": 0.0115, "step": 34030 }, { "grad_norm": 0.09910709410905838, "learning_rate": 6.615871323335161e-05, "loss": 0.0089, "step": 34040 }, { "grad_norm": 0.11887587606906891, "learning_rate": 6.613915255645725e-05, "loss": 0.0089, "step": 34050 }, { "grad_norm": 0.19485019147396088, "learning_rate": 6.611958912182312e-05, "loss": 0.0116, "step": 34060 }, { "grad_norm": 0.11338232457637787, "learning_rate": 6.610002293279207e-05, "loss": 0.009, "step": 34070 }, { "grad_norm": 0.11530760675668716, "learning_rate": 6.608045399270746e-05, "loss": 0.0096, "step": 34080 }, { "grad_norm": 0.0815650001168251, "learning_rate": 6.606088230491304e-05, "loss": 0.0097, "step": 34090 }, { "grad_norm": 0.12412282824516296, "learning_rate": 6.604130787275312e-05, "loss": 0.0085, "step": 34100 }, { "grad_norm": 0.14836303889751434, "learning_rate": 6.602173069957242e-05, "loss": 0.0107, "step": 34110 }, { "grad_norm": 0.0949583351612091, "learning_rate": 6.600215078871612e-05, "loss": 0.0093, "step": 34120 }, { "grad_norm": 0.15194882452487946, "learning_rate": 6.598256814352992e-05, "loss": 0.0098, "step": 34130 }, { "grad_norm": 0.09477300196886063, "learning_rate": 6.596298276735995e-05, "loss": 0.0076, "step": 34140 }, { "grad_norm": 0.10258696228265762, "learning_rate": 6.594339466355282e-05, "loss": 0.0096, "step": 34150 }, { "grad_norm": 0.15182366967201233, "learning_rate": 6.592380383545558e-05, "loss": 0.008, "step": 34160 }, { "grad_norm": 0.13096983730793, "learning_rate": 6.590421028641577e-05, "loss": 0.0094, "step": 34170 }, { "grad_norm": 0.11836019903421402, "learning_rate": 6.588461401978143e-05, "loss": 0.0096, "step": 34180 }, { "grad_norm": 0.12541094422340393, "learning_rate": 6.586501503890099e-05, "loss": 0.0077, "step": 34190 }, { "grad_norm": 0.1676388382911682, "learning_rate": 6.584541334712338e-05, "loss": 0.0075, "step": 34200 }, { "grad_norm": 0.0967741534113884, "learning_rate": 6.582580894779802e-05, "loss": 0.0073, "step": 34210 }, { "grad_norm": 0.10536019504070282, "learning_rate": 6.580620184427473e-05, "loss": 0.0081, "step": 34220 }, { "grad_norm": 0.16295696794986725, "learning_rate": 6.578659203990385e-05, "loss": 0.0093, "step": 34230 }, { "grad_norm": 0.12290266156196594, "learning_rate": 6.576697953803615e-05, "loss": 0.0098, "step": 34240 }, { "grad_norm": 0.10153600573539734, "learning_rate": 6.57473643420229e-05, "loss": 0.0088, "step": 34250 }, { "grad_norm": 0.17711472511291504, "learning_rate": 6.572774645521574e-05, "loss": 0.0099, "step": 34260 }, { "grad_norm": 0.1489308923482895, "learning_rate": 6.570812588096688e-05, "loss": 0.0091, "step": 34270 }, { "grad_norm": 0.1542840600013733, "learning_rate": 6.568850262262893e-05, "loss": 0.0103, "step": 34280 }, { "grad_norm": 0.13437612354755402, "learning_rate": 6.566887668355497e-05, "loss": 0.0087, "step": 34290 }, { "grad_norm": 0.10555127263069153, "learning_rate": 6.564924806709851e-05, "loss": 0.0076, "step": 34300 }, { "grad_norm": 0.15687550604343414, "learning_rate": 6.562961677661359e-05, "loss": 0.0082, "step": 34310 }, { "grad_norm": 0.16135652363300323, "learning_rate": 6.56099828154546e-05, "loss": 0.0111, "step": 34320 }, { "grad_norm": 0.0874035507440567, "learning_rate": 6.55903461869765e-05, "loss": 0.0091, "step": 34330 }, { "grad_norm": 0.13171643018722534, "learning_rate": 6.557070689453465e-05, "loss": 0.0078, "step": 34340 }, { "grad_norm": 0.11302786320447922, "learning_rate": 6.555106494148482e-05, "loss": 0.009, "step": 34350 }, { "grad_norm": 0.1397792100906372, "learning_rate": 6.553142033118333e-05, "loss": 0.0105, "step": 34360 }, { "grad_norm": 0.13478200137615204, "learning_rate": 6.551177306698688e-05, "loss": 0.0085, "step": 34370 }, { "grad_norm": 0.14441896975040436, "learning_rate": 6.549212315225267e-05, "loss": 0.0082, "step": 34380 }, { "grad_norm": 0.22311845421791077, "learning_rate": 6.547247059033833e-05, "loss": 0.0119, "step": 34390 }, { "grad_norm": 0.13552811741828918, "learning_rate": 6.545281538460193e-05, "loss": 0.0079, "step": 34400 }, { "grad_norm": 0.08422980457544327, "learning_rate": 6.543315753840202e-05, "loss": 0.0074, "step": 34410 }, { "grad_norm": 0.11664402484893799, "learning_rate": 6.541349705509758e-05, "loss": 0.0077, "step": 34420 }, { "grad_norm": 0.13634423911571503, "learning_rate": 6.539383393804805e-05, "loss": 0.0098, "step": 34430 }, { "grad_norm": 0.1058000698685646, "learning_rate": 6.537416819061333e-05, "loss": 0.008, "step": 34440 }, { "grad_norm": 0.1367958039045334, "learning_rate": 6.535449981615375e-05, "loss": 0.0099, "step": 34450 }, { "grad_norm": 0.1286652684211731, "learning_rate": 6.53348288180301e-05, "loss": 0.0092, "step": 34460 }, { "grad_norm": 0.18425226211547852, "learning_rate": 6.531515519960361e-05, "loss": 0.0086, "step": 34470 }, { "grad_norm": 0.12254404276609421, "learning_rate": 6.529547896423597e-05, "loss": 0.0098, "step": 34480 }, { "grad_norm": 0.12637948989868164, "learning_rate": 6.52758001152893e-05, "loss": 0.008, "step": 34490 }, { "grad_norm": 0.12148936092853546, "learning_rate": 6.525611865612618e-05, "loss": 0.0092, "step": 34500 }, { "grad_norm": 0.10881081968545914, "learning_rate": 6.523643459010966e-05, "loss": 0.0107, "step": 34510 }, { "grad_norm": 0.13573937118053436, "learning_rate": 6.521674792060317e-05, "loss": 0.0102, "step": 34520 }, { "grad_norm": 0.09231918305158615, "learning_rate": 6.519705865097063e-05, "loss": 0.0096, "step": 34530 }, { "grad_norm": 0.10051389783620834, "learning_rate": 6.517736678457641e-05, "loss": 0.0086, "step": 34540 }, { "grad_norm": 0.09292580187320709, "learning_rate": 6.515767232478534e-05, "loss": 0.0077, "step": 34550 }, { "grad_norm": 0.10695935040712357, "learning_rate": 6.51379752749626e-05, "loss": 0.0083, "step": 34560 }, { "grad_norm": 0.11653747409582138, "learning_rate": 6.511827563847393e-05, "loss": 0.0093, "step": 34570 }, { "grad_norm": 0.09097817540168762, "learning_rate": 6.509857341868542e-05, "loss": 0.011, "step": 34580 }, { "grad_norm": 0.1118568703532219, "learning_rate": 6.507886861896367e-05, "loss": 0.008, "step": 34590 }, { "grad_norm": 0.12524816393852234, "learning_rate": 6.505916124267567e-05, "loss": 0.0077, "step": 34600 }, { "grad_norm": 0.1269081085920334, "learning_rate": 6.503945129318891e-05, "loss": 0.0093, "step": 34610 }, { "grad_norm": 0.10736184567213058, "learning_rate": 6.501973877387122e-05, "loss": 0.0092, "step": 34620 }, { "grad_norm": 0.09772926568984985, "learning_rate": 6.500002368809098e-05, "loss": 0.0096, "step": 34630 }, { "grad_norm": 0.11577533185482025, "learning_rate": 6.498030603921694e-05, "loss": 0.0109, "step": 34640 }, { "grad_norm": 0.17292951047420502, "learning_rate": 6.496058583061832e-05, "loss": 0.0115, "step": 34650 }, { "grad_norm": 0.13815777003765106, "learning_rate": 6.494086306566475e-05, "loss": 0.0077, "step": 34660 }, { "grad_norm": 0.15831011533737183, "learning_rate": 6.492113774772632e-05, "loss": 0.0082, "step": 34670 }, { "grad_norm": 0.13991589844226837, "learning_rate": 6.490140988017354e-05, "loss": 0.0111, "step": 34680 }, { "grad_norm": 0.15884296596050262, "learning_rate": 6.488167946637736e-05, "loss": 0.0082, "step": 34690 }, { "grad_norm": 0.12598928809165955, "learning_rate": 6.486194650970915e-05, "loss": 0.01, "step": 34700 }, { "grad_norm": 0.11882246285676956, "learning_rate": 6.48422110135408e-05, "loss": 0.0092, "step": 34710 }, { "grad_norm": 0.13623180985450745, "learning_rate": 6.482247298124451e-05, "loss": 0.0088, "step": 34720 }, { "grad_norm": 0.1526723951101303, "learning_rate": 6.480273241619297e-05, "loss": 0.0084, "step": 34730 }, { "grad_norm": 0.15814203023910522, "learning_rate": 6.478298932175933e-05, "loss": 0.0101, "step": 34740 }, { "grad_norm": 0.14469271898269653, "learning_rate": 6.476324370131712e-05, "loss": 0.0097, "step": 34750 }, { "grad_norm": 0.12465514242649078, "learning_rate": 6.474349555824036e-05, "loss": 0.0076, "step": 34760 }, { "grad_norm": 0.12315351516008377, "learning_rate": 6.472374489590342e-05, "loss": 0.0091, "step": 34770 }, { "grad_norm": 0.11083969473838806, "learning_rate": 6.470399171768118e-05, "loss": 0.0071, "step": 34780 }, { "grad_norm": 0.14895427227020264, "learning_rate": 6.468423602694891e-05, "loss": 0.0081, "step": 34790 }, { "grad_norm": 0.12019781768321991, "learning_rate": 6.466447782708232e-05, "loss": 0.0078, "step": 34800 }, { "grad_norm": 0.13303998112678528, "learning_rate": 6.464471712145754e-05, "loss": 0.0101, "step": 34810 }, { "grad_norm": 0.12565302848815918, "learning_rate": 6.462495391345114e-05, "loss": 0.0082, "step": 34820 }, { "grad_norm": 0.11379872262477875, "learning_rate": 6.46051882064401e-05, "loss": 0.0099, "step": 34830 }, { "grad_norm": 0.12541329860687256, "learning_rate": 6.458542000380186e-05, "loss": 0.0083, "step": 34840 }, { "grad_norm": 0.10713499784469604, "learning_rate": 6.456564930891424e-05, "loss": 0.0076, "step": 34850 }, { "grad_norm": 0.09162051230669022, "learning_rate": 6.454587612515555e-05, "loss": 0.007, "step": 34860 }, { "grad_norm": 0.11826512962579727, "learning_rate": 6.452610045590444e-05, "loss": 0.0103, "step": 34870 }, { "grad_norm": 0.10354012250900269, "learning_rate": 6.450632230454005e-05, "loss": 0.0086, "step": 34880 }, { "grad_norm": 0.11204591393470764, "learning_rate": 6.448654167444195e-05, "loss": 0.0099, "step": 34890 }, { "grad_norm": 0.12001840025186539, "learning_rate": 6.446675856899005e-05, "loss": 0.0085, "step": 34900 }, { "grad_norm": 0.1147686019539833, "learning_rate": 6.444697299156481e-05, "loss": 0.0086, "step": 34910 }, { "grad_norm": 0.12168417870998383, "learning_rate": 6.442718494554701e-05, "loss": 0.0072, "step": 34920 }, { "grad_norm": 0.10295464843511581, "learning_rate": 6.440739443431787e-05, "loss": 0.0085, "step": 34930 }, { "grad_norm": 0.1170644611120224, "learning_rate": 6.438760146125906e-05, "loss": 0.01, "step": 34940 }, { "grad_norm": 0.1283058226108551, "learning_rate": 6.436780602975267e-05, "loss": 0.0084, "step": 34950 }, { "grad_norm": 0.18170228600502014, "learning_rate": 6.43480081431812e-05, "loss": 0.0089, "step": 34960 }, { "grad_norm": 0.12125880271196365, "learning_rate": 6.432820780492756e-05, "loss": 0.0074, "step": 34970 }, { "grad_norm": 0.11183720827102661, "learning_rate": 6.430840501837506e-05, "loss": 0.0111, "step": 34980 }, { "grad_norm": 0.11044421046972275, "learning_rate": 6.428859978690748e-05, "loss": 0.0087, "step": 34990 }, { "grad_norm": 0.09964030236005783, "learning_rate": 6.426879211390901e-05, "loss": 0.0085, "step": 35000 }, { "grad_norm": 0.17191007733345032, "learning_rate": 6.424898200276422e-05, "loss": 0.0125, "step": 35010 }, { "grad_norm": 0.12416122108697891, "learning_rate": 6.42291694568581e-05, "loss": 0.0087, "step": 35020 }, { "grad_norm": 0.14424766600131989, "learning_rate": 6.42093544795761e-05, "loss": 0.0105, "step": 35030 }, { "grad_norm": 0.11452123522758484, "learning_rate": 6.418953707430403e-05, "loss": 0.0099, "step": 35040 }, { "grad_norm": 0.10493705421686172, "learning_rate": 6.416971724442819e-05, "loss": 0.0083, "step": 35050 }, { "grad_norm": 0.08985060453414917, "learning_rate": 6.414989499333519e-05, "loss": 0.0083, "step": 35060 }, { "grad_norm": 0.133043572306633, "learning_rate": 6.413007032441214e-05, "loss": 0.0097, "step": 35070 }, { "grad_norm": 0.14110323786735535, "learning_rate": 6.411024324104653e-05, "loss": 0.0127, "step": 35080 }, { "grad_norm": 0.14299742877483368, "learning_rate": 6.409041374662628e-05, "loss": 0.0093, "step": 35090 }, { "grad_norm": 0.19844447076320648, "learning_rate": 6.407058184453967e-05, "loss": 0.0094, "step": 35100 }, { "grad_norm": 0.13737186789512634, "learning_rate": 6.405074753817548e-05, "loss": 0.0115, "step": 35110 }, { "grad_norm": 0.16067250072956085, "learning_rate": 6.40309108309228e-05, "loss": 0.0093, "step": 35120 }, { "grad_norm": 0.08830343931913376, "learning_rate": 6.401107172617122e-05, "loss": 0.009, "step": 35130 }, { "grad_norm": 0.16615839302539825, "learning_rate": 6.399123022731068e-05, "loss": 0.0081, "step": 35140 }, { "grad_norm": 0.1357366144657135, "learning_rate": 6.397138633773157e-05, "loss": 0.0081, "step": 35150 }, { "grad_norm": 0.1489349901676178, "learning_rate": 6.395154006082463e-05, "loss": 0.009, "step": 35160 }, { "grad_norm": 0.09780681133270264, "learning_rate": 6.393169139998109e-05, "loss": 0.0075, "step": 35170 }, { "grad_norm": 0.12466999888420105, "learning_rate": 6.39118403585925e-05, "loss": 0.0116, "step": 35180 }, { "grad_norm": 0.15027499198913574, "learning_rate": 6.38919869400509e-05, "loss": 0.0101, "step": 35190 }, { "grad_norm": 0.10334473848342896, "learning_rate": 6.387213114774865e-05, "loss": 0.0115, "step": 35200 }, { "grad_norm": 0.13428187370300293, "learning_rate": 6.385227298507863e-05, "loss": 0.0081, "step": 35210 }, { "grad_norm": 0.13955429196357727, "learning_rate": 6.3832412455434e-05, "loss": 0.0083, "step": 35220 }, { "grad_norm": 0.16147100925445557, "learning_rate": 6.381254956220841e-05, "loss": 0.0099, "step": 35230 }, { "grad_norm": 0.1423373818397522, "learning_rate": 6.379268430879586e-05, "loss": 0.0093, "step": 35240 }, { "grad_norm": 0.12530885636806488, "learning_rate": 6.37728166985908e-05, "loss": 0.009, "step": 35250 }, { "grad_norm": 0.10919447243213654, "learning_rate": 6.375294673498804e-05, "loss": 0.0098, "step": 35260 }, { "grad_norm": 0.1293153166770935, "learning_rate": 6.373307442138284e-05, "loss": 0.0086, "step": 35270 }, { "grad_norm": 0.11922139674425125, "learning_rate": 6.371319976117081e-05, "loss": 0.0075, "step": 35280 }, { "grad_norm": 0.10548485815525055, "learning_rate": 6.3693322757748e-05, "loss": 0.0092, "step": 35290 }, { "grad_norm": 0.17480549216270447, "learning_rate": 6.367344341451086e-05, "loss": 0.011, "step": 35300 }, { "grad_norm": 0.17168641090393066, "learning_rate": 6.36535617348562e-05, "loss": 0.0098, "step": 35310 }, { "grad_norm": 0.13843391835689545, "learning_rate": 6.363367772218128e-05, "loss": 0.0095, "step": 35320 }, { "grad_norm": 0.1318860650062561, "learning_rate": 6.36137913798837e-05, "loss": 0.0108, "step": 35330 }, { "grad_norm": 0.12354237586259842, "learning_rate": 6.359390271136151e-05, "loss": 0.0089, "step": 35340 }, { "grad_norm": 0.11298264563083649, "learning_rate": 6.357401172001314e-05, "loss": 0.0089, "step": 35350 }, { "grad_norm": 0.1078663244843483, "learning_rate": 6.355411840923742e-05, "loss": 0.008, "step": 35360 }, { "grad_norm": 0.13040350377559662, "learning_rate": 6.353422278243358e-05, "loss": 0.012, "step": 35370 }, { "grad_norm": 0.07520674914121628, "learning_rate": 6.351432484300121e-05, "loss": 0.0064, "step": 35380 }, { "grad_norm": 0.11954962462186813, "learning_rate": 6.349442459434036e-05, "loss": 0.0095, "step": 35390 }, { "grad_norm": 0.1967538744211197, "learning_rate": 6.34745220398514e-05, "loss": 0.0099, "step": 35400 }, { "grad_norm": 0.15794208645820618, "learning_rate": 6.345461718293518e-05, "loss": 0.0087, "step": 35410 }, { "grad_norm": 0.14859114587306976, "learning_rate": 6.343471002699286e-05, "loss": 0.0109, "step": 35420 }, { "grad_norm": 0.1126500740647316, "learning_rate": 6.341480057542602e-05, "loss": 0.0099, "step": 35430 }, { "grad_norm": 0.1669153869152069, "learning_rate": 6.339488883163667e-05, "loss": 0.0122, "step": 35440 }, { "grad_norm": 0.11334162205457687, "learning_rate": 6.337497479902716e-05, "loss": 0.0087, "step": 35450 }, { "grad_norm": 0.11082825809717178, "learning_rate": 6.335505848100027e-05, "loss": 0.0099, "step": 35460 }, { "grad_norm": 0.18751360476016998, "learning_rate": 6.333513988095915e-05, "loss": 0.0099, "step": 35470 }, { "grad_norm": 0.11460233479738235, "learning_rate": 6.331521900230735e-05, "loss": 0.0084, "step": 35480 }, { "grad_norm": 0.09592245519161224, "learning_rate": 6.329529584844878e-05, "loss": 0.0058, "step": 35490 }, { "grad_norm": 0.1091216504573822, "learning_rate": 6.327537042278777e-05, "loss": 0.0095, "step": 35500 }, { "grad_norm": 0.13639786839485168, "learning_rate": 6.325544272872905e-05, "loss": 0.0111, "step": 35510 }, { "grad_norm": 0.12018990516662598, "learning_rate": 6.323551276967771e-05, "loss": 0.0097, "step": 35520 }, { "grad_norm": 0.11746631562709808, "learning_rate": 6.321558054903922e-05, "loss": 0.0099, "step": 35530 }, { "grad_norm": 0.16161751747131348, "learning_rate": 6.319564607021947e-05, "loss": 0.0103, "step": 35540 }, { "grad_norm": 0.11509496718645096, "learning_rate": 6.31757093366247e-05, "loss": 0.0079, "step": 35550 }, { "grad_norm": 0.10751483589410782, "learning_rate": 6.315577035166154e-05, "loss": 0.0088, "step": 35560 }, { "grad_norm": 0.14737574756145477, "learning_rate": 6.313582911873708e-05, "loss": 0.0091, "step": 35570 }, { "grad_norm": 0.1328466385602951, "learning_rate": 6.311588564125865e-05, "loss": 0.008, "step": 35580 }, { "grad_norm": 0.13128146529197693, "learning_rate": 6.30959399226341e-05, "loss": 0.0081, "step": 35590 }, { "grad_norm": 0.11728838831186295, "learning_rate": 6.30759919662716e-05, "loss": 0.0087, "step": 35600 }, { "grad_norm": 0.11708241701126099, "learning_rate": 6.30560417755797e-05, "loss": 0.0082, "step": 35610 }, { "grad_norm": 0.1241539940237999, "learning_rate": 6.303608935396735e-05, "loss": 0.0107, "step": 35620 }, { "grad_norm": 0.1385555863380432, "learning_rate": 6.301613470484386e-05, "loss": 0.0108, "step": 35630 }, { "grad_norm": 0.14444082975387573, "learning_rate": 6.299617783161893e-05, "loss": 0.0111, "step": 35640 }, { "grad_norm": 0.17597194015979767, "learning_rate": 6.297621873770266e-05, "loss": 0.0087, "step": 35650 }, { "grad_norm": 0.09422633051872253, "learning_rate": 6.29562574265055e-05, "loss": 0.0091, "step": 35660 }, { "grad_norm": 0.10061515122652054, "learning_rate": 6.293629390143834e-05, "loss": 0.0084, "step": 35670 }, { "grad_norm": 0.10204146057367325, "learning_rate": 6.291632816591232e-05, "loss": 0.0094, "step": 35680 }, { "grad_norm": 0.11753793060779572, "learning_rate": 6.28963602233391e-05, "loss": 0.0101, "step": 35690 }, { "grad_norm": 0.15154635906219482, "learning_rate": 6.287639007713062e-05, "loss": 0.0101, "step": 35700 }, { "grad_norm": 0.1732439249753952, "learning_rate": 6.285641773069926e-05, "loss": 0.0082, "step": 35710 }, { "grad_norm": 0.13074027001857758, "learning_rate": 6.283644318745773e-05, "loss": 0.0107, "step": 35720 }, { "grad_norm": 0.15407830476760864, "learning_rate": 6.281646645081912e-05, "loss": 0.0102, "step": 35730 }, { "grad_norm": 0.14348044991493225, "learning_rate": 6.279648752419693e-05, "loss": 0.0082, "step": 35740 }, { "grad_norm": 0.12705089151859283, "learning_rate": 6.2776506411005e-05, "loss": 0.0083, "step": 35750 }, { "grad_norm": 0.14488476514816284, "learning_rate": 6.275652311465758e-05, "loss": 0.008, "step": 35760 }, { "grad_norm": 0.10163289308547974, "learning_rate": 6.273653763856926e-05, "loss": 0.0098, "step": 35770 }, { "grad_norm": 0.10382376611232758, "learning_rate": 6.271654998615501e-05, "loss": 0.0069, "step": 35780 }, { "grad_norm": 0.12991471588611603, "learning_rate": 6.269656016083013e-05, "loss": 0.0076, "step": 35790 }, { "grad_norm": 0.13759756088256836, "learning_rate": 6.267656816601038e-05, "loss": 0.0093, "step": 35800 }, { "grad_norm": 0.15447574853897095, "learning_rate": 6.265657400511185e-05, "loss": 0.0102, "step": 35810 }, { "grad_norm": 0.1636124700307846, "learning_rate": 6.263657768155098e-05, "loss": 0.0076, "step": 35820 }, { "grad_norm": 0.10199614614248276, "learning_rate": 6.261657919874457e-05, "loss": 0.0074, "step": 35830 }, { "grad_norm": 0.13261334598064423, "learning_rate": 6.259657856010986e-05, "loss": 0.0082, "step": 35840 }, { "grad_norm": 0.12978258728981018, "learning_rate": 6.257657576906439e-05, "loss": 0.0085, "step": 35850 }, { "grad_norm": 0.15533500909805298, "learning_rate": 6.255657082902609e-05, "loss": 0.0065, "step": 35860 }, { "grad_norm": 0.12450376152992249, "learning_rate": 6.253656374341325e-05, "loss": 0.011, "step": 35870 }, { "grad_norm": 0.11112458258867264, "learning_rate": 6.251655451564457e-05, "loss": 0.0069, "step": 35880 }, { "grad_norm": 0.10085614770650864, "learning_rate": 6.249654314913902e-05, "loss": 0.0077, "step": 35890 }, { "grad_norm": 0.12139612436294556, "learning_rate": 6.247652964731604e-05, "loss": 0.0083, "step": 35900 }, { "grad_norm": 0.09585444629192352, "learning_rate": 6.245651401359537e-05, "loss": 0.0113, "step": 35910 }, { "grad_norm": 0.15458740293979645, "learning_rate": 6.243649625139715e-05, "loss": 0.0071, "step": 35920 }, { "grad_norm": 0.11238930374383926, "learning_rate": 6.241647636414185e-05, "loss": 0.0083, "step": 35930 }, { "grad_norm": 0.10405780375003815, "learning_rate": 6.239645435525034e-05, "loss": 0.0082, "step": 35940 }, { "grad_norm": 0.11488831043243408, "learning_rate": 6.237643022814381e-05, "loss": 0.0089, "step": 35950 }, { "grad_norm": 0.11500148475170135, "learning_rate": 6.235640398624386e-05, "loss": 0.0079, "step": 35960 }, { "grad_norm": 0.1127910166978836, "learning_rate": 6.233637563297243e-05, "loss": 0.0102, "step": 35970 }, { "grad_norm": 0.12810949981212616, "learning_rate": 6.23163451717518e-05, "loss": 0.0096, "step": 35980 }, { "grad_norm": 0.12671642005443573, "learning_rate": 6.229631260600463e-05, "loss": 0.0094, "step": 35990 }, { "grad_norm": 0.11407192051410675, "learning_rate": 6.227627793915392e-05, "loss": 0.0097, "step": 36000 }, { "grad_norm": 0.0986645519733429, "learning_rate": 6.225624117462309e-05, "loss": 0.0082, "step": 36010 }, { "grad_norm": 0.12850455939769745, "learning_rate": 6.223620231583586e-05, "loss": 0.008, "step": 36020 }, { "grad_norm": 0.11694023758172989, "learning_rate": 6.221616136621629e-05, "loss": 0.011, "step": 36030 }, { "grad_norm": 0.11060391366481781, "learning_rate": 6.219611832918887e-05, "loss": 0.0095, "step": 36040 }, { "grad_norm": 0.131863534450531, "learning_rate": 6.217607320817838e-05, "loss": 0.0086, "step": 36050 }, { "grad_norm": 0.11641963571310043, "learning_rate": 6.215602600661001e-05, "loss": 0.0094, "step": 36060 }, { "grad_norm": 0.13673409819602966, "learning_rate": 6.213597672790925e-05, "loss": 0.0085, "step": 36070 }, { "grad_norm": 0.13938936591148376, "learning_rate": 6.2115925375502e-05, "loss": 0.0092, "step": 36080 }, { "grad_norm": 0.1084495261311531, "learning_rate": 6.209587195281447e-05, "loss": 0.01, "step": 36090 }, { "grad_norm": 0.10985356569290161, "learning_rate": 6.207581646327324e-05, "loss": 0.0099, "step": 36100 }, { "grad_norm": 0.1489786058664322, "learning_rate": 6.205575891030526e-05, "loss": 0.0095, "step": 36110 }, { "grad_norm": 0.12195243686437607, "learning_rate": 6.203569929733781e-05, "loss": 0.0087, "step": 36120 }, { "grad_norm": 0.09933586418628693, "learning_rate": 6.201563762779852e-05, "loss": 0.0086, "step": 36130 }, { "grad_norm": 0.1049957424402237, "learning_rate": 6.199557390511538e-05, "loss": 0.0072, "step": 36140 }, { "grad_norm": 0.1395234763622284, "learning_rate": 6.197550813271675e-05, "loss": 0.0088, "step": 36150 }, { "grad_norm": 0.09979946166276932, "learning_rate": 6.195544031403131e-05, "loss": 0.0083, "step": 36160 }, { "grad_norm": 0.1357329934835434, "learning_rate": 6.19353704524881e-05, "loss": 0.0111, "step": 36170 }, { "grad_norm": 0.13253189623355865, "learning_rate": 6.191529855151652e-05, "loss": 0.0084, "step": 36180 }, { "grad_norm": 0.12020669877529144, "learning_rate": 6.189522461454629e-05, "loss": 0.008, "step": 36190 }, { "grad_norm": 0.11493031680583954, "learning_rate": 6.187514864500752e-05, "loss": 0.0079, "step": 36200 }, { "grad_norm": 0.09095307439565659, "learning_rate": 6.185507064633062e-05, "loss": 0.0077, "step": 36210 }, { "grad_norm": 0.11243084073066711, "learning_rate": 6.18349906219464e-05, "loss": 0.0077, "step": 36220 }, { "grad_norm": 0.08893309533596039, "learning_rate": 6.181490857528596e-05, "loss": 0.0108, "step": 36230 }, { "grad_norm": 0.1323956400156021, "learning_rate": 6.179482450978077e-05, "loss": 0.0081, "step": 36240 }, { "grad_norm": 0.0808638334274292, "learning_rate": 6.177473842886269e-05, "loss": 0.0079, "step": 36250 }, { "grad_norm": 0.11951025575399399, "learning_rate": 6.175465033596382e-05, "loss": 0.0085, "step": 36260 }, { "grad_norm": 0.12722326815128326, "learning_rate": 6.173456023451671e-05, "loss": 0.0081, "step": 36270 }, { "grad_norm": 0.1178067997097969, "learning_rate": 6.171446812795422e-05, "loss": 0.0093, "step": 36280 }, { "grad_norm": 0.11421613395214081, "learning_rate": 6.169437401970949e-05, "loss": 0.0086, "step": 36290 }, { "grad_norm": 0.13231521844863892, "learning_rate": 6.16742779132161e-05, "loss": 0.0076, "step": 36300 }, { "grad_norm": 0.10512083768844604, "learning_rate": 6.165417981190789e-05, "loss": 0.0071, "step": 36310 }, { "grad_norm": 0.11701367050409317, "learning_rate": 6.16340797192191e-05, "loss": 0.0073, "step": 36320 }, { "grad_norm": 0.11883590370416641, "learning_rate": 6.161397763858427e-05, "loss": 0.0096, "step": 36330 }, { "grad_norm": 0.121933214366436, "learning_rate": 6.159387357343834e-05, "loss": 0.0106, "step": 36340 }, { "grad_norm": 0.11167030036449432, "learning_rate": 6.157376752721648e-05, "loss": 0.0081, "step": 36350 }, { "grad_norm": 0.10365016758441925, "learning_rate": 6.155365950335428e-05, "loss": 0.0067, "step": 36360 }, { "grad_norm": 0.10303054004907608, "learning_rate": 6.153354950528768e-05, "loss": 0.0084, "step": 36370 }, { "grad_norm": 0.14018113911151886, "learning_rate": 6.151343753645293e-05, "loss": 0.0094, "step": 36380 }, { "grad_norm": 0.10603264719247818, "learning_rate": 6.149332360028657e-05, "loss": 0.0114, "step": 36390 }, { "grad_norm": 0.10429234057664871, "learning_rate": 6.147320770022555e-05, "loss": 0.0075, "step": 36400 }, { "grad_norm": 0.16646085679531097, "learning_rate": 6.145308983970715e-05, "loss": 0.0084, "step": 36410 }, { "grad_norm": 0.12231240421533585, "learning_rate": 6.143297002216892e-05, "loss": 0.0075, "step": 36420 }, { "grad_norm": 0.12204910069704056, "learning_rate": 6.141284825104882e-05, "loss": 0.01, "step": 36430 }, { "grad_norm": 0.1053413674235344, "learning_rate": 6.13927245297851e-05, "loss": 0.0088, "step": 36440 }, { "grad_norm": 0.13155296444892883, "learning_rate": 6.137259886181633e-05, "loss": 0.0082, "step": 36450 }, { "grad_norm": 0.10133321583271027, "learning_rate": 6.135247125058145e-05, "loss": 0.0066, "step": 36460 }, { "grad_norm": 0.15138140320777893, "learning_rate": 6.133234169951974e-05, "loss": 0.0085, "step": 36470 }, { "grad_norm": 0.09375747293233871, "learning_rate": 6.131221021207078e-05, "loss": 0.008, "step": 36480 }, { "grad_norm": 0.1568908989429474, "learning_rate": 6.129207679167448e-05, "loss": 0.0084, "step": 36490 }, { "grad_norm": 0.1583486795425415, "learning_rate": 6.127194144177109e-05, "loss": 0.0079, "step": 36500 }, { "grad_norm": 0.12390193343162537, "learning_rate": 6.125180416580118e-05, "loss": 0.0073, "step": 36510 }, { "grad_norm": 0.10590222477912903, "learning_rate": 6.123166496720571e-05, "loss": 0.0064, "step": 36520 }, { "grad_norm": 0.10512951761484146, "learning_rate": 6.121152384942588e-05, "loss": 0.0066, "step": 36530 }, { "grad_norm": 0.12315746396780014, "learning_rate": 6.119138081590324e-05, "loss": 0.01, "step": 36540 }, { "grad_norm": 0.1573701649904251, "learning_rate": 6.117123587007971e-05, "loss": 0.0081, "step": 36550 }, { "grad_norm": 0.14332157373428345, "learning_rate": 6.11510890153975e-05, "loss": 0.0071, "step": 36560 }, { "grad_norm": 0.15852898359298706, "learning_rate": 6.113094025529916e-05, "loss": 0.0079, "step": 36570 }, { "grad_norm": 0.14213624596595764, "learning_rate": 6.111078959322757e-05, "loss": 0.0104, "step": 36580 }, { "grad_norm": 0.14594539999961853, "learning_rate": 6.109063703262592e-05, "loss": 0.0098, "step": 36590 }, { "grad_norm": 0.15126796066761017, "learning_rate": 6.107048257693772e-05, "loss": 0.009, "step": 36600 }, { "grad_norm": 0.1150115355849266, "learning_rate": 6.105032622960683e-05, "loss": 0.0096, "step": 36610 }, { "grad_norm": 0.09639438986778259, "learning_rate": 6.103016799407743e-05, "loss": 0.0089, "step": 36620 }, { "grad_norm": 0.1659899204969406, "learning_rate": 6.1010007873793984e-05, "loss": 0.0073, "step": 36630 }, { "grad_norm": 0.10545407980680466, "learning_rate": 6.098984587220131e-05, "loss": 0.008, "step": 36640 }, { "grad_norm": 0.13264499604701996, "learning_rate": 6.096968199274456e-05, "loss": 0.0101, "step": 36650 }, { "grad_norm": 0.12433066219091415, "learning_rate": 6.0949516238869166e-05, "loss": 0.007, "step": 36660 }, { "grad_norm": 0.11896353960037231, "learning_rate": 6.092934861402092e-05, "loss": 0.0078, "step": 36670 }, { "grad_norm": 0.08122167736291885, "learning_rate": 6.0909179121645924e-05, "loss": 0.0082, "step": 36680 }, { "grad_norm": 0.11529509723186493, "learning_rate": 6.0889007765190576e-05, "loss": 0.0093, "step": 36690 }, { "grad_norm": 0.11542511731386185, "learning_rate": 6.086883454810162e-05, "loss": 0.0087, "step": 36700 }, { "grad_norm": 0.1116614043712616, "learning_rate": 6.0848659473826084e-05, "loss": 0.0074, "step": 36710 }, { "grad_norm": 0.14059188961982727, "learning_rate": 6.082848254581138e-05, "loss": 0.0083, "step": 36720 }, { "grad_norm": 0.11203537881374359, "learning_rate": 6.080830376750517e-05, "loss": 0.0075, "step": 36730 }, { "grad_norm": 0.10634283721446991, "learning_rate": 6.0788123142355445e-05, "loss": 0.0081, "step": 36740 }, { "grad_norm": 0.11603248864412308, "learning_rate": 6.076794067381052e-05, "loss": 0.0107, "step": 36750 }, { "grad_norm": 0.13489268720149994, "learning_rate": 6.074775636531905e-05, "loss": 0.0099, "step": 36760 }, { "grad_norm": 0.12158222496509552, "learning_rate": 6.072757022032997e-05, "loss": 0.006, "step": 36770 }, { "grad_norm": 0.09232702851295471, "learning_rate": 6.070738224229253e-05, "loss": 0.0076, "step": 36780 }, { "grad_norm": 0.11814824491739273, "learning_rate": 6.0687192434656314e-05, "loss": 0.0083, "step": 36790 }, { "grad_norm": 0.14997440576553345, "learning_rate": 6.066700080087121e-05, "loss": 0.0097, "step": 36800 }, { "grad_norm": 0.11914105713367462, "learning_rate": 6.0646807344387424e-05, "loss": 0.0071, "step": 36810 }, { "grad_norm": 0.12324222177267075, "learning_rate": 6.062661206865543e-05, "loss": 0.0094, "step": 36820 }, { "grad_norm": 0.13097186386585236, "learning_rate": 6.06064149771261e-05, "loss": 0.0105, "step": 36830 }, { "grad_norm": 0.13593530654907227, "learning_rate": 6.058621607325051e-05, "loss": 0.0083, "step": 36840 }, { "grad_norm": 0.15428386628627777, "learning_rate": 6.056601536048014e-05, "loss": 0.0102, "step": 36850 }, { "grad_norm": 0.13194528222084045, "learning_rate": 6.0545812842266725e-05, "loss": 0.009, "step": 36860 }, { "grad_norm": 0.12721003592014313, "learning_rate": 6.052560852206232e-05, "loss": 0.0085, "step": 36870 }, { "grad_norm": 0.11760351806879044, "learning_rate": 6.05054024033193e-05, "loss": 0.0094, "step": 36880 }, { "grad_norm": 0.13080424070358276, "learning_rate": 6.048519448949032e-05, "loss": 0.0075, "step": 36890 }, { "grad_norm": 0.16370359063148499, "learning_rate": 6.046498478402839e-05, "loss": 0.0083, "step": 36900 }, { "grad_norm": 0.1146329939365387, "learning_rate": 6.044477329038677e-05, "loss": 0.0123, "step": 36910 }, { "grad_norm": 0.09469260275363922, "learning_rate": 6.042456001201906e-05, "loss": 0.0083, "step": 36920 }, { "grad_norm": 0.09749159216880798, "learning_rate": 6.040434495237917e-05, "loss": 0.0076, "step": 36930 }, { "grad_norm": 0.14042529463768005, "learning_rate": 6.0384128114921256e-05, "loss": 0.0087, "step": 36940 }, { "grad_norm": 0.11876687407493591, "learning_rate": 6.036390950309987e-05, "loss": 0.0115, "step": 36950 }, { "grad_norm": 0.1260482370853424, "learning_rate": 6.0343689120369805e-05, "loss": 0.0125, "step": 36960 }, { "grad_norm": 0.14396928250789642, "learning_rate": 6.032346697018616e-05, "loss": 0.0092, "step": 36970 }, { "grad_norm": 0.11183339357376099, "learning_rate": 6.0303243056004375e-05, "loss": 0.0072, "step": 36980 }, { "grad_norm": 0.13357144594192505, "learning_rate": 6.0283017381280136e-05, "loss": 0.008, "step": 36990 }, { "grad_norm": 0.10351172089576721, "learning_rate": 6.026278994946948e-05, "loss": 0.0107, "step": 37000 }, { "grad_norm": 0.20174112915992737, "learning_rate": 6.02425607640287e-05, "loss": 0.0085, "step": 37010 }, { "grad_norm": 0.10903395712375641, "learning_rate": 6.022232982841441e-05, "loss": 0.0082, "step": 37020 }, { "grad_norm": 0.11864224076271057, "learning_rate": 6.020209714608355e-05, "loss": 0.0082, "step": 37030 }, { "grad_norm": 0.10088852047920227, "learning_rate": 6.018186272049331e-05, "loss": 0.0107, "step": 37040 }, { "grad_norm": 0.14306305348873138, "learning_rate": 6.01616265551012e-05, "loss": 0.0093, "step": 37050 }, { "grad_norm": 0.131392702460289, "learning_rate": 6.014138865336503e-05, "loss": 0.0114, "step": 37060 }, { "grad_norm": 0.08698324859142303, "learning_rate": 6.0121149018742905e-05, "loss": 0.0086, "step": 37070 }, { "grad_norm": 0.1414525806903839, "learning_rate": 6.010090765469325e-05, "loss": 0.008, "step": 37080 }, { "grad_norm": 0.14757248759269714, "learning_rate": 6.008066456467473e-05, "loss": 0.0089, "step": 37090 }, { "grad_norm": 0.1026458889245987, "learning_rate": 6.0060419752146335e-05, "loss": 0.0087, "step": 37100 }, { "grad_norm": 0.10680662840604782, "learning_rate": 6.0040173220567353e-05, "loss": 0.0087, "step": 37110 }, { "grad_norm": 0.13765528798103333, "learning_rate": 6.001992497339737e-05, "loss": 0.0075, "step": 37120 }, { "grad_norm": 0.10433008521795273, "learning_rate": 5.999967501409626e-05, "loss": 0.0091, "step": 37130 }, { "grad_norm": 0.09038382768630981, "learning_rate": 5.997942334612418e-05, "loss": 0.0063, "step": 37140 }, { "grad_norm": 0.1300409734249115, "learning_rate": 5.995916997294158e-05, "loss": 0.0086, "step": 37150 }, { "grad_norm": 0.13248416781425476, "learning_rate": 5.9938914898009214e-05, "loss": 0.008, "step": 37160 }, { "grad_norm": 0.08550771325826645, "learning_rate": 5.991865812478813e-05, "loss": 0.0092, "step": 37170 }, { "grad_norm": 0.10917815566062927, "learning_rate": 5.989839965673964e-05, "loss": 0.009, "step": 37180 }, { "grad_norm": 0.1685791164636612, "learning_rate": 5.987813949732539e-05, "loss": 0.0123, "step": 37190 }, { "grad_norm": 0.11373158544301987, "learning_rate": 5.9857877650007255e-05, "loss": 0.0097, "step": 37200 }, { "grad_norm": 0.12019141763448715, "learning_rate": 5.983761411824744e-05, "loss": 0.0108, "step": 37210 }, { "grad_norm": 0.1095118448138237, "learning_rate": 5.981734890550844e-05, "loss": 0.0085, "step": 37220 }, { "grad_norm": 0.09936501085758209, "learning_rate": 5.979708201525301e-05, "loss": 0.0108, "step": 37230 }, { "grad_norm": 0.11763739585876465, "learning_rate": 5.977681345094422e-05, "loss": 0.0072, "step": 37240 }, { "grad_norm": 0.0751492828130722, "learning_rate": 5.97565432160454e-05, "loss": 0.0084, "step": 37250 }, { "grad_norm": 0.12460006773471832, "learning_rate": 5.9736271314020186e-05, "loss": 0.008, "step": 37260 }, { "grad_norm": 0.1734718382358551, "learning_rate": 5.971599774833251e-05, "loss": 0.0075, "step": 37270 }, { "grad_norm": 0.1411661058664322, "learning_rate": 5.9695722522446525e-05, "loss": 0.0101, "step": 37280 }, { "grad_norm": 0.11290790885686874, "learning_rate": 5.9675445639826765e-05, "loss": 0.0073, "step": 37290 }, { "grad_norm": 0.11742231994867325, "learning_rate": 5.965516710393796e-05, "loss": 0.0085, "step": 37300 }, { "grad_norm": 0.09420011192560196, "learning_rate": 5.963488691824516e-05, "loss": 0.0074, "step": 37310 }, { "grad_norm": 0.13713951408863068, "learning_rate": 5.96146050862137e-05, "loss": 0.0108, "step": 37320 }, { "grad_norm": 0.14288456737995148, "learning_rate": 5.959432161130919e-05, "loss": 0.0098, "step": 37330 }, { "grad_norm": 0.1195727288722992, "learning_rate": 5.9574036496997545e-05, "loss": 0.0097, "step": 37340 }, { "grad_norm": 0.0914650708436966, "learning_rate": 5.955374974674488e-05, "loss": 0.0084, "step": 37350 }, { "grad_norm": 0.10999329388141632, "learning_rate": 5.9533461364017696e-05, "loss": 0.0085, "step": 37360 }, { "grad_norm": 0.0962669849395752, "learning_rate": 5.9513171352282716e-05, "loss": 0.0072, "step": 37370 }, { "grad_norm": 0.11037256568670273, "learning_rate": 5.949287971500692e-05, "loss": 0.0081, "step": 37380 }, { "grad_norm": 0.1293564736843109, "learning_rate": 5.947258645565762e-05, "loss": 0.0097, "step": 37390 }, { "grad_norm": 0.09553637355566025, "learning_rate": 5.945229157770237e-05, "loss": 0.0077, "step": 37400 }, { "grad_norm": 0.10452590137720108, "learning_rate": 5.9431995084609006e-05, "loss": 0.0117, "step": 37410 }, { "grad_norm": 0.1569450944662094, "learning_rate": 5.941169697984564e-05, "loss": 0.0082, "step": 37420 }, { "grad_norm": 0.13466820120811462, "learning_rate": 5.9391397266880675e-05, "loss": 0.0074, "step": 37430 }, { "grad_norm": 0.16169150173664093, "learning_rate": 5.937109594918279e-05, "loss": 0.0114, "step": 37440 }, { "grad_norm": 0.15461702644824982, "learning_rate": 5.9350793030220884e-05, "loss": 0.008, "step": 37450 }, { "grad_norm": 0.16662374138832092, "learning_rate": 5.933048851346421e-05, "loss": 0.0108, "step": 37460 }, { "grad_norm": 0.0964822992682457, "learning_rate": 5.931018240238224e-05, "loss": 0.0099, "step": 37470 }, { "grad_norm": 0.11773243546485901, "learning_rate": 5.928987470044471e-05, "loss": 0.0085, "step": 37480 }, { "grad_norm": 0.12035542726516724, "learning_rate": 5.9269565411121695e-05, "loss": 0.0088, "step": 37490 }, { "grad_norm": 0.14978428184986115, "learning_rate": 5.924925453788347e-05, "loss": 0.0091, "step": 37500 }, { "grad_norm": 0.12636372447013855, "learning_rate": 5.92289420842006e-05, "loss": 0.0076, "step": 37510 }, { "grad_norm": 0.10595987737178802, "learning_rate": 5.9208628053543945e-05, "loss": 0.0105, "step": 37520 }, { "grad_norm": 0.18590444326400757, "learning_rate": 5.918831244938462e-05, "loss": 0.0089, "step": 37530 }, { "grad_norm": 0.21601204574108124, "learning_rate": 5.9167995275194e-05, "loss": 0.0082, "step": 37540 }, { "grad_norm": 0.18189798295497894, "learning_rate": 5.914767653444373e-05, "loss": 0.0082, "step": 37550 }, { "grad_norm": 0.1316954344511032, "learning_rate": 5.912735623060572e-05, "loss": 0.0089, "step": 37560 }, { "grad_norm": 0.09779508411884308, "learning_rate": 5.910703436715217e-05, "loss": 0.0106, "step": 37570 }, { "grad_norm": 0.11889342218637466, "learning_rate": 5.908671094755552e-05, "loss": 0.008, "step": 37580 }, { "grad_norm": 0.1587466597557068, "learning_rate": 5.906638597528851e-05, "loss": 0.0087, "step": 37590 }, { "grad_norm": 0.11069401353597641, "learning_rate": 5.9046059453824076e-05, "loss": 0.0056, "step": 37600 }, { "grad_norm": 0.11302969604730606, "learning_rate": 5.9025731386635505e-05, "loss": 0.0084, "step": 37610 }, { "grad_norm": 0.10513457655906677, "learning_rate": 5.900540177719629e-05, "loss": 0.0065, "step": 37620 }, { "grad_norm": 0.1136016696691513, "learning_rate": 5.898507062898021e-05, "loss": 0.0075, "step": 37630 }, { "grad_norm": 0.12345770746469498, "learning_rate": 5.8964737945461316e-05, "loss": 0.0069, "step": 37640 }, { "grad_norm": 0.11504033207893372, "learning_rate": 5.8944403730113885e-05, "loss": 0.0087, "step": 37650 }, { "grad_norm": 0.08191391825675964, "learning_rate": 5.892406798641248e-05, "loss": 0.0097, "step": 37660 }, { "grad_norm": 0.12078657001256943, "learning_rate": 5.890373071783193e-05, "loss": 0.0104, "step": 37670 }, { "grad_norm": 0.09908086806535721, "learning_rate": 5.888339192784732e-05, "loss": 0.011, "step": 37680 }, { "grad_norm": 0.17146924138069153, "learning_rate": 5.8863051619934003e-05, "loss": 0.01, "step": 37690 }, { "grad_norm": 0.11183289438486099, "learning_rate": 5.8842709797567554e-05, "loss": 0.0079, "step": 37700 }, { "grad_norm": 0.11162882298231125, "learning_rate": 5.8822366464223855e-05, "loss": 0.0086, "step": 37710 }, { "grad_norm": 0.10806884616613388, "learning_rate": 5.880202162337901e-05, "loss": 0.0071, "step": 37720 }, { "grad_norm": 0.11597786098718643, "learning_rate": 5.8781675278509405e-05, "loss": 0.0074, "step": 37730 }, { "grad_norm": 0.1268111914396286, "learning_rate": 5.8761327433091696e-05, "loss": 0.0093, "step": 37740 }, { "grad_norm": 0.1206643208861351, "learning_rate": 5.874097809060275e-05, "loss": 0.0084, "step": 37750 }, { "grad_norm": 0.11213331669569016, "learning_rate": 5.87206272545197e-05, "loss": 0.0089, "step": 37760 }, { "grad_norm": 0.1199919655919075, "learning_rate": 5.8700274928319955e-05, "loss": 0.0072, "step": 37770 }, { "grad_norm": 0.10249584913253784, "learning_rate": 5.867992111548118e-05, "loss": 0.0105, "step": 37780 }, { "grad_norm": 0.11450459063053131, "learning_rate": 5.865956581948131e-05, "loss": 0.007, "step": 37790 }, { "grad_norm": 0.09654436260461807, "learning_rate": 5.863920904379845e-05, "loss": 0.0077, "step": 37800 }, { "grad_norm": 0.11255251616239548, "learning_rate": 5.8618850791911064e-05, "loss": 0.0084, "step": 37810 }, { "grad_norm": 0.14918051660060883, "learning_rate": 5.859849106729779e-05, "loss": 0.0082, "step": 37820 }, { "grad_norm": 0.12246749550104141, "learning_rate": 5.857812987343758e-05, "loss": 0.0076, "step": 37830 }, { "grad_norm": 0.16613930463790894, "learning_rate": 5.855776721380957e-05, "loss": 0.0089, "step": 37840 }, { "grad_norm": 0.11591894924640656, "learning_rate": 5.8537403091893217e-05, "loss": 0.01, "step": 37850 }, { "grad_norm": 0.11733298748731613, "learning_rate": 5.851703751116816e-05, "loss": 0.0073, "step": 37860 }, { "grad_norm": 0.17434747517108917, "learning_rate": 5.8496670475114336e-05, "loss": 0.0067, "step": 37870 }, { "grad_norm": 0.11931917816400528, "learning_rate": 5.84763019872119e-05, "loss": 0.0077, "step": 37880 }, { "grad_norm": 0.19738930463790894, "learning_rate": 5.845593205094131e-05, "loss": 0.0075, "step": 37890 }, { "grad_norm": 0.1291947364807129, "learning_rate": 5.843556066978318e-05, "loss": 0.0066, "step": 37900 }, { "grad_norm": 0.11361970752477646, "learning_rate": 5.8415187847218455e-05, "loss": 0.0072, "step": 37910 }, { "grad_norm": 0.1281084418296814, "learning_rate": 5.839481358672827e-05, "loss": 0.0079, "step": 37920 }, { "grad_norm": 0.10255127400159836, "learning_rate": 5.837443789179407e-05, "loss": 0.0078, "step": 37930 }, { "grad_norm": 0.105781689286232, "learning_rate": 5.8354060765897445e-05, "loss": 0.0074, "step": 37940 }, { "grad_norm": 0.12024582177400589, "learning_rate": 5.8333682212520334e-05, "loss": 0.0076, "step": 37950 }, { "grad_norm": 0.08534953743219376, "learning_rate": 5.831330223514486e-05, "loss": 0.009, "step": 37960 }, { "grad_norm": 0.10520772635936737, "learning_rate": 5.8292920837253396e-05, "loss": 0.0081, "step": 37970 }, { "grad_norm": 0.17211976647377014, "learning_rate": 5.827253802232857e-05, "loss": 0.0091, "step": 37980 }, { "grad_norm": 0.08277498185634613, "learning_rate": 5.825215379385327e-05, "loss": 0.0066, "step": 37990 }, { "grad_norm": 0.12461671978235245, "learning_rate": 5.823176815531057e-05, "loss": 0.0105, "step": 38000 }, { "grad_norm": 0.15701816976070404, "learning_rate": 5.8211381110183826e-05, "loss": 0.0071, "step": 38010 }, { "grad_norm": 0.12474660575389862, "learning_rate": 5.8190992661956645e-05, "loss": 0.0072, "step": 38020 }, { "grad_norm": 0.17025694251060486, "learning_rate": 5.817060281411284e-05, "loss": 0.0068, "step": 38030 }, { "grad_norm": 0.055906664580106735, "learning_rate": 5.815021157013647e-05, "loss": 0.0065, "step": 38040 }, { "grad_norm": 0.12146671861410141, "learning_rate": 5.8129818933511856e-05, "loss": 0.0084, "step": 38050 }, { "grad_norm": 0.12588292360305786, "learning_rate": 5.8109424907723544e-05, "loss": 0.0071, "step": 38060 }, { "grad_norm": 0.12473947554826736, "learning_rate": 5.80890294962563e-05, "loss": 0.0067, "step": 38070 }, { "grad_norm": 0.126786008477211, "learning_rate": 5.806863270259515e-05, "loss": 0.0078, "step": 38080 }, { "grad_norm": 0.11957657337188721, "learning_rate": 5.804823453022536e-05, "loss": 0.0096, "step": 38090 }, { "grad_norm": 0.13996458053588867, "learning_rate": 5.80278349826324e-05, "loss": 0.0099, "step": 38100 }, { "grad_norm": 0.10578873753547668, "learning_rate": 5.8007434063302014e-05, "loss": 0.0077, "step": 38110 }, { "grad_norm": 0.13402242958545685, "learning_rate": 5.7987031775720136e-05, "loss": 0.0082, "step": 38120 }, { "grad_norm": 0.12051033228635788, "learning_rate": 5.7966628123372976e-05, "loss": 0.0089, "step": 38130 }, { "grad_norm": 0.11469528079032898, "learning_rate": 5.7946223109746956e-05, "loss": 0.0062, "step": 38140 }, { "grad_norm": 0.11457110196352005, "learning_rate": 5.7925816738328754e-05, "loss": 0.0069, "step": 38150 }, { "grad_norm": 0.1289360225200653, "learning_rate": 5.790540901260521e-05, "loss": 0.009, "step": 38160 }, { "grad_norm": 0.12237328290939331, "learning_rate": 5.788499993606351e-05, "loss": 0.0093, "step": 38170 }, { "grad_norm": 0.1119631975889206, "learning_rate": 5.786458951219096e-05, "loss": 0.0055, "step": 38180 }, { "grad_norm": 0.12014102190732956, "learning_rate": 5.784417774447517e-05, "loss": 0.0082, "step": 38190 }, { "grad_norm": 0.11403071880340576, "learning_rate": 5.782376463640393e-05, "loss": 0.0074, "step": 38200 }, { "grad_norm": 0.11432838439941406, "learning_rate": 5.780335019146531e-05, "loss": 0.0084, "step": 38210 }, { "grad_norm": 0.12114550173282623, "learning_rate": 5.778293441314755e-05, "loss": 0.0069, "step": 38220 }, { "grad_norm": 0.1091017872095108, "learning_rate": 5.776251730493917e-05, "loss": 0.009, "step": 38230 }, { "grad_norm": 0.09895719587802887, "learning_rate": 5.774209887032887e-05, "loss": 0.007, "step": 38240 }, { "grad_norm": 0.13567477464675903, "learning_rate": 5.772167911280565e-05, "loss": 0.0093, "step": 38250 }, { "grad_norm": 0.12847213447093964, "learning_rate": 5.770125803585864e-05, "loss": 0.0086, "step": 38260 }, { "grad_norm": 0.13059568405151367, "learning_rate": 5.768083564297726e-05, "loss": 0.0077, "step": 38270 }, { "grad_norm": 0.1509607434272766, "learning_rate": 5.766041193765114e-05, "loss": 0.0077, "step": 38280 }, { "grad_norm": 0.1338409185409546, "learning_rate": 5.763998692337015e-05, "loss": 0.0086, "step": 38290 }, { "grad_norm": 0.12016512453556061, "learning_rate": 5.761956060362433e-05, "loss": 0.0082, "step": 38300 }, { "grad_norm": 0.08679568767547607, "learning_rate": 5.7599132981904e-05, "loss": 0.009, "step": 38310 }, { "grad_norm": 0.1180046796798706, "learning_rate": 5.75787040616997e-05, "loss": 0.0071, "step": 38320 }, { "grad_norm": 0.09697161614894867, "learning_rate": 5.755827384650212e-05, "loss": 0.0096, "step": 38330 }, { "grad_norm": 0.08706361055374146, "learning_rate": 5.753784233980228e-05, "loss": 0.0107, "step": 38340 }, { "grad_norm": 0.14316953718662262, "learning_rate": 5.751740954509135e-05, "loss": 0.0082, "step": 38350 }, { "grad_norm": 0.1099645122885704, "learning_rate": 5.7496975465860715e-05, "loss": 0.0083, "step": 38360 }, { "grad_norm": 0.16079936921596527, "learning_rate": 5.747654010560202e-05, "loss": 0.0088, "step": 38370 }, { "grad_norm": 0.12303335964679718, "learning_rate": 5.7456103467807097e-05, "loss": 0.0077, "step": 38380 }, { "grad_norm": 0.12383593618869781, "learning_rate": 5.7435665555968046e-05, "loss": 0.0073, "step": 38390 }, { "grad_norm": 0.09433922916650772, "learning_rate": 5.74152263735771e-05, "loss": 0.0081, "step": 38400 }, { "grad_norm": 0.11883267760276794, "learning_rate": 5.739478592412677e-05, "loss": 0.0095, "step": 38410 }, { "grad_norm": 0.12529471516609192, "learning_rate": 5.7374344211109766e-05, "loss": 0.0087, "step": 38420 }, { "grad_norm": 0.09972811490297318, "learning_rate": 5.735390123801904e-05, "loss": 0.0085, "step": 38430 }, { "grad_norm": 0.12490773946046829, "learning_rate": 5.7333457008347704e-05, "loss": 0.0072, "step": 38440 }, { "grad_norm": 0.11533164978027344, "learning_rate": 5.7313011525589156e-05, "loss": 0.008, "step": 38450 }, { "grad_norm": 0.125447615981102, "learning_rate": 5.729256479323694e-05, "loss": 0.0099, "step": 38460 }, { "grad_norm": 0.11550713330507278, "learning_rate": 5.727211681478485e-05, "loss": 0.0086, "step": 38470 }, { "grad_norm": 0.1275232583284378, "learning_rate": 5.7251667593726886e-05, "loss": 0.008, "step": 38480 }, { "grad_norm": 0.09861744195222855, "learning_rate": 5.723121713355728e-05, "loss": 0.0086, "step": 38490 }, { "grad_norm": 0.11456207185983658, "learning_rate": 5.721076543777044e-05, "loss": 0.0101, "step": 38500 }, { "grad_norm": 0.11454357951879501, "learning_rate": 5.7190312509860986e-05, "loss": 0.0099, "step": 38510 }, { "grad_norm": 0.09681820869445801, "learning_rate": 5.716985835332379e-05, "loss": 0.0068, "step": 38520 }, { "grad_norm": 0.11580371111631393, "learning_rate": 5.714940297165389e-05, "loss": 0.0081, "step": 38530 }, { "grad_norm": 0.11090362817049026, "learning_rate": 5.712894636834656e-05, "loss": 0.0104, "step": 38540 }, { "grad_norm": 0.12833036482334137, "learning_rate": 5.7108488546897276e-05, "loss": 0.0105, "step": 38550 }, { "grad_norm": 0.11193793267011642, "learning_rate": 5.708802951080172e-05, "loss": 0.0079, "step": 38560 }, { "grad_norm": 0.10902775079011917, "learning_rate": 5.706756926355576e-05, "loss": 0.0075, "step": 38570 }, { "grad_norm": 0.12196052819490433, "learning_rate": 5.704710780865554e-05, "loss": 0.0084, "step": 38580 }, { "grad_norm": 0.12992063164710999, "learning_rate": 5.7026645149597325e-05, "loss": 0.0089, "step": 38590 }, { "grad_norm": 0.12772898375988007, "learning_rate": 5.700618128987764e-05, "loss": 0.0081, "step": 38600 }, { "grad_norm": 0.10598290711641312, "learning_rate": 5.698571623299317e-05, "loss": 0.0085, "step": 38610 }, { "grad_norm": 0.1300002932548523, "learning_rate": 5.696524998244086e-05, "loss": 0.0101, "step": 38620 }, { "grad_norm": 0.07855630666017532, "learning_rate": 5.6944782541717836e-05, "loss": 0.0079, "step": 38630 }, { "grad_norm": 0.14567378163337708, "learning_rate": 5.69243139143214e-05, "loss": 0.0082, "step": 38640 }, { "grad_norm": 0.08919693529605865, "learning_rate": 5.6903844103749125e-05, "loss": 0.0086, "step": 38650 }, { "grad_norm": 0.11079715937376022, "learning_rate": 5.688337311349869e-05, "loss": 0.008, "step": 38660 }, { "grad_norm": 0.1176462173461914, "learning_rate": 5.6862900947068074e-05, "loss": 0.0078, "step": 38670 }, { "grad_norm": 0.09305386990308762, "learning_rate": 5.6842427607955374e-05, "loss": 0.0087, "step": 38680 }, { "grad_norm": 0.16117721796035767, "learning_rate": 5.682195309965893e-05, "loss": 0.0077, "step": 38690 }, { "grad_norm": 0.1050504520535469, "learning_rate": 5.6801477425677294e-05, "loss": 0.0072, "step": 38700 }, { "grad_norm": 0.11408647894859314, "learning_rate": 5.678100058950917e-05, "loss": 0.0083, "step": 38710 }, { "grad_norm": 0.08131804317235947, "learning_rate": 5.676052259465352e-05, "loss": 0.0081, "step": 38720 }, { "grad_norm": 0.13383084535598755, "learning_rate": 5.674004344460945e-05, "loss": 0.0093, "step": 38730 }, { "grad_norm": 0.12384442985057831, "learning_rate": 5.6719563142876295e-05, "loss": 0.0093, "step": 38740 }, { "grad_norm": 0.15958496928215027, "learning_rate": 5.669908169295359e-05, "loss": 0.01, "step": 38750 }, { "grad_norm": 0.1641165018081665, "learning_rate": 5.667859909834105e-05, "loss": 0.0074, "step": 38760 }, { "grad_norm": 0.13984592258930206, "learning_rate": 5.6658115362538565e-05, "loss": 0.0103, "step": 38770 }, { "grad_norm": 0.10002529621124268, "learning_rate": 5.663763048904628e-05, "loss": 0.007, "step": 38780 }, { "grad_norm": 0.1137758195400238, "learning_rate": 5.661714448136447e-05, "loss": 0.0068, "step": 38790 }, { "grad_norm": 0.10118243843317032, "learning_rate": 5.659665734299366e-05, "loss": 0.0083, "step": 38800 }, { "grad_norm": 0.12274403870105743, "learning_rate": 5.6576169077434516e-05, "loss": 0.0082, "step": 38810 }, { "grad_norm": 0.11104324460029602, "learning_rate": 5.6555679688187944e-05, "loss": 0.008, "step": 38820 }, { "grad_norm": 0.1460246592760086, "learning_rate": 5.6535189178755e-05, "loss": 0.0093, "step": 38830 }, { "grad_norm": 0.09889449179172516, "learning_rate": 5.651469755263698e-05, "loss": 0.0083, "step": 38840 }, { "grad_norm": 0.08324373513460159, "learning_rate": 5.6494204813335316e-05, "loss": 0.0079, "step": 38850 }, { "grad_norm": 0.10910128802061081, "learning_rate": 5.647371096435168e-05, "loss": 0.0113, "step": 38860 }, { "grad_norm": 0.19051453471183777, "learning_rate": 5.645321600918788e-05, "loss": 0.0083, "step": 38870 }, { "grad_norm": 0.16085021197795868, "learning_rate": 5.643271995134597e-05, "loss": 0.0095, "step": 38880 }, { "grad_norm": 0.1077832281589508, "learning_rate": 5.641222279432814e-05, "loss": 0.008, "step": 38890 }, { "grad_norm": 0.16730515658855438, "learning_rate": 5.6391724541636834e-05, "loss": 0.01, "step": 38900 }, { "grad_norm": 0.106916144490242, "learning_rate": 5.6371225196774605e-05, "loss": 0.0084, "step": 38910 }, { "grad_norm": 0.1208840012550354, "learning_rate": 5.635072476324423e-05, "loss": 0.0074, "step": 38920 }, { "grad_norm": 0.10301487892866135, "learning_rate": 5.63302232445487e-05, "loss": 0.0062, "step": 38930 }, { "grad_norm": 0.09643519669771194, "learning_rate": 5.6309720644191144e-05, "loss": 0.0074, "step": 38940 }, { "grad_norm": 0.13249236345291138, "learning_rate": 5.628921696567491e-05, "loss": 0.0099, "step": 38950 }, { "grad_norm": 0.11266274750232697, "learning_rate": 5.62687122125035e-05, "loss": 0.0076, "step": 38960 }, { "grad_norm": 0.14811410009860992, "learning_rate": 5.624820638818062e-05, "loss": 0.0073, "step": 38970 }, { "grad_norm": 0.12423981726169586, "learning_rate": 5.6227699496210164e-05, "loss": 0.0077, "step": 38980 }, { "grad_norm": 0.1085599809885025, "learning_rate": 5.6207191540096195e-05, "loss": 0.0082, "step": 38990 }, { "grad_norm": 0.11790905892848969, "learning_rate": 5.618668252334296e-05, "loss": 0.0064, "step": 39000 }, { "grad_norm": 0.1465645283460617, "learning_rate": 5.616617244945488e-05, "loss": 0.009, "step": 39010 }, { "grad_norm": 0.10807660222053528, "learning_rate": 5.614566132193656e-05, "loss": 0.0074, "step": 39020 }, { "grad_norm": 0.14810723066329956, "learning_rate": 5.612514914429282e-05, "loss": 0.0099, "step": 39030 }, { "grad_norm": 0.13895536959171295, "learning_rate": 5.610463592002863e-05, "loss": 0.0093, "step": 39040 }, { "grad_norm": 0.10064518451690674, "learning_rate": 5.608412165264909e-05, "loss": 0.0064, "step": 39050 }, { "grad_norm": 0.0987563356757164, "learning_rate": 5.606360634565959e-05, "loss": 0.0098, "step": 39060 }, { "grad_norm": 0.13642963767051697, "learning_rate": 5.604309000256559e-05, "loss": 0.0086, "step": 39070 }, { "grad_norm": 0.12359462678432465, "learning_rate": 5.6022572626872785e-05, "loss": 0.0061, "step": 39080 }, { "grad_norm": 0.10117096453905106, "learning_rate": 5.600205422208704e-05, "loss": 0.008, "step": 39090 }, { "grad_norm": 0.09440003335475922, "learning_rate": 5.59815347917144e-05, "loss": 0.009, "step": 39100 }, { "grad_norm": 0.0934314876794815, "learning_rate": 5.596101433926103e-05, "loss": 0.0061, "step": 39110 }, { "grad_norm": 0.11805842816829681, "learning_rate": 5.5940492868233364e-05, "loss": 0.0113, "step": 39120 }, { "grad_norm": 0.09478338062763214, "learning_rate": 5.591997038213793e-05, "loss": 0.0095, "step": 39130 }, { "grad_norm": 0.12519751489162445, "learning_rate": 5.5899446884481475e-05, "loss": 0.0081, "step": 39140 }, { "grad_norm": 0.10082903504371643, "learning_rate": 5.5878922378770906e-05, "loss": 0.0072, "step": 39150 }, { "grad_norm": 0.14229387044906616, "learning_rate": 5.5858396868513285e-05, "loss": 0.0084, "step": 39160 }, { "grad_norm": 0.08642034232616425, "learning_rate": 5.583787035721586e-05, "loss": 0.0065, "step": 39170 }, { "grad_norm": 0.10848913341760635, "learning_rate": 5.581734284838606e-05, "loss": 0.0066, "step": 39180 }, { "grad_norm": 0.09358995407819748, "learning_rate": 5.579681434553147e-05, "loss": 0.0084, "step": 39190 }, { "grad_norm": 0.16764678061008453, "learning_rate": 5.5776284852159854e-05, "loss": 0.0079, "step": 39200 }, { "grad_norm": 0.109534852206707, "learning_rate": 5.575575437177913e-05, "loss": 0.0078, "step": 39210 }, { "grad_norm": 0.10462023317813873, "learning_rate": 5.573522290789742e-05, "loss": 0.0086, "step": 39220 }, { "grad_norm": 0.12332797050476074, "learning_rate": 5.571469046402298e-05, "loss": 0.0079, "step": 39230 }, { "grad_norm": 0.1946808099746704, "learning_rate": 5.5694157043664205e-05, "loss": 0.0122, "step": 39240 }, { "grad_norm": 0.08103945851325989, "learning_rate": 5.567362265032975e-05, "loss": 0.0064, "step": 39250 }, { "grad_norm": 0.10954541712999344, "learning_rate": 5.565308728752836e-05, "loss": 0.0064, "step": 39260 }, { "grad_norm": 0.1618465930223465, "learning_rate": 5.5632550958768956e-05, "loss": 0.0072, "step": 39270 }, { "grad_norm": 0.13012363016605377, "learning_rate": 5.5612013667560636e-05, "loss": 0.0097, "step": 39280 }, { "grad_norm": 0.10082210600376129, "learning_rate": 5.5591475417412676e-05, "loss": 0.0102, "step": 39290 }, { "grad_norm": 0.1159055083990097, "learning_rate": 5.557093621183451e-05, "loss": 0.0082, "step": 39300 }, { "grad_norm": 0.1562989354133606, "learning_rate": 5.55503960543357e-05, "loss": 0.0082, "step": 39310 }, { "grad_norm": 0.14342786371707916, "learning_rate": 5.552985494842601e-05, "loss": 0.0079, "step": 39320 }, { "grad_norm": 0.08896257728338242, "learning_rate": 5.550931289761534e-05, "loss": 0.0089, "step": 39330 }, { "grad_norm": 0.1053403690457344, "learning_rate": 5.548876990541378e-05, "loss": 0.0067, "step": 39340 }, { "grad_norm": 0.17049074172973633, "learning_rate": 5.5468225975331564e-05, "loss": 0.0101, "step": 39350 }, { "grad_norm": 0.13192002475261688, "learning_rate": 5.544768111087909e-05, "loss": 0.0088, "step": 39360 }, { "grad_norm": 0.1131104826927185, "learning_rate": 5.54271353155669e-05, "loss": 0.0066, "step": 39370 }, { "grad_norm": 0.10873185843229294, "learning_rate": 5.5406588592905715e-05, "loss": 0.0077, "step": 39380 }, { "grad_norm": 0.10326164215803146, "learning_rate": 5.5386040946406416e-05, "loss": 0.0073, "step": 39390 }, { "grad_norm": 0.10621654242277145, "learning_rate": 5.536549237958004e-05, "loss": 0.007, "step": 39400 }, { "grad_norm": 0.12954817712306976, "learning_rate": 5.5344942895937744e-05, "loss": 0.0075, "step": 39410 }, { "grad_norm": 0.12610168755054474, "learning_rate": 5.5324392498990904e-05, "loss": 0.0079, "step": 39420 }, { "grad_norm": 0.13304120302200317, "learning_rate": 5.5303841192251e-05, "loss": 0.0087, "step": 39430 }, { "grad_norm": 0.09674583375453949, "learning_rate": 5.52832889792297e-05, "loss": 0.0099, "step": 39440 }, { "grad_norm": 0.10256851464509964, "learning_rate": 5.526273586343881e-05, "loss": 0.0083, "step": 39450 }, { "grad_norm": 0.1567581743001938, "learning_rate": 5.5242181848390306e-05, "loss": 0.0069, "step": 39460 }, { "grad_norm": 0.08697322756052017, "learning_rate": 5.5221626937596285e-05, "loss": 0.0069, "step": 39470 }, { "grad_norm": 0.10903681814670563, "learning_rate": 5.520107113456903e-05, "loss": 0.0075, "step": 39480 }, { "grad_norm": 0.10190286487340927, "learning_rate": 5.5180514442820974e-05, "loss": 0.0082, "step": 39490 }, { "grad_norm": 0.10146224498748779, "learning_rate": 5.515995686586469e-05, "loss": 0.0075, "step": 39500 }, { "grad_norm": 0.12528324127197266, "learning_rate": 5.5139398407212916e-05, "loss": 0.0115, "step": 39510 }, { "grad_norm": 0.14245107769966125, "learning_rate": 5.511883907037849e-05, "loss": 0.0091, "step": 39520 }, { "grad_norm": 0.09764297306537628, "learning_rate": 5.509827885887449e-05, "loss": 0.0077, "step": 39530 }, { "grad_norm": 0.09994565695524216, "learning_rate": 5.507771777621406e-05, "loss": 0.0083, "step": 39540 }, { "grad_norm": 0.12986601889133453, "learning_rate": 5.505715582591052e-05, "loss": 0.0073, "step": 39550 }, { "grad_norm": 0.08878956735134125, "learning_rate": 5.50365930114774e-05, "loss": 0.0064, "step": 39560 }, { "grad_norm": 0.1702568084001541, "learning_rate": 5.5016029336428255e-05, "loss": 0.0071, "step": 39570 }, { "grad_norm": 0.11697200685739517, "learning_rate": 5.49954648042769e-05, "loss": 0.0087, "step": 39580 }, { "grad_norm": 0.1134813204407692, "learning_rate": 5.4974899418537226e-05, "loss": 0.0084, "step": 39590 }, { "grad_norm": 0.15964290499687195, "learning_rate": 5.4954333182723316e-05, "loss": 0.007, "step": 39600 }, { "grad_norm": 0.1105453372001648, "learning_rate": 5.493376610034937e-05, "loss": 0.0081, "step": 39610 }, { "grad_norm": 0.11452367156744003, "learning_rate": 5.4913198174929735e-05, "loss": 0.0061, "step": 39620 }, { "grad_norm": 0.08027038723230362, "learning_rate": 5.489262940997889e-05, "loss": 0.0069, "step": 39630 }, { "grad_norm": 0.11183751374483109, "learning_rate": 5.487205980901151e-05, "loss": 0.0069, "step": 39640 }, { "grad_norm": 0.08219151198863983, "learning_rate": 5.485148937554234e-05, "loss": 0.006, "step": 39650 }, { "grad_norm": 0.1127895936369896, "learning_rate": 5.483091811308635e-05, "loss": 0.0078, "step": 39660 }, { "grad_norm": 0.10084232687950134, "learning_rate": 5.4810346025158564e-05, "loss": 0.0077, "step": 39670 }, { "grad_norm": 0.09895816445350647, "learning_rate": 5.478977311527421e-05, "loss": 0.0076, "step": 39680 }, { "grad_norm": 0.1001691147685051, "learning_rate": 5.476919938694863e-05, "loss": 0.0081, "step": 39690 }, { "grad_norm": 0.11854156106710434, "learning_rate": 5.474862484369733e-05, "loss": 0.0077, "step": 39700 }, { "grad_norm": 0.08836834132671356, "learning_rate": 5.472804948903589e-05, "loss": 0.0093, "step": 39710 }, { "grad_norm": 0.11634107679128647, "learning_rate": 5.470747332648013e-05, "loss": 0.0065, "step": 39720 }, { "grad_norm": 0.12066209316253662, "learning_rate": 5.468689635954591e-05, "loss": 0.0086, "step": 39730 }, { "grad_norm": 0.1361406296491623, "learning_rate": 5.46663185917493e-05, "loss": 0.0105, "step": 39740 }, { "grad_norm": 0.0852770283818245, "learning_rate": 5.464574002660645e-05, "loss": 0.0064, "step": 39750 }, { "grad_norm": 0.13603517413139343, "learning_rate": 5.4625160667633724e-05, "loss": 0.0082, "step": 39760 }, { "grad_norm": 0.1339753270149231, "learning_rate": 5.4604580518347505e-05, "loss": 0.0081, "step": 39770 }, { "grad_norm": 0.09723997861146927, "learning_rate": 5.458399958226442e-05, "loss": 0.0071, "step": 39780 }, { "grad_norm": 0.0902947187423706, "learning_rate": 5.456341786290119e-05, "loss": 0.0094, "step": 39790 }, { "grad_norm": 0.12657715380191803, "learning_rate": 5.454283536377465e-05, "loss": 0.0084, "step": 39800 }, { "grad_norm": 0.09819196164608002, "learning_rate": 5.452225208840179e-05, "loss": 0.0085, "step": 39810 }, { "grad_norm": 0.13495495915412903, "learning_rate": 5.450166804029975e-05, "loss": 0.0092, "step": 39820 }, { "grad_norm": 0.11353347450494766, "learning_rate": 5.448108322298574e-05, "loss": 0.0079, "step": 39830 }, { "grad_norm": 0.14045147597789764, "learning_rate": 5.446049763997717e-05, "loss": 0.0078, "step": 39840 }, { "grad_norm": 0.12532342970371246, "learning_rate": 5.4439911294791546e-05, "loss": 0.0085, "step": 39850 }, { "grad_norm": 0.10373662412166595, "learning_rate": 5.441932419094652e-05, "loss": 0.0077, "step": 39860 }, { "grad_norm": 0.11528890579938889, "learning_rate": 5.439873633195985e-05, "loss": 0.0071, "step": 39870 }, { "grad_norm": 0.14318816363811493, "learning_rate": 5.437814772134947e-05, "loss": 0.0079, "step": 39880 }, { "grad_norm": 0.1150628924369812, "learning_rate": 5.4357558362633366e-05, "loss": 0.0098, "step": 39890 }, { "grad_norm": 0.08387002348899841, "learning_rate": 5.4336968259329726e-05, "loss": 0.0088, "step": 39900 }, { "grad_norm": 0.15220874547958374, "learning_rate": 5.431637741495681e-05, "loss": 0.0077, "step": 39910 }, { "grad_norm": 0.10909558087587357, "learning_rate": 5.429578583303307e-05, "loss": 0.0085, "step": 39920 }, { "grad_norm": 0.08734720945358276, "learning_rate": 5.427519351707701e-05, "loss": 0.0092, "step": 39930 }, { "grad_norm": 0.11322776228189468, "learning_rate": 5.42546004706073e-05, "loss": 0.0071, "step": 39940 }, { "grad_norm": 0.09409381449222565, "learning_rate": 5.4234006697142735e-05, "loss": 0.0062, "step": 39950 }, { "grad_norm": 0.10799124091863632, "learning_rate": 5.421341220020224e-05, "loss": 0.0084, "step": 39960 }, { "grad_norm": 0.1353558450937271, "learning_rate": 5.419281698330482e-05, "loss": 0.0062, "step": 39970 }, { "grad_norm": 0.10665678232908249, "learning_rate": 5.4172221049969665e-05, "loss": 0.0081, "step": 39980 }, { "grad_norm": 0.127376526594162, "learning_rate": 5.415162440371604e-05, "loss": 0.0058, "step": 39990 }, { "grad_norm": 0.12170139700174332, "learning_rate": 5.413102704806334e-05, "loss": 0.0066, "step": 40000 } ], "logging_steps": 10, "max_steps": 80000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 48, "trial_name": null, "trial_params": null }