{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8333333333333334, "eval_steps": 500, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 0.3151160776615143, "learning_rate": 3.0000000000000004e-07, "loss": 1.3404, "step": 10 }, { "grad_norm": 0.3020654320716858, "learning_rate": 6.333333333333333e-07, "loss": 1.342, "step": 20 }, { "grad_norm": 0.3234117329120636, "learning_rate": 9.666666666666668e-07, "loss": 1.3377, "step": 30 }, { "grad_norm": 0.30097776651382446, "learning_rate": 1.3e-06, "loss": 1.3301, "step": 40 }, { "grad_norm": 0.35790786147117615, "learning_rate": 1.6333333333333333e-06, "loss": 1.3219, "step": 50 }, { "grad_norm": 0.4395120143890381, "learning_rate": 1.9666666666666668e-06, "loss": 1.3045, "step": 60 }, { "grad_norm": 0.49268093705177307, "learning_rate": 2.3e-06, "loss": 1.2686, "step": 70 }, { "grad_norm": 0.5123924016952515, "learning_rate": 2.6333333333333337e-06, "loss": 1.2318, "step": 80 }, { "grad_norm": 0.5240263342857361, "learning_rate": 2.966666666666667e-06, "loss": 1.193, "step": 90 }, { "grad_norm": 0.4608796536922455, "learning_rate": 3.3e-06, "loss": 1.1633, "step": 100 }, { "grad_norm": 0.3693615198135376, "learning_rate": 3.633333333333334e-06, "loss": 1.1402, "step": 110 }, { "grad_norm": 0.29207348823547363, "learning_rate": 3.966666666666667e-06, "loss": 1.1277, "step": 120 }, { "grad_norm": 0.433851420879364, "learning_rate": 4.2999999999999995e-06, "loss": 1.1316, "step": 130 }, { "grad_norm": 0.3808739185333252, "learning_rate": 4.633333333333334e-06, "loss": 1.1236, "step": 140 }, { "grad_norm": 1.092936635017395, "learning_rate": 4.966666666666667e-06, "loss": 1.1152, "step": 150 }, { "grad_norm": 1.8908064365386963, "learning_rate": 5.3e-06, "loss": 1.1041, "step": 160 }, { "grad_norm": 0.36102619767189026, "learning_rate": 5.633333333333333e-06, "loss": 1.0922, "step": 170 }, { "grad_norm": 0.6650304198265076, "learning_rate": 5.9666666666666666e-06, "loss": 1.0838, "step": 180 }, { "grad_norm": 0.9714380502700806, "learning_rate": 6.300000000000001e-06, "loss": 1.0699, "step": 190 }, { "grad_norm": 0.5087857246398926, "learning_rate": 6.633333333333333e-06, "loss": 1.0527, "step": 200 }, { "grad_norm": 0.7789682745933533, "learning_rate": 6.966666666666667e-06, "loss": 1.0264, "step": 210 }, { "grad_norm": 0.8788719177246094, "learning_rate": 7.2999999999999996e-06, "loss": 1.0008, "step": 220 }, { "grad_norm": 1.7668907642364502, "learning_rate": 7.633333333333334e-06, "loss": 0.97, "step": 230 }, { "grad_norm": 1.87691330909729, "learning_rate": 7.966666666666666e-06, "loss": 0.9382, "step": 240 }, { "grad_norm": 2.0903725624084473, "learning_rate": 8.3e-06, "loss": 0.913, "step": 250 }, { "grad_norm": 1.5885066986083984, "learning_rate": 8.633333333333334e-06, "loss": 0.8897, "step": 260 }, { "grad_norm": 2.089568853378296, "learning_rate": 8.966666666666668e-06, "loss": 0.865, "step": 270 }, { "grad_norm": 1.8302918672561646, "learning_rate": 9.3e-06, "loss": 0.8245, "step": 280 }, { "grad_norm": 1.7896313667297363, "learning_rate": 9.633333333333335e-06, "loss": 0.8019, "step": 290 }, { "grad_norm": 2.4243576526641846, "learning_rate": 9.966666666666667e-06, "loss": 0.7622, "step": 300 }, { "grad_norm": 2.4779114723205566, "learning_rate": 1.03e-05, "loss": 0.7461, "step": 310 }, { "grad_norm": 2.26315975189209, "learning_rate": 1.0633333333333334e-05, "loss": 0.7075, "step": 320 }, { "grad_norm": 2.9704113006591797, "learning_rate": 1.0966666666666666e-05, "loss": 0.6804, "step": 330 }, { "grad_norm": 2.7189061641693115, "learning_rate": 1.13e-05, "loss": 0.6456, "step": 340 }, { "grad_norm": 3.232788324356079, "learning_rate": 1.1633333333333334e-05, "loss": 0.6187, "step": 350 }, { "grad_norm": 2.760183572769165, "learning_rate": 1.1966666666666668e-05, "loss": 0.598, "step": 360 }, { "grad_norm": 3.3780648708343506, "learning_rate": 1.23e-05, "loss": 0.5944, "step": 370 }, { "grad_norm": 2.2730066776275635, "learning_rate": 1.2633333333333333e-05, "loss": 0.5714, "step": 380 }, { "grad_norm": 2.858290433883667, "learning_rate": 1.2966666666666669e-05, "loss": 0.5479, "step": 390 }, { "grad_norm": 3.4867379665374756, "learning_rate": 1.3300000000000001e-05, "loss": 0.5249, "step": 400 }, { "grad_norm": 3.99298095703125, "learning_rate": 1.3633333333333334e-05, "loss": 0.5133, "step": 410 }, { "grad_norm": 3.0357937812805176, "learning_rate": 1.3966666666666666e-05, "loss": 0.4933, "step": 420 }, { "grad_norm": 3.299088716506958, "learning_rate": 1.43e-05, "loss": 0.479, "step": 430 }, { "grad_norm": 4.71593713760376, "learning_rate": 1.4633333333333334e-05, "loss": 0.4583, "step": 440 }, { "grad_norm": 4.464345932006836, "learning_rate": 1.4966666666666668e-05, "loss": 0.4452, "step": 450 }, { "grad_norm": 4.618643283843994, "learning_rate": 1.53e-05, "loss": 0.4365, "step": 460 }, { "grad_norm": 4.148494243621826, "learning_rate": 1.563333333333333e-05, "loss": 0.4231, "step": 470 }, { "grad_norm": 4.1066460609436035, "learning_rate": 1.5966666666666667e-05, "loss": 0.4048, "step": 480 }, { "grad_norm": 5.027594089508057, "learning_rate": 1.63e-05, "loss": 0.3889, "step": 490 }, { "grad_norm": 4.382143020629883, "learning_rate": 1.6633333333333336e-05, "loss": 0.3726, "step": 500 }, { "grad_norm": 3.284897804260254, "learning_rate": 1.6966666666666668e-05, "loss": 0.3719, "step": 510 }, { "grad_norm": 3.9474563598632812, "learning_rate": 1.73e-05, "loss": 0.3524, "step": 520 }, { "grad_norm": 4.27362060546875, "learning_rate": 1.7633333333333336e-05, "loss": 0.3536, "step": 530 }, { "grad_norm": 4.156512260437012, "learning_rate": 1.796666666666667e-05, "loss": 0.3341, "step": 540 }, { "grad_norm": 2.9755077362060547, "learning_rate": 1.83e-05, "loss": 0.3261, "step": 550 }, { "grad_norm": 5.38554573059082, "learning_rate": 1.8633333333333333e-05, "loss": 0.3169, "step": 560 }, { "grad_norm": 4.22395658493042, "learning_rate": 1.896666666666667e-05, "loss": 0.3218, "step": 570 }, { "grad_norm": 5.229879379272461, "learning_rate": 1.93e-05, "loss": 0.3157, "step": 580 }, { "grad_norm": 4.932677745819092, "learning_rate": 1.9633333333333334e-05, "loss": 0.3274, "step": 590 }, { "grad_norm": 4.6989970207214355, "learning_rate": 1.9966666666666666e-05, "loss": 0.3089, "step": 600 }, { "grad_norm": 4.014418125152588, "learning_rate": 2.0300000000000002e-05, "loss": 0.3252, "step": 610 }, { "grad_norm": 5.63760232925415, "learning_rate": 2.0633333333333335e-05, "loss": 0.289, "step": 620 }, { "grad_norm": 3.517097234725952, "learning_rate": 2.0966666666666667e-05, "loss": 0.2773, "step": 630 }, { "grad_norm": 3.986159324645996, "learning_rate": 2.13e-05, "loss": 0.2817, "step": 640 }, { "grad_norm": 5.639315128326416, "learning_rate": 2.1633333333333332e-05, "loss": 0.2936, "step": 650 }, { "grad_norm": 3.5614218711853027, "learning_rate": 2.1966666666666668e-05, "loss": 0.281, "step": 660 }, { "grad_norm": 4.742738723754883, "learning_rate": 2.23e-05, "loss": 0.3028, "step": 670 }, { "grad_norm": 3.518442392349243, "learning_rate": 2.2633333333333336e-05, "loss": 0.2848, "step": 680 }, { "grad_norm": 5.053081035614014, "learning_rate": 2.2966666666666668e-05, "loss": 0.2697, "step": 690 }, { "grad_norm": 3.5047106742858887, "learning_rate": 2.3300000000000004e-05, "loss": 0.2645, "step": 700 }, { "grad_norm": 3.9816548824310303, "learning_rate": 2.3633333333333336e-05, "loss": 0.2807, "step": 710 }, { "grad_norm": 3.4382357597351074, "learning_rate": 2.396666666666667e-05, "loss": 0.2638, "step": 720 }, { "grad_norm": 3.374788999557495, "learning_rate": 2.43e-05, "loss": 0.2565, "step": 730 }, { "grad_norm": 4.029055595397949, "learning_rate": 2.4633333333333334e-05, "loss": 0.2342, "step": 740 }, { "grad_norm": 3.1926724910736084, "learning_rate": 2.496666666666667e-05, "loss": 0.2645, "step": 750 }, { "grad_norm": 3.3582568168640137, "learning_rate": 2.5300000000000002e-05, "loss": 0.2511, "step": 760 }, { "grad_norm": 3.9178779125213623, "learning_rate": 2.5633333333333338e-05, "loss": 0.2594, "step": 770 }, { "grad_norm": 3.3477022647857666, "learning_rate": 2.5966666666666667e-05, "loss": 0.2523, "step": 780 }, { "grad_norm": 3.4148569107055664, "learning_rate": 2.6300000000000002e-05, "loss": 0.251, "step": 790 }, { "grad_norm": 2.7684218883514404, "learning_rate": 2.663333333333333e-05, "loss": 0.2529, "step": 800 }, { "grad_norm": 2.6964213848114014, "learning_rate": 2.6966666666666667e-05, "loss": 0.2595, "step": 810 }, { "grad_norm": 3.314094066619873, "learning_rate": 2.7300000000000003e-05, "loss": 0.2514, "step": 820 }, { "grad_norm": 3.2154042720794678, "learning_rate": 2.7633333333333332e-05, "loss": 0.244, "step": 830 }, { "grad_norm": 3.1019206047058105, "learning_rate": 2.7966666666666668e-05, "loss": 0.2359, "step": 840 }, { "grad_norm": 3.6844875812530518, "learning_rate": 2.83e-05, "loss": 0.2535, "step": 850 }, { "grad_norm": 3.789260149002075, "learning_rate": 2.8633333333333336e-05, "loss": 0.2412, "step": 860 }, { "grad_norm": 3.2102692127227783, "learning_rate": 2.8966666666666668e-05, "loss": 0.2354, "step": 870 }, { "grad_norm": 4.382669448852539, "learning_rate": 2.93e-05, "loss": 0.2329, "step": 880 }, { "grad_norm": 3.451387882232666, "learning_rate": 2.9633333333333336e-05, "loss": 0.2271, "step": 890 }, { "grad_norm": 3.3067309856414795, "learning_rate": 2.9966666666666672e-05, "loss": 0.2497, "step": 900 }, { "grad_norm": 2.8440616130828857, "learning_rate": 3.03e-05, "loss": 0.2199, "step": 910 }, { "grad_norm": 2.817674160003662, "learning_rate": 3.063333333333334e-05, "loss": 0.2376, "step": 920 }, { "grad_norm": 3.366726875305176, "learning_rate": 3.096666666666666e-05, "loss": 0.2244, "step": 930 }, { "grad_norm": 3.036982774734497, "learning_rate": 3.13e-05, "loss": 0.2263, "step": 940 }, { "grad_norm": 3.142780065536499, "learning_rate": 3.1633333333333334e-05, "loss": 0.2474, "step": 950 }, { "grad_norm": 2.676440715789795, "learning_rate": 3.196666666666667e-05, "loss": 0.2233, "step": 960 }, { "grad_norm": 2.636388063430786, "learning_rate": 3.2300000000000006e-05, "loss": 0.2204, "step": 970 }, { "grad_norm": 3.7486138343811035, "learning_rate": 3.263333333333333e-05, "loss": 0.239, "step": 980 }, { "grad_norm": 2.8146631717681885, "learning_rate": 3.296666666666667e-05, "loss": 0.2319, "step": 990 }, { "grad_norm": 2.960897445678711, "learning_rate": 3.33e-05, "loss": 0.2285, "step": 1000 }, { "grad_norm": 3.27303409576416, "learning_rate": 3.3633333333333335e-05, "loss": 0.2106, "step": 1010 }, { "grad_norm": 2.7102749347686768, "learning_rate": 3.396666666666667e-05, "loss": 0.2263, "step": 1020 }, { "grad_norm": 2.666199207305908, "learning_rate": 3.430000000000001e-05, "loss": 0.2203, "step": 1030 }, { "grad_norm": 2.260812759399414, "learning_rate": 3.463333333333333e-05, "loss": 0.231, "step": 1040 }, { "grad_norm": 2.5958454608917236, "learning_rate": 3.496666666666667e-05, "loss": 0.2114, "step": 1050 }, { "grad_norm": 3.396493673324585, "learning_rate": 3.53e-05, "loss": 0.2432, "step": 1060 }, { "grad_norm": 2.5110738277435303, "learning_rate": 3.563333333333334e-05, "loss": 0.2428, "step": 1070 }, { "grad_norm": 2.500586748123169, "learning_rate": 3.596666666666667e-05, "loss": 0.2311, "step": 1080 }, { "grad_norm": 2.6546878814697266, "learning_rate": 3.63e-05, "loss": 0.2051, "step": 1090 }, { "grad_norm": 2.615257501602173, "learning_rate": 3.6633333333333334e-05, "loss": 0.2168, "step": 1100 }, { "grad_norm": 2.9406702518463135, "learning_rate": 3.6966666666666666e-05, "loss": 0.2266, "step": 1110 }, { "grad_norm": 2.8010807037353516, "learning_rate": 3.73e-05, "loss": 0.2181, "step": 1120 }, { "grad_norm": 2.3122730255126953, "learning_rate": 3.763333333333334e-05, "loss": 0.2294, "step": 1130 }, { "grad_norm": 2.6660008430480957, "learning_rate": 3.796666666666667e-05, "loss": 0.2182, "step": 1140 }, { "grad_norm": 2.3272788524627686, "learning_rate": 3.83e-05, "loss": 0.2167, "step": 1150 }, { "grad_norm": 3.0624020099639893, "learning_rate": 3.8633333333333335e-05, "loss": 0.2123, "step": 1160 }, { "grad_norm": 2.790330648422241, "learning_rate": 3.896666666666667e-05, "loss": 0.2135, "step": 1170 }, { "grad_norm": 2.6018900871276855, "learning_rate": 3.9300000000000007e-05, "loss": 0.2043, "step": 1180 }, { "grad_norm": 2.6131794452667236, "learning_rate": 3.963333333333333e-05, "loss": 0.208, "step": 1190 }, { "grad_norm": 2.269437551498413, "learning_rate": 3.996666666666667e-05, "loss": 0.2194, "step": 1200 }, { "grad_norm": 3.1890411376953125, "learning_rate": 4.0300000000000004e-05, "loss": 0.2118, "step": 1210 }, { "grad_norm": 2.834775447845459, "learning_rate": 4.0633333333333336e-05, "loss": 0.2034, "step": 1220 }, { "grad_norm": 2.375288724899292, "learning_rate": 4.096666666666667e-05, "loss": 0.208, "step": 1230 }, { "grad_norm": 2.4344964027404785, "learning_rate": 4.13e-05, "loss": 0.1974, "step": 1240 }, { "grad_norm": 2.663491725921631, "learning_rate": 4.1633333333333333e-05, "loss": 0.2126, "step": 1250 }, { "grad_norm": 2.5932223796844482, "learning_rate": 4.196666666666667e-05, "loss": 0.2084, "step": 1260 }, { "grad_norm": 2.1940386295318604, "learning_rate": 4.23e-05, "loss": 0.2162, "step": 1270 }, { "grad_norm": 2.7900402545928955, "learning_rate": 4.263333333333334e-05, "loss": 0.2131, "step": 1280 }, { "grad_norm": 2.897533893585205, "learning_rate": 4.296666666666666e-05, "loss": 0.2166, "step": 1290 }, { "grad_norm": 2.2424840927124023, "learning_rate": 4.33e-05, "loss": 0.2032, "step": 1300 }, { "grad_norm": 2.116037130355835, "learning_rate": 4.3633333333333335e-05, "loss": 0.2115, "step": 1310 }, { "grad_norm": 2.2839269638061523, "learning_rate": 4.396666666666667e-05, "loss": 0.2138, "step": 1320 }, { "grad_norm": 2.4370317459106445, "learning_rate": 4.43e-05, "loss": 0.1947, "step": 1330 }, { "grad_norm": 1.981080412864685, "learning_rate": 4.463333333333334e-05, "loss": 0.2023, "step": 1340 }, { "grad_norm": 1.715331792831421, "learning_rate": 4.496666666666667e-05, "loss": 0.1908, "step": 1350 }, { "grad_norm": 1.8498488664627075, "learning_rate": 4.53e-05, "loss": 0.2023, "step": 1360 }, { "grad_norm": 2.0724167823791504, "learning_rate": 4.5633333333333336e-05, "loss": 0.1865, "step": 1370 }, { "grad_norm": 2.0844929218292236, "learning_rate": 4.596666666666667e-05, "loss": 0.2106, "step": 1380 }, { "grad_norm": 2.3113696575164795, "learning_rate": 4.630000000000001e-05, "loss": 0.1926, "step": 1390 }, { "grad_norm": 2.6103477478027344, "learning_rate": 4.663333333333333e-05, "loss": 0.2069, "step": 1400 }, { "grad_norm": 2.0490918159484863, "learning_rate": 4.696666666666667e-05, "loss": 0.1947, "step": 1410 }, { "grad_norm": 2.0844857692718506, "learning_rate": 4.73e-05, "loss": 0.2047, "step": 1420 }, { "grad_norm": 2.8071389198303223, "learning_rate": 4.763333333333334e-05, "loss": 0.1951, "step": 1430 }, { "grad_norm": 2.2475125789642334, "learning_rate": 4.796666666666667e-05, "loss": 0.1998, "step": 1440 }, { "grad_norm": 2.2238211631774902, "learning_rate": 4.83e-05, "loss": 0.2014, "step": 1450 }, { "grad_norm": 1.8085145950317383, "learning_rate": 4.8633333333333334e-05, "loss": 0.1979, "step": 1460 }, { "grad_norm": 1.818709135055542, "learning_rate": 4.8966666666666667e-05, "loss": 0.2023, "step": 1470 }, { "grad_norm": 2.0284204483032227, "learning_rate": 4.93e-05, "loss": 0.2024, "step": 1480 }, { "grad_norm": 2.4772613048553467, "learning_rate": 4.963333333333334e-05, "loss": 0.1908, "step": 1490 }, { "grad_norm": 2.158778190612793, "learning_rate": 4.996666666666667e-05, "loss": 0.2125, "step": 1500 }, { "grad_norm": 2.5836544036865234, "learning_rate": 5.03e-05, "loss": 0.1964, "step": 1510 }, { "grad_norm": 2.0680391788482666, "learning_rate": 5.0633333333333335e-05, "loss": 0.1985, "step": 1520 }, { "grad_norm": 1.9237951040267944, "learning_rate": 5.0966666666666674e-05, "loss": 0.1966, "step": 1530 }, { "grad_norm": 1.8502761125564575, "learning_rate": 5.130000000000001e-05, "loss": 0.1997, "step": 1540 }, { "grad_norm": 1.9518537521362305, "learning_rate": 5.163333333333333e-05, "loss": 0.1906, "step": 1550 }, { "grad_norm": 1.9879329204559326, "learning_rate": 5.196666666666667e-05, "loss": 0.1903, "step": 1560 }, { "grad_norm": 2.0137035846710205, "learning_rate": 5.2300000000000004e-05, "loss": 0.1958, "step": 1570 }, { "grad_norm": 1.6730906963348389, "learning_rate": 5.2633333333333336e-05, "loss": 0.1864, "step": 1580 }, { "grad_norm": 1.8633357286453247, "learning_rate": 5.296666666666666e-05, "loss": 0.1936, "step": 1590 }, { "grad_norm": 1.6458967924118042, "learning_rate": 5.330000000000001e-05, "loss": 0.1919, "step": 1600 }, { "grad_norm": 1.8325773477554321, "learning_rate": 5.3633333333333334e-05, "loss": 0.1833, "step": 1610 }, { "grad_norm": 1.6758291721343994, "learning_rate": 5.3966666666666666e-05, "loss": 0.1814, "step": 1620 }, { "grad_norm": 1.7761826515197754, "learning_rate": 5.4300000000000005e-05, "loss": 0.1883, "step": 1630 }, { "grad_norm": 1.943997859954834, "learning_rate": 5.463333333333334e-05, "loss": 0.1967, "step": 1640 }, { "grad_norm": 1.9337615966796875, "learning_rate": 5.496666666666666e-05, "loss": 0.2039, "step": 1650 }, { "grad_norm": 1.8413279056549072, "learning_rate": 5.530000000000001e-05, "loss": 0.1714, "step": 1660 }, { "grad_norm": 1.6442506313323975, "learning_rate": 5.5633333333333335e-05, "loss": 0.1874, "step": 1670 }, { "grad_norm": 1.6270753145217896, "learning_rate": 5.596666666666667e-05, "loss": 0.1936, "step": 1680 }, { "grad_norm": 1.7841635942459106, "learning_rate": 5.63e-05, "loss": 0.1972, "step": 1690 }, { "grad_norm": 1.8798664808273315, "learning_rate": 5.663333333333334e-05, "loss": 0.1901, "step": 1700 }, { "grad_norm": 1.711655855178833, "learning_rate": 5.696666666666667e-05, "loss": 0.1948, "step": 1710 }, { "grad_norm": 1.8562830686569214, "learning_rate": 5.73e-05, "loss": 0.1933, "step": 1720 }, { "grad_norm": 1.716495394706726, "learning_rate": 5.7633333333333336e-05, "loss": 0.1944, "step": 1730 }, { "grad_norm": 1.5201752185821533, "learning_rate": 5.796666666666667e-05, "loss": 0.1876, "step": 1740 }, { "grad_norm": 1.5733308792114258, "learning_rate": 5.83e-05, "loss": 0.1923, "step": 1750 }, { "grad_norm": 1.8823238611221313, "learning_rate": 5.863333333333334e-05, "loss": 0.1988, "step": 1760 }, { "grad_norm": 1.4311455488204956, "learning_rate": 5.896666666666667e-05, "loss": 0.1941, "step": 1770 }, { "grad_norm": 1.61282217502594, "learning_rate": 5.93e-05, "loss": 0.1884, "step": 1780 }, { "grad_norm": 1.8670682907104492, "learning_rate": 5.9633333333333344e-05, "loss": 0.1966, "step": 1790 }, { "grad_norm": 1.4939537048339844, "learning_rate": 5.996666666666667e-05, "loss": 0.1913, "step": 1800 }, { "grad_norm": 1.583107829093933, "learning_rate": 6.03e-05, "loss": 0.1942, "step": 1810 }, { "grad_norm": 1.6234831809997559, "learning_rate": 6.063333333333333e-05, "loss": 0.1797, "step": 1820 }, { "grad_norm": 1.4413312673568726, "learning_rate": 6.0966666666666674e-05, "loss": 0.1874, "step": 1830 }, { "grad_norm": 1.6700791120529175, "learning_rate": 6.13e-05, "loss": 0.1907, "step": 1840 }, { "grad_norm": 1.5959205627441406, "learning_rate": 6.163333333333333e-05, "loss": 0.1843, "step": 1850 }, { "grad_norm": 1.699729084968567, "learning_rate": 6.196666666666668e-05, "loss": 0.2022, "step": 1860 }, { "grad_norm": 1.4388071298599243, "learning_rate": 6.23e-05, "loss": 0.1862, "step": 1870 }, { "grad_norm": 1.8072881698608398, "learning_rate": 6.263333333333333e-05, "loss": 0.1783, "step": 1880 }, { "grad_norm": 1.3298296928405762, "learning_rate": 6.296666666666667e-05, "loss": 0.1695, "step": 1890 }, { "grad_norm": 1.474398136138916, "learning_rate": 6.330000000000001e-05, "loss": 0.1917, "step": 1900 }, { "grad_norm": 1.611061453819275, "learning_rate": 6.363333333333334e-05, "loss": 0.1922, "step": 1910 }, { "grad_norm": 1.3351390361785889, "learning_rate": 6.396666666666667e-05, "loss": 0.192, "step": 1920 }, { "grad_norm": 1.708946943283081, "learning_rate": 6.43e-05, "loss": 0.1868, "step": 1930 }, { "grad_norm": 1.6847813129425049, "learning_rate": 6.463333333333334e-05, "loss": 0.1886, "step": 1940 }, { "grad_norm": 1.5406711101531982, "learning_rate": 6.496666666666667e-05, "loss": 0.1844, "step": 1950 }, { "grad_norm": 1.631949782371521, "learning_rate": 6.53e-05, "loss": 0.1656, "step": 1960 }, { "grad_norm": 1.4850841760635376, "learning_rate": 6.563333333333333e-05, "loss": 0.1901, "step": 1970 }, { "grad_norm": 1.3837077617645264, "learning_rate": 6.596666666666667e-05, "loss": 0.1886, "step": 1980 }, { "grad_norm": 1.477758765220642, "learning_rate": 6.630000000000001e-05, "loss": 0.1858, "step": 1990 }, { "grad_norm": 1.4698750972747803, "learning_rate": 6.663333333333333e-05, "loss": 0.179, "step": 2000 }, { "grad_norm": 1.3225125074386597, "learning_rate": 6.696666666666666e-05, "loss": 0.1787, "step": 2010 }, { "grad_norm": 1.2733415365219116, "learning_rate": 6.730000000000001e-05, "loss": 0.1851, "step": 2020 }, { "grad_norm": 1.643424391746521, "learning_rate": 6.763333333333334e-05, "loss": 0.1885, "step": 2030 }, { "grad_norm": 1.1354880332946777, "learning_rate": 6.796666666666666e-05, "loss": 0.1931, "step": 2040 }, { "grad_norm": 1.4755196571350098, "learning_rate": 6.83e-05, "loss": 0.1814, "step": 2050 }, { "grad_norm": 1.7366747856140137, "learning_rate": 6.863333333333334e-05, "loss": 0.1708, "step": 2060 }, { "grad_norm": 1.2957370281219482, "learning_rate": 6.896666666666667e-05, "loss": 0.1804, "step": 2070 }, { "grad_norm": 1.409996747970581, "learning_rate": 6.93e-05, "loss": 0.175, "step": 2080 }, { "grad_norm": 1.3313970565795898, "learning_rate": 6.963333333333334e-05, "loss": 0.1702, "step": 2090 }, { "grad_norm": 1.4074225425720215, "learning_rate": 6.996666666666667e-05, "loss": 0.1683, "step": 2100 }, { "grad_norm": 1.4532965421676636, "learning_rate": 7.03e-05, "loss": 0.1749, "step": 2110 }, { "grad_norm": 1.3772770166397095, "learning_rate": 7.063333333333333e-05, "loss": 0.1699, "step": 2120 }, { "grad_norm": 1.2422809600830078, "learning_rate": 7.096666666666667e-05, "loss": 0.1815, "step": 2130 }, { "grad_norm": 1.3627618551254272, "learning_rate": 7.13e-05, "loss": 0.1912, "step": 2140 }, { "grad_norm": 1.271766185760498, "learning_rate": 7.163333333333334e-05, "loss": 0.1896, "step": 2150 }, { "grad_norm": 1.301351547241211, "learning_rate": 7.196666666666668e-05, "loss": 0.1803, "step": 2160 }, { "grad_norm": 1.230063557624817, "learning_rate": 7.23e-05, "loss": 0.1678, "step": 2170 }, { "grad_norm": 1.4455220699310303, "learning_rate": 7.263333333333334e-05, "loss": 0.1825, "step": 2180 }, { "grad_norm": 1.2409735918045044, "learning_rate": 7.296666666666667e-05, "loss": 0.1681, "step": 2190 }, { "grad_norm": 1.06504225730896, "learning_rate": 7.33e-05, "loss": 0.1709, "step": 2200 }, { "grad_norm": 1.3993388414382935, "learning_rate": 7.363333333333334e-05, "loss": 0.1708, "step": 2210 }, { "grad_norm": 1.177734375, "learning_rate": 7.396666666666667e-05, "loss": 0.1664, "step": 2220 }, { "grad_norm": 1.2166202068328857, "learning_rate": 7.43e-05, "loss": 0.1734, "step": 2230 }, { "grad_norm": 1.1245884895324707, "learning_rate": 7.463333333333334e-05, "loss": 0.1666, "step": 2240 }, { "grad_norm": 1.2664999961853027, "learning_rate": 7.496666666666667e-05, "loss": 0.1833, "step": 2250 }, { "grad_norm": 1.4394556283950806, "learning_rate": 7.53e-05, "loss": 0.1773, "step": 2260 }, { "grad_norm": 1.462640404701233, "learning_rate": 7.563333333333333e-05, "loss": 0.17, "step": 2270 }, { "grad_norm": 1.235329270362854, "learning_rate": 7.596666666666668e-05, "loss": 0.1847, "step": 2280 }, { "grad_norm": 1.037428379058838, "learning_rate": 7.630000000000001e-05, "loss": 0.1752, "step": 2290 }, { "grad_norm": 1.0617107152938843, "learning_rate": 7.663333333333333e-05, "loss": 0.1644, "step": 2300 }, { "grad_norm": 1.3012402057647705, "learning_rate": 7.696666666666668e-05, "loss": 0.1792, "step": 2310 }, { "grad_norm": 1.3474661111831665, "learning_rate": 7.730000000000001e-05, "loss": 0.1631, "step": 2320 }, { "grad_norm": 1.287260890007019, "learning_rate": 7.763333333333334e-05, "loss": 0.1815, "step": 2330 }, { "grad_norm": 1.1314858198165894, "learning_rate": 7.796666666666666e-05, "loss": 0.1718, "step": 2340 }, { "grad_norm": 1.0691331624984741, "learning_rate": 7.83e-05, "loss": 0.1779, "step": 2350 }, { "grad_norm": 1.361555814743042, "learning_rate": 7.863333333333334e-05, "loss": 0.1695, "step": 2360 }, { "grad_norm": 1.2168221473693848, "learning_rate": 7.896666666666667e-05, "loss": 0.168, "step": 2370 }, { "grad_norm": 1.1273889541625977, "learning_rate": 7.93e-05, "loss": 0.1617, "step": 2380 }, { "grad_norm": 1.2248425483703613, "learning_rate": 7.963333333333334e-05, "loss": 0.1799, "step": 2390 }, { "grad_norm": 1.1511616706848145, "learning_rate": 7.996666666666667e-05, "loss": 0.1762, "step": 2400 }, { "grad_norm": 1.187149167060852, "learning_rate": 8.030000000000001e-05, "loss": 0.164, "step": 2410 }, { "grad_norm": 1.2486686706542969, "learning_rate": 8.063333333333333e-05, "loss": 0.1641, "step": 2420 }, { "grad_norm": 1.238765001296997, "learning_rate": 8.096666666666667e-05, "loss": 0.1691, "step": 2430 }, { "grad_norm": 1.2171342372894287, "learning_rate": 8.13e-05, "loss": 0.1709, "step": 2440 }, { "grad_norm": 1.0714538097381592, "learning_rate": 8.163333333333334e-05, "loss": 0.1708, "step": 2450 }, { "grad_norm": 1.2182875871658325, "learning_rate": 8.196666666666668e-05, "loss": 0.1776, "step": 2460 }, { "grad_norm": 0.9714898467063904, "learning_rate": 8.23e-05, "loss": 0.1649, "step": 2470 }, { "grad_norm": 1.054153323173523, "learning_rate": 8.263333333333334e-05, "loss": 0.1637, "step": 2480 }, { "grad_norm": 1.0961401462554932, "learning_rate": 8.296666666666667e-05, "loss": 0.1628, "step": 2490 }, { "grad_norm": 1.0621126890182495, "learning_rate": 8.33e-05, "loss": 0.1822, "step": 2500 }, { "grad_norm": 1.0368319749832153, "learning_rate": 8.363333333333334e-05, "loss": 0.1743, "step": 2510 }, { "grad_norm": 1.1104371547698975, "learning_rate": 8.396666666666667e-05, "loss": 0.1643, "step": 2520 }, { "grad_norm": 1.0499911308288574, "learning_rate": 8.43e-05, "loss": 0.1609, "step": 2530 }, { "grad_norm": 1.1318551301956177, "learning_rate": 8.463333333333335e-05, "loss": 0.1724, "step": 2540 }, { "grad_norm": 1.012404203414917, "learning_rate": 8.496666666666667e-05, "loss": 0.1781, "step": 2550 }, { "grad_norm": 1.0706509351730347, "learning_rate": 8.53e-05, "loss": 0.1665, "step": 2560 }, { "grad_norm": 1.0580800771713257, "learning_rate": 8.563333333333333e-05, "loss": 0.1674, "step": 2570 }, { "grad_norm": 0.8989593982696533, "learning_rate": 8.596666666666668e-05, "loss": 0.1664, "step": 2580 }, { "grad_norm": 1.2026270627975464, "learning_rate": 8.63e-05, "loss": 0.1653, "step": 2590 }, { "grad_norm": 1.068744421005249, "learning_rate": 8.663333333333333e-05, "loss": 0.1652, "step": 2600 }, { "grad_norm": 1.0193761587142944, "learning_rate": 8.696666666666668e-05, "loss": 0.1591, "step": 2610 }, { "grad_norm": 1.1416807174682617, "learning_rate": 8.730000000000001e-05, "loss": 0.1672, "step": 2620 }, { "grad_norm": 1.0519858598709106, "learning_rate": 8.763333333333334e-05, "loss": 0.1543, "step": 2630 }, { "grad_norm": 0.9407383799552917, "learning_rate": 8.796666666666667e-05, "loss": 0.1566, "step": 2640 }, { "grad_norm": 0.8274550437927246, "learning_rate": 8.83e-05, "loss": 0.1697, "step": 2650 }, { "grad_norm": 0.9271885752677917, "learning_rate": 8.863333333333334e-05, "loss": 0.1718, "step": 2660 }, { "grad_norm": 1.0419094562530518, "learning_rate": 8.896666666666667e-05, "loss": 0.1657, "step": 2670 }, { "grad_norm": 0.8690840005874634, "learning_rate": 8.93e-05, "loss": 0.1663, "step": 2680 }, { "grad_norm": 0.9361692070960999, "learning_rate": 8.963333333333333e-05, "loss": 0.164, "step": 2690 }, { "grad_norm": 0.9394407868385315, "learning_rate": 8.996666666666667e-05, "loss": 0.1715, "step": 2700 }, { "grad_norm": 0.8534783124923706, "learning_rate": 9.030000000000001e-05, "loss": 0.1607, "step": 2710 }, { "grad_norm": 0.8205949664115906, "learning_rate": 9.063333333333333e-05, "loss": 0.1627, "step": 2720 }, { "grad_norm": 0.7320767045021057, "learning_rate": 9.096666666666666e-05, "loss": 0.1668, "step": 2730 }, { "grad_norm": 1.032417893409729, "learning_rate": 9.130000000000001e-05, "loss": 0.162, "step": 2740 }, { "grad_norm": 0.9331495761871338, "learning_rate": 9.163333333333334e-05, "loss": 0.1527, "step": 2750 }, { "grad_norm": 0.7410256266593933, "learning_rate": 9.196666666666666e-05, "loss": 0.1559, "step": 2760 }, { "grad_norm": 0.9585258364677429, "learning_rate": 9.230000000000001e-05, "loss": 0.151, "step": 2770 }, { "grad_norm": 0.8902868032455444, "learning_rate": 9.263333333333334e-05, "loss": 0.1571, "step": 2780 }, { "grad_norm": 0.9304125308990479, "learning_rate": 9.296666666666667e-05, "loss": 0.1501, "step": 2790 }, { "grad_norm": 0.8968132138252258, "learning_rate": 9.33e-05, "loss": 0.1615, "step": 2800 }, { "grad_norm": 1.0155423879623413, "learning_rate": 9.363333333333334e-05, "loss": 0.1553, "step": 2810 }, { "grad_norm": 0.8190322518348694, "learning_rate": 9.396666666666667e-05, "loss": 0.1514, "step": 2820 }, { "grad_norm": 0.9747948050498962, "learning_rate": 9.43e-05, "loss": 0.1612, "step": 2830 }, { "grad_norm": 0.9692564606666565, "learning_rate": 9.463333333333333e-05, "loss": 0.1541, "step": 2840 }, { "grad_norm": 0.9349033832550049, "learning_rate": 9.496666666666667e-05, "loss": 0.1558, "step": 2850 }, { "grad_norm": 0.6970181465148926, "learning_rate": 9.53e-05, "loss": 0.1479, "step": 2860 }, { "grad_norm": 0.6530835628509521, "learning_rate": 9.563333333333334e-05, "loss": 0.1498, "step": 2870 }, { "grad_norm": 0.6375618577003479, "learning_rate": 9.596666666666668e-05, "loss": 0.149, "step": 2880 }, { "grad_norm": 0.87689608335495, "learning_rate": 9.63e-05, "loss": 0.1517, "step": 2890 }, { "grad_norm": 0.8134520649909973, "learning_rate": 9.663333333333334e-05, "loss": 0.1539, "step": 2900 }, { "grad_norm": 0.7463963627815247, "learning_rate": 9.696666666666667e-05, "loss": 0.1663, "step": 2910 }, { "grad_norm": 0.7102210521697998, "learning_rate": 9.730000000000001e-05, "loss": 0.1536, "step": 2920 }, { "grad_norm": 0.7452377676963806, "learning_rate": 9.763333333333334e-05, "loss": 0.1531, "step": 2930 }, { "grad_norm": 0.8446027040481567, "learning_rate": 9.796666666666667e-05, "loss": 0.1495, "step": 2940 }, { "grad_norm": 0.9499046802520752, "learning_rate": 9.83e-05, "loss": 0.1513, "step": 2950 }, { "grad_norm": 0.8787628412246704, "learning_rate": 9.863333333333334e-05, "loss": 0.1632, "step": 2960 }, { "grad_norm": 0.7594059705734253, "learning_rate": 9.896666666666667e-05, "loss": 0.1568, "step": 2970 }, { "grad_norm": 0.9857231974601746, "learning_rate": 9.93e-05, "loss": 0.1541, "step": 2980 }, { "grad_norm": 1.0072860717773438, "learning_rate": 9.963333333333333e-05, "loss": 0.1507, "step": 2990 }, { "grad_norm": 0.9524849057197571, "learning_rate": 9.996666666666668e-05, "loss": 0.1618, "step": 3000 }, { "grad_norm": 0.7813325524330139, "learning_rate": 9.999999384858465e-05, "loss": 0.1517, "step": 3010 }, { "grad_norm": 0.7690659761428833, "learning_rate": 9.999997258443473e-05, "loss": 0.1637, "step": 3020 }, { "grad_norm": 0.7645965814590454, "learning_rate": 9.999993613161331e-05, "loss": 0.152, "step": 3030 }, { "grad_norm": 0.7554789781570435, "learning_rate": 9.999988449013146e-05, "loss": 0.1453, "step": 3040 }, { "grad_norm": 0.7488862872123718, "learning_rate": 9.99998176600049e-05, "loss": 0.1564, "step": 3050 }, { "grad_norm": 0.8536695241928101, "learning_rate": 9.999973564125389e-05, "loss": 0.1552, "step": 3060 }, { "grad_norm": 0.7884150743484497, "learning_rate": 9.999963843390335e-05, "loss": 0.1569, "step": 3070 }, { "grad_norm": 0.6515567898750305, "learning_rate": 9.999952603798282e-05, "loss": 0.1501, "step": 3080 }, { "grad_norm": 0.8387160897254944, "learning_rate": 9.999939845352646e-05, "loss": 0.1478, "step": 3090 }, { "grad_norm": 0.8207934498786926, "learning_rate": 9.999925568057298e-05, "loss": 0.1554, "step": 3100 }, { "grad_norm": 0.647365152835846, "learning_rate": 9.999909771916578e-05, "loss": 0.1627, "step": 3110 }, { "grad_norm": 0.6585726737976074, "learning_rate": 9.999892456935285e-05, "loss": 0.1436, "step": 3120 }, { "grad_norm": 0.667732834815979, "learning_rate": 9.999873623118679e-05, "loss": 0.1586, "step": 3130 }, { "grad_norm": 0.7181947827339172, "learning_rate": 9.999853270472479e-05, "loss": 0.1562, "step": 3140 }, { "grad_norm": 0.9192007780075073, "learning_rate": 9.999831399002871e-05, "loss": 0.1564, "step": 3150 }, { "grad_norm": 0.7717569470405579, "learning_rate": 9.999808008716494e-05, "loss": 0.1539, "step": 3160 }, { "grad_norm": 0.7475244402885437, "learning_rate": 9.999783099620459e-05, "loss": 0.1487, "step": 3170 }, { "grad_norm": 0.7856576442718506, "learning_rate": 9.999756671722328e-05, "loss": 0.1479, "step": 3180 }, { "grad_norm": 0.7158850431442261, "learning_rate": 9.99972872503013e-05, "loss": 0.1536, "step": 3190 }, { "grad_norm": 0.6840832829475403, "learning_rate": 9.999699259552359e-05, "loss": 0.1601, "step": 3200 }, { "grad_norm": 0.6901434659957886, "learning_rate": 9.99966827529796e-05, "loss": 0.1486, "step": 3210 }, { "grad_norm": 0.616156816482544, "learning_rate": 9.999635772276348e-05, "loss": 0.1438, "step": 3220 }, { "grad_norm": 0.607250988483429, "learning_rate": 9.999601750497396e-05, "loss": 0.1454, "step": 3230 }, { "grad_norm": 0.5945655703544617, "learning_rate": 9.99956620997144e-05, "loss": 0.1386, "step": 3240 }, { "grad_norm": 0.6415140628814697, "learning_rate": 9.999529150709275e-05, "loss": 0.1417, "step": 3250 }, { "grad_norm": 0.6460344791412354, "learning_rate": 9.999490572722158e-05, "loss": 0.1531, "step": 3260 }, { "grad_norm": 0.6181751489639282, "learning_rate": 9.99945047602181e-05, "loss": 0.1472, "step": 3270 }, { "grad_norm": 0.7785016298294067, "learning_rate": 9.99940886062041e-05, "loss": 0.1432, "step": 3280 }, { "grad_norm": 0.666128396987915, "learning_rate": 9.999365726530599e-05, "loss": 0.1499, "step": 3290 }, { "grad_norm": 0.7418286204338074, "learning_rate": 9.999321073765481e-05, "loss": 0.1424, "step": 3300 }, { "grad_norm": 1.0093660354614258, "learning_rate": 9.99927490233862e-05, "loss": 0.1388, "step": 3310 }, { "grad_norm": 0.8187978267669678, "learning_rate": 9.999227212264043e-05, "loss": 0.1451, "step": 3320 }, { "grad_norm": 0.6595624089241028, "learning_rate": 9.999178003556236e-05, "loss": 0.1456, "step": 3330 }, { "grad_norm": 0.5967870354652405, "learning_rate": 9.999127276230146e-05, "loss": 0.1439, "step": 3340 }, { "grad_norm": 0.6094669103622437, "learning_rate": 9.999075030301184e-05, "loss": 0.1435, "step": 3350 }, { "grad_norm": 0.6346691250801086, "learning_rate": 9.999021265785221e-05, "loss": 0.1438, "step": 3360 }, { "grad_norm": 0.6825395822525024, "learning_rate": 9.998965982698589e-05, "loss": 0.1553, "step": 3370 }, { "grad_norm": 0.5439192056655884, "learning_rate": 9.998909181058082e-05, "loss": 0.1432, "step": 3380 }, { "grad_norm": 0.8212159872055054, "learning_rate": 9.998850860880953e-05, "loss": 0.1477, "step": 3390 }, { "grad_norm": 0.6422718167304993, "learning_rate": 9.998791022184922e-05, "loss": 0.1557, "step": 3400 }, { "grad_norm": 0.6736795902252197, "learning_rate": 9.99872966498816e-05, "loss": 0.139, "step": 3410 }, { "grad_norm": 0.6558037996292114, "learning_rate": 9.998666789309313e-05, "loss": 0.1539, "step": 3420 }, { "grad_norm": 0.6848629713058472, "learning_rate": 9.998602395167475e-05, "loss": 0.1403, "step": 3430 }, { "grad_norm": 0.6282789707183838, "learning_rate": 9.998536482582213e-05, "loss": 0.1458, "step": 3440 }, { "grad_norm": 0.6881283521652222, "learning_rate": 9.998469051573544e-05, "loss": 0.1414, "step": 3450 }, { "grad_norm": 0.6158250570297241, "learning_rate": 9.998400102161954e-05, "loss": 0.1489, "step": 3460 }, { "grad_norm": 0.6907244324684143, "learning_rate": 9.998329634368388e-05, "loss": 0.1315, "step": 3470 }, { "grad_norm": 0.5803043246269226, "learning_rate": 9.998257648214253e-05, "loss": 0.1451, "step": 3480 }, { "grad_norm": 0.5967079997062683, "learning_rate": 9.998184143721417e-05, "loss": 0.1456, "step": 3490 }, { "grad_norm": 0.6366644501686096, "learning_rate": 9.998109120912206e-05, "loss": 0.1579, "step": 3500 }, { "grad_norm": 0.7676118016242981, "learning_rate": 9.998032579809411e-05, "loss": 0.1479, "step": 3510 }, { "grad_norm": 0.7297714352607727, "learning_rate": 9.997954520436286e-05, "loss": 0.1362, "step": 3520 }, { "grad_norm": 0.7591609954833984, "learning_rate": 9.997874942816538e-05, "loss": 0.1419, "step": 3530 }, { "grad_norm": 0.5345758199691772, "learning_rate": 9.997793846974345e-05, "loss": 0.149, "step": 3540 }, { "grad_norm": 0.7529535293579102, "learning_rate": 9.997711232934341e-05, "loss": 0.1455, "step": 3550 }, { "grad_norm": 0.6677576303482056, "learning_rate": 9.99762710072162e-05, "loss": 0.1508, "step": 3560 }, { "grad_norm": 0.5770912170410156, "learning_rate": 9.997541450361743e-05, "loss": 0.1563, "step": 3570 }, { "grad_norm": 0.6244877576828003, "learning_rate": 9.997454281880723e-05, "loss": 0.1471, "step": 3580 }, { "grad_norm": 0.6427288055419922, "learning_rate": 9.997365595305044e-05, "loss": 0.1418, "step": 3590 }, { "grad_norm": 0.6965516209602356, "learning_rate": 9.997275390661644e-05, "loss": 0.1488, "step": 3600 }, { "grad_norm": 0.6328218579292297, "learning_rate": 9.997183667977926e-05, "loss": 0.1412, "step": 3610 }, { "grad_norm": 0.6907172799110413, "learning_rate": 9.997090427281752e-05, "loss": 0.1491, "step": 3620 }, { "grad_norm": 0.5488272309303284, "learning_rate": 9.996995668601448e-05, "loss": 0.15, "step": 3630 }, { "grad_norm": 0.7100783586502075, "learning_rate": 9.996899391965798e-05, "loss": 0.1522, "step": 3640 }, { "grad_norm": 0.6303949952125549, "learning_rate": 9.996801597404048e-05, "loss": 0.1472, "step": 3650 }, { "grad_norm": 0.6151230931282043, "learning_rate": 9.996702284945905e-05, "loss": 0.1337, "step": 3660 }, { "grad_norm": 0.5047621130943298, "learning_rate": 9.996601454621539e-05, "loss": 0.1469, "step": 3670 }, { "grad_norm": 0.6613233685493469, "learning_rate": 9.996499106461577e-05, "loss": 0.1474, "step": 3680 }, { "grad_norm": 0.6667587757110596, "learning_rate": 9.996395240497112e-05, "loss": 0.1472, "step": 3690 }, { "grad_norm": 0.659233808517456, "learning_rate": 9.996289856759696e-05, "loss": 0.1396, "step": 3700 }, { "grad_norm": 0.6025744080543518, "learning_rate": 9.996182955281342e-05, "loss": 0.1368, "step": 3710 }, { "grad_norm": 0.6080824732780457, "learning_rate": 9.996074536094519e-05, "loss": 0.1469, "step": 3720 }, { "grad_norm": 0.5456026792526245, "learning_rate": 9.995964599232168e-05, "loss": 0.1347, "step": 3730 }, { "grad_norm": 0.5844096541404724, "learning_rate": 9.995853144727683e-05, "loss": 0.1396, "step": 3740 }, { "grad_norm": 0.5861422419548035, "learning_rate": 9.99574017261492e-05, "loss": 0.1428, "step": 3750 }, { "grad_norm": 0.6362485289573669, "learning_rate": 9.995625682928198e-05, "loss": 0.1359, "step": 3760 }, { "grad_norm": 0.6797501444816589, "learning_rate": 9.995509675702295e-05, "loss": 0.1496, "step": 3770 }, { "grad_norm": 0.6267451047897339, "learning_rate": 9.995392150972451e-05, "loss": 0.1443, "step": 3780 }, { "grad_norm": 0.6300370097160339, "learning_rate": 9.995273108774366e-05, "loss": 0.1358, "step": 3790 }, { "grad_norm": 0.5516828894615173, "learning_rate": 9.995152549144205e-05, "loss": 0.1443, "step": 3800 }, { "grad_norm": 0.6190621852874756, "learning_rate": 9.995030472118587e-05, "loss": 0.1339, "step": 3810 }, { "grad_norm": 0.6107175946235657, "learning_rate": 9.9949068777346e-05, "loss": 0.1475, "step": 3820 }, { "grad_norm": 0.5871418714523315, "learning_rate": 9.994781766029786e-05, "loss": 0.1456, "step": 3830 }, { "grad_norm": 0.5415273308753967, "learning_rate": 9.994655137042151e-05, "loss": 0.1442, "step": 3840 }, { "grad_norm": 0.6475327610969543, "learning_rate": 9.99452699081016e-05, "loss": 0.1363, "step": 3850 }, { "grad_norm": 0.5809086561203003, "learning_rate": 9.994397327372743e-05, "loss": 0.1447, "step": 3860 }, { "grad_norm": 0.5726230144500732, "learning_rate": 9.994266146769286e-05, "loss": 0.1424, "step": 3870 }, { "grad_norm": 0.5019162893295288, "learning_rate": 9.994133449039642e-05, "loss": 0.1396, "step": 3880 }, { "grad_norm": 0.5662689805030823, "learning_rate": 9.993999234224118e-05, "loss": 0.1383, "step": 3890 }, { "grad_norm": 0.613377571105957, "learning_rate": 9.993863502363485e-05, "loss": 0.1449, "step": 3900 }, { "grad_norm": 0.551930844783783, "learning_rate": 9.993726253498976e-05, "loss": 0.1509, "step": 3910 }, { "grad_norm": 0.7300756573677063, "learning_rate": 9.993587487672282e-05, "loss": 0.144, "step": 3920 }, { "grad_norm": 0.5569468140602112, "learning_rate": 9.993447204925558e-05, "loss": 0.1486, "step": 3930 }, { "grad_norm": 0.6959659457206726, "learning_rate": 9.993305405301416e-05, "loss": 0.1585, "step": 3940 }, { "grad_norm": 0.5805143117904663, "learning_rate": 9.993162088842935e-05, "loss": 0.152, "step": 3950 }, { "grad_norm": 0.5536757707595825, "learning_rate": 9.993017255593646e-05, "loss": 0.15, "step": 3960 }, { "grad_norm": 0.5313476920127869, "learning_rate": 9.992870905597548e-05, "loss": 0.1461, "step": 3970 }, { "grad_norm": 0.46529653668403625, "learning_rate": 9.9927230388991e-05, "loss": 0.1392, "step": 3980 }, { "grad_norm": 0.6412094831466675, "learning_rate": 9.992573655543215e-05, "loss": 0.137, "step": 3990 }, { "grad_norm": 0.6076364517211914, "learning_rate": 9.992422755575277e-05, "loss": 0.1429, "step": 4000 }, { "grad_norm": 0.5591936707496643, "learning_rate": 9.992270339041123e-05, "loss": 0.1401, "step": 4010 }, { "grad_norm": 0.5743570327758789, "learning_rate": 9.992116405987053e-05, "loss": 0.1424, "step": 4020 }, { "grad_norm": 0.5519633889198303, "learning_rate": 9.991960956459828e-05, "loss": 0.151, "step": 4030 }, { "grad_norm": 0.6113790273666382, "learning_rate": 9.991803990506669e-05, "loss": 0.1389, "step": 4040 }, { "grad_norm": 0.576439619064331, "learning_rate": 9.991645508175258e-05, "loss": 0.1416, "step": 4050 }, { "grad_norm": 0.62945955991745, "learning_rate": 9.99148550951374e-05, "loss": 0.1508, "step": 4060 }, { "grad_norm": 0.6143585443496704, "learning_rate": 9.991323994570716e-05, "loss": 0.1475, "step": 4070 }, { "grad_norm": 0.4971154034137726, "learning_rate": 9.99116096339525e-05, "loss": 0.1403, "step": 4080 }, { "grad_norm": 0.6693809628486633, "learning_rate": 9.990996416036869e-05, "loss": 0.1457, "step": 4090 }, { "grad_norm": 0.535125732421875, "learning_rate": 9.990830352545555e-05, "loss": 0.153, "step": 4100 }, { "grad_norm": 0.5083459615707397, "learning_rate": 9.990662772971756e-05, "loss": 0.1432, "step": 4110 }, { "grad_norm": 0.536979079246521, "learning_rate": 9.990493677366376e-05, "loss": 0.142, "step": 4120 }, { "grad_norm": 0.557867169380188, "learning_rate": 9.990323065780786e-05, "loss": 0.1438, "step": 4130 }, { "grad_norm": 0.465218722820282, "learning_rate": 9.990150938266808e-05, "loss": 0.1439, "step": 4140 }, { "grad_norm": 0.559672474861145, "learning_rate": 9.989977294876733e-05, "loss": 0.1367, "step": 4150 }, { "grad_norm": 0.5833478569984436, "learning_rate": 9.989802135663308e-05, "loss": 0.1306, "step": 4160 }, { "grad_norm": 0.6542783379554749, "learning_rate": 9.989625460679743e-05, "loss": 0.1449, "step": 4170 }, { "grad_norm": 0.6156213879585266, "learning_rate": 9.989447269979706e-05, "loss": 0.1523, "step": 4180 }, { "grad_norm": 0.5757542252540588, "learning_rate": 9.989267563617328e-05, "loss": 0.1429, "step": 4190 }, { "grad_norm": 0.5010551810264587, "learning_rate": 9.989086341647198e-05, "loss": 0.1425, "step": 4200 }, { "grad_norm": 0.4679884910583496, "learning_rate": 9.988903604124366e-05, "loss": 0.1354, "step": 4210 }, { "grad_norm": 0.5174017548561096, "learning_rate": 9.988719351104343e-05, "loss": 0.1443, "step": 4220 }, { "grad_norm": 0.6140590906143188, "learning_rate": 9.9885335826431e-05, "loss": 0.1362, "step": 4230 }, { "grad_norm": 0.479953795671463, "learning_rate": 9.988346298797071e-05, "loss": 0.1468, "step": 4240 }, { "grad_norm": 0.6006399393081665, "learning_rate": 9.988157499623146e-05, "loss": 0.1498, "step": 4250 }, { "grad_norm": 0.48575928807258606, "learning_rate": 9.987967185178677e-05, "loss": 0.1361, "step": 4260 }, { "grad_norm": 0.46315276622772217, "learning_rate": 9.987775355521476e-05, "loss": 0.1321, "step": 4270 }, { "grad_norm": 0.5148758292198181, "learning_rate": 9.987582010709817e-05, "loss": 0.1428, "step": 4280 }, { "grad_norm": 0.49477389454841614, "learning_rate": 9.987387150802431e-05, "loss": 0.1378, "step": 4290 }, { "grad_norm": 0.5054652690887451, "learning_rate": 9.987190775858517e-05, "loss": 0.1375, "step": 4300 }, { "grad_norm": 0.47041720151901245, "learning_rate": 9.98699288593772e-05, "loss": 0.1407, "step": 4310 }, { "grad_norm": 0.5625644326210022, "learning_rate": 9.986793481100161e-05, "loss": 0.1395, "step": 4320 }, { "grad_norm": 0.6859869956970215, "learning_rate": 9.986592561406412e-05, "loss": 0.1359, "step": 4330 }, { "grad_norm": 0.5429591536521912, "learning_rate": 9.986390126917503e-05, "loss": 0.1412, "step": 4340 }, { "grad_norm": 0.6219918727874756, "learning_rate": 9.986186177694933e-05, "loss": 0.1496, "step": 4350 }, { "grad_norm": 0.5983712077140808, "learning_rate": 9.985980713800656e-05, "loss": 0.1419, "step": 4360 }, { "grad_norm": 0.5897524356842041, "learning_rate": 9.985773735297084e-05, "loss": 0.1421, "step": 4370 }, { "grad_norm": 0.499279260635376, "learning_rate": 9.985565242247092e-05, "loss": 0.134, "step": 4380 }, { "grad_norm": 0.6191442012786865, "learning_rate": 9.985355234714016e-05, "loss": 0.1453, "step": 4390 }, { "grad_norm": 0.5990110635757446, "learning_rate": 9.985143712761652e-05, "loss": 0.1554, "step": 4400 }, { "grad_norm": 0.5292013883590698, "learning_rate": 9.984930676454252e-05, "loss": 0.1416, "step": 4410 }, { "grad_norm": 0.561180830001831, "learning_rate": 9.984716125856532e-05, "loss": 0.1389, "step": 4420 }, { "grad_norm": 0.554642379283905, "learning_rate": 9.984500061033667e-05, "loss": 0.1357, "step": 4430 }, { "grad_norm": 0.5794311761856079, "learning_rate": 9.984282482051293e-05, "loss": 0.1493, "step": 4440 }, { "grad_norm": 0.5280740261077881, "learning_rate": 9.9840633889755e-05, "loss": 0.1341, "step": 4450 }, { "grad_norm": 0.4872707426548004, "learning_rate": 9.983842781872848e-05, "loss": 0.1406, "step": 4460 }, { "grad_norm": 0.46390068531036377, "learning_rate": 9.98362066081035e-05, "loss": 0.1482, "step": 4470 }, { "grad_norm": 0.5767057538032532, "learning_rate": 9.983397025855479e-05, "loss": 0.1376, "step": 4480 }, { "grad_norm": 0.5789566040039062, "learning_rate": 9.983171877076171e-05, "loss": 0.1453, "step": 4490 }, { "grad_norm": 0.6008794903755188, "learning_rate": 9.98294521454082e-05, "loss": 0.1383, "step": 4500 }, { "grad_norm": 0.5692183375358582, "learning_rate": 9.98271703831828e-05, "loss": 0.1491, "step": 4510 }, { "grad_norm": 0.5034311413764954, "learning_rate": 9.982487348477865e-05, "loss": 0.1428, "step": 4520 }, { "grad_norm": 0.5322293639183044, "learning_rate": 9.982256145089347e-05, "loss": 0.1473, "step": 4530 }, { "grad_norm": 0.41866546869277954, "learning_rate": 9.982023428222962e-05, "loss": 0.1331, "step": 4540 }, { "grad_norm": 0.5093094110488892, "learning_rate": 9.981789197949403e-05, "loss": 0.1288, "step": 4550 }, { "grad_norm": 0.5471315979957581, "learning_rate": 9.98155345433982e-05, "loss": 0.1355, "step": 4560 }, { "grad_norm": 0.6027163863182068, "learning_rate": 9.981316197465831e-05, "loss": 0.1383, "step": 4570 }, { "grad_norm": 0.5210700631141663, "learning_rate": 9.981077427399504e-05, "loss": 0.134, "step": 4580 }, { "grad_norm": 0.46789318323135376, "learning_rate": 9.980837144213371e-05, "loss": 0.1453, "step": 4590 }, { "grad_norm": 0.5623993277549744, "learning_rate": 9.980595347980426e-05, "loss": 0.139, "step": 4600 }, { "grad_norm": 0.45463430881500244, "learning_rate": 9.980352038774119e-05, "loss": 0.1417, "step": 4610 }, { "grad_norm": 0.4531663954257965, "learning_rate": 9.98010721666836e-05, "loss": 0.1396, "step": 4620 }, { "grad_norm": 0.5103515982627869, "learning_rate": 9.979860881737523e-05, "loss": 0.1422, "step": 4630 }, { "grad_norm": 0.53779536485672, "learning_rate": 9.979613034056434e-05, "loss": 0.1435, "step": 4640 }, { "grad_norm": 0.5294598937034607, "learning_rate": 9.979363673700386e-05, "loss": 0.1441, "step": 4650 }, { "grad_norm": 0.5357514023780823, "learning_rate": 9.979112800745124e-05, "loss": 0.1548, "step": 4660 }, { "grad_norm": 0.5186597108840942, "learning_rate": 9.978860415266861e-05, "loss": 0.1476, "step": 4670 }, { "grad_norm": 0.5539357662200928, "learning_rate": 9.978606517342262e-05, "loss": 0.1311, "step": 4680 }, { "grad_norm": 0.5448322296142578, "learning_rate": 9.978351107048456e-05, "loss": 0.1452, "step": 4690 }, { "grad_norm": 0.45783865451812744, "learning_rate": 9.978094184463029e-05, "loss": 0.1451, "step": 4700 }, { "grad_norm": 0.5482490062713623, "learning_rate": 9.977835749664029e-05, "loss": 0.1442, "step": 4710 }, { "grad_norm": 0.43092983961105347, "learning_rate": 9.97757580272996e-05, "loss": 0.1471, "step": 4720 }, { "grad_norm": 0.49640461802482605, "learning_rate": 9.977314343739786e-05, "loss": 0.144, "step": 4730 }, { "grad_norm": 0.44661471247673035, "learning_rate": 9.977051372772934e-05, "loss": 0.1366, "step": 4740 }, { "grad_norm": 0.5270684361457825, "learning_rate": 9.976786889909286e-05, "loss": 0.1406, "step": 4750 }, { "grad_norm": 0.4969487190246582, "learning_rate": 9.976520895229185e-05, "loss": 0.132, "step": 4760 }, { "grad_norm": 0.5004627108573914, "learning_rate": 9.976253388813433e-05, "loss": 0.1424, "step": 4770 }, { "grad_norm": 0.5238275527954102, "learning_rate": 9.975984370743293e-05, "loss": 0.1412, "step": 4780 }, { "grad_norm": 0.5349233150482178, "learning_rate": 9.975713841100485e-05, "loss": 0.1389, "step": 4790 }, { "grad_norm": 0.550405740737915, "learning_rate": 9.975441799967187e-05, "loss": 0.1401, "step": 4800 }, { "grad_norm": 0.5389921069145203, "learning_rate": 9.975168247426039e-05, "loss": 0.1366, "step": 4810 }, { "grad_norm": 0.4930681884288788, "learning_rate": 9.974893183560139e-05, "loss": 0.1355, "step": 4820 }, { "grad_norm": 0.5214522480964661, "learning_rate": 9.974616608453045e-05, "loss": 0.1475, "step": 4830 }, { "grad_norm": 0.5381361246109009, "learning_rate": 9.974338522188772e-05, "loss": 0.1453, "step": 4840 }, { "grad_norm": 0.5632324814796448, "learning_rate": 9.974058924851797e-05, "loss": 0.1362, "step": 4850 }, { "grad_norm": 0.4934181571006775, "learning_rate": 9.973777816527051e-05, "loss": 0.1394, "step": 4860 }, { "grad_norm": 0.552261233329773, "learning_rate": 9.973495197299931e-05, "loss": 0.1395, "step": 4870 }, { "grad_norm": 0.4902271032333374, "learning_rate": 9.973211067256287e-05, "loss": 0.1416, "step": 4880 }, { "grad_norm": 0.4934186041355133, "learning_rate": 9.97292542648243e-05, "loss": 0.1446, "step": 4890 }, { "grad_norm": 0.5303343534469604, "learning_rate": 9.972638275065131e-05, "loss": 0.1426, "step": 4900 }, { "grad_norm": 0.4065636098384857, "learning_rate": 9.972349613091621e-05, "loss": 0.1324, "step": 4910 }, { "grad_norm": 0.42144787311553955, "learning_rate": 9.972059440649584e-05, "loss": 0.1395, "step": 4920 }, { "grad_norm": 0.5409848690032959, "learning_rate": 9.971767757827168e-05, "loss": 0.1483, "step": 4930 }, { "grad_norm": 0.5989275574684143, "learning_rate": 9.971474564712982e-05, "loss": 0.1372, "step": 4940 }, { "grad_norm": 0.5155283808708191, "learning_rate": 9.971179861396084e-05, "loss": 0.1391, "step": 4950 }, { "grad_norm": 0.5182726383209229, "learning_rate": 9.970883647966003e-05, "loss": 0.133, "step": 4960 }, { "grad_norm": 0.5171785354614258, "learning_rate": 9.970585924512717e-05, "loss": 0.1443, "step": 4970 }, { "grad_norm": 0.4884302318096161, "learning_rate": 9.970286691126669e-05, "loss": 0.1446, "step": 4980 }, { "grad_norm": 0.5560006499290466, "learning_rate": 9.969985947898756e-05, "loss": 0.1389, "step": 4990 }, { "grad_norm": 0.40711626410484314, "learning_rate": 9.969683694920337e-05, "loss": 0.1411, "step": 5000 }, { "grad_norm": 0.5138547420501709, "learning_rate": 9.969379932283228e-05, "loss": 0.1427, "step": 5010 }, { "grad_norm": 0.4788358509540558, "learning_rate": 9.969074660079704e-05, "loss": 0.1443, "step": 5020 }, { "grad_norm": 0.4543544054031372, "learning_rate": 9.968767878402501e-05, "loss": 0.1364, "step": 5030 }, { "grad_norm": 0.5334954857826233, "learning_rate": 9.968459587344808e-05, "loss": 0.136, "step": 5040 }, { "grad_norm": 0.4588620364665985, "learning_rate": 9.968149787000278e-05, "loss": 0.1434, "step": 5050 }, { "grad_norm": 0.4293912947177887, "learning_rate": 9.967838477463018e-05, "loss": 0.1391, "step": 5060 }, { "grad_norm": 0.48696577548980713, "learning_rate": 9.967525658827597e-05, "loss": 0.1274, "step": 5070 }, { "grad_norm": 0.47174540162086487, "learning_rate": 9.967211331189042e-05, "loss": 0.1363, "step": 5080 }, { "grad_norm": 0.452678382396698, "learning_rate": 9.966895494642834e-05, "loss": 0.1411, "step": 5090 }, { "grad_norm": 0.4657643139362335, "learning_rate": 9.96657814928492e-05, "loss": 0.1426, "step": 5100 }, { "grad_norm": 0.4748019278049469, "learning_rate": 9.966259295211697e-05, "loss": 0.1361, "step": 5110 }, { "grad_norm": 0.5965593457221985, "learning_rate": 9.965938932520028e-05, "loss": 0.1443, "step": 5120 }, { "grad_norm": 0.46972230076789856, "learning_rate": 9.965617061307229e-05, "loss": 0.1336, "step": 5130 }, { "grad_norm": 0.5282124876976013, "learning_rate": 9.965293681671077e-05, "loss": 0.1345, "step": 5140 }, { "grad_norm": 0.4792264401912689, "learning_rate": 9.964968793709804e-05, "loss": 0.1386, "step": 5150 }, { "grad_norm": 0.558594286441803, "learning_rate": 9.964642397522106e-05, "loss": 0.1428, "step": 5160 }, { "grad_norm": 0.4767739176750183, "learning_rate": 9.96431449320713e-05, "loss": 0.119, "step": 5170 }, { "grad_norm": 0.4772011935710907, "learning_rate": 9.963985080864486e-05, "loss": 0.1363, "step": 5180 }, { "grad_norm": 0.4949113428592682, "learning_rate": 9.96365416059424e-05, "loss": 0.1439, "step": 5190 }, { "grad_norm": 0.5492094159126282, "learning_rate": 9.963321732496919e-05, "loss": 0.1428, "step": 5200 }, { "grad_norm": 0.4634268283843994, "learning_rate": 9.962987796673506e-05, "loss": 0.1448, "step": 5210 }, { "grad_norm": 0.49758923053741455, "learning_rate": 9.962652353225438e-05, "loss": 0.1375, "step": 5220 }, { "grad_norm": 0.5527560114860535, "learning_rate": 9.962315402254619e-05, "loss": 0.1371, "step": 5230 }, { "grad_norm": 0.4759748876094818, "learning_rate": 9.9619769438634e-05, "loss": 0.1346, "step": 5240 }, { "grad_norm": 0.5119355916976929, "learning_rate": 9.9616369781546e-05, "loss": 0.135, "step": 5250 }, { "grad_norm": 0.48071447014808655, "learning_rate": 9.961295505231491e-05, "loss": 0.1407, "step": 5260 }, { "grad_norm": 0.559238851070404, "learning_rate": 9.960952525197804e-05, "loss": 0.142, "step": 5270 }, { "grad_norm": 0.4261573553085327, "learning_rate": 9.960608038157724e-05, "loss": 0.1506, "step": 5280 }, { "grad_norm": 0.4233832061290741, "learning_rate": 9.960262044215901e-05, "loss": 0.142, "step": 5290 }, { "grad_norm": 0.41432151198387146, "learning_rate": 9.959914543477435e-05, "loss": 0.1387, "step": 5300 }, { "grad_norm": 0.431253582239151, "learning_rate": 9.959565536047892e-05, "loss": 0.1426, "step": 5310 }, { "grad_norm": 0.5333501696586609, "learning_rate": 9.959215022033288e-05, "loss": 0.1413, "step": 5320 }, { "grad_norm": 0.37507107853889465, "learning_rate": 9.9588630015401e-05, "loss": 0.1344, "step": 5330 }, { "grad_norm": 0.4434981048107147, "learning_rate": 9.958509474675264e-05, "loss": 0.1345, "step": 5340 }, { "grad_norm": 0.42062005400657654, "learning_rate": 9.958154441546171e-05, "loss": 0.1367, "step": 5350 }, { "grad_norm": 0.4051445722579956, "learning_rate": 9.957797902260673e-05, "loss": 0.1371, "step": 5360 }, { "grad_norm": 0.39248764514923096, "learning_rate": 9.957439856927073e-05, "loss": 0.1437, "step": 5370 }, { "grad_norm": 0.4879933297634125, "learning_rate": 9.957080305654139e-05, "loss": 0.1353, "step": 5380 }, { "grad_norm": 0.6130387187004089, "learning_rate": 9.956719248551092e-05, "loss": 0.1417, "step": 5390 }, { "grad_norm": 0.5806025266647339, "learning_rate": 9.956356685727612e-05, "loss": 0.1417, "step": 5400 }, { "grad_norm": 0.4402570128440857, "learning_rate": 9.955992617293836e-05, "loss": 0.1433, "step": 5410 }, { "grad_norm": 0.40143147110939026, "learning_rate": 9.955627043360358e-05, "loss": 0.1387, "step": 5420 }, { "grad_norm": 0.45679888129234314, "learning_rate": 9.955259964038231e-05, "loss": 0.1409, "step": 5430 }, { "grad_norm": 0.5114424824714661, "learning_rate": 9.954891379438962e-05, "loss": 0.1354, "step": 5440 }, { "grad_norm": 0.5070980787277222, "learning_rate": 9.954521289674519e-05, "loss": 0.1432, "step": 5450 }, { "grad_norm": 0.4133518636226654, "learning_rate": 9.954149694857325e-05, "loss": 0.1307, "step": 5460 }, { "grad_norm": 0.4685085415840149, "learning_rate": 9.953776595100258e-05, "loss": 0.1345, "step": 5470 }, { "grad_norm": 0.4816305339336395, "learning_rate": 9.95340199051666e-05, "loss": 0.133, "step": 5480 }, { "grad_norm": 0.48426228761672974, "learning_rate": 9.953025881220325e-05, "loss": 0.1365, "step": 5490 }, { "grad_norm": 0.4099941551685333, "learning_rate": 9.952648267325504e-05, "loss": 0.1301, "step": 5500 }, { "grad_norm": 0.3765447437763214, "learning_rate": 9.952269148946905e-05, "loss": 0.133, "step": 5510 }, { "grad_norm": 0.362537145614624, "learning_rate": 9.951888526199697e-05, "loss": 0.1235, "step": 5520 }, { "grad_norm": 0.49974897503852844, "learning_rate": 9.951506399199501e-05, "loss": 0.1456, "step": 5530 }, { "grad_norm": 0.4511882960796356, "learning_rate": 9.951122768062399e-05, "loss": 0.1359, "step": 5540 }, { "grad_norm": 0.4629729390144348, "learning_rate": 9.950737632904927e-05, "loss": 0.1313, "step": 5550 }, { "grad_norm": 0.4166831374168396, "learning_rate": 9.950350993844077e-05, "loss": 0.1446, "step": 5560 }, { "grad_norm": 0.40150338411331177, "learning_rate": 9.949962850997303e-05, "loss": 0.1306, "step": 5570 }, { "grad_norm": 0.4656769931316376, "learning_rate": 9.949573204482512e-05, "loss": 0.1371, "step": 5580 }, { "grad_norm": 0.4709421396255493, "learning_rate": 9.949182054418064e-05, "loss": 0.1339, "step": 5590 }, { "grad_norm": 0.5441690683364868, "learning_rate": 9.948789400922787e-05, "loss": 0.139, "step": 5600 }, { "grad_norm": 0.5484582185745239, "learning_rate": 9.948395244115953e-05, "loss": 0.1485, "step": 5610 }, { "grad_norm": 0.45324668288230896, "learning_rate": 9.9479995841173e-05, "loss": 0.1434, "step": 5620 }, { "grad_norm": 0.5182957053184509, "learning_rate": 9.947602421047017e-05, "loss": 0.1292, "step": 5630 }, { "grad_norm": 0.4839521050453186, "learning_rate": 9.947203755025753e-05, "loss": 0.1369, "step": 5640 }, { "grad_norm": 0.49228841066360474, "learning_rate": 9.946803586174611e-05, "loss": 0.13, "step": 5650 }, { "grad_norm": 0.4834129512310028, "learning_rate": 9.946401914615151e-05, "loss": 0.1425, "step": 5660 }, { "grad_norm": 0.4313392639160156, "learning_rate": 9.945998740469394e-05, "loss": 0.1397, "step": 5670 }, { "grad_norm": 0.37955883145332336, "learning_rate": 9.945594063859809e-05, "loss": 0.1332, "step": 5680 }, { "grad_norm": 0.45557424426078796, "learning_rate": 9.94518788490933e-05, "loss": 0.1454, "step": 5690 }, { "grad_norm": 0.48666197061538696, "learning_rate": 9.944780203741341e-05, "loss": 0.1277, "step": 5700 }, { "grad_norm": 0.4130806028842926, "learning_rate": 9.944371020479686e-05, "loss": 0.1445, "step": 5710 }, { "grad_norm": 0.45043376088142395, "learning_rate": 9.943960335248662e-05, "loss": 0.145, "step": 5720 }, { "grad_norm": 0.4799914062023163, "learning_rate": 9.943548148173027e-05, "loss": 0.1485, "step": 5730 }, { "grad_norm": 0.41299816966056824, "learning_rate": 9.943134459377992e-05, "loss": 0.1382, "step": 5740 }, { "grad_norm": 0.3468102514743805, "learning_rate": 9.942719268989222e-05, "loss": 0.1356, "step": 5750 }, { "grad_norm": 0.4302981197834015, "learning_rate": 9.942302577132844e-05, "loss": 0.1374, "step": 5760 }, { "grad_norm": 0.38997378945350647, "learning_rate": 9.941884383935438e-05, "loss": 0.1333, "step": 5770 }, { "grad_norm": 0.5445230007171631, "learning_rate": 9.941464689524039e-05, "loss": 0.1407, "step": 5780 }, { "grad_norm": 0.4738694727420807, "learning_rate": 9.941043494026139e-05, "loss": 0.1293, "step": 5790 }, { "grad_norm": 0.5055897831916809, "learning_rate": 9.940620797569685e-05, "loss": 0.1346, "step": 5800 }, { "grad_norm": 0.4383603632450104, "learning_rate": 9.940196600283082e-05, "loss": 0.132, "step": 5810 }, { "grad_norm": 0.49540090560913086, "learning_rate": 9.939770902295192e-05, "loss": 0.1242, "step": 5820 }, { "grad_norm": 0.5625064373016357, "learning_rate": 9.939343703735329e-05, "loss": 0.1415, "step": 5830 }, { "grad_norm": 0.4444628059864044, "learning_rate": 9.938915004733264e-05, "loss": 0.1377, "step": 5840 }, { "grad_norm": 0.42299216985702515, "learning_rate": 9.938484805419224e-05, "loss": 0.1425, "step": 5850 }, { "grad_norm": 0.4657228887081146, "learning_rate": 9.938053105923894e-05, "loss": 0.1365, "step": 5860 }, { "grad_norm": 0.45642316341400146, "learning_rate": 9.937619906378413e-05, "loss": 0.1357, "step": 5870 }, { "grad_norm": 0.4460170567035675, "learning_rate": 9.937185206914374e-05, "loss": 0.1442, "step": 5880 }, { "grad_norm": 0.4177013635635376, "learning_rate": 9.936749007663829e-05, "loss": 0.1401, "step": 5890 }, { "grad_norm": 0.5017956495285034, "learning_rate": 9.93631130875928e-05, "loss": 0.1376, "step": 5900 }, { "grad_norm": 0.4445359706878662, "learning_rate": 9.935872110333692e-05, "loss": 0.1344, "step": 5910 }, { "grad_norm": 0.4287000000476837, "learning_rate": 9.935431412520484e-05, "loss": 0.1375, "step": 5920 }, { "grad_norm": 0.42407968640327454, "learning_rate": 9.934989215453523e-05, "loss": 0.1353, "step": 5930 }, { "grad_norm": 0.5444915890693665, "learning_rate": 9.934545519267139e-05, "loss": 0.1333, "step": 5940 }, { "grad_norm": 0.47427916526794434, "learning_rate": 9.934100324096117e-05, "loss": 0.1373, "step": 5950 }, { "grad_norm": 0.5107167959213257, "learning_rate": 9.933653630075692e-05, "loss": 0.1379, "step": 5960 }, { "grad_norm": 0.4962834417819977, "learning_rate": 9.93320543734156e-05, "loss": 0.1332, "step": 5970 }, { "grad_norm": 0.4528677463531494, "learning_rate": 9.932755746029871e-05, "loss": 0.1382, "step": 5980 }, { "grad_norm": 0.5168883800506592, "learning_rate": 9.932304556277228e-05, "loss": 0.1333, "step": 5990 }, { "grad_norm": 0.5443991422653198, "learning_rate": 9.93185186822069e-05, "loss": 0.1413, "step": 6000 }, { "grad_norm": 0.45979002118110657, "learning_rate": 9.931397681997773e-05, "loss": 0.1278, "step": 6010 }, { "grad_norm": 0.5402315258979797, "learning_rate": 9.930941997746446e-05, "loss": 0.1341, "step": 6020 }, { "grad_norm": 0.4471725821495056, "learning_rate": 9.930484815605134e-05, "loss": 0.1306, "step": 6030 }, { "grad_norm": 0.6298156380653381, "learning_rate": 9.930026135712717e-05, "loss": 0.1346, "step": 6040 }, { "grad_norm": 0.4520268738269806, "learning_rate": 9.92956595820853e-05, "loss": 0.1315, "step": 6050 }, { "grad_norm": 0.4074367880821228, "learning_rate": 9.929104283232362e-05, "loss": 0.1333, "step": 6060 }, { "grad_norm": 0.35179224610328674, "learning_rate": 9.92864111092446e-05, "loss": 0.1379, "step": 6070 }, { "grad_norm": 0.4298937916755676, "learning_rate": 9.92817644142552e-05, "loss": 0.1425, "step": 6080 }, { "grad_norm": 0.3779122531414032, "learning_rate": 9.927710274876698e-05, "loss": 0.1325, "step": 6090 }, { "grad_norm": 0.48163917660713196, "learning_rate": 9.927242611419603e-05, "loss": 0.1286, "step": 6100 }, { "grad_norm": 0.4466235637664795, "learning_rate": 9.926773451196301e-05, "loss": 0.1298, "step": 6110 }, { "grad_norm": 0.4247318506240845, "learning_rate": 9.926302794349306e-05, "loss": 0.1348, "step": 6120 }, { "grad_norm": 0.4024914503097534, "learning_rate": 9.925830641021594e-05, "loss": 0.1387, "step": 6130 }, { "grad_norm": 0.41416728496551514, "learning_rate": 9.925356991356593e-05, "loss": 0.128, "step": 6140 }, { "grad_norm": 0.3477926254272461, "learning_rate": 9.924881845498184e-05, "loss": 0.1236, "step": 6150 }, { "grad_norm": 0.35065746307373047, "learning_rate": 9.924405203590705e-05, "loss": 0.1379, "step": 6160 }, { "grad_norm": 0.4023871123790741, "learning_rate": 9.923927065778946e-05, "loss": 0.1386, "step": 6170 }, { "grad_norm": 0.3970889747142792, "learning_rate": 9.923447432208154e-05, "loss": 0.1374, "step": 6180 }, { "grad_norm": 0.47978776693344116, "learning_rate": 9.922966303024027e-05, "loss": 0.1457, "step": 6190 }, { "grad_norm": 0.35430338978767395, "learning_rate": 9.922483678372721e-05, "loss": 0.1343, "step": 6200 }, { "grad_norm": 0.3754441440105438, "learning_rate": 9.921999558400845e-05, "loss": 0.1366, "step": 6210 }, { "grad_norm": 0.39648669958114624, "learning_rate": 9.92151394325546e-05, "loss": 0.1292, "step": 6220 }, { "grad_norm": 0.41935357451438904, "learning_rate": 9.921026833084084e-05, "loss": 0.1384, "step": 6230 }, { "grad_norm": 0.3920626938343048, "learning_rate": 9.920538228034689e-05, "loss": 0.1333, "step": 6240 }, { "grad_norm": 0.5305835008621216, "learning_rate": 9.920048128255699e-05, "loss": 0.1394, "step": 6250 }, { "grad_norm": 0.40946295857429504, "learning_rate": 9.919556533895995e-05, "loss": 0.1401, "step": 6260 }, { "grad_norm": 0.4209093153476715, "learning_rate": 9.919063445104907e-05, "loss": 0.1352, "step": 6270 }, { "grad_norm": 0.37849947810173035, "learning_rate": 9.918568862032227e-05, "loss": 0.1419, "step": 6280 }, { "grad_norm": 0.3919341564178467, "learning_rate": 9.918072784828194e-05, "loss": 0.1449, "step": 6290 }, { "grad_norm": 0.41727909445762634, "learning_rate": 9.917575213643501e-05, "loss": 0.1276, "step": 6300 }, { "grad_norm": 0.47153499722480774, "learning_rate": 9.917076148629302e-05, "loss": 0.1377, "step": 6310 }, { "grad_norm": 0.4465515911579132, "learning_rate": 9.916575589937196e-05, "loss": 0.1332, "step": 6320 }, { "grad_norm": 0.41123640537261963, "learning_rate": 9.916073537719239e-05, "loss": 0.1378, "step": 6330 }, { "grad_norm": 0.4478461444377899, "learning_rate": 9.915569992127944e-05, "loss": 0.133, "step": 6340 }, { "grad_norm": 0.44156864285469055, "learning_rate": 9.915064953316273e-05, "loss": 0.1265, "step": 6350 }, { "grad_norm": 0.4072710871696472, "learning_rate": 9.914558421437645e-05, "loss": 0.1397, "step": 6360 }, { "grad_norm": 0.3542092442512512, "learning_rate": 9.914050396645929e-05, "loss": 0.1321, "step": 6370 }, { "grad_norm": 0.45624053478240967, "learning_rate": 9.913540879095452e-05, "loss": 0.129, "step": 6380 }, { "grad_norm": 0.45659908652305603, "learning_rate": 9.913029868940987e-05, "loss": 0.1352, "step": 6390 }, { "grad_norm": 0.44748812913894653, "learning_rate": 9.912517366337772e-05, "loss": 0.1346, "step": 6400 }, { "grad_norm": 0.41509905457496643, "learning_rate": 9.912003371441487e-05, "loss": 0.142, "step": 6410 }, { "grad_norm": 0.398860901594162, "learning_rate": 9.911487884408271e-05, "loss": 0.1361, "step": 6420 }, { "grad_norm": 0.4707973301410675, "learning_rate": 9.910970905394719e-05, "loss": 0.1313, "step": 6430 }, { "grad_norm": 0.4058988690376282, "learning_rate": 9.91045243455787e-05, "loss": 0.1379, "step": 6440 }, { "grad_norm": 0.3881915509700775, "learning_rate": 9.909932472055225e-05, "loss": 0.129, "step": 6450 }, { "grad_norm": 0.45429548621177673, "learning_rate": 9.909411018044734e-05, "loss": 0.1339, "step": 6460 }, { "grad_norm": 0.41240882873535156, "learning_rate": 9.908888072684802e-05, "loss": 0.1418, "step": 6470 }, { "grad_norm": 0.4298354387283325, "learning_rate": 9.908363636134285e-05, "loss": 0.1284, "step": 6480 }, { "grad_norm": 0.4326455295085907, "learning_rate": 9.907837708552493e-05, "loss": 0.1333, "step": 6490 }, { "grad_norm": 0.440611869096756, "learning_rate": 9.90731029009919e-05, "loss": 0.1311, "step": 6500 }, { "grad_norm": 0.4402589201927185, "learning_rate": 9.906781380934589e-05, "loss": 0.1338, "step": 6510 }, { "grad_norm": 0.47367823123931885, "learning_rate": 9.906250981219362e-05, "loss": 0.139, "step": 6520 }, { "grad_norm": 0.3862268924713135, "learning_rate": 9.905719091114628e-05, "loss": 0.1406, "step": 6530 }, { "grad_norm": 0.36541518568992615, "learning_rate": 9.905185710781964e-05, "loss": 0.1307, "step": 6540 }, { "grad_norm": 0.49940797686576843, "learning_rate": 9.904650840383392e-05, "loss": 0.1371, "step": 6550 }, { "grad_norm": 0.4498700201511383, "learning_rate": 9.904114480081397e-05, "loss": 0.1417, "step": 6560 }, { "grad_norm": 0.36927640438079834, "learning_rate": 9.903576630038906e-05, "loss": 0.1355, "step": 6570 }, { "grad_norm": 0.4612352252006531, "learning_rate": 9.903037290419309e-05, "loss": 0.1445, "step": 6580 }, { "grad_norm": 0.3819347620010376, "learning_rate": 9.902496461386439e-05, "loss": 0.1335, "step": 6590 }, { "grad_norm": 0.4694919288158417, "learning_rate": 9.901954143104588e-05, "loss": 0.1286, "step": 6600 }, { "grad_norm": 0.4181078374385834, "learning_rate": 9.901410335738496e-05, "loss": 0.14, "step": 6610 }, { "grad_norm": 0.4565073847770691, "learning_rate": 9.900865039453358e-05, "loss": 0.1256, "step": 6620 }, { "grad_norm": 0.4246467649936676, "learning_rate": 9.900318254414821e-05, "loss": 0.1444, "step": 6630 }, { "grad_norm": 0.4689114987850189, "learning_rate": 9.899769980788985e-05, "loss": 0.1402, "step": 6640 }, { "grad_norm": 0.5084611177444458, "learning_rate": 9.899220218742398e-05, "loss": 0.1297, "step": 6650 }, { "grad_norm": 0.4983733296394348, "learning_rate": 9.898668968442066e-05, "loss": 0.1416, "step": 6660 }, { "grad_norm": 0.3813593089580536, "learning_rate": 9.898116230055443e-05, "loss": 0.132, "step": 6670 }, { "grad_norm": 0.4345841109752655, "learning_rate": 9.897562003750437e-05, "loss": 0.1368, "step": 6680 }, { "grad_norm": 0.3658589720726013, "learning_rate": 9.897006289695407e-05, "loss": 0.1337, "step": 6690 }, { "grad_norm": 0.45629751682281494, "learning_rate": 9.896449088059164e-05, "loss": 0.1366, "step": 6700 }, { "grad_norm": 0.39063742756843567, "learning_rate": 9.89589039901097e-05, "loss": 0.1308, "step": 6710 }, { "grad_norm": 0.4056045711040497, "learning_rate": 9.895330222720542e-05, "loss": 0.1317, "step": 6720 }, { "grad_norm": 0.40053221583366394, "learning_rate": 9.894768559358047e-05, "loss": 0.1341, "step": 6730 }, { "grad_norm": 0.38423675298690796, "learning_rate": 9.894205409094101e-05, "loss": 0.1414, "step": 6740 }, { "grad_norm": 0.40026578307151794, "learning_rate": 9.893640772099777e-05, "loss": 0.1291, "step": 6750 }, { "grad_norm": 0.41017231345176697, "learning_rate": 9.893074648546595e-05, "loss": 0.1306, "step": 6760 }, { "grad_norm": 0.3866899609565735, "learning_rate": 9.892507038606528e-05, "loss": 0.1241, "step": 6770 }, { "grad_norm": 0.4114864468574524, "learning_rate": 9.891937942452003e-05, "loss": 0.1322, "step": 6780 }, { "grad_norm": 0.4775182008743286, "learning_rate": 9.891367360255895e-05, "loss": 0.1372, "step": 6790 }, { "grad_norm": 0.42295676469802856, "learning_rate": 9.890795292191532e-05, "loss": 0.144, "step": 6800 }, { "grad_norm": 0.3922048509120941, "learning_rate": 9.890221738432694e-05, "loss": 0.1322, "step": 6810 }, { "grad_norm": 0.41857898235321045, "learning_rate": 9.88964669915361e-05, "loss": 0.1265, "step": 6820 }, { "grad_norm": 0.3518364727497101, "learning_rate": 9.889070174528963e-05, "loss": 0.1222, "step": 6830 }, { "grad_norm": 0.38894563913345337, "learning_rate": 9.888492164733883e-05, "loss": 0.123, "step": 6840 }, { "grad_norm": 0.37243103981018066, "learning_rate": 9.88791266994396e-05, "loss": 0.1265, "step": 6850 }, { "grad_norm": 0.4592823088169098, "learning_rate": 9.887331690335223e-05, "loss": 0.1422, "step": 6860 }, { "grad_norm": 0.4232751131057739, "learning_rate": 9.886749226084163e-05, "loss": 0.1365, "step": 6870 }, { "grad_norm": 0.4649706184864044, "learning_rate": 9.886165277367714e-05, "loss": 0.141, "step": 6880 }, { "grad_norm": 0.3763732314109802, "learning_rate": 9.885579844363265e-05, "loss": 0.1391, "step": 6890 }, { "grad_norm": 0.42191097140312195, "learning_rate": 9.884992927248656e-05, "loss": 0.1327, "step": 6900 }, { "grad_norm": 0.4020983576774597, "learning_rate": 9.884404526202178e-05, "loss": 0.137, "step": 6910 }, { "grad_norm": 0.42852073907852173, "learning_rate": 9.883814641402568e-05, "loss": 0.1454, "step": 6920 }, { "grad_norm": 0.3854748010635376, "learning_rate": 9.88322327302902e-05, "loss": 0.1332, "step": 6930 }, { "grad_norm": 0.36088693141937256, "learning_rate": 9.882630421261176e-05, "loss": 0.1346, "step": 6940 }, { "grad_norm": 0.40957170724868774, "learning_rate": 9.88203608627913e-05, "loss": 0.1437, "step": 6950 }, { "grad_norm": 0.4592682719230652, "learning_rate": 9.881440268263422e-05, "loss": 0.1385, "step": 6960 }, { "grad_norm": 0.4838898777961731, "learning_rate": 9.880842967395048e-05, "loss": 0.1275, "step": 6970 }, { "grad_norm": 0.4072883129119873, "learning_rate": 9.880244183855452e-05, "loss": 0.1333, "step": 6980 }, { "grad_norm": 0.37305229902267456, "learning_rate": 9.879643917826527e-05, "loss": 0.1324, "step": 6990 }, { "grad_norm": 0.4067302644252777, "learning_rate": 9.87904216949062e-05, "loss": 0.1326, "step": 7000 }, { "grad_norm": 0.429580420255661, "learning_rate": 9.878438939030526e-05, "loss": 0.1396, "step": 7010 }, { "grad_norm": 0.43490907549858093, "learning_rate": 9.877834226629489e-05, "loss": 0.1276, "step": 7020 }, { "grad_norm": 0.3280240595340729, "learning_rate": 9.877228032471206e-05, "loss": 0.1366, "step": 7030 }, { "grad_norm": 0.39454054832458496, "learning_rate": 9.876620356739823e-05, "loss": 0.1371, "step": 7040 }, { "grad_norm": 0.5117334127426147, "learning_rate": 9.876011199619935e-05, "loss": 0.1414, "step": 7050 }, { "grad_norm": 0.4382916986942291, "learning_rate": 9.875400561296589e-05, "loss": 0.1304, "step": 7060 }, { "grad_norm": 0.4350440204143524, "learning_rate": 9.874788441955278e-05, "loss": 0.141, "step": 7070 }, { "grad_norm": 0.39775732159614563, "learning_rate": 9.874174841781951e-05, "loss": 0.1289, "step": 7080 }, { "grad_norm": 0.43150463700294495, "learning_rate": 9.873559760963003e-05, "loss": 0.134, "step": 7090 }, { "grad_norm": 0.35491904616355896, "learning_rate": 9.872943199685278e-05, "loss": 0.1345, "step": 7100 }, { "grad_norm": 0.37826886773109436, "learning_rate": 9.872325158136071e-05, "loss": 0.138, "step": 7110 }, { "grad_norm": 0.39623647928237915, "learning_rate": 9.871705636503128e-05, "loss": 0.1346, "step": 7120 }, { "grad_norm": 0.34657230973243713, "learning_rate": 9.871084634974641e-05, "loss": 0.1276, "step": 7130 }, { "grad_norm": 0.40298405289649963, "learning_rate": 9.870462153739257e-05, "loss": 0.1353, "step": 7140 }, { "grad_norm": 0.47610050439834595, "learning_rate": 9.869838192986067e-05, "loss": 0.1378, "step": 7150 }, { "grad_norm": 0.32137930393218994, "learning_rate": 9.869212752904616e-05, "loss": 0.1373, "step": 7160 }, { "grad_norm": 0.41327807307243347, "learning_rate": 9.868585833684894e-05, "loss": 0.1448, "step": 7170 }, { "grad_norm": 0.37675443291664124, "learning_rate": 9.867957435517342e-05, "loss": 0.133, "step": 7180 }, { "grad_norm": 0.38815614581108093, "learning_rate": 9.867327558592854e-05, "loss": 0.1393, "step": 7190 }, { "grad_norm": 0.4883882403373718, "learning_rate": 9.866696203102766e-05, "loss": 0.1346, "step": 7200 }, { "grad_norm": 0.45251908898353577, "learning_rate": 9.86606336923887e-05, "loss": 0.1293, "step": 7210 }, { "grad_norm": 0.4814627468585968, "learning_rate": 9.865429057193403e-05, "loss": 0.1439, "step": 7220 }, { "grad_norm": 0.4666111171245575, "learning_rate": 9.864793267159053e-05, "loss": 0.1328, "step": 7230 }, { "grad_norm": 0.40319934487342834, "learning_rate": 9.864155999328957e-05, "loss": 0.1271, "step": 7240 }, { "grad_norm": 0.3774356544017792, "learning_rate": 9.8635172538967e-05, "loss": 0.1292, "step": 7250 }, { "grad_norm": 0.40391770005226135, "learning_rate": 9.862877031056312e-05, "loss": 0.1332, "step": 7260 }, { "grad_norm": 0.42309853434562683, "learning_rate": 9.862235331002279e-05, "loss": 0.14, "step": 7270 }, { "grad_norm": 0.3761102557182312, "learning_rate": 9.861592153929533e-05, "loss": 0.123, "step": 7280 }, { "grad_norm": 0.39720410108566284, "learning_rate": 9.860947500033455e-05, "loss": 0.1286, "step": 7290 }, { "grad_norm": 0.38693612813949585, "learning_rate": 9.86030136950987e-05, "loss": 0.1381, "step": 7300 }, { "grad_norm": 0.5055922865867615, "learning_rate": 9.85965376255506e-05, "loss": 0.133, "step": 7310 }, { "grad_norm": 0.5110734105110168, "learning_rate": 9.859004679365747e-05, "loss": 0.1353, "step": 7320 }, { "grad_norm": 0.48958995938301086, "learning_rate": 9.858354120139108e-05, "loss": 0.1365, "step": 7330 }, { "grad_norm": 0.35940003395080566, "learning_rate": 9.857702085072764e-05, "loss": 0.1317, "step": 7340 }, { "grad_norm": 0.4137304723262787, "learning_rate": 9.857048574364787e-05, "loss": 0.1321, "step": 7350 }, { "grad_norm": 0.35695379972457886, "learning_rate": 9.856393588213698e-05, "loss": 0.1294, "step": 7360 }, { "grad_norm": 0.4323517680168152, "learning_rate": 9.855737126818458e-05, "loss": 0.1326, "step": 7370 }, { "grad_norm": 0.3562936782836914, "learning_rate": 9.855079190378491e-05, "loss": 0.138, "step": 7380 }, { "grad_norm": 0.3762814402580261, "learning_rate": 9.854419779093655e-05, "loss": 0.1359, "step": 7390 }, { "grad_norm": 0.42037901282310486, "learning_rate": 9.853758893164264e-05, "loss": 0.1346, "step": 7400 }, { "grad_norm": 0.3680761456489563, "learning_rate": 9.853096532791078e-05, "loss": 0.122, "step": 7410 }, { "grad_norm": 0.4733975827693939, "learning_rate": 9.852432698175304e-05, "loss": 0.1366, "step": 7420 }, { "grad_norm": 0.43601343035697937, "learning_rate": 9.851767389518597e-05, "loss": 0.1324, "step": 7430 }, { "grad_norm": 0.42050036787986755, "learning_rate": 9.85110060702306e-05, "loss": 0.1318, "step": 7440 }, { "grad_norm": 0.4034005403518677, "learning_rate": 9.850432350891245e-05, "loss": 0.127, "step": 7450 }, { "grad_norm": 0.4962591826915741, "learning_rate": 9.84976262132615e-05, "loss": 0.1289, "step": 7460 }, { "grad_norm": 0.40297120809555054, "learning_rate": 9.849091418531222e-05, "loss": 0.1359, "step": 7470 }, { "grad_norm": 0.4140458405017853, "learning_rate": 9.848418742710353e-05, "loss": 0.132, "step": 7480 }, { "grad_norm": 0.4097139537334442, "learning_rate": 9.847744594067885e-05, "loss": 0.1424, "step": 7490 }, { "grad_norm": 0.3610565960407257, "learning_rate": 9.847068972808607e-05, "loss": 0.1388, "step": 7500 }, { "grad_norm": 0.4280611574649811, "learning_rate": 9.846391879137756e-05, "loss": 0.1262, "step": 7510 }, { "grad_norm": 0.3936038613319397, "learning_rate": 9.845713313261012e-05, "loss": 0.1375, "step": 7520 }, { "grad_norm": 0.4520220160484314, "learning_rate": 9.845033275384505e-05, "loss": 0.138, "step": 7530 }, { "grad_norm": 0.39146456122398376, "learning_rate": 9.844351765714818e-05, "loss": 0.1304, "step": 7540 }, { "grad_norm": 0.39461207389831543, "learning_rate": 9.843668784458971e-05, "loss": 0.1349, "step": 7550 }, { "grad_norm": 0.38022711873054504, "learning_rate": 9.842984331824437e-05, "loss": 0.1317, "step": 7560 }, { "grad_norm": 0.3599965274333954, "learning_rate": 9.842298408019133e-05, "loss": 0.1286, "step": 7570 }, { "grad_norm": 0.43216219544410706, "learning_rate": 9.841611013251429e-05, "loss": 0.143, "step": 7580 }, { "grad_norm": 0.3409128785133362, "learning_rate": 9.840922147730133e-05, "loss": 0.1357, "step": 7590 }, { "grad_norm": 0.38440558314323425, "learning_rate": 9.840231811664506e-05, "loss": 0.1363, "step": 7600 }, { "grad_norm": 0.39884820580482483, "learning_rate": 9.839540005264252e-05, "loss": 0.1267, "step": 7610 }, { "grad_norm": 0.3652859032154083, "learning_rate": 9.838846728739527e-05, "loss": 0.1239, "step": 7620 }, { "grad_norm": 0.3632954955101013, "learning_rate": 9.838151982300927e-05, "loss": 0.1332, "step": 7630 }, { "grad_norm": 0.41360828280448914, "learning_rate": 9.8374557661595e-05, "loss": 0.1336, "step": 7640 }, { "grad_norm": 0.38145214319229126, "learning_rate": 9.836758080526735e-05, "loss": 0.1316, "step": 7650 }, { "grad_norm": 0.32458579540252686, "learning_rate": 9.836058925614575e-05, "loss": 0.1277, "step": 7660 }, { "grad_norm": 0.31706973910331726, "learning_rate": 9.8353583016354e-05, "loss": 0.1302, "step": 7670 }, { "grad_norm": 0.41098108887672424, "learning_rate": 9.834656208802044e-05, "loss": 0.1292, "step": 7680 }, { "grad_norm": 0.409057080745697, "learning_rate": 9.833952647327784e-05, "loss": 0.1316, "step": 7690 }, { "grad_norm": 0.37828367948532104, "learning_rate": 9.833247617426342e-05, "loss": 0.1307, "step": 7700 }, { "grad_norm": 0.3578576445579529, "learning_rate": 9.832541119311889e-05, "loss": 0.1327, "step": 7710 }, { "grad_norm": 0.41793736815452576, "learning_rate": 9.83183315319904e-05, "loss": 0.1279, "step": 7720 }, { "grad_norm": 0.547339916229248, "learning_rate": 9.831123719302855e-05, "loss": 0.143, "step": 7730 }, { "grad_norm": 0.42102688550949097, "learning_rate": 9.830412817838842e-05, "loss": 0.1253, "step": 7740 }, { "grad_norm": 0.3313559591770172, "learning_rate": 9.829700449022956e-05, "loss": 0.1339, "step": 7750 }, { "grad_norm": 0.4125280976295471, "learning_rate": 9.828986613071593e-05, "loss": 0.1304, "step": 7760 }, { "grad_norm": 0.3795121908187866, "learning_rate": 9.828271310201601e-05, "loss": 0.1241, "step": 7770 }, { "grad_norm": 0.4020049571990967, "learning_rate": 9.827554540630268e-05, "loss": 0.1382, "step": 7780 }, { "grad_norm": 0.3320959806442261, "learning_rate": 9.826836304575329e-05, "loss": 0.1268, "step": 7790 }, { "grad_norm": 0.38785287737846375, "learning_rate": 9.826116602254966e-05, "loss": 0.1308, "step": 7800 }, { "grad_norm": 0.3842443525791168, "learning_rate": 9.825395433887805e-05, "loss": 0.1327, "step": 7810 }, { "grad_norm": 0.47622159123420715, "learning_rate": 9.824672799692917e-05, "loss": 0.1382, "step": 7820 }, { "grad_norm": 0.42947232723236084, "learning_rate": 9.823948699889823e-05, "loss": 0.1427, "step": 7830 }, { "grad_norm": 0.39656299352645874, "learning_rate": 9.823223134698483e-05, "loss": 0.1355, "step": 7840 }, { "grad_norm": 0.4565046727657318, "learning_rate": 9.822496104339303e-05, "loss": 0.1318, "step": 7850 }, { "grad_norm": 0.3323585093021393, "learning_rate": 9.821767609033138e-05, "loss": 0.1273, "step": 7860 }, { "grad_norm": 0.3392324149608612, "learning_rate": 9.821037649001284e-05, "loss": 0.1283, "step": 7870 }, { "grad_norm": 0.3118092119693756, "learning_rate": 9.820306224465486e-05, "loss": 0.1384, "step": 7880 }, { "grad_norm": 0.437872052192688, "learning_rate": 9.819573335647928e-05, "loss": 0.1444, "step": 7890 }, { "grad_norm": 0.40620875358581543, "learning_rate": 9.818838982771246e-05, "loss": 0.1299, "step": 7900 }, { "grad_norm": 0.3711297810077667, "learning_rate": 9.818103166058514e-05, "loss": 0.1407, "step": 7910 }, { "grad_norm": 0.3042111098766327, "learning_rate": 9.817365885733254e-05, "loss": 0.1345, "step": 7920 }, { "grad_norm": 0.36083120107650757, "learning_rate": 9.816627142019434e-05, "loss": 0.1346, "step": 7930 }, { "grad_norm": 0.4722733199596405, "learning_rate": 9.815886935141463e-05, "loss": 0.1352, "step": 7940 }, { "grad_norm": 0.4296022355556488, "learning_rate": 9.8151452653242e-05, "loss": 0.143, "step": 7950 }, { "grad_norm": 0.43014857172966003, "learning_rate": 9.814402132792939e-05, "loss": 0.1491, "step": 7960 }, { "grad_norm": 0.3806188702583313, "learning_rate": 9.813657537773428e-05, "loss": 0.1381, "step": 7970 }, { "grad_norm": 0.45538878440856934, "learning_rate": 9.812911480491854e-05, "loss": 0.1521, "step": 7980 }, { "grad_norm": 0.44739651679992676, "learning_rate": 9.81216396117485e-05, "loss": 0.138, "step": 7990 }, { "grad_norm": 0.4153222143650055, "learning_rate": 9.811414980049491e-05, "loss": 0.1295, "step": 8000 }, { "grad_norm": 0.4008978307247162, "learning_rate": 9.810664537343301e-05, "loss": 0.134, "step": 8010 }, { "grad_norm": 0.3187969923019409, "learning_rate": 9.809912633284243e-05, "loss": 0.1357, "step": 8020 }, { "grad_norm": 0.3554292917251587, "learning_rate": 9.809159268100725e-05, "loss": 0.137, "step": 8030 }, { "grad_norm": 0.371548056602478, "learning_rate": 9.808404442021599e-05, "loss": 0.1297, "step": 8040 }, { "grad_norm": 0.3973073363304138, "learning_rate": 9.807648155276163e-05, "loss": 0.1293, "step": 8050 }, { "grad_norm": 0.3304101526737213, "learning_rate": 9.806890408094156e-05, "loss": 0.1324, "step": 8060 }, { "grad_norm": 0.31236952543258667, "learning_rate": 9.806131200705761e-05, "loss": 0.128, "step": 8070 }, { "grad_norm": 0.35327327251434326, "learning_rate": 9.805370533341605e-05, "loss": 0.1302, "step": 8080 }, { "grad_norm": 0.4064542353153229, "learning_rate": 9.804608406232762e-05, "loss": 0.1265, "step": 8090 }, { "grad_norm": 0.3965306878089905, "learning_rate": 9.803844819610741e-05, "loss": 0.1229, "step": 8100 }, { "grad_norm": 0.37889862060546875, "learning_rate": 9.803079773707504e-05, "loss": 0.1266, "step": 8110 }, { "grad_norm": 0.4663085341453552, "learning_rate": 9.802313268755447e-05, "loss": 0.1309, "step": 8120 }, { "grad_norm": 0.41875162720680237, "learning_rate": 9.801545304987419e-05, "loss": 0.1245, "step": 8130 }, { "grad_norm": 0.40439876914024353, "learning_rate": 9.800775882636704e-05, "loss": 0.1357, "step": 8140 }, { "grad_norm": 0.36148759722709656, "learning_rate": 9.800005001937034e-05, "loss": 0.1322, "step": 8150 }, { "grad_norm": 0.3449189066886902, "learning_rate": 9.79923266312258e-05, "loss": 0.1277, "step": 8160 }, { "grad_norm": 0.31682834029197693, "learning_rate": 9.79845886642796e-05, "loss": 0.13, "step": 8170 }, { "grad_norm": 0.33806726336479187, "learning_rate": 9.797683612088233e-05, "loss": 0.1331, "step": 8180 }, { "grad_norm": 0.41041299700737, "learning_rate": 9.796906900338898e-05, "loss": 0.1376, "step": 8190 }, { "grad_norm": 0.3327403664588928, "learning_rate": 9.796128731415903e-05, "loss": 0.1334, "step": 8200 }, { "grad_norm": 0.330401211977005, "learning_rate": 9.795349105555634e-05, "loss": 0.1273, "step": 8210 }, { "grad_norm": 0.45245984196662903, "learning_rate": 9.794568022994922e-05, "loss": 0.1405, "step": 8220 }, { "grad_norm": 0.4603584408760071, "learning_rate": 9.793785483971034e-05, "loss": 0.1341, "step": 8230 }, { "grad_norm": 0.30700212717056274, "learning_rate": 9.793001488721691e-05, "loss": 0.1311, "step": 8240 }, { "grad_norm": 0.3658927381038666, "learning_rate": 9.792216037485047e-05, "loss": 0.1293, "step": 8250 }, { "grad_norm": 0.4336036443710327, "learning_rate": 9.791429130499704e-05, "loss": 0.1276, "step": 8260 }, { "grad_norm": 0.39817896485328674, "learning_rate": 9.790640768004698e-05, "loss": 0.1446, "step": 8270 }, { "grad_norm": 0.40063443779945374, "learning_rate": 9.789850950239518e-05, "loss": 0.1329, "step": 8280 }, { "grad_norm": 0.41768211126327515, "learning_rate": 9.789059677444089e-05, "loss": 0.1307, "step": 8290 }, { "grad_norm": 0.3359839916229248, "learning_rate": 9.788266949858776e-05, "loss": 0.126, "step": 8300 }, { "grad_norm": 0.45048996806144714, "learning_rate": 9.787472767724392e-05, "loss": 0.1383, "step": 8310 }, { "grad_norm": 0.3782585859298706, "learning_rate": 9.786677131282185e-05, "loss": 0.1391, "step": 8320 }, { "grad_norm": 0.4029935598373413, "learning_rate": 9.785880040773853e-05, "loss": 0.1379, "step": 8330 }, { "grad_norm": 0.33339035511016846, "learning_rate": 9.785081496441527e-05, "loss": 0.1303, "step": 8340 }, { "grad_norm": 0.3543514609336853, "learning_rate": 9.784281498527785e-05, "loss": 0.1357, "step": 8350 }, { "grad_norm": 0.37670281529426575, "learning_rate": 9.783480047275646e-05, "loss": 0.1348, "step": 8360 }, { "grad_norm": 0.31259188055992126, "learning_rate": 9.78267714292857e-05, "loss": 0.1237, "step": 8370 }, { "grad_norm": 0.33421602845191956, "learning_rate": 9.781872785730454e-05, "loss": 0.1302, "step": 8380 }, { "grad_norm": 0.34695661067962646, "learning_rate": 9.781066975925646e-05, "loss": 0.1309, "step": 8390 }, { "grad_norm": 0.3264489769935608, "learning_rate": 9.780259713758928e-05, "loss": 0.1173, "step": 8400 }, { "grad_norm": 0.3473994731903076, "learning_rate": 9.779450999475524e-05, "loss": 0.1341, "step": 8410 }, { "grad_norm": 0.30938631296157837, "learning_rate": 9.7786408333211e-05, "loss": 0.1225, "step": 8420 }, { "grad_norm": 0.38088855147361755, "learning_rate": 9.777829215541764e-05, "loss": 0.1316, "step": 8430 }, { "grad_norm": 0.42539358139038086, "learning_rate": 9.777016146384064e-05, "loss": 0.1202, "step": 8440 }, { "grad_norm": 0.3648991584777832, "learning_rate": 9.776201626094988e-05, "loss": 0.1339, "step": 8450 }, { "grad_norm": 0.3570597469806671, "learning_rate": 9.775385654921965e-05, "loss": 0.1187, "step": 8460 }, { "grad_norm": 0.3204605281352997, "learning_rate": 9.774568233112868e-05, "loss": 0.134, "step": 8470 }, { "grad_norm": 0.4219425916671753, "learning_rate": 9.773749360916007e-05, "loss": 0.1392, "step": 8480 }, { "grad_norm": 0.431801438331604, "learning_rate": 9.772929038580134e-05, "loss": 0.1342, "step": 8490 }, { "grad_norm": 0.3701893389225006, "learning_rate": 9.772107266354439e-05, "loss": 0.1294, "step": 8500 }, { "grad_norm": 0.3782850205898285, "learning_rate": 9.77128404448856e-05, "loss": 0.136, "step": 8510 }, { "grad_norm": 0.3741302788257599, "learning_rate": 9.770459373232565e-05, "loss": 0.1347, "step": 8520 }, { "grad_norm": 0.382390558719635, "learning_rate": 9.769633252836969e-05, "loss": 0.1294, "step": 8530 }, { "grad_norm": 0.3627473711967468, "learning_rate": 9.768805683552724e-05, "loss": 0.1257, "step": 8540 }, { "grad_norm": 0.4382565915584564, "learning_rate": 9.767976665631228e-05, "loss": 0.1406, "step": 8550 }, { "grad_norm": 0.32819637656211853, "learning_rate": 9.767146199324311e-05, "loss": 0.1268, "step": 8560 }, { "grad_norm": 0.42989882826805115, "learning_rate": 9.766314284884249e-05, "loss": 0.1385, "step": 8570 }, { "grad_norm": 0.33435162901878357, "learning_rate": 9.765480922563752e-05, "loss": 0.1279, "step": 8580 }, { "grad_norm": 0.3961484730243683, "learning_rate": 9.764646112615978e-05, "loss": 0.1274, "step": 8590 }, { "grad_norm": 0.3705799877643585, "learning_rate": 9.763809855294517e-05, "loss": 0.139, "step": 8600 }, { "grad_norm": 0.42214998602867126, "learning_rate": 9.762972150853404e-05, "loss": 0.1296, "step": 8610 }, { "grad_norm": 0.43912991881370544, "learning_rate": 9.762132999547111e-05, "loss": 0.1317, "step": 8620 }, { "grad_norm": 0.3250165581703186, "learning_rate": 9.761292401630549e-05, "loss": 0.1285, "step": 8630 }, { "grad_norm": 0.3328067362308502, "learning_rate": 9.76045035735907e-05, "loss": 0.1314, "step": 8640 }, { "grad_norm": 0.3378582000732422, "learning_rate": 9.759606866988464e-05, "loss": 0.1276, "step": 8650 }, { "grad_norm": 0.3415006101131439, "learning_rate": 9.758761930774963e-05, "loss": 0.1308, "step": 8660 }, { "grad_norm": 0.37848585844039917, "learning_rate": 9.757915548975235e-05, "loss": 0.1301, "step": 8670 }, { "grad_norm": 0.352772980928421, "learning_rate": 9.757067721846389e-05, "loss": 0.1413, "step": 8680 }, { "grad_norm": 0.4145978093147278, "learning_rate": 9.756218449645971e-05, "loss": 0.1355, "step": 8690 }, { "grad_norm": 0.34688809514045715, "learning_rate": 9.75536773263197e-05, "loss": 0.1282, "step": 8700 }, { "grad_norm": 0.4476308226585388, "learning_rate": 9.75451557106281e-05, "loss": 0.1274, "step": 8710 }, { "grad_norm": 0.39853185415267944, "learning_rate": 9.753661965197354e-05, "loss": 0.1356, "step": 8720 }, { "grad_norm": 0.3333134055137634, "learning_rate": 9.752806915294908e-05, "loss": 0.134, "step": 8730 }, { "grad_norm": 0.3658578097820282, "learning_rate": 9.75195042161521e-05, "loss": 0.1288, "step": 8740 }, { "grad_norm": 0.3801271915435791, "learning_rate": 9.751092484418442e-05, "loss": 0.1332, "step": 8750 }, { "grad_norm": 0.3924911618232727, "learning_rate": 9.750233103965224e-05, "loss": 0.1394, "step": 8760 }, { "grad_norm": 0.3945900797843933, "learning_rate": 9.749372280516611e-05, "loss": 0.1308, "step": 8770 }, { "grad_norm": 0.40277355909347534, "learning_rate": 9.748510014334097e-05, "loss": 0.1386, "step": 8780 }, { "grad_norm": 0.38338884711265564, "learning_rate": 9.747646305679621e-05, "loss": 0.1408, "step": 8790 }, { "grad_norm": 0.4442824721336365, "learning_rate": 9.74678115481555e-05, "loss": 0.1383, "step": 8800 }, { "grad_norm": 0.3492079973220825, "learning_rate": 9.745914562004696e-05, "loss": 0.1336, "step": 8810 }, { "grad_norm": 0.34821411967277527, "learning_rate": 9.745046527510307e-05, "loss": 0.1395, "step": 8820 }, { "grad_norm": 0.35938242077827454, "learning_rate": 9.744177051596068e-05, "loss": 0.1344, "step": 8830 }, { "grad_norm": 0.320134699344635, "learning_rate": 9.743306134526105e-05, "loss": 0.1293, "step": 8840 }, { "grad_norm": 0.3270711898803711, "learning_rate": 9.742433776564977e-05, "loss": 0.1349, "step": 8850 }, { "grad_norm": 0.31497740745544434, "learning_rate": 9.741559977977683e-05, "loss": 0.1332, "step": 8860 }, { "grad_norm": 0.3886284828186035, "learning_rate": 9.740684739029661e-05, "loss": 0.1258, "step": 8870 }, { "grad_norm": 0.36187541484832764, "learning_rate": 9.739808059986789e-05, "loss": 0.1283, "step": 8880 }, { "grad_norm": 0.40137043595314026, "learning_rate": 9.738929941115373e-05, "loss": 0.1243, "step": 8890 }, { "grad_norm": 0.4446832537651062, "learning_rate": 9.738050382682167e-05, "loss": 0.1359, "step": 8900 }, { "grad_norm": 0.3694598376750946, "learning_rate": 9.737169384954355e-05, "loss": 0.1284, "step": 8910 }, { "grad_norm": 0.38429373502731323, "learning_rate": 9.736286948199562e-05, "loss": 0.1309, "step": 8920 }, { "grad_norm": 0.317697674036026, "learning_rate": 9.735403072685848e-05, "loss": 0.1254, "step": 8930 }, { "grad_norm": 0.35445141792297363, "learning_rate": 9.734517758681712e-05, "loss": 0.1343, "step": 8940 }, { "grad_norm": 0.36197853088378906, "learning_rate": 9.733631006456088e-05, "loss": 0.1296, "step": 8950 }, { "grad_norm": 0.37931200861930847, "learning_rate": 9.732742816278348e-05, "loss": 0.1397, "step": 8960 }, { "grad_norm": 0.2826651632785797, "learning_rate": 9.731853188418302e-05, "loss": 0.1269, "step": 8970 }, { "grad_norm": 0.36439716815948486, "learning_rate": 9.730962123146194e-05, "loss": 0.1316, "step": 8980 }, { "grad_norm": 0.3712165951728821, "learning_rate": 9.730069620732709e-05, "loss": 0.1347, "step": 8990 }, { "grad_norm": 0.36409667134284973, "learning_rate": 9.72917568144896e-05, "loss": 0.1415, "step": 9000 }, { "grad_norm": 0.3789047300815582, "learning_rate": 9.728280305566509e-05, "loss": 0.1223, "step": 9010 }, { "grad_norm": 0.39512237906455994, "learning_rate": 9.727383493357343e-05, "loss": 0.1278, "step": 9020 }, { "grad_norm": 0.3798912465572357, "learning_rate": 9.726485245093891e-05, "loss": 0.1245, "step": 9030 }, { "grad_norm": 0.3494974374771118, "learning_rate": 9.725585561049018e-05, "loss": 0.1284, "step": 9040 }, { "grad_norm": 0.36517009139060974, "learning_rate": 9.724684441496022e-05, "loss": 0.1271, "step": 9050 }, { "grad_norm": 0.37539201974868774, "learning_rate": 9.72378188670864e-05, "loss": 0.1241, "step": 9060 }, { "grad_norm": 0.3499176800251007, "learning_rate": 9.722877896961047e-05, "loss": 0.1305, "step": 9070 }, { "grad_norm": 0.31028130650520325, "learning_rate": 9.721972472527848e-05, "loss": 0.1242, "step": 9080 }, { "grad_norm": 0.3782140016555786, "learning_rate": 9.721065613684089e-05, "loss": 0.1285, "step": 9090 }, { "grad_norm": 0.4470145106315613, "learning_rate": 9.72015732070525e-05, "loss": 0.1313, "step": 9100 }, { "grad_norm": 0.3804374039173126, "learning_rate": 9.719247593867244e-05, "loss": 0.1338, "step": 9110 }, { "grad_norm": 0.2840443551540375, "learning_rate": 9.718336433446423e-05, "loss": 0.1335, "step": 9120 }, { "grad_norm": 0.3921661972999573, "learning_rate": 9.717423839719574e-05, "loss": 0.1208, "step": 9130 }, { "grad_norm": 0.3516930043697357, "learning_rate": 9.71650981296392e-05, "loss": 0.1337, "step": 9140 }, { "grad_norm": 0.38630032539367676, "learning_rate": 9.715594353457118e-05, "loss": 0.1368, "step": 9150 }, { "grad_norm": 0.34154242277145386, "learning_rate": 9.714677461477257e-05, "loss": 0.1278, "step": 9160 }, { "grad_norm": 0.35769912600517273, "learning_rate": 9.713759137302869e-05, "loss": 0.1361, "step": 9170 }, { "grad_norm": 0.3462125360965729, "learning_rate": 9.712839381212914e-05, "loss": 0.131, "step": 9180 }, { "grad_norm": 0.32320478558540344, "learning_rate": 9.71191819348679e-05, "loss": 0.1226, "step": 9190 }, { "grad_norm": 0.3338281810283661, "learning_rate": 9.710995574404331e-05, "loss": 0.1335, "step": 9200 }, { "grad_norm": 0.34193333983421326, "learning_rate": 9.710071524245802e-05, "loss": 0.1295, "step": 9210 }, { "grad_norm": 0.3335099220275879, "learning_rate": 9.709146043291906e-05, "loss": 0.1334, "step": 9220 }, { "grad_norm": 0.34337425231933594, "learning_rate": 9.70821913182378e-05, "loss": 0.1321, "step": 9230 }, { "grad_norm": 0.4176008403301239, "learning_rate": 9.707290790122995e-05, "loss": 0.1266, "step": 9240 }, { "grad_norm": 0.4091542363166809, "learning_rate": 9.706361018471557e-05, "loss": 0.1306, "step": 9250 }, { "grad_norm": 0.33607152104377747, "learning_rate": 9.705429817151906e-05, "loss": 0.129, "step": 9260 }, { "grad_norm": 0.3525104522705078, "learning_rate": 9.704497186446917e-05, "loss": 0.1277, "step": 9270 }, { "grad_norm": 0.30817627906799316, "learning_rate": 9.703563126639896e-05, "loss": 0.1238, "step": 9280 }, { "grad_norm": 0.3278917968273163, "learning_rate": 9.70262763801459e-05, "loss": 0.1199, "step": 9290 }, { "grad_norm": 0.34303516149520874, "learning_rate": 9.701690720855171e-05, "loss": 0.1324, "step": 9300 }, { "grad_norm": 0.3366715610027313, "learning_rate": 9.700752375446253e-05, "loss": 0.1301, "step": 9310 }, { "grad_norm": 0.3605670928955078, "learning_rate": 9.69981260207288e-05, "loss": 0.1294, "step": 9320 }, { "grad_norm": 0.3729088306427002, "learning_rate": 9.698871401020529e-05, "loss": 0.1435, "step": 9330 }, { "grad_norm": 0.36285871267318726, "learning_rate": 9.697928772575112e-05, "loss": 0.1389, "step": 9340 }, { "grad_norm": 0.3540070354938507, "learning_rate": 9.696984717022976e-05, "loss": 0.128, "step": 9350 }, { "grad_norm": 0.3615584373474121, "learning_rate": 9.6960392346509e-05, "loss": 0.1408, "step": 9360 }, { "grad_norm": 0.33678144216537476, "learning_rate": 9.695092325746097e-05, "loss": 0.1439, "step": 9370 }, { "grad_norm": 0.3999866545200348, "learning_rate": 9.694143990596211e-05, "loss": 0.1392, "step": 9380 }, { "grad_norm": 0.37307825684547424, "learning_rate": 9.693194229489325e-05, "loss": 0.1365, "step": 9390 }, { "grad_norm": 0.33543193340301514, "learning_rate": 9.692243042713944e-05, "loss": 0.1275, "step": 9400 }, { "grad_norm": 0.43644312024116516, "learning_rate": 9.691290430559022e-05, "loss": 0.1276, "step": 9410 }, { "grad_norm": 0.34602591395378113, "learning_rate": 9.690336393313932e-05, "loss": 0.1267, "step": 9420 }, { "grad_norm": 0.33215829730033875, "learning_rate": 9.689380931268487e-05, "loss": 0.128, "step": 9430 }, { "grad_norm": 0.43807342648506165, "learning_rate": 9.688424044712932e-05, "loss": 0.134, "step": 9440 }, { "grad_norm": 0.38757970929145813, "learning_rate": 9.687465733937942e-05, "loss": 0.1287, "step": 9450 }, { "grad_norm": 0.2447117418050766, "learning_rate": 9.686505999234627e-05, "loss": 0.1337, "step": 9460 }, { "grad_norm": 0.37281665205955505, "learning_rate": 9.685544840894529e-05, "loss": 0.134, "step": 9470 }, { "grad_norm": 0.36159253120422363, "learning_rate": 9.684582259209624e-05, "loss": 0.1305, "step": 9480 }, { "grad_norm": 0.36241865158081055, "learning_rate": 9.683618254472317e-05, "loss": 0.1434, "step": 9490 }, { "grad_norm": 0.64483243227005, "learning_rate": 9.682652826975449e-05, "loss": 0.1345, "step": 9500 }, { "grad_norm": 0.47117140889167786, "learning_rate": 9.681685977012291e-05, "loss": 0.1307, "step": 9510 }, { "grad_norm": 0.35344764590263367, "learning_rate": 9.680717704876546e-05, "loss": 0.1403, "step": 9520 }, { "grad_norm": 0.3901916444301605, "learning_rate": 9.679748010862349e-05, "loss": 0.1405, "step": 9530 }, { "grad_norm": 0.3348620533943176, "learning_rate": 9.678776895264267e-05, "loss": 0.1368, "step": 9540 }, { "grad_norm": 0.29511454701423645, "learning_rate": 9.6778043583773e-05, "loss": 0.1334, "step": 9550 }, { "grad_norm": 0.29998213052749634, "learning_rate": 9.67683040049688e-05, "loss": 0.1344, "step": 9560 }, { "grad_norm": 0.3819141685962677, "learning_rate": 9.675855021918869e-05, "loss": 0.1348, "step": 9570 }, { "grad_norm": 0.33444663882255554, "learning_rate": 9.674878222939561e-05, "loss": 0.1322, "step": 9580 }, { "grad_norm": 0.34059688448905945, "learning_rate": 9.673900003855681e-05, "loss": 0.1202, "step": 9590 }, { "grad_norm": 0.3320016860961914, "learning_rate": 9.672920364964389e-05, "loss": 0.1278, "step": 9600 }, { "grad_norm": 0.40544652938842773, "learning_rate": 9.671939306563269e-05, "loss": 0.1333, "step": 9610 }, { "grad_norm": 0.373435914516449, "learning_rate": 9.670956828950345e-05, "loss": 0.1371, "step": 9620 }, { "grad_norm": 0.3835541009902954, "learning_rate": 9.669972932424065e-05, "loss": 0.1378, "step": 9630 }, { "grad_norm": 0.339152991771698, "learning_rate": 9.668987617283312e-05, "loss": 0.1261, "step": 9640 }, { "grad_norm": 0.38632020354270935, "learning_rate": 9.668000883827397e-05, "loss": 0.132, "step": 9650 }, { "grad_norm": 0.33425554633140564, "learning_rate": 9.667012732356067e-05, "loss": 0.1328, "step": 9660 }, { "grad_norm": 0.3536755442619324, "learning_rate": 9.666023163169493e-05, "loss": 0.1305, "step": 9670 }, { "grad_norm": 0.3848486542701721, "learning_rate": 9.665032176568281e-05, "loss": 0.1318, "step": 9680 }, { "grad_norm": 0.37929752469062805, "learning_rate": 9.664039772853469e-05, "loss": 0.1282, "step": 9690 }, { "grad_norm": 0.3441700041294098, "learning_rate": 9.663045952326518e-05, "loss": 0.1225, "step": 9700 }, { "grad_norm": 0.29336032271385193, "learning_rate": 9.662050715289328e-05, "loss": 0.1367, "step": 9710 }, { "grad_norm": 0.2951778173446655, "learning_rate": 9.661054062044226e-05, "loss": 0.1264, "step": 9720 }, { "grad_norm": 0.3327368199825287, "learning_rate": 9.660055992893968e-05, "loss": 0.1327, "step": 9730 }, { "grad_norm": 0.31659480929374695, "learning_rate": 9.659056508141739e-05, "loss": 0.1373, "step": 9740 }, { "grad_norm": 0.33631283044815063, "learning_rate": 9.658055608091161e-05, "loss": 0.1297, "step": 9750 }, { "grad_norm": 0.3233247399330139, "learning_rate": 9.657053293046276e-05, "loss": 0.135, "step": 9760 }, { "grad_norm": 0.3601404130458832, "learning_rate": 9.656049563311564e-05, "loss": 0.1347, "step": 9770 }, { "grad_norm": 0.3431384861469269, "learning_rate": 9.655044419191929e-05, "loss": 0.1291, "step": 9780 }, { "grad_norm": 0.3720654547214508, "learning_rate": 9.654037860992711e-05, "loss": 0.1307, "step": 9790 }, { "grad_norm": 0.39615464210510254, "learning_rate": 9.653029889019672e-05, "loss": 0.1294, "step": 9800 }, { "grad_norm": 0.3289942741394043, "learning_rate": 9.65202050357901e-05, "loss": 0.1368, "step": 9810 }, { "grad_norm": 0.37713348865509033, "learning_rate": 9.651009704977347e-05, "loss": 0.1282, "step": 9820 }, { "grad_norm": 0.4161076545715332, "learning_rate": 9.649997493521738e-05, "loss": 0.1454, "step": 9830 }, { "grad_norm": 0.379183292388916, "learning_rate": 9.64898386951967e-05, "loss": 0.1297, "step": 9840 }, { "grad_norm": 0.36354300379753113, "learning_rate": 9.647968833279049e-05, "loss": 0.139, "step": 9850 }, { "grad_norm": 0.36591920256614685, "learning_rate": 9.646952385108218e-05, "loss": 0.1356, "step": 9860 }, { "grad_norm": 0.33530858159065247, "learning_rate": 9.645934525315951e-05, "loss": 0.1234, "step": 9870 }, { "grad_norm": 0.3572870194911957, "learning_rate": 9.644915254211442e-05, "loss": 0.1291, "step": 9880 }, { "grad_norm": 0.33728888630867004, "learning_rate": 9.643894572104321e-05, "loss": 0.1337, "step": 9890 }, { "grad_norm": 0.2824889123439789, "learning_rate": 9.642872479304644e-05, "loss": 0.1235, "step": 9900 }, { "grad_norm": 0.31668320298194885, "learning_rate": 9.641848976122895e-05, "loss": 0.128, "step": 9910 }, { "grad_norm": 0.32627207040786743, "learning_rate": 9.64082406286999e-05, "loss": 0.1308, "step": 9920 }, { "grad_norm": 0.34298175573349, "learning_rate": 9.639797739857269e-05, "loss": 0.125, "step": 9930 }, { "grad_norm": 0.3460008203983307, "learning_rate": 9.638770007396498e-05, "loss": 0.1422, "step": 9940 }, { "grad_norm": 0.3155359625816345, "learning_rate": 9.63774086579988e-05, "loss": 0.141, "step": 9950 }, { "grad_norm": 0.42203858494758606, "learning_rate": 9.63671031538004e-05, "loss": 0.1358, "step": 9960 }, { "grad_norm": 0.3243013620376587, "learning_rate": 9.635678356450031e-05, "loss": 0.1284, "step": 9970 }, { "grad_norm": 0.3150555193424225, "learning_rate": 9.634644989323336e-05, "loss": 0.1268, "step": 9980 }, { "grad_norm": 0.254977822303772, "learning_rate": 9.633610214313861e-05, "loss": 0.1299, "step": 9990 }, { "grad_norm": 0.31873393058776855, "learning_rate": 9.632574031735951e-05, "loss": 0.1324, "step": 10000 }, { "grad_norm": 0.31590405106544495, "learning_rate": 9.631536441904364e-05, "loss": 0.1315, "step": 10010 }, { "grad_norm": 0.3730684220790863, "learning_rate": 9.630497445134293e-05, "loss": 0.1348, "step": 10020 }, { "grad_norm": 0.3024005889892578, "learning_rate": 9.62945704174136e-05, "loss": 0.1244, "step": 10030 }, { "grad_norm": 0.37194666266441345, "learning_rate": 9.628415232041612e-05, "loss": 0.1369, "step": 10040 }, { "grad_norm": 0.3064044415950775, "learning_rate": 9.627372016351524e-05, "loss": 0.1246, "step": 10050 }, { "grad_norm": 0.3191378712654114, "learning_rate": 9.626327394987995e-05, "loss": 0.134, "step": 10060 }, { "grad_norm": 0.2929452657699585, "learning_rate": 9.625281368268355e-05, "loss": 0.1303, "step": 10070 }, { "grad_norm": 0.3416345715522766, "learning_rate": 9.624233936510357e-05, "loss": 0.1296, "step": 10080 }, { "grad_norm": 0.3686096668243408, "learning_rate": 9.623185100032187e-05, "loss": 0.1262, "step": 10090 }, { "grad_norm": 0.3233985900878906, "learning_rate": 9.62213485915245e-05, "loss": 0.1341, "step": 10100 }, { "grad_norm": 0.3270803987979889, "learning_rate": 9.621083214190186e-05, "loss": 0.1334, "step": 10110 }, { "grad_norm": 0.30118656158447266, "learning_rate": 9.62003016546485e-05, "loss": 0.1335, "step": 10120 }, { "grad_norm": 0.3151547908782959, "learning_rate": 9.618975713296339e-05, "loss": 0.1286, "step": 10130 }, { "grad_norm": 0.3296704888343811, "learning_rate": 9.61791985800496e-05, "loss": 0.1312, "step": 10140 }, { "grad_norm": 0.3134455680847168, "learning_rate": 9.616862599911458e-05, "loss": 0.1282, "step": 10150 }, { "grad_norm": 0.29882004857063293, "learning_rate": 9.615803939337e-05, "loss": 0.1289, "step": 10160 }, { "grad_norm": 0.3027501702308655, "learning_rate": 9.614743876603178e-05, "loss": 0.1315, "step": 10170 }, { "grad_norm": 0.3756309151649475, "learning_rate": 9.613682412032013e-05, "loss": 0.1323, "step": 10180 }, { "grad_norm": 0.3751760423183441, "learning_rate": 9.612619545945947e-05, "loss": 0.1388, "step": 10190 }, { "grad_norm": 0.33841514587402344, "learning_rate": 9.611555278667852e-05, "loss": 0.1311, "step": 10200 }, { "grad_norm": 0.3292877972126007, "learning_rate": 9.610489610521024e-05, "loss": 0.1372, "step": 10210 }, { "grad_norm": 0.38449808955192566, "learning_rate": 9.609422541829187e-05, "loss": 0.1292, "step": 10220 }, { "grad_norm": 0.2748781442642212, "learning_rate": 9.608354072916486e-05, "loss": 0.1356, "step": 10230 }, { "grad_norm": 0.3453880548477173, "learning_rate": 9.607284204107493e-05, "loss": 0.1315, "step": 10240 }, { "grad_norm": 0.36652693152427673, "learning_rate": 9.606212935727208e-05, "loss": 0.1333, "step": 10250 }, { "grad_norm": 0.3344210386276245, "learning_rate": 9.605140268101052e-05, "loss": 0.1305, "step": 10260 }, { "grad_norm": 0.32534441351890564, "learning_rate": 9.604066201554875e-05, "loss": 0.1349, "step": 10270 }, { "grad_norm": 0.38395243883132935, "learning_rate": 9.60299073641495e-05, "loss": 0.1443, "step": 10280 }, { "grad_norm": 0.3346421420574188, "learning_rate": 9.601913873007974e-05, "loss": 0.1358, "step": 10290 }, { "grad_norm": 0.3096528649330139, "learning_rate": 9.60083561166107e-05, "loss": 0.1283, "step": 10300 }, { "grad_norm": 0.3481723964214325, "learning_rate": 9.599755952701783e-05, "loss": 0.1283, "step": 10310 }, { "grad_norm": 0.320924311876297, "learning_rate": 9.598674896458089e-05, "loss": 0.1313, "step": 10320 }, { "grad_norm": 0.3042280972003937, "learning_rate": 9.597592443258383e-05, "loss": 0.1325, "step": 10330 }, { "grad_norm": 0.314481258392334, "learning_rate": 9.596508593431483e-05, "loss": 0.1271, "step": 10340 }, { "grad_norm": 0.38249626755714417, "learning_rate": 9.59542334730664e-05, "loss": 0.1315, "step": 10350 }, { "grad_norm": 0.3826186954975128, "learning_rate": 9.594336705213516e-05, "loss": 0.124, "step": 10360 }, { "grad_norm": 0.3776364028453827, "learning_rate": 9.593248667482208e-05, "loss": 0.1427, "step": 10370 }, { "grad_norm": 0.2744872570037842, "learning_rate": 9.592159234443233e-05, "loss": 0.1314, "step": 10380 }, { "grad_norm": 0.3309490382671356, "learning_rate": 9.59106840642753e-05, "loss": 0.1284, "step": 10390 }, { "grad_norm": 0.3201298415660858, "learning_rate": 9.589976183766467e-05, "loss": 0.1306, "step": 10400 }, { "grad_norm": 0.3541224002838135, "learning_rate": 9.58888256679183e-05, "loss": 0.1216, "step": 10410 }, { "grad_norm": 0.3522472679615021, "learning_rate": 9.587787555835832e-05, "loss": 0.1274, "step": 10420 }, { "grad_norm": 0.34869423508644104, "learning_rate": 9.586691151231107e-05, "loss": 0.1231, "step": 10430 }, { "grad_norm": 0.27899467945098877, "learning_rate": 9.585593353310715e-05, "loss": 0.1332, "step": 10440 }, { "grad_norm": 0.37211787700653076, "learning_rate": 9.58449416240814e-05, "loss": 0.1326, "step": 10450 }, { "grad_norm": 0.3025646209716797, "learning_rate": 9.583393578857283e-05, "loss": 0.1269, "step": 10460 }, { "grad_norm": 0.365829735994339, "learning_rate": 9.582291602992474e-05, "loss": 0.1278, "step": 10470 }, { "grad_norm": 0.31622496247291565, "learning_rate": 9.581188235148466e-05, "loss": 0.1256, "step": 10480 }, { "grad_norm": 0.3107435405254364, "learning_rate": 9.58008347566043e-05, "loss": 0.1236, "step": 10490 }, { "grad_norm": 0.3216341733932495, "learning_rate": 9.578977324863965e-05, "loss": 0.128, "step": 10500 }, { "grad_norm": 0.3454084098339081, "learning_rate": 9.577869783095089e-05, "loss": 0.1355, "step": 10510 }, { "grad_norm": 0.3570747673511505, "learning_rate": 9.576760850690245e-05, "loss": 0.1374, "step": 10520 }, { "grad_norm": 0.35040804743766785, "learning_rate": 9.575650527986298e-05, "loss": 0.1342, "step": 10530 }, { "grad_norm": 0.3233104646205902, "learning_rate": 9.574538815320531e-05, "loss": 0.1267, "step": 10540 }, { "grad_norm": 0.32742923498153687, "learning_rate": 9.573425713030656e-05, "loss": 0.1227, "step": 10550 }, { "grad_norm": 0.302083820104599, "learning_rate": 9.572311221454806e-05, "loss": 0.1263, "step": 10560 }, { "grad_norm": 0.348944753408432, "learning_rate": 9.57119534093153e-05, "loss": 0.1277, "step": 10570 }, { "grad_norm": 0.32439514994621277, "learning_rate": 9.570078071799806e-05, "loss": 0.1239, "step": 10580 }, { "grad_norm": 0.33256134390830994, "learning_rate": 9.568959414399028e-05, "loss": 0.133, "step": 10590 }, { "grad_norm": 0.4132240116596222, "learning_rate": 9.567839369069018e-05, "loss": 0.1294, "step": 10600 }, { "grad_norm": 0.3565572202205658, "learning_rate": 9.566717936150013e-05, "loss": 0.1299, "step": 10610 }, { "grad_norm": 0.3811361491680145, "learning_rate": 9.565595115982678e-05, "loss": 0.1256, "step": 10620 }, { "grad_norm": 0.3138488531112671, "learning_rate": 9.564470908908094e-05, "loss": 0.1338, "step": 10630 }, { "grad_norm": 0.360230416059494, "learning_rate": 9.563345315267764e-05, "loss": 0.1285, "step": 10640 }, { "grad_norm": 0.31939834356307983, "learning_rate": 9.562218335403616e-05, "loss": 0.1184, "step": 10650 }, { "grad_norm": 0.37134599685668945, "learning_rate": 9.561089969657999e-05, "loss": 0.1312, "step": 10660 }, { "grad_norm": 0.35381877422332764, "learning_rate": 9.559960218373673e-05, "loss": 0.1305, "step": 10670 }, { "grad_norm": 0.2901255190372467, "learning_rate": 9.558829081893836e-05, "loss": 0.1277, "step": 10680 }, { "grad_norm": 0.28846219182014465, "learning_rate": 9.55769656056209e-05, "loss": 0.1252, "step": 10690 }, { "grad_norm": 0.3747487962245941, "learning_rate": 9.556562654722469e-05, "loss": 0.1311, "step": 10700 }, { "grad_norm": 0.3296245038509369, "learning_rate": 9.555427364719422e-05, "loss": 0.1367, "step": 10710 }, { "grad_norm": 0.34656617045402527, "learning_rate": 9.55429069089782e-05, "loss": 0.1222, "step": 10720 }, { "grad_norm": 0.33748090267181396, "learning_rate": 9.553152633602956e-05, "loss": 0.1296, "step": 10730 }, { "grad_norm": 0.2887110412120819, "learning_rate": 9.552013193180543e-05, "loss": 0.134, "step": 10740 }, { "grad_norm": 0.36477306485176086, "learning_rate": 9.550872369976707e-05, "loss": 0.1312, "step": 10750 }, { "grad_norm": 0.33073267340660095, "learning_rate": 9.549730164338007e-05, "loss": 0.1407, "step": 10760 }, { "grad_norm": 0.34060895442962646, "learning_rate": 9.548586576611408e-05, "loss": 0.1335, "step": 10770 }, { "grad_norm": 0.29605743288993835, "learning_rate": 9.54744160714431e-05, "loss": 0.1357, "step": 10780 }, { "grad_norm": 0.30060678720474243, "learning_rate": 9.546295256284516e-05, "loss": 0.1325, "step": 10790 }, { "grad_norm": 0.33311787247657776, "learning_rate": 9.545147524380265e-05, "loss": 0.1312, "step": 10800 }, { "grad_norm": 0.3306635916233063, "learning_rate": 9.543998411780201e-05, "loss": 0.1297, "step": 10810 }, { "grad_norm": 0.339439332485199, "learning_rate": 9.542847918833397e-05, "loss": 0.1333, "step": 10820 }, { "grad_norm": 0.35006749629974365, "learning_rate": 9.541696045889343e-05, "loss": 0.1398, "step": 10830 }, { "grad_norm": 0.36180445551872253, "learning_rate": 9.540542793297947e-05, "loss": 0.1368, "step": 10840 }, { "grad_norm": 0.31919705867767334, "learning_rate": 9.539388161409537e-05, "loss": 0.1266, "step": 10850 }, { "grad_norm": 0.3524421453475952, "learning_rate": 9.538232150574857e-05, "loss": 0.1248, "step": 10860 }, { "grad_norm": 0.3013788163661957, "learning_rate": 9.537074761145076e-05, "loss": 0.1291, "step": 10870 }, { "grad_norm": 0.31887274980545044, "learning_rate": 9.535915993471778e-05, "loss": 0.1274, "step": 10880 }, { "grad_norm": 0.3360661566257477, "learning_rate": 9.534755847906964e-05, "loss": 0.1281, "step": 10890 }, { "grad_norm": 0.37384939193725586, "learning_rate": 9.533594324803057e-05, "loss": 0.1297, "step": 10900 }, { "grad_norm": 0.33655279874801636, "learning_rate": 9.532431424512895e-05, "loss": 0.1311, "step": 10910 }, { "grad_norm": 0.37829580903053284, "learning_rate": 9.531267147389741e-05, "loss": 0.1228, "step": 10920 }, { "grad_norm": 0.2667408585548401, "learning_rate": 9.530101493787266e-05, "loss": 0.1355, "step": 10930 }, { "grad_norm": 0.3691570460796356, "learning_rate": 9.528934464059571e-05, "loss": 0.1326, "step": 10940 }, { "grad_norm": 0.3245776295661926, "learning_rate": 9.527766058561163e-05, "loss": 0.1303, "step": 10950 }, { "grad_norm": 0.36199885606765747, "learning_rate": 9.526596277646976e-05, "loss": 0.1289, "step": 10960 }, { "grad_norm": 0.3360731899738312, "learning_rate": 9.525425121672358e-05, "loss": 0.13, "step": 10970 }, { "grad_norm": 0.31335198879241943, "learning_rate": 9.524252590993074e-05, "loss": 0.1325, "step": 10980 }, { "grad_norm": 0.31162229180336, "learning_rate": 9.523078685965309e-05, "loss": 0.1262, "step": 10990 }, { "grad_norm": 0.36216840147972107, "learning_rate": 9.521903406945664e-05, "loss": 0.1387, "step": 11000 }, { "grad_norm": 0.3258671164512634, "learning_rate": 9.520726754291158e-05, "loss": 0.1287, "step": 11010 }, { "grad_norm": 0.365253210067749, "learning_rate": 9.519548728359227e-05, "loss": 0.1365, "step": 11020 }, { "grad_norm": 0.38892191648483276, "learning_rate": 9.518369329507726e-05, "loss": 0.1458, "step": 11030 }, { "grad_norm": 0.26771169900894165, "learning_rate": 9.51718855809492e-05, "loss": 0.1198, "step": 11040 }, { "grad_norm": 0.28159672021865845, "learning_rate": 9.516006414479502e-05, "loss": 0.1318, "step": 11050 }, { "grad_norm": 0.34165605902671814, "learning_rate": 9.514822899020572e-05, "loss": 0.1242, "step": 11060 }, { "grad_norm": 0.37607356905937195, "learning_rate": 9.513638012077654e-05, "loss": 0.1366, "step": 11070 }, { "grad_norm": 0.3063693046569824, "learning_rate": 9.512451754010683e-05, "loss": 0.1354, "step": 11080 }, { "grad_norm": 0.3196542263031006, "learning_rate": 9.511264125180013e-05, "loss": 0.1283, "step": 11090 }, { "grad_norm": 0.3885175287723541, "learning_rate": 9.510075125946414e-05, "loss": 0.1353, "step": 11100 }, { "grad_norm": 0.34113413095474243, "learning_rate": 9.508884756671075e-05, "loss": 0.1411, "step": 11110 }, { "grad_norm": 0.44202545285224915, "learning_rate": 9.507693017715596e-05, "loss": 0.128, "step": 11120 }, { "grad_norm": 0.3471803069114685, "learning_rate": 9.506499909441997e-05, "loss": 0.129, "step": 11130 }, { "grad_norm": 0.3050287961959839, "learning_rate": 9.505305432212713e-05, "loss": 0.1288, "step": 11140 }, { "grad_norm": 0.3962189555168152, "learning_rate": 9.504109586390595e-05, "loss": 0.1388, "step": 11150 }, { "grad_norm": 0.33261388540267944, "learning_rate": 9.502912372338908e-05, "loss": 0.1363, "step": 11160 }, { "grad_norm": 0.34948739409446716, "learning_rate": 9.501713790421335e-05, "loss": 0.136, "step": 11170 }, { "grad_norm": 0.2411874383687973, "learning_rate": 9.500513841001974e-05, "loss": 0.1265, "step": 11180 }, { "grad_norm": 0.29685622453689575, "learning_rate": 9.499312524445336e-05, "loss": 0.1306, "step": 11190 }, { "grad_norm": 0.32845261693000793, "learning_rate": 9.498109841116351e-05, "loss": 0.135, "step": 11200 }, { "grad_norm": 0.3450257182121277, "learning_rate": 9.496905791380363e-05, "loss": 0.1375, "step": 11210 }, { "grad_norm": 0.3769330382347107, "learning_rate": 9.495700375603129e-05, "loss": 0.1318, "step": 11220 }, { "grad_norm": 0.3394305109977722, "learning_rate": 9.494493594150822e-05, "loss": 0.1402, "step": 11230 }, { "grad_norm": 0.32367056608200073, "learning_rate": 9.493285447390032e-05, "loss": 0.1236, "step": 11240 }, { "grad_norm": 0.28303542733192444, "learning_rate": 9.492075935687761e-05, "loss": 0.1364, "step": 11250 }, { "grad_norm": 0.33244189620018005, "learning_rate": 9.490865059411427e-05, "loss": 0.133, "step": 11260 }, { "grad_norm": 0.35815247893333435, "learning_rate": 9.489652818928863e-05, "loss": 0.1313, "step": 11270 }, { "grad_norm": 0.2808220684528351, "learning_rate": 9.488439214608315e-05, "loss": 0.1277, "step": 11280 }, { "grad_norm": 0.28458377718925476, "learning_rate": 9.487224246818444e-05, "loss": 0.1381, "step": 11290 }, { "grad_norm": 0.3892328143119812, "learning_rate": 9.486007915928325e-05, "loss": 0.1332, "step": 11300 }, { "grad_norm": 0.3341919779777527, "learning_rate": 9.484790222307448e-05, "loss": 0.1249, "step": 11310 }, { "grad_norm": 0.3162122368812561, "learning_rate": 9.483571166325716e-05, "loss": 0.125, "step": 11320 }, { "grad_norm": 0.26926925778388977, "learning_rate": 9.482350748353444e-05, "loss": 0.1279, "step": 11330 }, { "grad_norm": 0.3485700190067291, "learning_rate": 9.481128968761363e-05, "loss": 0.138, "step": 11340 }, { "grad_norm": 0.30314651131629944, "learning_rate": 9.479905827920621e-05, "loss": 0.119, "step": 11350 }, { "grad_norm": 0.3927820026874542, "learning_rate": 9.478681326202773e-05, "loss": 0.1222, "step": 11360 }, { "grad_norm": 0.32472798228263855, "learning_rate": 9.477455463979791e-05, "loss": 0.1298, "step": 11370 }, { "grad_norm": 0.30529189109802246, "learning_rate": 9.476228241624059e-05, "loss": 0.1252, "step": 11380 }, { "grad_norm": 0.2866649627685547, "learning_rate": 9.474999659508374e-05, "loss": 0.1221, "step": 11390 }, { "grad_norm": 0.28946518898010254, "learning_rate": 9.47376971800595e-05, "loss": 0.1192, "step": 11400 }, { "grad_norm": 0.3191961646080017, "learning_rate": 9.472538417490409e-05, "loss": 0.1341, "step": 11410 }, { "grad_norm": 0.32917723059654236, "learning_rate": 9.471305758335784e-05, "loss": 0.1309, "step": 11420 }, { "grad_norm": 0.2871326506137848, "learning_rate": 9.47007174091653e-05, "loss": 0.1274, "step": 11430 }, { "grad_norm": 0.26825740933418274, "learning_rate": 9.468836365607507e-05, "loss": 0.1284, "step": 11440 }, { "grad_norm": 0.3543224334716797, "learning_rate": 9.467599632783988e-05, "loss": 0.132, "step": 11450 }, { "grad_norm": 0.33025145530700684, "learning_rate": 9.466361542821662e-05, "loss": 0.1458, "step": 11460 }, { "grad_norm": 0.31839966773986816, "learning_rate": 9.465122096096625e-05, "loss": 0.137, "step": 11470 }, { "grad_norm": 0.3025517165660858, "learning_rate": 9.463881292985391e-05, "loss": 0.1269, "step": 11480 }, { "grad_norm": 0.3939780592918396, "learning_rate": 9.462639133864881e-05, "loss": 0.1276, "step": 11490 }, { "grad_norm": 0.309218168258667, "learning_rate": 9.461395619112432e-05, "loss": 0.1313, "step": 11500 }, { "grad_norm": 0.3290044963359833, "learning_rate": 9.460150749105791e-05, "loss": 0.1282, "step": 11510 }, { "grad_norm": 0.2925330698490143, "learning_rate": 9.458904524223116e-05, "loss": 0.1276, "step": 11520 }, { "grad_norm": 0.33191248774528503, "learning_rate": 9.457656944842976e-05, "loss": 0.1329, "step": 11530 }, { "grad_norm": 0.29449182748794556, "learning_rate": 9.456408011344353e-05, "loss": 0.1262, "step": 11540 }, { "grad_norm": 0.37353643774986267, "learning_rate": 9.455157724106643e-05, "loss": 0.1345, "step": 11550 }, { "grad_norm": 0.29533693194389343, "learning_rate": 9.453906083509647e-05, "loss": 0.1316, "step": 11560 }, { "grad_norm": 0.32115137577056885, "learning_rate": 9.45265308993358e-05, "loss": 0.1269, "step": 11570 }, { "grad_norm": 0.273895263671875, "learning_rate": 9.451398743759071e-05, "loss": 0.1223, "step": 11580 }, { "grad_norm": 0.27834591269493103, "learning_rate": 9.450143045367156e-05, "loss": 0.1288, "step": 11590 }, { "grad_norm": 0.33914366364479065, "learning_rate": 9.448885995139283e-05, "loss": 0.1324, "step": 11600 }, { "grad_norm": 0.2735610902309418, "learning_rate": 9.44762759345731e-05, "loss": 0.1272, "step": 11610 }, { "grad_norm": 0.34272438287734985, "learning_rate": 9.446367840703509e-05, "loss": 0.1345, "step": 11620 }, { "grad_norm": 0.36046016216278076, "learning_rate": 9.445106737260556e-05, "loss": 0.1238, "step": 11630 }, { "grad_norm": 0.23035132884979248, "learning_rate": 9.443844283511543e-05, "loss": 0.1275, "step": 11640 }, { "grad_norm": 0.3053935170173645, "learning_rate": 9.442580479839968e-05, "loss": 0.1296, "step": 11650 }, { "grad_norm": 0.30858367681503296, "learning_rate": 9.441315326629745e-05, "loss": 0.1296, "step": 11660 }, { "grad_norm": 0.31621983647346497, "learning_rate": 9.44004882426519e-05, "loss": 0.1287, "step": 11670 }, { "grad_norm": 0.28442487120628357, "learning_rate": 9.438780973131037e-05, "loss": 0.1263, "step": 11680 }, { "grad_norm": 0.32811474800109863, "learning_rate": 9.437511773612423e-05, "loss": 0.1308, "step": 11690 }, { "grad_norm": 0.24620065093040466, "learning_rate": 9.436241226094896e-05, "loss": 0.1252, "step": 11700 }, { "grad_norm": 0.2759955823421478, "learning_rate": 9.434969330964418e-05, "loss": 0.1313, "step": 11710 }, { "grad_norm": 0.3162033259868622, "learning_rate": 9.433696088607356e-05, "loss": 0.1252, "step": 11720 }, { "grad_norm": 0.2896290719509125, "learning_rate": 9.432421499410486e-05, "loss": 0.1214, "step": 11730 }, { "grad_norm": 0.2812982499599457, "learning_rate": 9.431145563760998e-05, "loss": 0.1247, "step": 11740 }, { "grad_norm": 0.2938506007194519, "learning_rate": 9.429868282046484e-05, "loss": 0.1306, "step": 11750 }, { "grad_norm": 0.30270916223526, "learning_rate": 9.428589654654951e-05, "loss": 0.1297, "step": 11760 }, { "grad_norm": 0.2975110113620758, "learning_rate": 9.42730968197481e-05, "loss": 0.124, "step": 11770 }, { "grad_norm": 0.32422947883605957, "learning_rate": 9.426028364394883e-05, "loss": 0.1284, "step": 11780 }, { "grad_norm": 0.34691718220710754, "learning_rate": 9.424745702304402e-05, "loss": 0.1298, "step": 11790 }, { "grad_norm": 0.266572505235672, "learning_rate": 9.423461696093006e-05, "loss": 0.1286, "step": 11800 }, { "grad_norm": 0.2611488699913025, "learning_rate": 9.422176346150741e-05, "loss": 0.1227, "step": 11810 }, { "grad_norm": 0.2938764989376068, "learning_rate": 9.420889652868063e-05, "loss": 0.1322, "step": 11820 }, { "grad_norm": 0.3806636929512024, "learning_rate": 9.419601616635836e-05, "loss": 0.1346, "step": 11830 }, { "grad_norm": 0.28184962272644043, "learning_rate": 9.418312237845331e-05, "loss": 0.1233, "step": 11840 }, { "grad_norm": 0.34219223260879517, "learning_rate": 9.417021516888225e-05, "loss": 0.122, "step": 11850 }, { "grad_norm": 0.36344897747039795, "learning_rate": 9.415729454156608e-05, "loss": 0.1386, "step": 11860 }, { "grad_norm": 0.3534705936908722, "learning_rate": 9.414436050042973e-05, "loss": 0.1351, "step": 11870 }, { "grad_norm": 0.3341466188430786, "learning_rate": 9.413141304940223e-05, "loss": 0.1262, "step": 11880 }, { "grad_norm": 0.2802118957042694, "learning_rate": 9.411845219241666e-05, "loss": 0.1292, "step": 11890 }, { "grad_norm": 0.3122096657752991, "learning_rate": 9.410547793341021e-05, "loss": 0.134, "step": 11900 }, { "grad_norm": 0.379457950592041, "learning_rate": 9.409249027632408e-05, "loss": 0.1287, "step": 11910 }, { "grad_norm": 0.34322696924209595, "learning_rate": 9.407948922510362e-05, "loss": 0.1284, "step": 11920 }, { "grad_norm": 0.27544525265693665, "learning_rate": 9.406647478369817e-05, "loss": 0.1258, "step": 11930 }, { "grad_norm": 0.3497299551963806, "learning_rate": 9.405344695606118e-05, "loss": 0.1326, "step": 11940 }, { "grad_norm": 0.33997318148612976, "learning_rate": 9.404040574615018e-05, "loss": 0.1244, "step": 11950 }, { "grad_norm": 0.29540687799453735, "learning_rate": 9.402735115792674e-05, "loss": 0.1236, "step": 11960 }, { "grad_norm": 0.3258680999279022, "learning_rate": 9.401428319535649e-05, "loss": 0.1319, "step": 11970 }, { "grad_norm": 0.3405366837978363, "learning_rate": 9.400120186240912e-05, "loss": 0.1289, "step": 11980 }, { "grad_norm": 0.3020670711994171, "learning_rate": 9.398810716305844e-05, "loss": 0.1263, "step": 11990 }, { "grad_norm": 0.3281182646751404, "learning_rate": 9.397499910128222e-05, "loss": 0.1235, "step": 12000 }, { "grad_norm": 0.4057546555995941, "learning_rate": 9.396187768106237e-05, "loss": 0.1313, "step": 12010 }, { "grad_norm": 0.3086274266242981, "learning_rate": 9.394874290638482e-05, "loss": 0.1287, "step": 12020 }, { "grad_norm": 0.3035244941711426, "learning_rate": 9.393559478123959e-05, "loss": 0.1311, "step": 12030 }, { "grad_norm": 0.33254295587539673, "learning_rate": 9.39224333096207e-05, "loss": 0.1255, "step": 12040 }, { "grad_norm": 0.283105731010437, "learning_rate": 9.390925849552629e-05, "loss": 0.1347, "step": 12050 }, { "grad_norm": 0.3245055079460144, "learning_rate": 9.389607034295849e-05, "loss": 0.1319, "step": 12060 }, { "grad_norm": 0.3181493282318115, "learning_rate": 9.388286885592355e-05, "loss": 0.126, "step": 12070 }, { "grad_norm": 0.34306174516677856, "learning_rate": 9.386965403843168e-05, "loss": 0.1299, "step": 12080 }, { "grad_norm": 0.2827678322792053, "learning_rate": 9.385642589449726e-05, "loss": 0.1162, "step": 12090 }, { "grad_norm": 0.2898194193840027, "learning_rate": 9.38431844281386e-05, "loss": 0.1298, "step": 12100 }, { "grad_norm": 0.31883367896080017, "learning_rate": 9.38299296433781e-05, "loss": 0.1272, "step": 12110 }, { "grad_norm": 0.331766277551651, "learning_rate": 9.381666154424226e-05, "loss": 0.1372, "step": 12120 }, { "grad_norm": 0.2865717113018036, "learning_rate": 9.380338013476157e-05, "loss": 0.1273, "step": 12130 }, { "grad_norm": 0.29766467213630676, "learning_rate": 9.379008541897054e-05, "loss": 0.1286, "step": 12140 }, { "grad_norm": 0.33174097537994385, "learning_rate": 9.377677740090777e-05, "loss": 0.1298, "step": 12150 }, { "grad_norm": 0.31999510526657104, "learning_rate": 9.376345608461588e-05, "loss": 0.1416, "step": 12160 }, { "grad_norm": 0.31338945031166077, "learning_rate": 9.375012147414155e-05, "loss": 0.1274, "step": 12170 }, { "grad_norm": 0.33366337418556213, "learning_rate": 9.373677357353545e-05, "loss": 0.1302, "step": 12180 }, { "grad_norm": 0.30788782238960266, "learning_rate": 9.372341238685237e-05, "loss": 0.1357, "step": 12190 }, { "grad_norm": 0.2827083468437195, "learning_rate": 9.371003791815102e-05, "loss": 0.126, "step": 12200 }, { "grad_norm": 0.3326703608036041, "learning_rate": 9.369665017149429e-05, "loss": 0.1314, "step": 12210 }, { "grad_norm": 0.3028552234172821, "learning_rate": 9.368324915094895e-05, "loss": 0.1297, "step": 12220 }, { "grad_norm": 0.3410350978374481, "learning_rate": 9.366983486058591e-05, "loss": 0.1427, "step": 12230 }, { "grad_norm": 0.2804310619831085, "learning_rate": 9.365640730448009e-05, "loss": 0.1286, "step": 12240 }, { "grad_norm": 0.33056148886680603, "learning_rate": 9.36429664867104e-05, "loss": 0.1253, "step": 12250 }, { "grad_norm": 0.30735161900520325, "learning_rate": 9.362951241135982e-05, "loss": 0.13, "step": 12260 }, { "grad_norm": 0.2976866066455841, "learning_rate": 9.361604508251534e-05, "loss": 0.1298, "step": 12270 }, { "grad_norm": 0.29208558797836304, "learning_rate": 9.360256450426799e-05, "loss": 0.122, "step": 12280 }, { "grad_norm": 0.2832409739494324, "learning_rate": 9.358907068071279e-05, "loss": 0.1254, "step": 12290 }, { "grad_norm": 0.2549532353878021, "learning_rate": 9.357556361594882e-05, "loss": 0.122, "step": 12300 }, { "grad_norm": 0.2852008640766144, "learning_rate": 9.356204331407917e-05, "loss": 0.121, "step": 12310 }, { "grad_norm": 0.3040929138660431, "learning_rate": 9.354850977921094e-05, "loss": 0.1274, "step": 12320 }, { "grad_norm": 0.30981945991516113, "learning_rate": 9.353496301545529e-05, "loss": 0.1237, "step": 12330 }, { "grad_norm": 0.2616605758666992, "learning_rate": 9.352140302692733e-05, "loss": 0.1273, "step": 12340 }, { "grad_norm": 0.33022671937942505, "learning_rate": 9.350782981774627e-05, "loss": 0.1284, "step": 12350 }, { "grad_norm": 0.2975847125053406, "learning_rate": 9.349424339203526e-05, "loss": 0.1262, "step": 12360 }, { "grad_norm": 0.27562806010246277, "learning_rate": 9.34806437539215e-05, "loss": 0.1209, "step": 12370 }, { "grad_norm": 0.30111366510391235, "learning_rate": 9.346703090753622e-05, "loss": 0.1333, "step": 12380 }, { "grad_norm": 0.3787344694137573, "learning_rate": 9.345340485701461e-05, "loss": 0.1232, "step": 12390 }, { "grad_norm": 0.26700422167778015, "learning_rate": 9.343976560649595e-05, "loss": 0.1375, "step": 12400 }, { "grad_norm": 0.38808777928352356, "learning_rate": 9.342611316012344e-05, "loss": 0.1338, "step": 12410 }, { "grad_norm": 0.3147278130054474, "learning_rate": 9.341244752204437e-05, "loss": 0.1274, "step": 12420 }, { "grad_norm": 0.30224496126174927, "learning_rate": 9.339876869640995e-05, "loss": 0.1302, "step": 12430 }, { "grad_norm": 0.2981621325016022, "learning_rate": 9.33850766873755e-05, "loss": 0.127, "step": 12440 }, { "grad_norm": 0.3031899034976959, "learning_rate": 9.337137149910028e-05, "loss": 0.1248, "step": 12450 }, { "grad_norm": 0.3072209656238556, "learning_rate": 9.335765313574753e-05, "loss": 0.1308, "step": 12460 }, { "grad_norm": 0.2972380220890045, "learning_rate": 9.334392160148457e-05, "loss": 0.125, "step": 12470 }, { "grad_norm": 0.30416953563690186, "learning_rate": 9.333017690048264e-05, "loss": 0.1356, "step": 12480 }, { "grad_norm": 0.26805275678634644, "learning_rate": 9.331641903691706e-05, "loss": 0.1242, "step": 12490 }, { "grad_norm": 0.3036195933818817, "learning_rate": 9.330264801496707e-05, "loss": 0.1276, "step": 12500 }, { "grad_norm": 0.2958117425441742, "learning_rate": 9.328886383881594e-05, "loss": 0.1263, "step": 12510 }, { "grad_norm": 0.3709637224674225, "learning_rate": 9.327506651265095e-05, "loss": 0.1239, "step": 12520 }, { "grad_norm": 0.31534281373023987, "learning_rate": 9.326125604066338e-05, "loss": 0.1186, "step": 12530 }, { "grad_norm": 0.3155990540981293, "learning_rate": 9.324743242704847e-05, "loss": 0.1239, "step": 12540 }, { "grad_norm": 0.3603028655052185, "learning_rate": 9.323359567600546e-05, "loss": 0.129, "step": 12550 }, { "grad_norm": 0.29103511571884155, "learning_rate": 9.321974579173761e-05, "loss": 0.1218, "step": 12560 }, { "grad_norm": 0.2653474509716034, "learning_rate": 9.320588277845213e-05, "loss": 0.1289, "step": 12570 }, { "grad_norm": 0.30757027864456177, "learning_rate": 9.319200664036026e-05, "loss": 0.1333, "step": 12580 }, { "grad_norm": 0.3390921354293823, "learning_rate": 9.31781173816772e-05, "loss": 0.1251, "step": 12590 }, { "grad_norm": 0.2692849934101105, "learning_rate": 9.316421500662212e-05, "loss": 0.1326, "step": 12600 }, { "grad_norm": 0.2699835002422333, "learning_rate": 9.31502995194182e-05, "loss": 0.1194, "step": 12610 }, { "grad_norm": 0.312071830034256, "learning_rate": 9.31363709242926e-05, "loss": 0.1286, "step": 12620 }, { "grad_norm": 0.2625146806240082, "learning_rate": 9.312242922547647e-05, "loss": 0.1244, "step": 12630 }, { "grad_norm": 0.2924734354019165, "learning_rate": 9.310847442720492e-05, "loss": 0.128, "step": 12640 }, { "grad_norm": 0.32739391922950745, "learning_rate": 9.309450653371706e-05, "loss": 0.129, "step": 12650 }, { "grad_norm": 0.33928531408309937, "learning_rate": 9.308052554925595e-05, "loss": 0.1271, "step": 12660 }, { "grad_norm": 0.27748793363571167, "learning_rate": 9.306653147806867e-05, "loss": 0.1318, "step": 12670 }, { "grad_norm": 0.26464399695396423, "learning_rate": 9.305252432440622e-05, "loss": 0.125, "step": 12680 }, { "grad_norm": 0.33170631527900696, "learning_rate": 9.303850409252361e-05, "loss": 0.1363, "step": 12690 }, { "grad_norm": 0.29023703932762146, "learning_rate": 9.302447078667985e-05, "loss": 0.1272, "step": 12700 }, { "grad_norm": 0.3491586148738861, "learning_rate": 9.301042441113783e-05, "loss": 0.1227, "step": 12710 }, { "grad_norm": 0.2889450490474701, "learning_rate": 9.299636497016451e-05, "loss": 0.133, "step": 12720 }, { "grad_norm": 0.2785283923149109, "learning_rate": 9.298229246803076e-05, "loss": 0.1259, "step": 12730 }, { "grad_norm": 0.33528774976730347, "learning_rate": 9.296820690901144e-05, "loss": 0.1258, "step": 12740 }, { "grad_norm": 0.2913602888584137, "learning_rate": 9.295410829738539e-05, "loss": 0.1239, "step": 12750 }, { "grad_norm": 0.26225602626800537, "learning_rate": 9.293999663743535e-05, "loss": 0.1275, "step": 12760 }, { "grad_norm": 0.31315451860427856, "learning_rate": 9.292587193344813e-05, "loss": 0.1187, "step": 12770 }, { "grad_norm": 0.30628693103790283, "learning_rate": 9.291173418971437e-05, "loss": 0.1321, "step": 12780 }, { "grad_norm": 0.37860745191574097, "learning_rate": 9.28975834105288e-05, "loss": 0.1323, "step": 12790 }, { "grad_norm": 0.39365965127944946, "learning_rate": 9.288341960019004e-05, "loss": 0.133, "step": 12800 }, { "grad_norm": 0.2865135073661804, "learning_rate": 9.286924276300067e-05, "loss": 0.1448, "step": 12810 }, { "grad_norm": 0.2828204333782196, "learning_rate": 9.285505290326726e-05, "loss": 0.1351, "step": 12820 }, { "grad_norm": 0.32213130593299866, "learning_rate": 9.284085002530027e-05, "loss": 0.1288, "step": 12830 }, { "grad_norm": 0.31136810779571533, "learning_rate": 9.282663413341422e-05, "loss": 0.1312, "step": 12840 }, { "grad_norm": 0.30831101536750793, "learning_rate": 9.281240523192747e-05, "loss": 0.1264, "step": 12850 }, { "grad_norm": 0.2874046564102173, "learning_rate": 9.279816332516242e-05, "loss": 0.1298, "step": 12860 }, { "grad_norm": 0.2998248040676117, "learning_rate": 9.278390841744536e-05, "loss": 0.1274, "step": 12870 }, { "grad_norm": 0.3065885901451111, "learning_rate": 9.276964051310658e-05, "loss": 0.1234, "step": 12880 }, { "grad_norm": 0.27366045117378235, "learning_rate": 9.275535961648027e-05, "loss": 0.132, "step": 12890 }, { "grad_norm": 0.2532235085964203, "learning_rate": 9.274106573190459e-05, "loss": 0.1231, "step": 12900 }, { "grad_norm": 0.2434072643518448, "learning_rate": 9.272675886372168e-05, "loss": 0.1255, "step": 12910 }, { "grad_norm": 0.3309999108314514, "learning_rate": 9.271243901627754e-05, "loss": 0.1373, "step": 12920 }, { "grad_norm": 0.29932430386543274, "learning_rate": 9.269810619392219e-05, "loss": 0.1281, "step": 12930 }, { "grad_norm": 0.3104681372642517, "learning_rate": 9.268376040100955e-05, "loss": 0.1215, "step": 12940 }, { "grad_norm": 0.3074192702770233, "learning_rate": 9.266940164189752e-05, "loss": 0.1308, "step": 12950 }, { "grad_norm": 0.30669012665748596, "learning_rate": 9.265502992094787e-05, "loss": 0.1252, "step": 12960 }, { "grad_norm": 0.26333141326904297, "learning_rate": 9.264064524252638e-05, "loss": 0.1278, "step": 12970 }, { "grad_norm": 0.29311761260032654, "learning_rate": 9.262624761100271e-05, "loss": 0.1228, "step": 12980 }, { "grad_norm": 0.33328643441200256, "learning_rate": 9.261183703075051e-05, "loss": 0.1276, "step": 12990 }, { "grad_norm": 0.31741878390312195, "learning_rate": 9.259741350614733e-05, "loss": 0.1333, "step": 13000 }, { "grad_norm": 0.2867833971977234, "learning_rate": 9.258297704157464e-05, "loss": 0.1356, "step": 13010 }, { "grad_norm": 0.2766583263874054, "learning_rate": 9.256852764141786e-05, "loss": 0.1296, "step": 13020 }, { "grad_norm": 0.3627084493637085, "learning_rate": 9.255406531006634e-05, "loss": 0.1313, "step": 13030 }, { "grad_norm": 0.2779291570186615, "learning_rate": 9.253959005191335e-05, "loss": 0.125, "step": 13040 }, { "grad_norm": 0.3172273635864258, "learning_rate": 9.25251018713561e-05, "loss": 0.1296, "step": 13050 }, { "grad_norm": 0.2975636422634125, "learning_rate": 9.251060077279571e-05, "loss": 0.1313, "step": 13060 }, { "grad_norm": 0.26280155777931213, "learning_rate": 9.249608676063724e-05, "loss": 0.1159, "step": 13070 }, { "grad_norm": 0.312924861907959, "learning_rate": 9.248155983928964e-05, "loss": 0.1315, "step": 13080 }, { "grad_norm": 0.27666470408439636, "learning_rate": 9.246702001316583e-05, "loss": 0.1252, "step": 13090 }, { "grad_norm": 0.29422834515571594, "learning_rate": 9.245246728668262e-05, "loss": 0.1325, "step": 13100 }, { "grad_norm": 0.24457168579101562, "learning_rate": 9.243790166426073e-05, "loss": 0.1285, "step": 13110 }, { "grad_norm": 0.3353123962879181, "learning_rate": 9.242332315032484e-05, "loss": 0.134, "step": 13120 }, { "grad_norm": 0.33665454387664795, "learning_rate": 9.240873174930349e-05, "loss": 0.132, "step": 13130 }, { "grad_norm": 0.3073258101940155, "learning_rate": 9.239412746562917e-05, "loss": 0.1264, "step": 13140 }, { "grad_norm": 0.29888930916786194, "learning_rate": 9.237951030373828e-05, "loss": 0.1215, "step": 13150 }, { "grad_norm": 0.26076775789260864, "learning_rate": 9.236488026807113e-05, "loss": 0.1184, "step": 13160 }, { "grad_norm": 0.2967705726623535, "learning_rate": 9.235023736307193e-05, "loss": 0.1315, "step": 13170 }, { "grad_norm": 0.34692147374153137, "learning_rate": 9.233558159318881e-05, "loss": 0.1258, "step": 13180 }, { "grad_norm": 0.3018060326576233, "learning_rate": 9.232091296287382e-05, "loss": 0.1182, "step": 13190 }, { "grad_norm": 0.32571470737457275, "learning_rate": 9.230623147658288e-05, "loss": 0.1368, "step": 13200 }, { "grad_norm": 0.28576526045799255, "learning_rate": 9.229153713877586e-05, "loss": 0.1331, "step": 13210 }, { "grad_norm": 0.23621097207069397, "learning_rate": 9.227682995391649e-05, "loss": 0.1311, "step": 13220 }, { "grad_norm": 0.27857062220573425, "learning_rate": 9.226210992647243e-05, "loss": 0.1318, "step": 13230 }, { "grad_norm": 0.32927361130714417, "learning_rate": 9.224737706091525e-05, "loss": 0.1269, "step": 13240 }, { "grad_norm": 0.30610236525535583, "learning_rate": 9.223263136172039e-05, "loss": 0.1341, "step": 13250 }, { "grad_norm": 0.3569173812866211, "learning_rate": 9.22178728333672e-05, "loss": 0.1355, "step": 13260 }, { "grad_norm": 0.3935225307941437, "learning_rate": 9.220310148033897e-05, "loss": 0.1413, "step": 13270 }, { "grad_norm": 0.3176920711994171, "learning_rate": 9.21883173071228e-05, "loss": 0.1284, "step": 13280 }, { "grad_norm": 0.35914942622184753, "learning_rate": 9.217352031820976e-05, "loss": 0.1365, "step": 13290 }, { "grad_norm": 0.32142728567123413, "learning_rate": 9.215871051809477e-05, "loss": 0.1341, "step": 13300 }, { "grad_norm": 0.2584364414215088, "learning_rate": 9.214388791127666e-05, "loss": 0.1156, "step": 13310 }, { "grad_norm": 0.3061908781528473, "learning_rate": 9.212905250225814e-05, "loss": 0.1269, "step": 13320 }, { "grad_norm": 0.3706182837486267, "learning_rate": 9.211420429554583e-05, "loss": 0.1326, "step": 13330 }, { "grad_norm": 0.3502230644226074, "learning_rate": 9.209934329565022e-05, "loss": 0.1294, "step": 13340 }, { "grad_norm": 0.29722607135772705, "learning_rate": 9.208446950708568e-05, "loss": 0.1251, "step": 13350 }, { "grad_norm": 0.3207128047943115, "learning_rate": 9.20695829343705e-05, "loss": 0.1258, "step": 13360 }, { "grad_norm": 0.24632152915000916, "learning_rate": 9.205468358202678e-05, "loss": 0.1213, "step": 13370 }, { "grad_norm": 0.32952794432640076, "learning_rate": 9.203977145458059e-05, "loss": 0.1291, "step": 13380 }, { "grad_norm": 0.29482313990592957, "learning_rate": 9.202484655656182e-05, "loss": 0.1266, "step": 13390 }, { "grad_norm": 0.28886792063713074, "learning_rate": 9.200990889250427e-05, "loss": 0.1214, "step": 13400 }, { "grad_norm": 0.2504325211048126, "learning_rate": 9.19949584669456e-05, "loss": 0.133, "step": 13410 }, { "grad_norm": 0.2541996240615845, "learning_rate": 9.197999528442738e-05, "loss": 0.1349, "step": 13420 }, { "grad_norm": 0.312604159116745, "learning_rate": 9.196501934949499e-05, "loss": 0.138, "step": 13430 }, { "grad_norm": 0.2600819170475006, "learning_rate": 9.195003066669776e-05, "loss": 0.1343, "step": 13440 }, { "grad_norm": 0.3280467689037323, "learning_rate": 9.193502924058884e-05, "loss": 0.1383, "step": 13450 }, { "grad_norm": 0.29311782121658325, "learning_rate": 9.192001507572526e-05, "loss": 0.1313, "step": 13460 }, { "grad_norm": 0.28863903880119324, "learning_rate": 9.190498817666793e-05, "loss": 0.1301, "step": 13470 }, { "grad_norm": 0.36048975586891174, "learning_rate": 9.188994854798163e-05, "loss": 0.1195, "step": 13480 }, { "grad_norm": 0.29396873712539673, "learning_rate": 9.187489619423499e-05, "loss": 0.1375, "step": 13490 }, { "grad_norm": 0.2879737913608551, "learning_rate": 9.185983112000056e-05, "loss": 0.1308, "step": 13500 }, { "grad_norm": 0.4250825345516205, "learning_rate": 9.184475332985464e-05, "loss": 0.1362, "step": 13510 }, { "grad_norm": 0.37242794036865234, "learning_rate": 9.182966282837754e-05, "loss": 0.122, "step": 13520 }, { "grad_norm": 0.29610610008239746, "learning_rate": 9.18145596201533e-05, "loss": 0.1273, "step": 13530 }, { "grad_norm": 0.3186778128147125, "learning_rate": 9.179944370976991e-05, "loss": 0.1332, "step": 13540 }, { "grad_norm": 0.34201616048812866, "learning_rate": 9.178431510181918e-05, "loss": 0.1338, "step": 13550 }, { "grad_norm": 0.3186924457550049, "learning_rate": 9.176917380089675e-05, "loss": 0.1411, "step": 13560 }, { "grad_norm": 0.26496657729148865, "learning_rate": 9.175401981160219e-05, "loss": 0.1231, "step": 13570 }, { "grad_norm": 0.30657583475112915, "learning_rate": 9.173885313853885e-05, "loss": 0.133, "step": 13580 }, { "grad_norm": 0.33321326971054077, "learning_rate": 9.172367378631398e-05, "loss": 0.1283, "step": 13590 }, { "grad_norm": 0.2908218801021576, "learning_rate": 9.170848175953866e-05, "loss": 0.1267, "step": 13600 }, { "grad_norm": 0.3213663399219513, "learning_rate": 9.169327706282784e-05, "loss": 0.1315, "step": 13610 }, { "grad_norm": 0.26476824283599854, "learning_rate": 9.167805970080029e-05, "loss": 0.1344, "step": 13620 }, { "grad_norm": 0.29299041628837585, "learning_rate": 9.166282967807864e-05, "loss": 0.1402, "step": 13630 }, { "grad_norm": 0.3240971267223358, "learning_rate": 9.16475869992894e-05, "loss": 0.1277, "step": 13640 }, { "grad_norm": 0.26759111881256104, "learning_rate": 9.163233166906284e-05, "loss": 0.1271, "step": 13650 }, { "grad_norm": 0.310306578874588, "learning_rate": 9.161706369203317e-05, "loss": 0.1287, "step": 13660 }, { "grad_norm": 0.3072526454925537, "learning_rate": 9.16017830728384e-05, "loss": 0.127, "step": 13670 }, { "grad_norm": 0.2846507132053375, "learning_rate": 9.158648981612035e-05, "loss": 0.1283, "step": 13680 }, { "grad_norm": 0.3316996991634369, "learning_rate": 9.157118392652472e-05, "loss": 0.1317, "step": 13690 }, { "grad_norm": 0.2871340215206146, "learning_rate": 9.155586540870104e-05, "loss": 0.1412, "step": 13700 }, { "grad_norm": 0.3569892346858978, "learning_rate": 9.154053426730267e-05, "loss": 0.1304, "step": 13710 }, { "grad_norm": 0.29916274547576904, "learning_rate": 9.15251905069868e-05, "loss": 0.1254, "step": 13720 }, { "grad_norm": 0.3165000379085541, "learning_rate": 9.150983413241446e-05, "loss": 0.1295, "step": 13730 }, { "grad_norm": 0.27698925137519836, "learning_rate": 9.149446514825051e-05, "loss": 0.1248, "step": 13740 }, { "grad_norm": 0.2845411002635956, "learning_rate": 9.147908355916365e-05, "loss": 0.1305, "step": 13750 }, { "grad_norm": 0.21297085285186768, "learning_rate": 9.146368936982642e-05, "loss": 0.1234, "step": 13760 }, { "grad_norm": 0.27577143907546997, "learning_rate": 9.144828258491511e-05, "loss": 0.1315, "step": 13770 }, { "grad_norm": 0.3197077214717865, "learning_rate": 9.143286320910996e-05, "loss": 0.122, "step": 13780 }, { "grad_norm": 0.3084214925765991, "learning_rate": 9.141743124709491e-05, "loss": 0.1317, "step": 13790 }, { "grad_norm": 0.3160024583339691, "learning_rate": 9.140198670355784e-05, "loss": 0.1297, "step": 13800 }, { "grad_norm": 0.2929399609565735, "learning_rate": 9.138652958319034e-05, "loss": 0.1348, "step": 13810 }, { "grad_norm": 0.3119374215602875, "learning_rate": 9.137105989068791e-05, "loss": 0.1327, "step": 13820 }, { "grad_norm": 0.29381540417671204, "learning_rate": 9.135557763074983e-05, "loss": 0.1245, "step": 13830 }, { "grad_norm": 0.3119882643222809, "learning_rate": 9.13400828080792e-05, "loss": 0.1305, "step": 13840 }, { "grad_norm": 0.24394437670707703, "learning_rate": 9.132457542738292e-05, "loss": 0.1215, "step": 13850 }, { "grad_norm": 0.3728210926055908, "learning_rate": 9.130905549337174e-05, "loss": 0.122, "step": 13860 }, { "grad_norm": 0.2807961702346802, "learning_rate": 9.129352301076021e-05, "loss": 0.1218, "step": 13870 }, { "grad_norm": 0.37488648295402527, "learning_rate": 9.127797798426668e-05, "loss": 0.1247, "step": 13880 }, { "grad_norm": 0.2578642964363098, "learning_rate": 9.126242041861333e-05, "loss": 0.1371, "step": 13890 }, { "grad_norm": 0.3214207887649536, "learning_rate": 9.124685031852611e-05, "loss": 0.1286, "step": 13900 }, { "grad_norm": 0.2786400020122528, "learning_rate": 9.123126768873482e-05, "loss": 0.1254, "step": 13910 }, { "grad_norm": 0.2833665609359741, "learning_rate": 9.121567253397308e-05, "loss": 0.1362, "step": 13920 }, { "grad_norm": 0.2975747883319855, "learning_rate": 9.120006485897824e-05, "loss": 0.1298, "step": 13930 }, { "grad_norm": 0.32333308458328247, "learning_rate": 9.118444466849152e-05, "loss": 0.1243, "step": 13940 }, { "grad_norm": 0.302156537771225, "learning_rate": 9.116881196725793e-05, "loss": 0.1336, "step": 13950 }, { "grad_norm": 0.29863518476486206, "learning_rate": 9.115316676002627e-05, "loss": 0.121, "step": 13960 }, { "grad_norm": 0.36087101697921753, "learning_rate": 9.113750905154911e-05, "loss": 0.1293, "step": 13970 }, { "grad_norm": 0.2945076525211334, "learning_rate": 9.112183884658289e-05, "loss": 0.1227, "step": 13980 }, { "grad_norm": 0.3306335508823395, "learning_rate": 9.11061561498878e-05, "loss": 0.1291, "step": 13990 }, { "grad_norm": 0.29154643416404724, "learning_rate": 9.109046096622779e-05, "loss": 0.1189, "step": 14000 }, { "grad_norm": 0.29471197724342346, "learning_rate": 9.107475330037069e-05, "loss": 0.1265, "step": 14010 }, { "grad_norm": 0.3010765016078949, "learning_rate": 9.105903315708806e-05, "loss": 0.1297, "step": 14020 }, { "grad_norm": 0.3730803430080414, "learning_rate": 9.104330054115524e-05, "loss": 0.1285, "step": 14030 }, { "grad_norm": 0.2642109990119934, "learning_rate": 9.102755545735141e-05, "loss": 0.1225, "step": 14040 }, { "grad_norm": 0.26687681674957275, "learning_rate": 9.10117979104595e-05, "loss": 0.1185, "step": 14050 }, { "grad_norm": 0.24286814033985138, "learning_rate": 9.099602790526624e-05, "loss": 0.1327, "step": 14060 }, { "grad_norm": 0.323750376701355, "learning_rate": 9.098024544656212e-05, "loss": 0.1286, "step": 14070 }, { "grad_norm": 0.275012731552124, "learning_rate": 9.096445053914148e-05, "loss": 0.1236, "step": 14080 }, { "grad_norm": 0.31415802240371704, "learning_rate": 9.094864318780236e-05, "loss": 0.1252, "step": 14090 }, { "grad_norm": 0.285910427570343, "learning_rate": 9.093282339734663e-05, "loss": 0.1203, "step": 14100 }, { "grad_norm": 0.29623082280158997, "learning_rate": 9.091699117257992e-05, "loss": 0.1233, "step": 14110 }, { "grad_norm": 0.2962034046649933, "learning_rate": 9.090114651831163e-05, "loss": 0.1239, "step": 14120 }, { "grad_norm": 0.3591517210006714, "learning_rate": 9.088528943935497e-05, "loss": 0.1258, "step": 14130 }, { "grad_norm": 0.33198484778404236, "learning_rate": 9.086941994052689e-05, "loss": 0.1308, "step": 14140 }, { "grad_norm": 0.3276165723800659, "learning_rate": 9.085353802664813e-05, "loss": 0.1203, "step": 14150 }, { "grad_norm": 0.32476094365119934, "learning_rate": 9.08376437025432e-05, "loss": 0.1253, "step": 14160 }, { "grad_norm": 0.3055660128593445, "learning_rate": 9.082173697304035e-05, "loss": 0.1276, "step": 14170 }, { "grad_norm": 0.2891024649143219, "learning_rate": 9.080581784297166e-05, "loss": 0.1262, "step": 14180 }, { "grad_norm": 0.2740764021873474, "learning_rate": 9.078988631717291e-05, "loss": 0.1301, "step": 14190 }, { "grad_norm": 0.27378523349761963, "learning_rate": 9.077394240048369e-05, "loss": 0.1294, "step": 14200 }, { "grad_norm": 0.35268059372901917, "learning_rate": 9.075798609774736e-05, "loss": 0.1301, "step": 14210 }, { "grad_norm": 0.37336456775665283, "learning_rate": 9.0742017413811e-05, "loss": 0.1286, "step": 14220 }, { "grad_norm": 0.32647955417633057, "learning_rate": 9.072603635352548e-05, "loss": 0.1382, "step": 14230 }, { "grad_norm": 0.28603503108024597, "learning_rate": 9.071004292174541e-05, "loss": 0.1223, "step": 14240 }, { "grad_norm": 0.29747846722602844, "learning_rate": 9.06940371233292e-05, "loss": 0.1367, "step": 14250 }, { "grad_norm": 0.2692274749279022, "learning_rate": 9.067801896313898e-05, "loss": 0.1309, "step": 14260 }, { "grad_norm": 0.2795654833316803, "learning_rate": 9.066198844604064e-05, "loss": 0.1237, "step": 14270 }, { "grad_norm": 0.28152376413345337, "learning_rate": 9.06459455769038e-05, "loss": 0.1285, "step": 14280 }, { "grad_norm": 0.27824416756629944, "learning_rate": 9.062989036060193e-05, "loss": 0.1151, "step": 14290 }, { "grad_norm": 0.27786579728126526, "learning_rate": 9.061382280201212e-05, "loss": 0.1233, "step": 14300 }, { "grad_norm": 0.27620550990104675, "learning_rate": 9.059774290601528e-05, "loss": 0.1201, "step": 14310 }, { "grad_norm": 0.2851296067237854, "learning_rate": 9.058165067749606e-05, "loss": 0.1312, "step": 14320 }, { "grad_norm": 0.33078381419181824, "learning_rate": 9.056554612134288e-05, "loss": 0.1354, "step": 14330 }, { "grad_norm": 0.3228641152381897, "learning_rate": 9.054942924244785e-05, "loss": 0.1313, "step": 14340 }, { "grad_norm": 0.31025466322898865, "learning_rate": 9.053330004570686e-05, "loss": 0.1259, "step": 14350 }, { "grad_norm": 0.27157220244407654, "learning_rate": 9.051715853601955e-05, "loss": 0.13, "step": 14360 }, { "grad_norm": 0.3009704351425171, "learning_rate": 9.050100471828926e-05, "loss": 0.1304, "step": 14370 }, { "grad_norm": 0.3447461724281311, "learning_rate": 9.048483859742311e-05, "loss": 0.1319, "step": 14380 }, { "grad_norm": 0.19661946594715118, "learning_rate": 9.046866017833193e-05, "loss": 0.1197, "step": 14390 }, { "grad_norm": 0.3032904267311096, "learning_rate": 9.045246946593029e-05, "loss": 0.1287, "step": 14400 }, { "grad_norm": 0.2998185157775879, "learning_rate": 9.043626646513652e-05, "loss": 0.1271, "step": 14410 }, { "grad_norm": 0.25280508399009705, "learning_rate": 9.042005118087267e-05, "loss": 0.1268, "step": 14420 }, { "grad_norm": 0.2902035117149353, "learning_rate": 9.040382361806448e-05, "loss": 0.1187, "step": 14430 }, { "grad_norm": 0.23965607583522797, "learning_rate": 9.038758378164148e-05, "loss": 0.1168, "step": 14440 }, { "grad_norm": 0.3321903645992279, "learning_rate": 9.037133167653691e-05, "loss": 0.1333, "step": 14450 }, { "grad_norm": 0.29135969281196594, "learning_rate": 9.035506730768771e-05, "loss": 0.1273, "step": 14460 }, { "grad_norm": 0.32131659984588623, "learning_rate": 9.033879068003458e-05, "loss": 0.1312, "step": 14470 }, { "grad_norm": 0.36486443877220154, "learning_rate": 9.032250179852193e-05, "loss": 0.1349, "step": 14480 }, { "grad_norm": 0.33246707916259766, "learning_rate": 9.030620066809787e-05, "loss": 0.1288, "step": 14490 }, { "grad_norm": 0.2942383587360382, "learning_rate": 9.028988729371428e-05, "loss": 0.1171, "step": 14500 }, { "grad_norm": 0.2897149920463562, "learning_rate": 9.027356168032673e-05, "loss": 0.131, "step": 14510 }, { "grad_norm": 0.2804950177669525, "learning_rate": 9.02572238328945e-05, "loss": 0.1205, "step": 14520 }, { "grad_norm": 0.30579274892807007, "learning_rate": 9.02408737563806e-05, "loss": 0.1203, "step": 14530 }, { "grad_norm": 0.3110488951206207, "learning_rate": 9.022451145575174e-05, "loss": 0.1396, "step": 14540 }, { "grad_norm": 0.23837697505950928, "learning_rate": 9.02081369359784e-05, "loss": 0.1297, "step": 14550 }, { "grad_norm": 0.273840069770813, "learning_rate": 9.019175020203465e-05, "loss": 0.1254, "step": 14560 }, { "grad_norm": 0.24638792872428894, "learning_rate": 9.017535125889842e-05, "loss": 0.1289, "step": 14570 }, { "grad_norm": 0.2680298388004303, "learning_rate": 9.015894011155124e-05, "loss": 0.1267, "step": 14580 }, { "grad_norm": 0.2589370012283325, "learning_rate": 9.014251676497838e-05, "loss": 0.1218, "step": 14590 }, { "grad_norm": 0.3070414066314697, "learning_rate": 9.012608122416884e-05, "loss": 0.1248, "step": 14600 }, { "grad_norm": 0.31435897946357727, "learning_rate": 9.010963349411529e-05, "loss": 0.1285, "step": 14610 }, { "grad_norm": 0.3338169455528259, "learning_rate": 9.00931735798141e-05, "loss": 0.1242, "step": 14620 }, { "grad_norm": 0.3339804410934448, "learning_rate": 9.00767014862654e-05, "loss": 0.1329, "step": 14630 }, { "grad_norm": 0.2822965979576111, "learning_rate": 9.006021721847295e-05, "loss": 0.1252, "step": 14640 }, { "grad_norm": 0.29756662249565125, "learning_rate": 9.004372078144423e-05, "loss": 0.1291, "step": 14650 }, { "grad_norm": 0.27702274918556213, "learning_rate": 9.002721218019043e-05, "loss": 0.1272, "step": 14660 }, { "grad_norm": 0.21284490823745728, "learning_rate": 9.001069141972642e-05, "loss": 0.125, "step": 14670 }, { "grad_norm": 0.2486388236284256, "learning_rate": 8.99941585050708e-05, "loss": 0.1279, "step": 14680 }, { "grad_norm": 0.26636579632759094, "learning_rate": 8.997761344124578e-05, "loss": 0.1259, "step": 14690 }, { "grad_norm": 0.33123883605003357, "learning_rate": 8.996105623327737e-05, "loss": 0.1301, "step": 14700 }, { "grad_norm": 0.3249562978744507, "learning_rate": 8.994448688619517e-05, "loss": 0.1261, "step": 14710 }, { "grad_norm": 0.2786236107349396, "learning_rate": 8.992790540503253e-05, "loss": 0.1394, "step": 14720 }, { "grad_norm": 0.2753862142562866, "learning_rate": 8.991131179482648e-05, "loss": 0.1292, "step": 14730 }, { "grad_norm": 0.2553400993347168, "learning_rate": 8.989470606061768e-05, "loss": 0.1377, "step": 14740 }, { "grad_norm": 0.3118550777435303, "learning_rate": 8.987808820745056e-05, "loss": 0.1303, "step": 14750 }, { "grad_norm": 0.29668474197387695, "learning_rate": 8.986145824037315e-05, "loss": 0.1226, "step": 14760 }, { "grad_norm": 0.2481732815504074, "learning_rate": 8.984481616443721e-05, "loss": 0.1383, "step": 14770 }, { "grad_norm": 0.30324798822402954, "learning_rate": 8.982816198469815e-05, "loss": 0.1257, "step": 14780 }, { "grad_norm": 0.26865580677986145, "learning_rate": 8.98114957062151e-05, "loss": 0.1309, "step": 14790 }, { "grad_norm": 0.29565343260765076, "learning_rate": 8.97948173340508e-05, "loss": 0.1183, "step": 14800 }, { "grad_norm": 0.243218332529068, "learning_rate": 8.977812687327172e-05, "loss": 0.1228, "step": 14810 }, { "grad_norm": 0.25527116656303406, "learning_rate": 8.976142432894798e-05, "loss": 0.1279, "step": 14820 }, { "grad_norm": 0.2360631674528122, "learning_rate": 8.974470970615336e-05, "loss": 0.1287, "step": 14830 }, { "grad_norm": 0.253194123506546, "learning_rate": 8.972798300996534e-05, "loss": 0.1279, "step": 14840 }, { "grad_norm": 0.2642633318901062, "learning_rate": 8.971124424546504e-05, "loss": 0.1227, "step": 14850 }, { "grad_norm": 0.29487672448158264, "learning_rate": 8.969449341773724e-05, "loss": 0.1271, "step": 14860 }, { "grad_norm": 0.3021458387374878, "learning_rate": 8.967773053187042e-05, "loss": 0.1273, "step": 14870 }, { "grad_norm": 0.28867530822753906, "learning_rate": 8.966095559295668e-05, "loss": 0.1253, "step": 14880 }, { "grad_norm": 0.2508962154388428, "learning_rate": 8.964416860609184e-05, "loss": 0.1245, "step": 14890 }, { "grad_norm": 0.2882542014122009, "learning_rate": 8.962736957637532e-05, "loss": 0.121, "step": 14900 }, { "grad_norm": 0.26893192529678345, "learning_rate": 8.96105585089102e-05, "loss": 0.1239, "step": 14910 }, { "grad_norm": 0.3383790850639343, "learning_rate": 8.959373540880329e-05, "loss": 0.1464, "step": 14920 }, { "grad_norm": 0.30084261298179626, "learning_rate": 8.957690028116495e-05, "loss": 0.1209, "step": 14930 }, { "grad_norm": 0.27357789874076843, "learning_rate": 8.956005313110928e-05, "loss": 0.1379, "step": 14940 }, { "grad_norm": 0.2968611419200897, "learning_rate": 8.9543193963754e-05, "loss": 0.1292, "step": 14950 }, { "grad_norm": 0.2851259708404541, "learning_rate": 8.952632278422048e-05, "loss": 0.1206, "step": 14960 }, { "grad_norm": 0.2513560950756073, "learning_rate": 8.95094395976337e-05, "loss": 0.1204, "step": 14970 }, { "grad_norm": 0.24546073377132416, "learning_rate": 8.949254440912239e-05, "loss": 0.1208, "step": 14980 }, { "grad_norm": 0.3311712145805359, "learning_rate": 8.94756372238188e-05, "loss": 0.1335, "step": 14990 }, { "grad_norm": 0.2877499461174011, "learning_rate": 8.945871804685892e-05, "loss": 0.1252, "step": 15000 }, { "grad_norm": 0.2616111636161804, "learning_rate": 8.944178688338236e-05, "loss": 0.1254, "step": 15010 }, { "grad_norm": 0.24171856045722961, "learning_rate": 8.942484373853233e-05, "loss": 0.1196, "step": 15020 }, { "grad_norm": 0.274126261472702, "learning_rate": 8.940788861745572e-05, "loss": 0.1304, "step": 15030 }, { "grad_norm": 0.2772763669490814, "learning_rate": 8.939092152530308e-05, "loss": 0.1245, "step": 15040 }, { "grad_norm": 0.38441020250320435, "learning_rate": 8.937394246722853e-05, "loss": 0.1277, "step": 15050 }, { "grad_norm": 0.30156901478767395, "learning_rate": 8.935695144838984e-05, "loss": 0.1297, "step": 15060 }, { "grad_norm": 0.27551916241645813, "learning_rate": 8.933994847394849e-05, "loss": 0.1295, "step": 15070 }, { "grad_norm": 0.26955142617225647, "learning_rate": 8.932293354906949e-05, "loss": 0.1226, "step": 15080 }, { "grad_norm": 0.36736199259757996, "learning_rate": 8.930590667892153e-05, "loss": 0.127, "step": 15090 }, { "grad_norm": 0.3258891701698303, "learning_rate": 8.928886786867696e-05, "loss": 0.1261, "step": 15100 }, { "grad_norm": 0.2999826967716217, "learning_rate": 8.927181712351168e-05, "loss": 0.1326, "step": 15110 }, { "grad_norm": 0.4149426519870758, "learning_rate": 8.925475444860527e-05, "loss": 0.1352, "step": 15120 }, { "grad_norm": 0.3260814845561981, "learning_rate": 8.923767984914092e-05, "loss": 0.1331, "step": 15130 }, { "grad_norm": 0.3054690957069397, "learning_rate": 8.922059333030545e-05, "loss": 0.1247, "step": 15140 }, { "grad_norm": 0.27958205342292786, "learning_rate": 8.920349489728928e-05, "loss": 0.1241, "step": 15150 }, { "grad_norm": 0.24338267743587494, "learning_rate": 8.918638455528646e-05, "loss": 0.1268, "step": 15160 }, { "grad_norm": 0.2563036382198334, "learning_rate": 8.916926230949468e-05, "loss": 0.1261, "step": 15170 }, { "grad_norm": 0.22778844833374023, "learning_rate": 8.915212816511522e-05, "loss": 0.1244, "step": 15180 }, { "grad_norm": 0.2976309061050415, "learning_rate": 8.913498212735296e-05, "loss": 0.1226, "step": 15190 }, { "grad_norm": 0.34004002809524536, "learning_rate": 8.911782420141643e-05, "loss": 0.1321, "step": 15200 }, { "grad_norm": 0.26955491304397583, "learning_rate": 8.910065439251775e-05, "loss": 0.1252, "step": 15210 }, { "grad_norm": 0.25894850492477417, "learning_rate": 8.908347270587268e-05, "loss": 0.125, "step": 15220 }, { "grad_norm": 0.24287161231040955, "learning_rate": 8.906627914670054e-05, "loss": 0.1274, "step": 15230 }, { "grad_norm": 0.3055875897407532, "learning_rate": 8.904907372022427e-05, "loss": 0.1278, "step": 15240 }, { "grad_norm": 0.2933570146560669, "learning_rate": 8.903185643167042e-05, "loss": 0.1312, "step": 15250 }, { "grad_norm": 0.2838204503059387, "learning_rate": 8.901462728626919e-05, "loss": 0.1232, "step": 15260 }, { "grad_norm": 0.2718684673309326, "learning_rate": 8.899738628925429e-05, "loss": 0.1327, "step": 15270 }, { "grad_norm": 0.28645241260528564, "learning_rate": 8.898013344586312e-05, "loss": 0.1286, "step": 15280 }, { "grad_norm": 0.29919666051864624, "learning_rate": 8.896286876133661e-05, "loss": 0.1264, "step": 15290 }, { "grad_norm": 0.2858762741088867, "learning_rate": 8.894559224091933e-05, "loss": 0.1255, "step": 15300 }, { "grad_norm": 0.32557201385498047, "learning_rate": 8.892830388985942e-05, "loss": 0.1334, "step": 15310 }, { "grad_norm": 0.2708100974559784, "learning_rate": 8.891100371340864e-05, "loss": 0.1427, "step": 15320 }, { "grad_norm": 0.25422701239585876, "learning_rate": 8.889369171682231e-05, "loss": 0.1319, "step": 15330 }, { "grad_norm": 0.2965254783630371, "learning_rate": 8.887636790535936e-05, "loss": 0.1336, "step": 15340 }, { "grad_norm": 0.2579832375049591, "learning_rate": 8.885903228428231e-05, "loss": 0.1207, "step": 15350 }, { "grad_norm": 0.2835141718387604, "learning_rate": 8.884168485885727e-05, "loss": 0.1216, "step": 15360 }, { "grad_norm": 0.2390967607498169, "learning_rate": 8.882432563435393e-05, "loss": 0.1218, "step": 15370 }, { "grad_norm": 0.24433554708957672, "learning_rate": 8.880695461604556e-05, "loss": 0.1219, "step": 15380 }, { "grad_norm": 0.28973448276519775, "learning_rate": 8.878957180920901e-05, "loss": 0.1343, "step": 15390 }, { "grad_norm": 0.3653233051300049, "learning_rate": 8.877217721912473e-05, "loss": 0.1347, "step": 15400 }, { "grad_norm": 0.3071911334991455, "learning_rate": 8.875477085107673e-05, "loss": 0.1248, "step": 15410 }, { "grad_norm": 0.28349268436431885, "learning_rate": 8.87373527103526e-05, "loss": 0.1282, "step": 15420 }, { "grad_norm": 0.32094496488571167, "learning_rate": 8.871992280224353e-05, "loss": 0.1248, "step": 15430 }, { "grad_norm": 0.21982359886169434, "learning_rate": 8.870248113204422e-05, "loss": 0.122, "step": 15440 }, { "grad_norm": 0.26957619190216064, "learning_rate": 8.868502770505306e-05, "loss": 0.128, "step": 15450 }, { "grad_norm": 0.2647901773452759, "learning_rate": 8.86675625265719e-05, "loss": 0.125, "step": 15460 }, { "grad_norm": 0.31282150745391846, "learning_rate": 8.865008560190618e-05, "loss": 0.1279, "step": 15470 }, { "grad_norm": 0.28653839230537415, "learning_rate": 8.863259693636496e-05, "loss": 0.1183, "step": 15480 }, { "grad_norm": 0.30264657735824585, "learning_rate": 8.861509653526083e-05, "loss": 0.1299, "step": 15490 }, { "grad_norm": 0.25017818808555603, "learning_rate": 8.859758440390993e-05, "loss": 0.1239, "step": 15500 }, { "grad_norm": 0.2757086455821991, "learning_rate": 8.858006054763202e-05, "loss": 0.1256, "step": 15510 }, { "grad_norm": 0.27890744805336, "learning_rate": 8.856252497175035e-05, "loss": 0.1265, "step": 15520 }, { "grad_norm": 0.4253179430961609, "learning_rate": 8.854497768159178e-05, "loss": 0.1438, "step": 15530 }, { "grad_norm": 0.31170719861984253, "learning_rate": 8.852741868248671e-05, "loss": 0.1338, "step": 15540 }, { "grad_norm": 0.32920363545417786, "learning_rate": 8.85098479797691e-05, "loss": 0.1269, "step": 15550 }, { "grad_norm": 0.22813035547733307, "learning_rate": 8.849226557877646e-05, "loss": 0.1271, "step": 15560 }, { "grad_norm": 0.3636399507522583, "learning_rate": 8.84746714848499e-05, "loss": 0.1266, "step": 15570 }, { "grad_norm": 0.33987191319465637, "learning_rate": 8.845706570333397e-05, "loss": 0.1314, "step": 15580 }, { "grad_norm": 0.23091217875480652, "learning_rate": 8.84394482395769e-05, "loss": 0.1323, "step": 15590 }, { "grad_norm": 0.2777831256389618, "learning_rate": 8.842181909893038e-05, "loss": 0.1219, "step": 15600 }, { "grad_norm": 0.27256715297698975, "learning_rate": 8.840417828674969e-05, "loss": 0.1255, "step": 15610 }, { "grad_norm": 0.23515456914901733, "learning_rate": 8.838652580839364e-05, "loss": 0.1261, "step": 15620 }, { "grad_norm": 0.24589143693447113, "learning_rate": 8.836886166922458e-05, "loss": 0.1162, "step": 15630 }, { "grad_norm": 0.32786640524864197, "learning_rate": 8.835118587460844e-05, "loss": 0.1236, "step": 15640 }, { "grad_norm": 0.31755173206329346, "learning_rate": 8.83334984299146e-05, "loss": 0.1321, "step": 15650 }, { "grad_norm": 0.22319136559963226, "learning_rate": 8.83157993405161e-05, "loss": 0.1234, "step": 15660 }, { "grad_norm": 0.33525505661964417, "learning_rate": 8.829808861178943e-05, "loss": 0.1307, "step": 15670 }, { "grad_norm": 0.2938670814037323, "learning_rate": 8.828036624911464e-05, "loss": 0.1236, "step": 15680 }, { "grad_norm": 0.27329957485198975, "learning_rate": 8.826263225787532e-05, "loss": 0.1228, "step": 15690 }, { "grad_norm": 0.31105369329452515, "learning_rate": 8.824488664345858e-05, "loss": 0.1251, "step": 15700 }, { "grad_norm": 0.26315924525260925, "learning_rate": 8.822712941125508e-05, "loss": 0.1211, "step": 15710 }, { "grad_norm": 0.3121950924396515, "learning_rate": 8.820936056665898e-05, "loss": 0.1254, "step": 15720 }, { "grad_norm": 0.2302861362695694, "learning_rate": 8.819158011506801e-05, "loss": 0.12, "step": 15730 }, { "grad_norm": 0.29972606897354126, "learning_rate": 8.81737880618834e-05, "loss": 0.1292, "step": 15740 }, { "grad_norm": 0.32431352138519287, "learning_rate": 8.815598441250987e-05, "loss": 0.1303, "step": 15750 }, { "grad_norm": 0.33772480487823486, "learning_rate": 8.813816917235576e-05, "loss": 0.1268, "step": 15760 }, { "grad_norm": 0.31005990505218506, "learning_rate": 8.812034234683282e-05, "loss": 0.1374, "step": 15770 }, { "grad_norm": 0.29767611622810364, "learning_rate": 8.810250394135637e-05, "loss": 0.1338, "step": 15780 }, { "grad_norm": 0.26568296551704407, "learning_rate": 8.808465396134529e-05, "loss": 0.1237, "step": 15790 }, { "grad_norm": 0.29500898718833923, "learning_rate": 8.806679241222189e-05, "loss": 0.1291, "step": 15800 }, { "grad_norm": 0.27832135558128357, "learning_rate": 8.804891929941203e-05, "loss": 0.122, "step": 15810 }, { "grad_norm": 0.2568560242652893, "learning_rate": 8.803103462834514e-05, "loss": 0.129, "step": 15820 }, { "grad_norm": 0.2888723313808441, "learning_rate": 8.801313840445408e-05, "loss": 0.1275, "step": 15830 }, { "grad_norm": 0.28866487741470337, "learning_rate": 8.799523063317524e-05, "loss": 0.1208, "step": 15840 }, { "grad_norm": 0.28881216049194336, "learning_rate": 8.797731131994854e-05, "loss": 0.1309, "step": 15850 }, { "grad_norm": 0.24368815124034882, "learning_rate": 8.795938047021739e-05, "loss": 0.1182, "step": 15860 }, { "grad_norm": 0.25352969765663147, "learning_rate": 8.794143808942872e-05, "loss": 0.123, "step": 15870 }, { "grad_norm": 0.27119138836860657, "learning_rate": 8.792348418303296e-05, "loss": 0.137, "step": 15880 }, { "grad_norm": 0.2508065700531006, "learning_rate": 8.790551875648398e-05, "loss": 0.1222, "step": 15890 }, { "grad_norm": 0.28941473364830017, "learning_rate": 8.788754181523926e-05, "loss": 0.1256, "step": 15900 }, { "grad_norm": 0.30244889855384827, "learning_rate": 8.78695533647597e-05, "loss": 0.1321, "step": 15910 }, { "grad_norm": 0.24554072320461273, "learning_rate": 8.785155341050972e-05, "loss": 0.1266, "step": 15920 }, { "grad_norm": 0.3762190043926239, "learning_rate": 8.783354195795721e-05, "loss": 0.145, "step": 15930 }, { "grad_norm": 0.33338144421577454, "learning_rate": 8.78155190125736e-05, "loss": 0.1281, "step": 15940 }, { "grad_norm": 0.23712500929832458, "learning_rate": 8.779748457983378e-05, "loss": 0.1316, "step": 15950 }, { "grad_norm": 0.2533131241798401, "learning_rate": 8.777943866521612e-05, "loss": 0.1328, "step": 15960 }, { "grad_norm": 0.23704120516777039, "learning_rate": 8.77613812742025e-05, "loss": 0.1318, "step": 15970 }, { "grad_norm": 0.22541005909442902, "learning_rate": 8.774331241227829e-05, "loss": 0.1258, "step": 15980 }, { "grad_norm": 0.3548915982246399, "learning_rate": 8.772523208493232e-05, "loss": 0.1236, "step": 15990 }, { "grad_norm": 0.34171169996261597, "learning_rate": 8.770714029765692e-05, "loss": 0.1272, "step": 16000 }, { "grad_norm": 0.26977625489234924, "learning_rate": 8.768903705594789e-05, "loss": 0.1281, "step": 16010 }, { "grad_norm": 0.3298180401325226, "learning_rate": 8.767092236530453e-05, "loss": 0.1217, "step": 16020 }, { "grad_norm": 0.30929476022720337, "learning_rate": 8.76527962312296e-05, "loss": 0.1264, "step": 16030 }, { "grad_norm": 0.28304240107536316, "learning_rate": 8.763465865922934e-05, "loss": 0.1272, "step": 16040 }, { "grad_norm": 0.3046930134296417, "learning_rate": 8.761650965481347e-05, "loss": 0.1321, "step": 16050 }, { "grad_norm": 0.2903406023979187, "learning_rate": 8.759834922349516e-05, "loss": 0.1214, "step": 16060 }, { "grad_norm": 0.2785981595516205, "learning_rate": 8.758017737079108e-05, "loss": 0.1231, "step": 16070 }, { "grad_norm": 0.287216454744339, "learning_rate": 8.756199410222137e-05, "loss": 0.1299, "step": 16080 }, { "grad_norm": 0.23692484200000763, "learning_rate": 8.754379942330963e-05, "loss": 0.129, "step": 16090 }, { "grad_norm": 0.3203966021537781, "learning_rate": 8.75255933395829e-05, "loss": 0.1253, "step": 16100 }, { "grad_norm": 0.3742621839046478, "learning_rate": 8.750737585657171e-05, "loss": 0.1295, "step": 16110 }, { "grad_norm": 0.3587304949760437, "learning_rate": 8.748914697981008e-05, "loss": 0.1321, "step": 16120 }, { "grad_norm": 0.24418532848358154, "learning_rate": 8.747090671483542e-05, "loss": 0.1213, "step": 16130 }, { "grad_norm": 0.27832576632499695, "learning_rate": 8.745265506718869e-05, "loss": 0.1289, "step": 16140 }, { "grad_norm": 0.26338350772857666, "learning_rate": 8.74343920424142e-05, "loss": 0.1236, "step": 16150 }, { "grad_norm": 0.2602940797805786, "learning_rate": 8.741611764605982e-05, "loss": 0.1265, "step": 16160 }, { "grad_norm": 0.2725672423839569, "learning_rate": 8.739783188367682e-05, "loss": 0.1199, "step": 16170 }, { "grad_norm": 0.29606184363365173, "learning_rate": 8.737953476081991e-05, "loss": 0.1315, "step": 16180 }, { "grad_norm": 0.2900511920452118, "learning_rate": 8.73612262830473e-05, "loss": 0.1298, "step": 16190 }, { "grad_norm": 0.2619999647140503, "learning_rate": 8.734290645592061e-05, "loss": 0.1279, "step": 16200 }, { "grad_norm": 0.2441129982471466, "learning_rate": 8.732457528500493e-05, "loss": 0.1184, "step": 16210 }, { "grad_norm": 0.27749672532081604, "learning_rate": 8.730623277586875e-05, "loss": 0.1287, "step": 16220 }, { "grad_norm": 0.25860559940338135, "learning_rate": 8.72878789340841e-05, "loss": 0.1266, "step": 16230 }, { "grad_norm": 0.280528724193573, "learning_rate": 8.726951376522635e-05, "loss": 0.1279, "step": 16240 }, { "grad_norm": 0.2572590410709381, "learning_rate": 8.725113727487435e-05, "loss": 0.1245, "step": 16250 }, { "grad_norm": 0.2535024881362915, "learning_rate": 8.723274946861042e-05, "loss": 0.1317, "step": 16260 }, { "grad_norm": 0.27947109937667847, "learning_rate": 8.721435035202026e-05, "loss": 0.1295, "step": 16270 }, { "grad_norm": 0.29159337282180786, "learning_rate": 8.719593993069306e-05, "loss": 0.1166, "step": 16280 }, { "grad_norm": 0.24901017546653748, "learning_rate": 8.717751821022139e-05, "loss": 0.1292, "step": 16290 }, { "grad_norm": 0.24991284310817719, "learning_rate": 8.715908519620134e-05, "loss": 0.1274, "step": 16300 }, { "grad_norm": 0.32816120982170105, "learning_rate": 8.71406408942323e-05, "loss": 0.1254, "step": 16310 }, { "grad_norm": 0.29680779576301575, "learning_rate": 8.712218530991723e-05, "loss": 0.1238, "step": 16320 }, { "grad_norm": 0.28955569863319397, "learning_rate": 8.710371844886241e-05, "loss": 0.1304, "step": 16330 }, { "grad_norm": 0.27562928199768066, "learning_rate": 8.708524031667758e-05, "loss": 0.1292, "step": 16340 }, { "grad_norm": 0.2668900489807129, "learning_rate": 8.706675091897592e-05, "loss": 0.1229, "step": 16350 }, { "grad_norm": 0.2953263223171234, "learning_rate": 8.704825026137404e-05, "loss": 0.1277, "step": 16360 }, { "grad_norm": 0.2816394865512848, "learning_rate": 8.702973834949192e-05, "loss": 0.129, "step": 16370 }, { "grad_norm": 0.2636459767818451, "learning_rate": 8.701121518895301e-05, "loss": 0.1322, "step": 16380 }, { "grad_norm": 0.32786381244659424, "learning_rate": 8.699268078538414e-05, "loss": 0.1227, "step": 16390 }, { "grad_norm": 0.27138230204582214, "learning_rate": 8.69741351444156e-05, "loss": 0.1279, "step": 16400 }, { "grad_norm": 0.32200509309768677, "learning_rate": 8.695557827168101e-05, "loss": 0.1236, "step": 16410 }, { "grad_norm": 0.28591814637184143, "learning_rate": 8.693701017281753e-05, "loss": 0.1201, "step": 16420 }, { "grad_norm": 0.26724517345428467, "learning_rate": 8.691843085346563e-05, "loss": 0.1211, "step": 16430 }, { "grad_norm": 0.24139435589313507, "learning_rate": 8.689984031926919e-05, "loss": 0.1315, "step": 16440 }, { "grad_norm": 0.22475580871105194, "learning_rate": 8.688123857587555e-05, "loss": 0.1177, "step": 16450 }, { "grad_norm": 0.29075899720191956, "learning_rate": 8.686262562893544e-05, "loss": 0.1272, "step": 16460 }, { "grad_norm": 0.28042498230934143, "learning_rate": 8.684400148410294e-05, "loss": 0.1205, "step": 16470 }, { "grad_norm": 0.27921658754348755, "learning_rate": 8.682536614703562e-05, "loss": 0.1266, "step": 16480 }, { "grad_norm": 0.28643232583999634, "learning_rate": 8.680671962339437e-05, "loss": 0.1286, "step": 16490 }, { "grad_norm": 0.2834594249725342, "learning_rate": 8.678806191884352e-05, "loss": 0.1234, "step": 16500 }, { "grad_norm": 0.276192843914032, "learning_rate": 8.67693930390508e-05, "loss": 0.1199, "step": 16510 }, { "grad_norm": 0.2999782860279083, "learning_rate": 8.67507129896873e-05, "loss": 0.1281, "step": 16520 }, { "grad_norm": 0.22441941499710083, "learning_rate": 8.673202177642757e-05, "loss": 0.1227, "step": 16530 }, { "grad_norm": 0.2314467877149582, "learning_rate": 8.671331940494945e-05, "loss": 0.1252, "step": 16540 }, { "grad_norm": 0.2407207489013672, "learning_rate": 8.669460588093427e-05, "loss": 0.1343, "step": 16550 }, { "grad_norm": 0.25705868005752563, "learning_rate": 8.667588121006667e-05, "loss": 0.1206, "step": 16560 }, { "grad_norm": 0.2451791763305664, "learning_rate": 8.665714539803475e-05, "loss": 0.1251, "step": 16570 }, { "grad_norm": 0.2933942377567291, "learning_rate": 8.663839845052993e-05, "loss": 0.1304, "step": 16580 }, { "grad_norm": 0.2795334756374359, "learning_rate": 8.661964037324703e-05, "loss": 0.1239, "step": 16590 }, { "grad_norm": 0.27465739846229553, "learning_rate": 8.660087117188427e-05, "loss": 0.1301, "step": 16600 }, { "grad_norm": 0.25407522916793823, "learning_rate": 8.658209085214325e-05, "loss": 0.1215, "step": 16610 }, { "grad_norm": 0.31540805101394653, "learning_rate": 8.656329941972891e-05, "loss": 0.1309, "step": 16620 }, { "grad_norm": 0.26984477043151855, "learning_rate": 8.654449688034963e-05, "loss": 0.1244, "step": 16630 }, { "grad_norm": 0.32415053248405457, "learning_rate": 8.652568323971706e-05, "loss": 0.128, "step": 16640 }, { "grad_norm": 0.30382657051086426, "learning_rate": 8.650685850354636e-05, "loss": 0.1249, "step": 16650 }, { "grad_norm": 0.30038443207740784, "learning_rate": 8.648802267755593e-05, "loss": 0.122, "step": 16660 }, { "grad_norm": 0.26028376817703247, "learning_rate": 8.646917576746764e-05, "loss": 0.1245, "step": 16670 }, { "grad_norm": 0.2475600391626358, "learning_rate": 8.645031777900666e-05, "loss": 0.1234, "step": 16680 }, { "grad_norm": 0.24492883682250977, "learning_rate": 8.643144871790154e-05, "loss": 0.1185, "step": 16690 }, { "grad_norm": 0.2858632802963257, "learning_rate": 8.641256858988424e-05, "loss": 0.1276, "step": 16700 }, { "grad_norm": 0.27459463477134705, "learning_rate": 8.639367740069e-05, "loss": 0.1191, "step": 16710 }, { "grad_norm": 0.3606293797492981, "learning_rate": 8.63747751560575e-05, "loss": 0.13, "step": 16720 }, { "grad_norm": 0.2615557610988617, "learning_rate": 8.635586186172871e-05, "loss": 0.119, "step": 16730 }, { "grad_norm": 0.25067368149757385, "learning_rate": 8.633693752344902e-05, "loss": 0.1162, "step": 16740 }, { "grad_norm": 0.21265621483325958, "learning_rate": 8.631800214696713e-05, "loss": 0.1208, "step": 16750 }, { "grad_norm": 0.2508781850337982, "learning_rate": 8.629905573803511e-05, "loss": 0.1278, "step": 16760 }, { "grad_norm": 0.282697468996048, "learning_rate": 8.628009830240839e-05, "loss": 0.1326, "step": 16770 }, { "grad_norm": 0.2952059805393219, "learning_rate": 8.626112984584571e-05, "loss": 0.1266, "step": 16780 }, { "grad_norm": 0.23396600782871246, "learning_rate": 8.62421503741092e-05, "loss": 0.1263, "step": 16790 }, { "grad_norm": 0.25664636492729187, "learning_rate": 8.622315989296432e-05, "loss": 0.1232, "step": 16800 }, { "grad_norm": 0.28119006752967834, "learning_rate": 8.62041584081799e-05, "loss": 0.1304, "step": 16810 }, { "grad_norm": 0.31082120537757874, "learning_rate": 8.618514592552807e-05, "loss": 0.1274, "step": 16820 }, { "grad_norm": 0.27369821071624756, "learning_rate": 8.616612245078431e-05, "loss": 0.1233, "step": 16830 }, { "grad_norm": 0.2953101396560669, "learning_rate": 8.614708798972746e-05, "loss": 0.1315, "step": 16840 }, { "grad_norm": 0.3141539692878723, "learning_rate": 8.61280425481397e-05, "loss": 0.1239, "step": 16850 }, { "grad_norm": 0.2465040534734726, "learning_rate": 8.61089861318065e-05, "loss": 0.1224, "step": 16860 }, { "grad_norm": 0.29539090394973755, "learning_rate": 8.608991874651673e-05, "loss": 0.1247, "step": 16870 }, { "grad_norm": 0.29395049810409546, "learning_rate": 8.607084039806255e-05, "loss": 0.1329, "step": 16880 }, { "grad_norm": 0.2797222435474396, "learning_rate": 8.605175109223944e-05, "loss": 0.1271, "step": 16890 }, { "grad_norm": 0.3269157111644745, "learning_rate": 8.603265083484624e-05, "loss": 0.1397, "step": 16900 }, { "grad_norm": 0.3020494878292084, "learning_rate": 8.60135396316851e-05, "loss": 0.1279, "step": 16910 }, { "grad_norm": 0.25974661111831665, "learning_rate": 8.599441748856152e-05, "loss": 0.121, "step": 16920 }, { "grad_norm": 0.30389654636383057, "learning_rate": 8.597528441128427e-05, "loss": 0.126, "step": 16930 }, { "grad_norm": 0.2619440257549286, "learning_rate": 8.595614040566549e-05, "loss": 0.1236, "step": 16940 }, { "grad_norm": 0.29167866706848145, "learning_rate": 8.593698547752063e-05, "loss": 0.1224, "step": 16950 }, { "grad_norm": 0.30215346813201904, "learning_rate": 8.591781963266843e-05, "loss": 0.128, "step": 16960 }, { "grad_norm": 0.3000149428844452, "learning_rate": 8.5898642876931e-05, "loss": 0.133, "step": 16970 }, { "grad_norm": 0.281843900680542, "learning_rate": 8.587945521613369e-05, "loss": 0.1267, "step": 16980 }, { "grad_norm": 0.22893819212913513, "learning_rate": 8.586025665610524e-05, "loss": 0.1298, "step": 16990 }, { "grad_norm": 0.3066019117832184, "learning_rate": 8.584104720267765e-05, "loss": 0.1344, "step": 17000 }, { "grad_norm": 0.2574676275253296, "learning_rate": 8.582182686168625e-05, "loss": 0.1257, "step": 17010 }, { "grad_norm": 0.27616578340530396, "learning_rate": 8.580259563896967e-05, "loss": 0.1279, "step": 17020 }, { "grad_norm": 0.2802002429962158, "learning_rate": 8.578335354036983e-05, "loss": 0.1299, "step": 17030 }, { "grad_norm": 0.24639491736888885, "learning_rate": 8.576410057173201e-05, "loss": 0.1207, "step": 17040 }, { "grad_norm": 0.2272539883852005, "learning_rate": 8.574483673890474e-05, "loss": 0.1216, "step": 17050 }, { "grad_norm": 0.3256906569004059, "learning_rate": 8.572556204773983e-05, "loss": 0.1285, "step": 17060 }, { "grad_norm": 0.29650959372520447, "learning_rate": 8.570627650409246e-05, "loss": 0.1299, "step": 17070 }, { "grad_norm": 0.28281307220458984, "learning_rate": 8.568698011382107e-05, "loss": 0.1352, "step": 17080 }, { "grad_norm": 0.23952870070934296, "learning_rate": 8.566767288278738e-05, "loss": 0.1217, "step": 17090 }, { "grad_norm": 0.2660794258117676, "learning_rate": 8.56483548168564e-05, "loss": 0.1235, "step": 17100 }, { "grad_norm": 0.308492511510849, "learning_rate": 8.562902592189648e-05, "loss": 0.1262, "step": 17110 }, { "grad_norm": 0.2673039436340332, "learning_rate": 8.560968620377921e-05, "loss": 0.1348, "step": 17120 }, { "grad_norm": 0.2908955216407776, "learning_rate": 8.559033566837951e-05, "loss": 0.1387, "step": 17130 }, { "grad_norm": 0.2903021275997162, "learning_rate": 8.557097432157551e-05, "loss": 0.1277, "step": 17140 }, { "grad_norm": 0.289599746465683, "learning_rate": 8.555160216924872e-05, "loss": 0.1328, "step": 17150 }, { "grad_norm": 0.27344027161598206, "learning_rate": 8.55322192172839e-05, "loss": 0.1292, "step": 17160 }, { "grad_norm": 0.23435774445533752, "learning_rate": 8.551282547156902e-05, "loss": 0.1198, "step": 17170 }, { "grad_norm": 0.31853875517845154, "learning_rate": 8.549342093799544e-05, "loss": 0.129, "step": 17180 }, { "grad_norm": 0.25438588857650757, "learning_rate": 8.547400562245773e-05, "loss": 0.1221, "step": 17190 }, { "grad_norm": 0.31259992718696594, "learning_rate": 8.545457953085374e-05, "loss": 0.1283, "step": 17200 }, { "grad_norm": 0.25788649916648865, "learning_rate": 8.543514266908463e-05, "loss": 0.1285, "step": 17210 }, { "grad_norm": 0.23599275946617126, "learning_rate": 8.541569504305478e-05, "loss": 0.1266, "step": 17220 }, { "grad_norm": 0.2788534164428711, "learning_rate": 8.539623665867187e-05, "loss": 0.1252, "step": 17230 }, { "grad_norm": 0.2949940264225006, "learning_rate": 8.537676752184685e-05, "loss": 0.1247, "step": 17240 }, { "grad_norm": 0.28944703936576843, "learning_rate": 8.53572876384939e-05, "loss": 0.1285, "step": 17250 }, { "grad_norm": 0.3101396858692169, "learning_rate": 8.533779701453056e-05, "loss": 0.1309, "step": 17260 }, { "grad_norm": 0.29687315225601196, "learning_rate": 8.53182956558775e-05, "loss": 0.1327, "step": 17270 }, { "grad_norm": 0.31000423431396484, "learning_rate": 8.529878356845877e-05, "loss": 0.1314, "step": 17280 }, { "grad_norm": 0.2521118223667145, "learning_rate": 8.527926075820158e-05, "loss": 0.1275, "step": 17290 }, { "grad_norm": 0.22128832340240479, "learning_rate": 8.525972723103648e-05, "loss": 0.1325, "step": 17300 }, { "grad_norm": 0.3226397931575775, "learning_rate": 8.524018299289722e-05, "loss": 0.1274, "step": 17310 }, { "grad_norm": 0.24156993627548218, "learning_rate": 8.522062804972083e-05, "loss": 0.1214, "step": 17320 }, { "grad_norm": 0.2892138659954071, "learning_rate": 8.520106240744759e-05, "loss": 0.1308, "step": 17330 }, { "grad_norm": 0.2338598668575287, "learning_rate": 8.518148607202102e-05, "loss": 0.1282, "step": 17340 }, { "grad_norm": 0.2953154444694519, "learning_rate": 8.51618990493879e-05, "loss": 0.1298, "step": 17350 }, { "grad_norm": 0.2681247591972351, "learning_rate": 8.514230134549823e-05, "loss": 0.1292, "step": 17360 }, { "grad_norm": 0.27737316489219666, "learning_rate": 8.51226929663053e-05, "loss": 0.1191, "step": 17370 }, { "grad_norm": 0.2746802866458893, "learning_rate": 8.51030739177656e-05, "loss": 0.119, "step": 17380 }, { "grad_norm": 0.2376800775527954, "learning_rate": 8.508344420583889e-05, "loss": 0.1204, "step": 17390 }, { "grad_norm": 0.2443649023771286, "learning_rate": 8.506380383648816e-05, "loss": 0.1153, "step": 17400 }, { "grad_norm": 0.2838618755340576, "learning_rate": 8.504415281567963e-05, "loss": 0.1283, "step": 17410 }, { "grad_norm": 0.31306520104408264, "learning_rate": 8.502449114938275e-05, "loss": 0.1314, "step": 17420 }, { "grad_norm": 0.30404555797576904, "learning_rate": 8.500481884357025e-05, "loss": 0.1321, "step": 17430 }, { "grad_norm": 0.2649962306022644, "learning_rate": 8.498513590421801e-05, "loss": 0.122, "step": 17440 }, { "grad_norm": 0.3012303113937378, "learning_rate": 8.496544233730522e-05, "loss": 0.1247, "step": 17450 }, { "grad_norm": 0.3166891038417816, "learning_rate": 8.494573814881426e-05, "loss": 0.128, "step": 17460 }, { "grad_norm": 0.2913958430290222, "learning_rate": 8.492602334473074e-05, "loss": 0.1242, "step": 17470 }, { "grad_norm": 0.2488388568162918, "learning_rate": 8.49062979310435e-05, "loss": 0.1302, "step": 17480 }, { "grad_norm": 0.23041598498821259, "learning_rate": 8.488656191374458e-05, "loss": 0.1304, "step": 17490 }, { "grad_norm": 0.2669263482093811, "learning_rate": 8.48668152988293e-05, "loss": 0.1135, "step": 17500 }, { "grad_norm": 0.2951256334781647, "learning_rate": 8.484705809229612e-05, "loss": 0.1187, "step": 17510 }, { "grad_norm": 0.26689404249191284, "learning_rate": 8.482729030014677e-05, "loss": 0.1302, "step": 17520 }, { "grad_norm": 0.26604294776916504, "learning_rate": 8.48075119283862e-05, "loss": 0.128, "step": 17530 }, { "grad_norm": 0.2556823492050171, "learning_rate": 8.478772298302254e-05, "loss": 0.1339, "step": 17540 }, { "grad_norm": 0.3222276568412781, "learning_rate": 8.476792347006716e-05, "loss": 0.129, "step": 17550 }, { "grad_norm": 0.32761988043785095, "learning_rate": 8.474811339553462e-05, "loss": 0.1321, "step": 17560 }, { "grad_norm": 0.28853365778923035, "learning_rate": 8.47282927654427e-05, "loss": 0.1237, "step": 17570 }, { "grad_norm": 0.31980523467063904, "learning_rate": 8.470846158581238e-05, "loss": 0.1273, "step": 17580 }, { "grad_norm": 0.2728740870952606, "learning_rate": 8.468861986266787e-05, "loss": 0.1191, "step": 17590 }, { "grad_norm": 0.2578241527080536, "learning_rate": 8.466876760203654e-05, "loss": 0.1226, "step": 17600 }, { "grad_norm": 0.23067522048950195, "learning_rate": 8.464890480994898e-05, "loss": 0.1133, "step": 17610 }, { "grad_norm": 0.19930341839790344, "learning_rate": 8.462903149243899e-05, "loss": 0.1235, "step": 17620 }, { "grad_norm": 0.27296164631843567, "learning_rate": 8.460914765554357e-05, "loss": 0.1268, "step": 17630 }, { "grad_norm": 0.23537208139896393, "learning_rate": 8.458925330530288e-05, "loss": 0.131, "step": 17640 }, { "grad_norm": 0.3079794943332672, "learning_rate": 8.456934844776032e-05, "loss": 0.127, "step": 17650 }, { "grad_norm": 0.3182278871536255, "learning_rate": 8.454943308896246e-05, "loss": 0.1286, "step": 17660 }, { "grad_norm": 0.31765151023864746, "learning_rate": 8.452950723495905e-05, "loss": 0.133, "step": 17670 }, { "grad_norm": 0.30637285113334656, "learning_rate": 8.450957089180303e-05, "loss": 0.1209, "step": 17680 }, { "grad_norm": 0.2854776084423065, "learning_rate": 8.448962406555055e-05, "loss": 0.1226, "step": 17690 }, { "grad_norm": 0.3212285041809082, "learning_rate": 8.446966676226093e-05, "loss": 0.1419, "step": 17700 }, { "grad_norm": 0.2634446322917938, "learning_rate": 8.444969898799667e-05, "loss": 0.1253, "step": 17710 }, { "grad_norm": 0.30002227425575256, "learning_rate": 8.442972074882343e-05, "loss": 0.1308, "step": 17720 }, { "grad_norm": 0.242011159658432, "learning_rate": 8.44097320508101e-05, "loss": 0.1287, "step": 17730 }, { "grad_norm": 0.26766982674598694, "learning_rate": 8.43897329000287e-05, "loss": 0.1295, "step": 17740 }, { "grad_norm": 0.2408442199230194, "learning_rate": 8.436972330255448e-05, "loss": 0.1149, "step": 17750 }, { "grad_norm": 0.2915785312652588, "learning_rate": 8.434970326446579e-05, "loss": 0.1289, "step": 17760 }, { "grad_norm": 0.2613019347190857, "learning_rate": 8.432967279184418e-05, "loss": 0.1246, "step": 17770 }, { "grad_norm": 0.24557523429393768, "learning_rate": 8.430963189077441e-05, "loss": 0.121, "step": 17780 }, { "grad_norm": 0.2319786697626114, "learning_rate": 8.428958056734437e-05, "loss": 0.1206, "step": 17790 }, { "grad_norm": 0.2645212709903717, "learning_rate": 8.426951882764513e-05, "loss": 0.1223, "step": 17800 }, { "grad_norm": 0.2845107316970825, "learning_rate": 8.424944667777089e-05, "loss": 0.1293, "step": 17810 }, { "grad_norm": 0.2543324828147888, "learning_rate": 8.422936412381905e-05, "loss": 0.1289, "step": 17820 }, { "grad_norm": 0.26997172832489014, "learning_rate": 8.420927117189017e-05, "loss": 0.1272, "step": 17830 }, { "grad_norm": 0.2605656087398529, "learning_rate": 8.418916782808795e-05, "loss": 0.1285, "step": 17840 }, { "grad_norm": 0.3164089322090149, "learning_rate": 8.416905409851926e-05, "loss": 0.1408, "step": 17850 }, { "grad_norm": 0.2815982401371002, "learning_rate": 8.41489299892941e-05, "loss": 0.1214, "step": 17860 }, { "grad_norm": 0.2594541609287262, "learning_rate": 8.412879550652566e-05, "loss": 0.1297, "step": 17870 }, { "grad_norm": 0.30944469571113586, "learning_rate": 8.410865065633029e-05, "loss": 0.139, "step": 17880 }, { "grad_norm": 0.2765669524669647, "learning_rate": 8.408849544482742e-05, "loss": 0.1287, "step": 17890 }, { "grad_norm": 0.30865123867988586, "learning_rate": 8.406832987813968e-05, "loss": 0.1288, "step": 17900 }, { "grad_norm": 0.2801450192928314, "learning_rate": 8.404815396239286e-05, "loss": 0.1293, "step": 17910 }, { "grad_norm": 0.35082104802131653, "learning_rate": 8.402796770371587e-05, "loss": 0.1299, "step": 17920 }, { "grad_norm": 0.3482170104980469, "learning_rate": 8.400777110824071e-05, "loss": 0.1256, "step": 17930 }, { "grad_norm": 0.3065292537212372, "learning_rate": 8.398756418210263e-05, "loss": 0.1339, "step": 17940 }, { "grad_norm": 0.2968234717845917, "learning_rate": 8.396734693143993e-05, "loss": 0.1285, "step": 17950 }, { "grad_norm": 0.2707216739654541, "learning_rate": 8.39471193623941e-05, "loss": 0.1298, "step": 17960 }, { "grad_norm": 0.25346097350120544, "learning_rate": 8.392688148110974e-05, "loss": 0.1264, "step": 17970 }, { "grad_norm": 0.27555450797080994, "learning_rate": 8.390663329373456e-05, "loss": 0.1227, "step": 17980 }, { "grad_norm": 0.24866358935832977, "learning_rate": 8.388637480641944e-05, "loss": 0.1248, "step": 17990 }, { "grad_norm": 0.2685653567314148, "learning_rate": 8.386610602531837e-05, "loss": 0.1287, "step": 18000 }, { "grad_norm": 0.25323083996772766, "learning_rate": 8.384582695658847e-05, "loss": 0.1224, "step": 18010 }, { "grad_norm": 0.24528340995311737, "learning_rate": 8.382553760638999e-05, "loss": 0.1303, "step": 18020 }, { "grad_norm": 0.3095386028289795, "learning_rate": 8.380523798088631e-05, "loss": 0.1282, "step": 18030 }, { "grad_norm": 0.2757023572921753, "learning_rate": 8.378492808624389e-05, "loss": 0.1203, "step": 18040 }, { "grad_norm": 0.2596069276332855, "learning_rate": 8.376460792863237e-05, "loss": 0.1295, "step": 18050 }, { "grad_norm": 0.4091825485229492, "learning_rate": 8.374427751422444e-05, "loss": 0.129, "step": 18060 }, { "grad_norm": 0.2825661599636078, "learning_rate": 8.3723936849196e-05, "loss": 0.1257, "step": 18070 }, { "grad_norm": 0.29069802165031433, "learning_rate": 8.370358593972595e-05, "loss": 0.1304, "step": 18080 }, { "grad_norm": 0.23459963500499725, "learning_rate": 8.36832247919964e-05, "loss": 0.1193, "step": 18090 }, { "grad_norm": 0.25299325585365295, "learning_rate": 8.36628534121925e-05, "loss": 0.1223, "step": 18100 }, { "grad_norm": 0.23739129304885864, "learning_rate": 8.364247180650254e-05, "loss": 0.1313, "step": 18110 }, { "grad_norm": 0.2909586429595947, "learning_rate": 8.362207998111794e-05, "loss": 0.1311, "step": 18120 }, { "grad_norm": 0.29886409640312195, "learning_rate": 8.360167794223318e-05, "loss": 0.1174, "step": 18130 }, { "grad_norm": 0.29531192779541016, "learning_rate": 8.358126569604586e-05, "loss": 0.1329, "step": 18140 }, { "grad_norm": 0.287529319524765, "learning_rate": 8.356084324875668e-05, "loss": 0.118, "step": 18150 }, { "grad_norm": 0.26492995023727417, "learning_rate": 8.354041060656945e-05, "loss": 0.1332, "step": 18160 }, { "grad_norm": 0.2426668405532837, "learning_rate": 8.351996777569106e-05, "loss": 0.1188, "step": 18170 }, { "grad_norm": 0.2564946711063385, "learning_rate": 8.349951476233148e-05, "loss": 0.1283, "step": 18180 }, { "grad_norm": 0.23010297119617462, "learning_rate": 8.347905157270386e-05, "loss": 0.124, "step": 18190 }, { "grad_norm": 0.25399914383888245, "learning_rate": 8.345857821302432e-05, "loss": 0.1154, "step": 18200 }, { "grad_norm": 0.2052605003118515, "learning_rate": 8.343809468951213e-05, "loss": 0.1253, "step": 18210 }, { "grad_norm": 0.24641215801239014, "learning_rate": 8.341760100838965e-05, "loss": 0.1253, "step": 18220 }, { "grad_norm": 0.2650318741798401, "learning_rate": 8.339709717588233e-05, "loss": 0.1264, "step": 18230 }, { "grad_norm": 0.30910101532936096, "learning_rate": 8.33765831982187e-05, "loss": 0.1282, "step": 18240 }, { "grad_norm": 0.3414677381515503, "learning_rate": 8.335605908163035e-05, "loss": 0.1358, "step": 18250 }, { "grad_norm": 0.2909446954727173, "learning_rate": 8.333552483235196e-05, "loss": 0.1189, "step": 18260 }, { "grad_norm": 0.24775145947933197, "learning_rate": 8.33149804566213e-05, "loss": 0.1319, "step": 18270 }, { "grad_norm": 0.2715279161930084, "learning_rate": 8.329442596067921e-05, "loss": 0.118, "step": 18280 }, { "grad_norm": 0.30143994092941284, "learning_rate": 8.32738613507696e-05, "loss": 0.1294, "step": 18290 }, { "grad_norm": 0.25050559639930725, "learning_rate": 8.325328663313946e-05, "loss": 0.1328, "step": 18300 }, { "grad_norm": 0.28249233961105347, "learning_rate": 8.323270181403884e-05, "loss": 0.1261, "step": 18310 }, { "grad_norm": 0.25894448161125183, "learning_rate": 8.321210689972086e-05, "loss": 0.1173, "step": 18320 }, { "grad_norm": 0.2925615906715393, "learning_rate": 8.319150189644174e-05, "loss": 0.1278, "step": 18330 }, { "grad_norm": 0.24684704840183258, "learning_rate": 8.31708868104607e-05, "loss": 0.1242, "step": 18340 }, { "grad_norm": 0.2912921607494354, "learning_rate": 8.315026164804007e-05, "loss": 0.1185, "step": 18350 }, { "grad_norm": 0.2554190456867218, "learning_rate": 8.312962641544524e-05, "loss": 0.1279, "step": 18360 }, { "grad_norm": 0.2866176664829254, "learning_rate": 8.310898111894465e-05, "loss": 0.1364, "step": 18370 }, { "grad_norm": 0.30980417132377625, "learning_rate": 8.308832576480977e-05, "loss": 0.1283, "step": 18380 }, { "grad_norm": 0.26470914483070374, "learning_rate": 8.306766035931519e-05, "loss": 0.1211, "step": 18390 }, { "grad_norm": 0.29536691308021545, "learning_rate": 8.304698490873847e-05, "loss": 0.1289, "step": 18400 }, { "grad_norm": 0.25909584760665894, "learning_rate": 8.30262994193603e-05, "loss": 0.1334, "step": 18410 }, { "grad_norm": 0.31326887011528015, "learning_rate": 8.300560389746438e-05, "loss": 0.1248, "step": 18420 }, { "grad_norm": 0.24796034395694733, "learning_rate": 8.298489834933745e-05, "loss": 0.1273, "step": 18430 }, { "grad_norm": 0.3009352684020996, "learning_rate": 8.296418278126934e-05, "loss": 0.1303, "step": 18440 }, { "grad_norm": 0.3068403899669647, "learning_rate": 8.294345719955284e-05, "loss": 0.1265, "step": 18450 }, { "grad_norm": 0.2815842628479004, "learning_rate": 8.29227216104839e-05, "loss": 0.1288, "step": 18460 }, { "grad_norm": 0.2819996178150177, "learning_rate": 8.290197602036137e-05, "loss": 0.1275, "step": 18470 }, { "grad_norm": 0.22038598358631134, "learning_rate": 8.288122043548725e-05, "loss": 0.1199, "step": 18480 }, { "grad_norm": 0.2565299868583679, "learning_rate": 8.286045486216657e-05, "loss": 0.1266, "step": 18490 }, { "grad_norm": 0.257168173789978, "learning_rate": 8.283967930670733e-05, "loss": 0.1204, "step": 18500 }, { "grad_norm": 0.33537527918815613, "learning_rate": 8.281889377542058e-05, "loss": 0.1273, "step": 18510 }, { "grad_norm": 0.39308062195777893, "learning_rate": 8.279809827462045e-05, "loss": 0.121, "step": 18520 }, { "grad_norm": 0.309737890958786, "learning_rate": 8.277729281062402e-05, "loss": 0.1269, "step": 18530 }, { "grad_norm": 0.27239057421684265, "learning_rate": 8.27564773897515e-05, "loss": 0.134, "step": 18540 }, { "grad_norm": 0.25145938992500305, "learning_rate": 8.273565201832602e-05, "loss": 0.1203, "step": 18550 }, { "grad_norm": 0.31423047184944153, "learning_rate": 8.27148167026738e-05, "loss": 0.1327, "step": 18560 }, { "grad_norm": 0.2886013090610504, "learning_rate": 8.269397144912405e-05, "loss": 0.1283, "step": 18570 }, { "grad_norm": 0.25630301237106323, "learning_rate": 8.267311626400899e-05, "loss": 0.1178, "step": 18580 }, { "grad_norm": 0.23033927381038666, "learning_rate": 8.26522511536639e-05, "loss": 0.1276, "step": 18590 }, { "grad_norm": 0.2986776828765869, "learning_rate": 8.263137612442706e-05, "loss": 0.1286, "step": 18600 }, { "grad_norm": 0.25144535303115845, "learning_rate": 8.261049118263971e-05, "loss": 0.1222, "step": 18610 }, { "grad_norm": 0.20706219971179962, "learning_rate": 8.258959633464619e-05, "loss": 0.1162, "step": 18620 }, { "grad_norm": 0.2534739673137665, "learning_rate": 8.256869158679377e-05, "loss": 0.1276, "step": 18630 }, { "grad_norm": 0.2472379356622696, "learning_rate": 8.254777694543278e-05, "loss": 0.1224, "step": 18640 }, { "grad_norm": 0.28797799348831177, "learning_rate": 8.252685241691651e-05, "loss": 0.1263, "step": 18650 }, { "grad_norm": 0.2273532748222351, "learning_rate": 8.250591800760133e-05, "loss": 0.1207, "step": 18660 }, { "grad_norm": 0.31892919540405273, "learning_rate": 8.248497372384649e-05, "loss": 0.1407, "step": 18670 }, { "grad_norm": 0.24288274347782135, "learning_rate": 8.246401957201437e-05, "loss": 0.1228, "step": 18680 }, { "grad_norm": 0.30536961555480957, "learning_rate": 8.244305555847027e-05, "loss": 0.1298, "step": 18690 }, { "grad_norm": 0.28901171684265137, "learning_rate": 8.24220816895825e-05, "loss": 0.1225, "step": 18700 }, { "grad_norm": 0.272087424993515, "learning_rate": 8.240109797172237e-05, "loss": 0.1281, "step": 18710 }, { "grad_norm": 0.2943830192089081, "learning_rate": 8.238010441126416e-05, "loss": 0.1257, "step": 18720 }, { "grad_norm": 0.23872986435890198, "learning_rate": 8.23591010145852e-05, "loss": 0.1176, "step": 18730 }, { "grad_norm": 0.2187395542860031, "learning_rate": 8.233808778806571e-05, "loss": 0.1187, "step": 18740 }, { "grad_norm": 0.2149970531463623, "learning_rate": 8.231706473808903e-05, "loss": 0.1166, "step": 18750 }, { "grad_norm": 0.20907169580459595, "learning_rate": 8.229603187104133e-05, "loss": 0.1218, "step": 18760 }, { "grad_norm": 0.2738320231437683, "learning_rate": 8.22749891933119e-05, "loss": 0.1279, "step": 18770 }, { "grad_norm": 0.27614444494247437, "learning_rate": 8.225393671129291e-05, "loss": 0.1232, "step": 18780 }, { "grad_norm": 0.2801656723022461, "learning_rate": 8.223287443137957e-05, "loss": 0.124, "step": 18790 }, { "grad_norm": 0.21237830817699432, "learning_rate": 8.221180235997004e-05, "loss": 0.1241, "step": 18800 }, { "grad_norm": 0.27586594223976135, "learning_rate": 8.219072050346544e-05, "loss": 0.1343, "step": 18810 }, { "grad_norm": 0.27271223068237305, "learning_rate": 8.216962886826992e-05, "loss": 0.1245, "step": 18820 }, { "grad_norm": 0.29055309295654297, "learning_rate": 8.214852746079054e-05, "loss": 0.128, "step": 18830 }, { "grad_norm": 0.27278244495391846, "learning_rate": 8.212741628743732e-05, "loss": 0.1221, "step": 18840 }, { "grad_norm": 0.23890136182308197, "learning_rate": 8.210629535462333e-05, "loss": 0.1242, "step": 18850 }, { "grad_norm": 0.2709961533546448, "learning_rate": 8.208516466876453e-05, "loss": 0.1242, "step": 18860 }, { "grad_norm": 0.2484947293996811, "learning_rate": 8.206402423627986e-05, "loss": 0.1221, "step": 18870 }, { "grad_norm": 0.29786181449890137, "learning_rate": 8.204287406359124e-05, "loss": 0.1273, "step": 18880 }, { "grad_norm": 0.2689514458179474, "learning_rate": 8.20217141571235e-05, "loss": 0.1238, "step": 18890 }, { "grad_norm": 0.28273293375968933, "learning_rate": 8.200054452330449e-05, "loss": 0.1245, "step": 18900 }, { "grad_norm": 0.22190678119659424, "learning_rate": 8.197936516856499e-05, "loss": 0.1201, "step": 18910 }, { "grad_norm": 0.30852359533309937, "learning_rate": 8.195817609933871e-05, "loss": 0.1317, "step": 18920 }, { "grad_norm": 0.25391560792922974, "learning_rate": 8.193697732206233e-05, "loss": 0.1225, "step": 18930 }, { "grad_norm": 0.2727567255496979, "learning_rate": 8.19157688431755e-05, "loss": 0.1318, "step": 18940 }, { "grad_norm": 0.24515973031520844, "learning_rate": 8.189455066912077e-05, "loss": 0.1173, "step": 18950 }, { "grad_norm": 0.2513839304447174, "learning_rate": 8.187332280634369e-05, "loss": 0.1374, "step": 18960 }, { "grad_norm": 0.21820731461048126, "learning_rate": 8.18520852612927e-05, "loss": 0.122, "step": 18970 }, { "grad_norm": 0.22641150653362274, "learning_rate": 8.183083804041921e-05, "loss": 0.1252, "step": 18980 }, { "grad_norm": 0.25765684247016907, "learning_rate": 8.180958115017757e-05, "loss": 0.1167, "step": 18990 }, { "grad_norm": 0.23104067146778107, "learning_rate": 8.178831459702505e-05, "loss": 0.1201, "step": 19000 }, { "grad_norm": 0.29543182253837585, "learning_rate": 8.17670383874219e-05, "loss": 0.1264, "step": 19010 }, { "grad_norm": 0.25790029764175415, "learning_rate": 8.174575252783124e-05, "loss": 0.1266, "step": 19020 }, { "grad_norm": 0.27553048729896545, "learning_rate": 8.172445702471914e-05, "loss": 0.1341, "step": 19030 }, { "grad_norm": 0.2499500811100006, "learning_rate": 8.170315188455466e-05, "loss": 0.1259, "step": 19040 }, { "grad_norm": 0.25881245732307434, "learning_rate": 8.168183711380969e-05, "loss": 0.1178, "step": 19050 }, { "grad_norm": 0.2873832881450653, "learning_rate": 8.166051271895913e-05, "loss": 0.1165, "step": 19060 }, { "grad_norm": 0.2576196491718292, "learning_rate": 8.163917870648075e-05, "loss": 0.1228, "step": 19070 }, { "grad_norm": 0.32185599207878113, "learning_rate": 8.161783508285526e-05, "loss": 0.1187, "step": 19080 }, { "grad_norm": 0.2847517728805542, "learning_rate": 8.159648185456628e-05, "loss": 0.1216, "step": 19090 }, { "grad_norm": 0.24850137531757355, "learning_rate": 8.157511902810038e-05, "loss": 0.1267, "step": 19100 }, { "grad_norm": 0.2520488202571869, "learning_rate": 8.155374660994701e-05, "loss": 0.1268, "step": 19110 }, { "grad_norm": 0.21024805307388306, "learning_rate": 8.153236460659857e-05, "loss": 0.1186, "step": 19120 }, { "grad_norm": 0.2294238954782486, "learning_rate": 8.151097302455031e-05, "loss": 0.1273, "step": 19130 }, { "grad_norm": 0.2845477759838104, "learning_rate": 8.148957187030044e-05, "loss": 0.115, "step": 19140 }, { "grad_norm": 0.2560008764266968, "learning_rate": 8.146816115035006e-05, "loss": 0.1278, "step": 19150 }, { "grad_norm": 0.2648065388202667, "learning_rate": 8.14467408712032e-05, "loss": 0.1308, "step": 19160 }, { "grad_norm": 0.25570160150527954, "learning_rate": 8.142531103936678e-05, "loss": 0.1234, "step": 19170 }, { "grad_norm": 0.26421767473220825, "learning_rate": 8.14038716613506e-05, "loss": 0.1238, "step": 19180 }, { "grad_norm": 0.23783424496650696, "learning_rate": 8.138242274366736e-05, "loss": 0.1289, "step": 19190 }, { "grad_norm": 0.23070575296878815, "learning_rate": 8.136096429283271e-05, "loss": 0.1219, "step": 19200 }, { "grad_norm": 0.2287486493587494, "learning_rate": 8.133949631536515e-05, "loss": 0.1205, "step": 19210 }, { "grad_norm": 0.2364719808101654, "learning_rate": 8.131801881778607e-05, "loss": 0.126, "step": 19220 }, { "grad_norm": 0.29436272382736206, "learning_rate": 8.129653180661978e-05, "loss": 0.1357, "step": 19230 }, { "grad_norm": 0.2508828938007355, "learning_rate": 8.127503528839346e-05, "loss": 0.121, "step": 19240 }, { "grad_norm": 0.21536362171173096, "learning_rate": 8.125352926963721e-05, "loss": 0.1258, "step": 19250 }, { "grad_norm": 0.2347584217786789, "learning_rate": 8.123201375688395e-05, "loss": 0.1248, "step": 19260 }, { "grad_norm": 0.2741236388683319, "learning_rate": 8.121048875666954e-05, "loss": 0.1299, "step": 19270 }, { "grad_norm": 0.28803750872612, "learning_rate": 8.118895427553274e-05, "loss": 0.1328, "step": 19280 }, { "grad_norm": 0.23639988899230957, "learning_rate": 8.116741032001511e-05, "loss": 0.137, "step": 19290 }, { "grad_norm": 0.21322670578956604, "learning_rate": 8.114585689666114e-05, "loss": 0.1259, "step": 19300 }, { "grad_norm": 0.2842141389846802, "learning_rate": 8.112429401201821e-05, "loss": 0.1282, "step": 19310 }, { "grad_norm": 0.28551822900772095, "learning_rate": 8.110272167263656e-05, "loss": 0.1213, "step": 19320 }, { "grad_norm": 0.21302318572998047, "learning_rate": 8.108113988506929e-05, "loss": 0.1212, "step": 19330 }, { "grad_norm": 0.2173224240541458, "learning_rate": 8.105954865587235e-05, "loss": 0.1243, "step": 19340 }, { "grad_norm": 0.28795284032821655, "learning_rate": 8.103794799160463e-05, "loss": 0.1249, "step": 19350 }, { "grad_norm": 0.2607155442237854, "learning_rate": 8.101633789882781e-05, "loss": 0.1188, "step": 19360 }, { "grad_norm": 0.26948410272598267, "learning_rate": 8.099471838410648e-05, "loss": 0.1261, "step": 19370 }, { "grad_norm": 0.25303030014038086, "learning_rate": 8.097308945400806e-05, "loss": 0.1271, "step": 19380 }, { "grad_norm": 0.3304736316204071, "learning_rate": 8.095145111510288e-05, "loss": 0.1347, "step": 19390 }, { "grad_norm": 0.30230140686035156, "learning_rate": 8.092980337396406e-05, "loss": 0.1302, "step": 19400 }, { "grad_norm": 0.29070281982421875, "learning_rate": 8.090814623716763e-05, "loss": 0.1281, "step": 19410 }, { "grad_norm": 0.22818787395954132, "learning_rate": 8.088647971129246e-05, "loss": 0.1246, "step": 19420 }, { "grad_norm": 0.2729281485080719, "learning_rate": 8.086480380292026e-05, "loss": 0.1258, "step": 19430 }, { "grad_norm": 0.23250681161880493, "learning_rate": 8.084311851863562e-05, "loss": 0.1297, "step": 19440 }, { "grad_norm": 0.22825656831264496, "learning_rate": 8.082142386502591e-05, "loss": 0.1285, "step": 19450 }, { "grad_norm": 0.2441476434469223, "learning_rate": 8.079971984868145e-05, "loss": 0.1158, "step": 19460 }, { "grad_norm": 0.2425069957971573, "learning_rate": 8.077800647619532e-05, "loss": 0.1275, "step": 19470 }, { "grad_norm": 0.24679157137870789, "learning_rate": 8.075628375416345e-05, "loss": 0.1323, "step": 19480 }, { "grad_norm": 0.26697203516960144, "learning_rate": 8.073455168918464e-05, "loss": 0.1343, "step": 19490 }, { "grad_norm": 0.24249690771102905, "learning_rate": 8.071281028786055e-05, "loss": 0.1185, "step": 19500 }, { "grad_norm": 0.26691219210624695, "learning_rate": 8.069105955679562e-05, "loss": 0.1223, "step": 19510 }, { "grad_norm": 0.26624053716659546, "learning_rate": 8.066929950259713e-05, "loss": 0.1247, "step": 19520 }, { "grad_norm": 0.2494584321975708, "learning_rate": 8.064753013187522e-05, "loss": 0.1255, "step": 19530 }, { "grad_norm": 0.2889845371246338, "learning_rate": 8.062575145124289e-05, "loss": 0.123, "step": 19540 }, { "grad_norm": 0.26822352409362793, "learning_rate": 8.060396346731587e-05, "loss": 0.1274, "step": 19550 }, { "grad_norm": 0.2609923779964447, "learning_rate": 8.058216618671281e-05, "loss": 0.1267, "step": 19560 }, { "grad_norm": 0.2566622793674469, "learning_rate": 8.056035961605514e-05, "loss": 0.1317, "step": 19570 }, { "grad_norm": 0.31375858187675476, "learning_rate": 8.05385437619671e-05, "loss": 0.1293, "step": 19580 }, { "grad_norm": 0.22285392880439758, "learning_rate": 8.05167186310758e-05, "loss": 0.1176, "step": 19590 }, { "grad_norm": 0.26315703988075256, "learning_rate": 8.049488423001113e-05, "loss": 0.1198, "step": 19600 }, { "grad_norm": 0.26536262035369873, "learning_rate": 8.047304056540581e-05, "loss": 0.1145, "step": 19610 }, { "grad_norm": 0.25108522176742554, "learning_rate": 8.045118764389534e-05, "loss": 0.1283, "step": 19620 }, { "grad_norm": 0.27305471897125244, "learning_rate": 8.042932547211809e-05, "loss": 0.125, "step": 19630 }, { "grad_norm": 0.2510153353214264, "learning_rate": 8.04074540567152e-05, "loss": 0.1231, "step": 19640 }, { "grad_norm": 0.24795694649219513, "learning_rate": 8.038557340433063e-05, "loss": 0.1217, "step": 19650 }, { "grad_norm": 0.24790911376476288, "learning_rate": 8.036368352161115e-05, "loss": 0.1371, "step": 19660 }, { "grad_norm": 0.2314600646495819, "learning_rate": 8.034178441520633e-05, "loss": 0.1278, "step": 19670 }, { "grad_norm": 0.27455854415893555, "learning_rate": 8.031987609176852e-05, "loss": 0.124, "step": 19680 }, { "grad_norm": 0.27049073576927185, "learning_rate": 8.02979585579529e-05, "loss": 0.1224, "step": 19690 }, { "grad_norm": 0.2911930978298187, "learning_rate": 8.027603182041745e-05, "loss": 0.1315, "step": 19700 }, { "grad_norm": 0.276785284280777, "learning_rate": 8.025409588582292e-05, "loss": 0.1225, "step": 19710 }, { "grad_norm": 0.35288721323013306, "learning_rate": 8.023215076083288e-05, "loss": 0.1233, "step": 19720 }, { "grad_norm": 0.3080448508262634, "learning_rate": 8.021019645211367e-05, "loss": 0.1268, "step": 19730 }, { "grad_norm": 0.29504817724227905, "learning_rate": 8.018823296633441e-05, "loss": 0.1295, "step": 19740 }, { "grad_norm": 0.26463356614112854, "learning_rate": 8.016626031016708e-05, "loss": 0.1319, "step": 19750 }, { "grad_norm": 0.24397949874401093, "learning_rate": 8.014427849028636e-05, "loss": 0.1225, "step": 19760 }, { "grad_norm": 0.2511431574821472, "learning_rate": 8.012228751336974e-05, "loss": 0.1275, "step": 19770 }, { "grad_norm": 0.2660979926586151, "learning_rate": 8.01002873860975e-05, "loss": 0.1241, "step": 19780 }, { "grad_norm": 0.24973149597644806, "learning_rate": 8.00782781151527e-05, "loss": 0.1255, "step": 19790 }, { "grad_norm": 0.2312939316034317, "learning_rate": 8.005625970722119e-05, "loss": 0.1211, "step": 19800 }, { "grad_norm": 0.2675303518772125, "learning_rate": 8.003423216899158e-05, "loss": 0.1313, "step": 19810 }, { "grad_norm": 0.2516126036643982, "learning_rate": 8.001219550715522e-05, "loss": 0.1176, "step": 19820 }, { "grad_norm": 0.29442915320396423, "learning_rate": 7.999014972840632e-05, "loss": 0.1245, "step": 19830 }, { "grad_norm": 0.22972266376018524, "learning_rate": 7.996809483944174e-05, "loss": 0.13, "step": 19840 }, { "grad_norm": 0.23615314066410065, "learning_rate": 7.994603084696124e-05, "loss": 0.1324, "step": 19850 }, { "grad_norm": 0.274357408285141, "learning_rate": 7.992395775766724e-05, "loss": 0.1322, "step": 19860 }, { "grad_norm": 0.23572176694869995, "learning_rate": 7.990187557826497e-05, "loss": 0.1282, "step": 19870 }, { "grad_norm": 0.2543247938156128, "learning_rate": 7.987978431546242e-05, "loss": 0.1253, "step": 19880 }, { "grad_norm": 0.27155762910842896, "learning_rate": 7.985768397597031e-05, "loss": 0.1246, "step": 19890 }, { "grad_norm": 0.23069658875465393, "learning_rate": 7.983557456650216e-05, "loss": 0.124, "step": 19900 }, { "grad_norm": 0.26079389452934265, "learning_rate": 7.981345609377422e-05, "loss": 0.1196, "step": 19910 }, { "grad_norm": 0.1863243132829666, "learning_rate": 7.97913285645055e-05, "loss": 0.123, "step": 19920 }, { "grad_norm": 0.2211555689573288, "learning_rate": 7.976919198541776e-05, "loss": 0.1193, "step": 19930 }, { "grad_norm": 0.23963850736618042, "learning_rate": 7.974704636323548e-05, "loss": 0.1244, "step": 19940 }, { "grad_norm": 0.35094305872917175, "learning_rate": 7.972489170468597e-05, "loss": 0.1304, "step": 19950 }, { "grad_norm": 0.2960309386253357, "learning_rate": 7.970272801649918e-05, "loss": 0.1181, "step": 19960 }, { "grad_norm": 0.24527306854724884, "learning_rate": 7.96805553054079e-05, "loss": 0.1239, "step": 19970 }, { "grad_norm": 0.24253244698047638, "learning_rate": 7.965837357814756e-05, "loss": 0.1203, "step": 19980 }, { "grad_norm": 0.28297358751296997, "learning_rate": 7.963618284145643e-05, "loss": 0.129, "step": 19990 }, { "grad_norm": 0.20604784786701202, "learning_rate": 7.961398310207544e-05, "loss": 0.1202, "step": 20000 }, { "grad_norm": 0.2285463660955429, "learning_rate": 7.95917743667483e-05, "loss": 0.1225, "step": 20010 }, { "grad_norm": 0.27482813596725464, "learning_rate": 7.956955664222144e-05, "loss": 0.1261, "step": 20020 }, { "grad_norm": 0.2243138551712036, "learning_rate": 7.954732993524399e-05, "loss": 0.1235, "step": 20030 }, { "grad_norm": 0.2532173991203308, "learning_rate": 7.952509425256786e-05, "loss": 0.1263, "step": 20040 }, { "grad_norm": 0.1908102184534073, "learning_rate": 7.950284960094767e-05, "loss": 0.1173, "step": 20050 }, { "grad_norm": 0.2825027108192444, "learning_rate": 7.948059598714076e-05, "loss": 0.1303, "step": 20060 }, { "grad_norm": 0.22984720766544342, "learning_rate": 7.945833341790717e-05, "loss": 0.129, "step": 20070 }, { "grad_norm": 0.2469748556613922, "learning_rate": 7.94360619000097e-05, "loss": 0.1166, "step": 20080 }, { "grad_norm": 0.28847235441207886, "learning_rate": 7.941378144021381e-05, "loss": 0.1283, "step": 20090 }, { "grad_norm": 0.25102823972702026, "learning_rate": 7.939149204528777e-05, "loss": 0.1236, "step": 20100 }, { "grad_norm": 0.253803551197052, "learning_rate": 7.936919372200246e-05, "loss": 0.1111, "step": 20110 }, { "grad_norm": 0.18369901180267334, "learning_rate": 7.934688647713158e-05, "loss": 0.1196, "step": 20120 }, { "grad_norm": 0.24937689304351807, "learning_rate": 7.932457031745143e-05, "loss": 0.119, "step": 20130 }, { "grad_norm": 0.2693967819213867, "learning_rate": 7.930224524974108e-05, "loss": 0.1276, "step": 20140 }, { "grad_norm": 0.2562641203403473, "learning_rate": 7.927991128078232e-05, "loss": 0.1231, "step": 20150 }, { "grad_norm": 0.2622627913951874, "learning_rate": 7.925756841735958e-05, "loss": 0.1305, "step": 20160 }, { "grad_norm": 0.17786403000354767, "learning_rate": 7.923521666626008e-05, "loss": 0.1161, "step": 20170 }, { "grad_norm": 0.23375153541564941, "learning_rate": 7.921285603427366e-05, "loss": 0.1215, "step": 20180 }, { "grad_norm": 0.19849200546741486, "learning_rate": 7.91904865281929e-05, "loss": 0.1241, "step": 20190 }, { "grad_norm": 0.3679966330528259, "learning_rate": 7.916810815481307e-05, "loss": 0.1318, "step": 20200 }, { "grad_norm": 0.26763781905174255, "learning_rate": 7.914572092093211e-05, "loss": 0.1291, "step": 20210 }, { "grad_norm": 0.2753245532512665, "learning_rate": 7.912332483335068e-05, "loss": 0.1277, "step": 20220 }, { "grad_norm": 0.18683531880378723, "learning_rate": 7.910091989887213e-05, "loss": 0.1138, "step": 20230 }, { "grad_norm": 0.2525083124637604, "learning_rate": 7.907850612430248e-05, "loss": 0.1281, "step": 20240 }, { "grad_norm": 0.22764591872692108, "learning_rate": 7.905608351645044e-05, "loss": 0.1145, "step": 20250 }, { "grad_norm": 0.2621179521083832, "learning_rate": 7.90336520821274e-05, "loss": 0.1221, "step": 20260 }, { "grad_norm": 0.2596755027770996, "learning_rate": 7.901121182814746e-05, "loss": 0.1252, "step": 20270 }, { "grad_norm": 0.2924700677394867, "learning_rate": 7.898876276132736e-05, "loss": 0.1251, "step": 20280 }, { "grad_norm": 0.29952454566955566, "learning_rate": 7.896630488848654e-05, "loss": 0.1155, "step": 20290 }, { "grad_norm": 0.21339131891727448, "learning_rate": 7.89438382164471e-05, "loss": 0.1185, "step": 20300 }, { "grad_norm": 0.25772953033447266, "learning_rate": 7.892136275203383e-05, "loss": 0.1186, "step": 20310 }, { "grad_norm": 0.252332866191864, "learning_rate": 7.889887850207418e-05, "loss": 0.1215, "step": 20320 }, { "grad_norm": 0.2512720823287964, "learning_rate": 7.887638547339827e-05, "loss": 0.1224, "step": 20330 }, { "grad_norm": 0.2753334641456604, "learning_rate": 7.885388367283891e-05, "loss": 0.1321, "step": 20340 }, { "grad_norm": 0.24452225863933563, "learning_rate": 7.88313731072315e-05, "loss": 0.1254, "step": 20350 }, { "grad_norm": 0.2529919743537903, "learning_rate": 7.88088537834142e-05, "loss": 0.1286, "step": 20360 }, { "grad_norm": 0.28041696548461914, "learning_rate": 7.878632570822778e-05, "loss": 0.1216, "step": 20370 }, { "grad_norm": 0.20420750975608826, "learning_rate": 7.876378888851567e-05, "loss": 0.1198, "step": 20380 }, { "grad_norm": 0.224908247590065, "learning_rate": 7.874124333112396e-05, "loss": 0.1234, "step": 20390 }, { "grad_norm": 0.22657336294651031, "learning_rate": 7.871868904290138e-05, "loss": 0.1152, "step": 20400 }, { "grad_norm": 0.25116270780563354, "learning_rate": 7.869612603069935e-05, "loss": 0.1257, "step": 20410 }, { "grad_norm": 0.24244016408920288, "learning_rate": 7.867355430137192e-05, "loss": 0.1192, "step": 20420 }, { "grad_norm": 0.3047901391983032, "learning_rate": 7.865097386177577e-05, "loss": 0.1252, "step": 20430 }, { "grad_norm": 0.30224305391311646, "learning_rate": 7.862838471877023e-05, "loss": 0.127, "step": 20440 }, { "grad_norm": 0.2842647433280945, "learning_rate": 7.860578687921731e-05, "loss": 0.116, "step": 20450 }, { "grad_norm": 0.19007238745689392, "learning_rate": 7.858318034998164e-05, "loss": 0.118, "step": 20460 }, { "grad_norm": 0.2735355496406555, "learning_rate": 7.856056513793046e-05, "loss": 0.1204, "step": 20470 }, { "grad_norm": 0.2480243593454361, "learning_rate": 7.85379412499337e-05, "loss": 0.1178, "step": 20480 }, { "grad_norm": 0.2415713220834732, "learning_rate": 7.851530869286389e-05, "loss": 0.1234, "step": 20490 }, { "grad_norm": 0.2258894443511963, "learning_rate": 7.849266747359619e-05, "loss": 0.1183, "step": 20500 }, { "grad_norm": 0.24577753245830536, "learning_rate": 7.847001759900843e-05, "loss": 0.1262, "step": 20510 }, { "grad_norm": 0.24416179955005646, "learning_rate": 7.844735907598102e-05, "loss": 0.1246, "step": 20520 }, { "grad_norm": 0.3230130672454834, "learning_rate": 7.842469191139703e-05, "loss": 0.12, "step": 20530 }, { "grad_norm": 0.2616064250469208, "learning_rate": 7.840201611214215e-05, "loss": 0.1273, "step": 20540 }, { "grad_norm": 0.3288092613220215, "learning_rate": 7.837933168510469e-05, "loss": 0.1179, "step": 20550 }, { "grad_norm": 0.2832246422767639, "learning_rate": 7.835663863717559e-05, "loss": 0.1246, "step": 20560 }, { "grad_norm": 0.23859864473342896, "learning_rate": 7.833393697524838e-05, "loss": 0.1238, "step": 20570 }, { "grad_norm": 0.3289240002632141, "learning_rate": 7.831122670621922e-05, "loss": 0.1297, "step": 20580 }, { "grad_norm": 0.29447004199028015, "learning_rate": 7.82885078369869e-05, "loss": 0.1218, "step": 20590 }, { "grad_norm": 0.29019561409950256, "learning_rate": 7.826578037445283e-05, "loss": 0.131, "step": 20600 }, { "grad_norm": 0.23311269283294678, "learning_rate": 7.824304432552097e-05, "loss": 0.1301, "step": 20610 }, { "grad_norm": 0.25753164291381836, "learning_rate": 7.822029969709798e-05, "loss": 0.1212, "step": 20620 }, { "grad_norm": 0.2690712511539459, "learning_rate": 7.819754649609306e-05, "loss": 0.1208, "step": 20630 }, { "grad_norm": 0.2724335193634033, "learning_rate": 7.817478472941802e-05, "loss": 0.1208, "step": 20640 }, { "grad_norm": 0.2522333264350891, "learning_rate": 7.815201440398727e-05, "loss": 0.1268, "step": 20650 }, { "grad_norm": 0.2872611880302429, "learning_rate": 7.812923552671789e-05, "loss": 0.122, "step": 20660 }, { "grad_norm": 0.27797284722328186, "learning_rate": 7.810644810452945e-05, "loss": 0.1255, "step": 20670 }, { "grad_norm": 0.24844162166118622, "learning_rate": 7.808365214434417e-05, "loss": 0.126, "step": 20680 }, { "grad_norm": 0.27011311054229736, "learning_rate": 7.80608476530869e-05, "loss": 0.1273, "step": 20690 }, { "grad_norm": 0.2833697199821472, "learning_rate": 7.8038034637685e-05, "loss": 0.127, "step": 20700 }, { "grad_norm": 0.2562924921512604, "learning_rate": 7.801521310506848e-05, "loss": 0.128, "step": 20710 }, { "grad_norm": 0.23434901237487793, "learning_rate": 7.799238306216994e-05, "loss": 0.1253, "step": 20720 }, { "grad_norm": 0.258905827999115, "learning_rate": 7.796954451592448e-05, "loss": 0.1269, "step": 20730 }, { "grad_norm": 0.2089558094739914, "learning_rate": 7.794669747326992e-05, "loss": 0.1176, "step": 20740 }, { "grad_norm": 0.203962504863739, "learning_rate": 7.792384194114654e-05, "loss": 0.1198, "step": 20750 }, { "grad_norm": 0.20684786140918732, "learning_rate": 7.790097792649729e-05, "loss": 0.1201, "step": 20760 }, { "grad_norm": 0.23743799328804016, "learning_rate": 7.787810543626762e-05, "loss": 0.1252, "step": 20770 }, { "grad_norm": 0.23900370299816132, "learning_rate": 7.785522447740558e-05, "loss": 0.1304, "step": 20780 }, { "grad_norm": 0.239165261387825, "learning_rate": 7.783233505686182e-05, "loss": 0.1265, "step": 20790 }, { "grad_norm": 0.23411408066749573, "learning_rate": 7.780943718158955e-05, "loss": 0.1287, "step": 20800 }, { "grad_norm": 0.24717307090759277, "learning_rate": 7.778653085854453e-05, "loss": 0.118, "step": 20810 }, { "grad_norm": 0.29065772891044617, "learning_rate": 7.77636160946851e-05, "loss": 0.1237, "step": 20820 }, { "grad_norm": 0.2377616912126541, "learning_rate": 7.774069289697215e-05, "loss": 0.121, "step": 20830 }, { "grad_norm": 0.324375182390213, "learning_rate": 7.771776127236913e-05, "loss": 0.1204, "step": 20840 }, { "grad_norm": 0.28439566493034363, "learning_rate": 7.769482122784212e-05, "loss": 0.1253, "step": 20850 }, { "grad_norm": 0.2584039270877838, "learning_rate": 7.767187277035963e-05, "loss": 0.1174, "step": 20860 }, { "grad_norm": 0.19799774885177612, "learning_rate": 7.764891590689285e-05, "loss": 0.1172, "step": 20870 }, { "grad_norm": 0.26828625798225403, "learning_rate": 7.762595064441542e-05, "loss": 0.1257, "step": 20880 }, { "grad_norm": 0.24233794212341309, "learning_rate": 7.760297698990362e-05, "loss": 0.1188, "step": 20890 }, { "grad_norm": 0.2625802159309387, "learning_rate": 7.757999495033623e-05, "loss": 0.1254, "step": 20900 }, { "grad_norm": 0.2359815090894699, "learning_rate": 7.755700453269456e-05, "loss": 0.1165, "step": 20910 }, { "grad_norm": 0.26551565527915955, "learning_rate": 7.753400574396254e-05, "loss": 0.1283, "step": 20920 }, { "grad_norm": 0.23548206686973572, "learning_rate": 7.751099859112655e-05, "loss": 0.1142, "step": 20930 }, { "grad_norm": 0.25290191173553467, "learning_rate": 7.748798308117557e-05, "loss": 0.129, "step": 20940 }, { "grad_norm": 0.24140048027038574, "learning_rate": 7.746495922110112e-05, "loss": 0.1185, "step": 20950 }, { "grad_norm": 0.292361319065094, "learning_rate": 7.744192701789723e-05, "loss": 0.1244, "step": 20960 }, { "grad_norm": 0.20942509174346924, "learning_rate": 7.741888647856046e-05, "loss": 0.1298, "step": 20970 }, { "grad_norm": 0.23870603740215302, "learning_rate": 7.739583761008994e-05, "loss": 0.1305, "step": 20980 }, { "grad_norm": 0.2409277707338333, "learning_rate": 7.73727804194873e-05, "loss": 0.1333, "step": 20990 }, { "grad_norm": 0.36771681904792786, "learning_rate": 7.734971491375671e-05, "loss": 0.1364, "step": 21000 }, { "grad_norm": 0.29761937260627747, "learning_rate": 7.732664109990485e-05, "loss": 0.1201, "step": 21010 }, { "grad_norm": 0.25431689620018005, "learning_rate": 7.730355898494095e-05, "loss": 0.1268, "step": 21020 }, { "grad_norm": 0.2020004540681839, "learning_rate": 7.728046857587673e-05, "loss": 0.1204, "step": 21030 }, { "grad_norm": 0.24152064323425293, "learning_rate": 7.725736987972647e-05, "loss": 0.12, "step": 21040 }, { "grad_norm": 0.22347506880760193, "learning_rate": 7.723426290350691e-05, "loss": 0.1285, "step": 21050 }, { "grad_norm": 0.30625322461128235, "learning_rate": 7.721114765423736e-05, "loss": 0.123, "step": 21060 }, { "grad_norm": 0.27449241280555725, "learning_rate": 7.718802413893963e-05, "loss": 0.1246, "step": 21070 }, { "grad_norm": 0.23771044611930847, "learning_rate": 7.716489236463802e-05, "loss": 0.1285, "step": 21080 }, { "grad_norm": 0.19274596869945526, "learning_rate": 7.714175233835936e-05, "loss": 0.121, "step": 21090 }, { "grad_norm": 0.2839961349964142, "learning_rate": 7.711860406713299e-05, "loss": 0.1321, "step": 21100 }, { "grad_norm": 0.25929147005081177, "learning_rate": 7.70954475579907e-05, "loss": 0.121, "step": 21110 }, { "grad_norm": 0.25485992431640625, "learning_rate": 7.707228281796688e-05, "loss": 0.1204, "step": 21120 }, { "grad_norm": 0.2484835833311081, "learning_rate": 7.704910985409833e-05, "loss": 0.1149, "step": 21130 }, { "grad_norm": 0.24396760761737823, "learning_rate": 7.702592867342439e-05, "loss": 0.1243, "step": 21140 }, { "grad_norm": 0.22539806365966797, "learning_rate": 7.700273928298691e-05, "loss": 0.1296, "step": 21150 }, { "grad_norm": 0.17010800540447235, "learning_rate": 7.697954168983021e-05, "loss": 0.1202, "step": 21160 }, { "grad_norm": 0.2467518150806427, "learning_rate": 7.695633590100109e-05, "loss": 0.1294, "step": 21170 }, { "grad_norm": 0.22100606560707092, "learning_rate": 7.693312192354886e-05, "loss": 0.1152, "step": 21180 }, { "grad_norm": 0.2431665062904358, "learning_rate": 7.690989976452532e-05, "loss": 0.1144, "step": 21190 }, { "grad_norm": 0.27771425247192383, "learning_rate": 7.688666943098475e-05, "loss": 0.1196, "step": 21200 }, { "grad_norm": 0.22753696143627167, "learning_rate": 7.686343092998389e-05, "loss": 0.1199, "step": 21210 }, { "grad_norm": 0.21626894176006317, "learning_rate": 7.684018426858202e-05, "loss": 0.1257, "step": 21220 }, { "grad_norm": 0.27973827719688416, "learning_rate": 7.681692945384084e-05, "loss": 0.1263, "step": 21230 }, { "grad_norm": 0.2475142776966095, "learning_rate": 7.679366649282456e-05, "loss": 0.1274, "step": 21240 }, { "grad_norm": 0.24796482920646667, "learning_rate": 7.677039539259983e-05, "loss": 0.1243, "step": 21250 }, { "grad_norm": 0.2917155921459198, "learning_rate": 7.674711616023581e-05, "loss": 0.1305, "step": 21260 }, { "grad_norm": 0.2684248089790344, "learning_rate": 7.672382880280413e-05, "loss": 0.1283, "step": 21270 }, { "grad_norm": 0.21961405873298645, "learning_rate": 7.670053332737885e-05, "loss": 0.1278, "step": 21280 }, { "grad_norm": 0.2513824999332428, "learning_rate": 7.667722974103654e-05, "loss": 0.1265, "step": 21290 }, { "grad_norm": 0.22961348295211792, "learning_rate": 7.66539180508562e-05, "loss": 0.128, "step": 21300 }, { "grad_norm": 0.265445351600647, "learning_rate": 7.663059826391932e-05, "loss": 0.1137, "step": 21310 }, { "grad_norm": 0.22765392065048218, "learning_rate": 7.660727038730981e-05, "loss": 0.1139, "step": 21320 }, { "grad_norm": 0.22798199951648712, "learning_rate": 7.65839344281141e-05, "loss": 0.1168, "step": 21330 }, { "grad_norm": 0.2614861726760864, "learning_rate": 7.656059039342101e-05, "loss": 0.1236, "step": 21340 }, { "grad_norm": 0.21885082125663757, "learning_rate": 7.653723829032187e-05, "loss": 0.1177, "step": 21350 }, { "grad_norm": 0.2958770990371704, "learning_rate": 7.65138781259104e-05, "loss": 0.1301, "step": 21360 }, { "grad_norm": 0.2652629315853119, "learning_rate": 7.649050990728279e-05, "loss": 0.1187, "step": 21370 }, { "grad_norm": 0.2755635380744934, "learning_rate": 7.646713364153774e-05, "loss": 0.1228, "step": 21380 }, { "grad_norm": 0.28715425729751587, "learning_rate": 7.64437493357763e-05, "loss": 0.1221, "step": 21390 }, { "grad_norm": 0.26315781474113464, "learning_rate": 7.642035699710202e-05, "loss": 0.1287, "step": 21400 }, { "grad_norm": 0.2512977421283722, "learning_rate": 7.639695663262089e-05, "loss": 0.1228, "step": 21410 }, { "grad_norm": 0.22320997714996338, "learning_rate": 7.637354824944128e-05, "loss": 0.1211, "step": 21420 }, { "grad_norm": 0.2386360764503479, "learning_rate": 7.635013185467408e-05, "loss": 0.1287, "step": 21430 }, { "grad_norm": 0.26995953917503357, "learning_rate": 7.632670745543256e-05, "loss": 0.1288, "step": 21440 }, { "grad_norm": 0.24095968902111053, "learning_rate": 7.630327505883242e-05, "loss": 0.1328, "step": 21450 }, { "grad_norm": 0.23304258286952972, "learning_rate": 7.627983467199182e-05, "loss": 0.1234, "step": 21460 }, { "grad_norm": 0.2553110122680664, "learning_rate": 7.625638630203132e-05, "loss": 0.1253, "step": 21470 }, { "grad_norm": 0.2405133843421936, "learning_rate": 7.623292995607394e-05, "loss": 0.1256, "step": 21480 }, { "grad_norm": 0.24728451669216156, "learning_rate": 7.620946564124507e-05, "loss": 0.1292, "step": 21490 }, { "grad_norm": 0.2701030671596527, "learning_rate": 7.618599336467256e-05, "loss": 0.126, "step": 21500 }, { "grad_norm": 0.23957860469818115, "learning_rate": 7.616251313348666e-05, "loss": 0.1255, "step": 21510 }, { "grad_norm": 0.296708345413208, "learning_rate": 7.613902495482005e-05, "loss": 0.1219, "step": 21520 }, { "grad_norm": 0.25912022590637207, "learning_rate": 7.611552883580784e-05, "loss": 0.116, "step": 21530 }, { "grad_norm": 0.2567426562309265, "learning_rate": 7.609202478358748e-05, "loss": 0.1249, "step": 21540 }, { "grad_norm": 0.30936533212661743, "learning_rate": 7.606851280529895e-05, "loss": 0.1128, "step": 21550 }, { "grad_norm": 0.2524036765098572, "learning_rate": 7.604499290808449e-05, "loss": 0.1211, "step": 21560 }, { "grad_norm": 0.2522145211696625, "learning_rate": 7.602146509908888e-05, "loss": 0.122, "step": 21570 }, { "grad_norm": 0.26722991466522217, "learning_rate": 7.599792938545921e-05, "loss": 0.1202, "step": 21580 }, { "grad_norm": 0.25139757990837097, "learning_rate": 7.597438577434506e-05, "loss": 0.1382, "step": 21590 }, { "grad_norm": 0.28154200315475464, "learning_rate": 7.595083427289831e-05, "loss": 0.1232, "step": 21600 }, { "grad_norm": 0.2651607394218445, "learning_rate": 7.59272748882733e-05, "loss": 0.1327, "step": 21610 }, { "grad_norm": 0.24120360612869263, "learning_rate": 7.590370762762675e-05, "loss": 0.1226, "step": 21620 }, { "grad_norm": 0.3010866940021515, "learning_rate": 7.588013249811777e-05, "loss": 0.1211, "step": 21630 }, { "grad_norm": 0.21329191327095032, "learning_rate": 7.585654950690786e-05, "loss": 0.1266, "step": 21640 }, { "grad_norm": 0.22933566570281982, "learning_rate": 7.583295866116091e-05, "loss": 0.122, "step": 21650 }, { "grad_norm": 0.29299214482307434, "learning_rate": 7.580935996804321e-05, "loss": 0.1255, "step": 21660 }, { "grad_norm": 0.24278993904590607, "learning_rate": 7.57857534347234e-05, "loss": 0.124, "step": 21670 }, { "grad_norm": 0.22075185179710388, "learning_rate": 7.576213906837254e-05, "loss": 0.1272, "step": 21680 }, { "grad_norm": 0.23727452754974365, "learning_rate": 7.573851687616403e-05, "loss": 0.1257, "step": 21690 }, { "grad_norm": 0.2702335715293884, "learning_rate": 7.571488686527368e-05, "loss": 0.1272, "step": 21700 }, { "grad_norm": 0.2807042896747589, "learning_rate": 7.569124904287968e-05, "loss": 0.1226, "step": 21710 }, { "grad_norm": 0.30665791034698486, "learning_rate": 7.566760341616254e-05, "loss": 0.1286, "step": 21720 }, { "grad_norm": 0.229291170835495, "learning_rate": 7.564394999230519e-05, "loss": 0.1266, "step": 21730 }, { "grad_norm": 0.2973242402076721, "learning_rate": 7.562028877849294e-05, "loss": 0.1182, "step": 21740 }, { "grad_norm": 0.2623154819011688, "learning_rate": 7.559661978191341e-05, "loss": 0.124, "step": 21750 }, { "grad_norm": 0.2828432023525238, "learning_rate": 7.557294300975664e-05, "loss": 0.1323, "step": 21760 }, { "grad_norm": 0.29984748363494873, "learning_rate": 7.554925846921499e-05, "loss": 0.1192, "step": 21770 }, { "grad_norm": 0.2788054347038269, "learning_rate": 7.552556616748321e-05, "loss": 0.1176, "step": 21780 }, { "grad_norm": 0.2547699511051178, "learning_rate": 7.550186611175838e-05, "loss": 0.124, "step": 21790 }, { "grad_norm": 0.22770950198173523, "learning_rate": 7.547815830923998e-05, "loss": 0.115, "step": 21800 }, { "grad_norm": 0.23701925575733185, "learning_rate": 7.54544427671298e-05, "loss": 0.1268, "step": 21810 }, { "grad_norm": 0.27294665575027466, "learning_rate": 7.543071949263198e-05, "loss": 0.1147, "step": 21820 }, { "grad_norm": 0.1820135861635208, "learning_rate": 7.540698849295305e-05, "loss": 0.1234, "step": 21830 }, { "grad_norm": 0.2170444279909134, "learning_rate": 7.538324977530183e-05, "loss": 0.119, "step": 21840 }, { "grad_norm": 0.25351670384407043, "learning_rate": 7.535950334688955e-05, "loss": 0.1302, "step": 21850 }, { "grad_norm": 0.26369383931159973, "learning_rate": 7.533574921492972e-05, "loss": 0.1195, "step": 21860 }, { "grad_norm": 0.2690143883228302, "learning_rate": 7.531198738663824e-05, "loss": 0.1282, "step": 21870 }, { "grad_norm": 0.26589709520339966, "learning_rate": 7.528821786923333e-05, "loss": 0.1305, "step": 21880 }, { "grad_norm": 0.23374196887016296, "learning_rate": 7.52644406699355e-05, "loss": 0.1209, "step": 21890 }, { "grad_norm": 0.2650601863861084, "learning_rate": 7.524065579596766e-05, "loss": 0.1203, "step": 21900 }, { "grad_norm": 0.2830134332180023, "learning_rate": 7.521686325455506e-05, "loss": 0.1256, "step": 21910 }, { "grad_norm": 0.22650514543056488, "learning_rate": 7.51930630529252e-05, "loss": 0.1167, "step": 21920 }, { "grad_norm": 0.23550406098365784, "learning_rate": 7.516925519830797e-05, "loss": 0.1319, "step": 21930 }, { "grad_norm": 0.31531548500061035, "learning_rate": 7.514543969793557e-05, "loss": 0.1307, "step": 21940 }, { "grad_norm": 0.29601147770881653, "learning_rate": 7.512161655904251e-05, "loss": 0.1189, "step": 21950 }, { "grad_norm": 0.20688803493976593, "learning_rate": 7.509778578886563e-05, "loss": 0.1175, "step": 21960 }, { "grad_norm": 0.22022737562656403, "learning_rate": 7.507394739464412e-05, "loss": 0.1176, "step": 21970 }, { "grad_norm": 0.2958274781703949, "learning_rate": 7.50501013836194e-05, "loss": 0.1376, "step": 21980 }, { "grad_norm": 0.235473170876503, "learning_rate": 7.50262477630353e-05, "loss": 0.1198, "step": 21990 }, { "grad_norm": 0.2569681406021118, "learning_rate": 7.500238654013794e-05, "loss": 0.1237, "step": 22000 }, { "grad_norm": 0.22936925292015076, "learning_rate": 7.497851772217566e-05, "loss": 0.1262, "step": 22010 }, { "grad_norm": 0.2032928168773651, "learning_rate": 7.495464131639924e-05, "loss": 0.1205, "step": 22020 }, { "grad_norm": 0.22534644603729248, "learning_rate": 7.493075733006166e-05, "loss": 0.1215, "step": 22030 }, { "grad_norm": 0.24698428809642792, "learning_rate": 7.490686577041828e-05, "loss": 0.1229, "step": 22040 }, { "grad_norm": 0.2678113877773285, "learning_rate": 7.488296664472668e-05, "loss": 0.1188, "step": 22050 }, { "grad_norm": 0.24403052031993866, "learning_rate": 7.485905996024682e-05, "loss": 0.1172, "step": 22060 }, { "grad_norm": 0.2754824459552765, "learning_rate": 7.483514572424093e-05, "loss": 0.1204, "step": 22070 }, { "grad_norm": 0.2722407579421997, "learning_rate": 7.481122394397349e-05, "loss": 0.1228, "step": 22080 }, { "grad_norm": 0.25212907791137695, "learning_rate": 7.478729462671131e-05, "loss": 0.1227, "step": 22090 }, { "grad_norm": 0.21711964905261993, "learning_rate": 7.47633577797235e-05, "loss": 0.1221, "step": 22100 }, { "grad_norm": 0.22849781811237335, "learning_rate": 7.473941341028144e-05, "loss": 0.125, "step": 22110 }, { "grad_norm": 0.2825089991092682, "learning_rate": 7.471546152565879e-05, "loss": 0.1178, "step": 22120 }, { "grad_norm": 0.22423961758613586, "learning_rate": 7.46915021331315e-05, "loss": 0.1198, "step": 22130 }, { "grad_norm": 0.25249606370925903, "learning_rate": 7.466753523997778e-05, "loss": 0.1205, "step": 22140 }, { "grad_norm": 0.2681425213813782, "learning_rate": 7.464356085347819e-05, "loss": 0.1278, "step": 22150 }, { "grad_norm": 0.32585984468460083, "learning_rate": 7.461957898091548e-05, "loss": 0.1289, "step": 22160 }, { "grad_norm": 0.29421624541282654, "learning_rate": 7.459558962957473e-05, "loss": 0.1141, "step": 22170 }, { "grad_norm": 0.23492902517318726, "learning_rate": 7.457159280674326e-05, "loss": 0.1285, "step": 22180 }, { "grad_norm": 0.25647205114364624, "learning_rate": 7.454758851971066e-05, "loss": 0.1239, "step": 22190 }, { "grad_norm": 0.22741936147212982, "learning_rate": 7.45235767757688e-05, "loss": 0.1226, "step": 22200 }, { "grad_norm": 0.2175936996936798, "learning_rate": 7.449955758221183e-05, "loss": 0.1123, "step": 22210 }, { "grad_norm": 0.2483232021331787, "learning_rate": 7.447553094633615e-05, "loss": 0.1358, "step": 22220 }, { "grad_norm": 0.2738252580165863, "learning_rate": 7.445149687544039e-05, "loss": 0.1222, "step": 22230 }, { "grad_norm": 0.22060978412628174, "learning_rate": 7.44274553768255e-05, "loss": 0.1211, "step": 22240 }, { "grad_norm": 0.2849811911582947, "learning_rate": 7.440340645779464e-05, "loss": 0.1336, "step": 22250 }, { "grad_norm": 0.23312047123908997, "learning_rate": 7.437935012565322e-05, "loss": 0.1267, "step": 22260 }, { "grad_norm": 0.2040143758058548, "learning_rate": 7.435528638770893e-05, "loss": 0.1216, "step": 22270 }, { "grad_norm": 0.21217623353004456, "learning_rate": 7.433121525127171e-05, "loss": 0.1265, "step": 22280 }, { "grad_norm": 0.20969659090042114, "learning_rate": 7.430713672365371e-05, "loss": 0.1122, "step": 22290 }, { "grad_norm": 0.21047237515449524, "learning_rate": 7.428305081216938e-05, "loss": 0.1193, "step": 22300 }, { "grad_norm": 0.19743144512176514, "learning_rate": 7.425895752413536e-05, "loss": 0.1272, "step": 22310 }, { "grad_norm": 0.25046199560165405, "learning_rate": 7.423485686687057e-05, "loss": 0.1316, "step": 22320 }, { "grad_norm": 0.2046230435371399, "learning_rate": 7.421074884769616e-05, "loss": 0.1264, "step": 22330 }, { "grad_norm": 0.2812708616256714, "learning_rate": 7.418663347393548e-05, "loss": 0.1253, "step": 22340 }, { "grad_norm": 0.2192666381597519, "learning_rate": 7.416251075291418e-05, "loss": 0.1196, "step": 22350 }, { "grad_norm": 0.20530743896961212, "learning_rate": 7.413838069196007e-05, "loss": 0.1188, "step": 22360 }, { "grad_norm": 0.23605045676231384, "learning_rate": 7.411424329840324e-05, "loss": 0.1323, "step": 22370 }, { "grad_norm": 0.23844093084335327, "learning_rate": 7.409009857957601e-05, "loss": 0.1281, "step": 22380 }, { "grad_norm": 0.23764850199222565, "learning_rate": 7.40659465428129e-05, "loss": 0.1177, "step": 22390 }, { "grad_norm": 0.22555147111415863, "learning_rate": 7.404178719545063e-05, "loss": 0.1281, "step": 22400 }, { "grad_norm": 0.32817384600639343, "learning_rate": 7.401762054482822e-05, "loss": 0.1358, "step": 22410 }, { "grad_norm": 0.284814715385437, "learning_rate": 7.39934465982868e-05, "loss": 0.1218, "step": 22420 }, { "grad_norm": 0.2143716961145401, "learning_rate": 7.396926536316984e-05, "loss": 0.1224, "step": 22430 }, { "grad_norm": 0.22513079643249512, "learning_rate": 7.394507684682293e-05, "loss": 0.1152, "step": 22440 }, { "grad_norm": 0.23156271874904633, "learning_rate": 7.392088105659393e-05, "loss": 0.1288, "step": 22450 }, { "grad_norm": 0.28899097442626953, "learning_rate": 7.389667799983284e-05, "loss": 0.1222, "step": 22460 }, { "grad_norm": 0.26793017983436584, "learning_rate": 7.387246768389193e-05, "loss": 0.1236, "step": 22470 }, { "grad_norm": 0.25848910212516785, "learning_rate": 7.384825011612563e-05, "loss": 0.1233, "step": 22480 }, { "grad_norm": 0.2799314260482788, "learning_rate": 7.382402530389066e-05, "loss": 0.1281, "step": 22490 }, { "grad_norm": 0.21766015887260437, "learning_rate": 7.379979325454582e-05, "loss": 0.1197, "step": 22500 }, { "grad_norm": 0.23679354786872864, "learning_rate": 7.37755539754522e-05, "loss": 0.1164, "step": 22510 }, { "grad_norm": 0.25562015175819397, "learning_rate": 7.375130747397302e-05, "loss": 0.122, "step": 22520 }, { "grad_norm": 0.20384295284748077, "learning_rate": 7.372705375747377e-05, "loss": 0.1233, "step": 22530 }, { "grad_norm": 0.2206346094608307, "learning_rate": 7.370279283332205e-05, "loss": 0.1199, "step": 22540 }, { "grad_norm": 0.26480355858802795, "learning_rate": 7.36785247088877e-05, "loss": 0.1216, "step": 22550 }, { "grad_norm": 0.2582545578479767, "learning_rate": 7.365424939154275e-05, "loss": 0.1193, "step": 22560 }, { "grad_norm": 0.2749618589878082, "learning_rate": 7.362996688866138e-05, "loss": 0.1236, "step": 22570 }, { "grad_norm": 0.258772611618042, "learning_rate": 7.360567720761999e-05, "loss": 0.1186, "step": 22580 }, { "grad_norm": 0.1656259149312973, "learning_rate": 7.358138035579711e-05, "loss": 0.1127, "step": 22590 }, { "grad_norm": 0.2415679544210434, "learning_rate": 7.355707634057354e-05, "loss": 0.1227, "step": 22600 }, { "grad_norm": 0.23471403121948242, "learning_rate": 7.353276516933215e-05, "loss": 0.1208, "step": 22610 }, { "grad_norm": 0.2311570942401886, "learning_rate": 7.350844684945806e-05, "loss": 0.1172, "step": 22620 }, { "grad_norm": 0.21518582105636597, "learning_rate": 7.348412138833851e-05, "loss": 0.1165, "step": 22630 }, { "grad_norm": 0.24096111953258514, "learning_rate": 7.345978879336295e-05, "loss": 0.1238, "step": 22640 }, { "grad_norm": 0.26519063115119934, "learning_rate": 7.343544907192296e-05, "loss": 0.1195, "step": 22650 }, { "grad_norm": 0.24633987247943878, "learning_rate": 7.341110223141235e-05, "loss": 0.1231, "step": 22660 }, { "grad_norm": 0.25480085611343384, "learning_rate": 7.3386748279227e-05, "loss": 0.1177, "step": 22670 }, { "grad_norm": 0.289723664522171, "learning_rate": 7.336238722276501e-05, "loss": 0.1233, "step": 22680 }, { "grad_norm": 0.311495304107666, "learning_rate": 7.333801906942663e-05, "loss": 0.1367, "step": 22690 }, { "grad_norm": 0.3157345652580261, "learning_rate": 7.331364382661428e-05, "loss": 0.1255, "step": 22700 }, { "grad_norm": 0.2462521493434906, "learning_rate": 7.328926150173248e-05, "loss": 0.1214, "step": 22710 }, { "grad_norm": 0.2592034339904785, "learning_rate": 7.326487210218795e-05, "loss": 0.1235, "step": 22720 }, { "grad_norm": 0.237786203622818, "learning_rate": 7.324047563538955e-05, "loss": 0.1273, "step": 22730 }, { "grad_norm": 0.2563726007938385, "learning_rate": 7.321607210874828e-05, "loss": 0.1217, "step": 22740 }, { "grad_norm": 0.2242862582206726, "learning_rate": 7.31916615296773e-05, "loss": 0.1287, "step": 22750 }, { "grad_norm": 0.25993403792381287, "learning_rate": 7.316724390559188e-05, "loss": 0.1241, "step": 22760 }, { "grad_norm": 0.29735347628593445, "learning_rate": 7.314281924390946e-05, "loss": 0.1215, "step": 22770 }, { "grad_norm": 0.2855720520019531, "learning_rate": 7.311838755204959e-05, "loss": 0.1273, "step": 22780 }, { "grad_norm": 0.24212980270385742, "learning_rate": 7.3093948837434e-05, "loss": 0.1281, "step": 22790 }, { "grad_norm": 0.23907433450222015, "learning_rate": 7.306950310748651e-05, "loss": 0.1275, "step": 22800 }, { "grad_norm": 0.27046680450439453, "learning_rate": 7.304505036963311e-05, "loss": 0.1239, "step": 22810 }, { "grad_norm": 0.2460085153579712, "learning_rate": 7.302059063130186e-05, "loss": 0.1221, "step": 22820 }, { "grad_norm": 0.21285827457904816, "learning_rate": 7.2996123899923e-05, "loss": 0.1155, "step": 22830 }, { "grad_norm": 0.2548177242279053, "learning_rate": 7.297165018292886e-05, "loss": 0.1208, "step": 22840 }, { "grad_norm": 0.22115401923656464, "learning_rate": 7.294716948775396e-05, "loss": 0.1226, "step": 22850 }, { "grad_norm": 0.24555499851703644, "learning_rate": 7.292268182183484e-05, "loss": 0.1262, "step": 22860 }, { "grad_norm": 0.22180147469043732, "learning_rate": 7.28981871926102e-05, "loss": 0.113, "step": 22870 }, { "grad_norm": 0.2291213423013687, "learning_rate": 7.28736856075209e-05, "loss": 0.1206, "step": 22880 }, { "grad_norm": 0.2316693812608719, "learning_rate": 7.284917707400985e-05, "loss": 0.1264, "step": 22890 }, { "grad_norm": 0.23663455247879028, "learning_rate": 7.282466159952212e-05, "loss": 0.1178, "step": 22900 }, { "grad_norm": 0.2052682787179947, "learning_rate": 7.280013919150483e-05, "loss": 0.118, "step": 22910 }, { "grad_norm": 0.21571500599384308, "learning_rate": 7.277560985740728e-05, "loss": 0.1247, "step": 22920 }, { "grad_norm": 0.21568149328231812, "learning_rate": 7.275107360468079e-05, "loss": 0.1224, "step": 22930 }, { "grad_norm": 0.2148251235485077, "learning_rate": 7.272653044077885e-05, "loss": 0.115, "step": 22940 }, { "grad_norm": 0.21857009828090668, "learning_rate": 7.270198037315703e-05, "loss": 0.1264, "step": 22950 }, { "grad_norm": 0.21989509463310242, "learning_rate": 7.267742340927297e-05, "loss": 0.1219, "step": 22960 }, { "grad_norm": 0.23333540558815002, "learning_rate": 7.265285955658645e-05, "loss": 0.1169, "step": 22970 }, { "grad_norm": 0.25444135069847107, "learning_rate": 7.26282888225593e-05, "loss": 0.1279, "step": 22980 }, { "grad_norm": 0.24085843563079834, "learning_rate": 7.260371121465548e-05, "loss": 0.1181, "step": 22990 }, { "grad_norm": 0.25117027759552, "learning_rate": 7.2579126740341e-05, "loss": 0.1164, "step": 23000 }, { "grad_norm": 0.27878537774086, "learning_rate": 7.2554535407084e-05, "loss": 0.1219, "step": 23010 }, { "grad_norm": 0.2309470921754837, "learning_rate": 7.252993722235464e-05, "loss": 0.1215, "step": 23020 }, { "grad_norm": 0.24292033910751343, "learning_rate": 7.250533219362523e-05, "loss": 0.1272, "step": 23030 }, { "grad_norm": 0.22491030395030975, "learning_rate": 7.248072032837012e-05, "loss": 0.1283, "step": 23040 }, { "grad_norm": 0.22105810046195984, "learning_rate": 7.245610163406575e-05, "loss": 0.1332, "step": 23050 }, { "grad_norm": 0.293468177318573, "learning_rate": 7.243147611819061e-05, "loss": 0.1225, "step": 23060 }, { "grad_norm": 0.25838202238082886, "learning_rate": 7.240684378822531e-05, "loss": 0.1281, "step": 23070 }, { "grad_norm": 0.28623420000076294, "learning_rate": 7.238220465165248e-05, "loss": 0.1278, "step": 23080 }, { "grad_norm": 0.2664668560028076, "learning_rate": 7.235755871595684e-05, "loss": 0.1271, "step": 23090 }, { "grad_norm": 0.2260747104883194, "learning_rate": 7.233290598862517e-05, "loss": 0.1269, "step": 23100 }, { "grad_norm": 0.19311347603797913, "learning_rate": 7.230824647714635e-05, "loss": 0.1217, "step": 23110 }, { "grad_norm": 0.21658842265605927, "learning_rate": 7.228358018901124e-05, "loss": 0.1243, "step": 23120 }, { "grad_norm": 0.2504076659679413, "learning_rate": 7.225890713171286e-05, "loss": 0.1232, "step": 23130 }, { "grad_norm": 0.22429265081882477, "learning_rate": 7.223422731274618e-05, "loss": 0.1225, "step": 23140 }, { "grad_norm": 0.24093416333198547, "learning_rate": 7.220954073960832e-05, "loss": 0.1268, "step": 23150 }, { "grad_norm": 0.29563790559768677, "learning_rate": 7.218484741979838e-05, "loss": 0.1295, "step": 23160 }, { "grad_norm": 0.2813178598880768, "learning_rate": 7.216014736081756e-05, "loss": 0.1208, "step": 23170 }, { "grad_norm": 0.33162519335746765, "learning_rate": 7.213544057016906e-05, "loss": 0.1264, "step": 23180 }, { "grad_norm": 0.21231909096240997, "learning_rate": 7.211072705535819e-05, "loss": 0.1147, "step": 23190 }, { "grad_norm": 0.26265573501586914, "learning_rate": 7.208600682389224e-05, "loss": 0.1248, "step": 23200 }, { "grad_norm": 0.26910820603370667, "learning_rate": 7.206127988328055e-05, "loss": 0.133, "step": 23210 }, { "grad_norm": 0.20220543444156647, "learning_rate": 7.203654624103453e-05, "loss": 0.1253, "step": 23220 }, { "grad_norm": 0.21369610726833344, "learning_rate": 7.201180590466761e-05, "loss": 0.1298, "step": 23230 }, { "grad_norm": 0.20313909649848938, "learning_rate": 7.198705888169523e-05, "loss": 0.121, "step": 23240 }, { "grad_norm": 0.2969514727592468, "learning_rate": 7.196230517963491e-05, "loss": 0.1353, "step": 23250 }, { "grad_norm": 0.23404042422771454, "learning_rate": 7.193754480600615e-05, "loss": 0.1324, "step": 23260 }, { "grad_norm": 0.2553820312023163, "learning_rate": 7.19127777683305e-05, "loss": 0.1252, "step": 23270 }, { "grad_norm": 0.22932961583137512, "learning_rate": 7.188800407413156e-05, "loss": 0.1204, "step": 23280 }, { "grad_norm": 0.23824943602085114, "learning_rate": 7.186322373093489e-05, "loss": 0.1217, "step": 23290 }, { "grad_norm": 0.2481316477060318, "learning_rate": 7.18384367462681e-05, "loss": 0.1242, "step": 23300 }, { "grad_norm": 0.2586043179035187, "learning_rate": 7.181364312766085e-05, "loss": 0.1215, "step": 23310 }, { "grad_norm": 0.18542149662971497, "learning_rate": 7.178884288264477e-05, "loss": 0.1234, "step": 23320 }, { "grad_norm": 0.2707739770412445, "learning_rate": 7.176403601875353e-05, "loss": 0.1265, "step": 23330 }, { "grad_norm": 0.26363855600357056, "learning_rate": 7.173922254352279e-05, "loss": 0.1225, "step": 23340 }, { "grad_norm": 0.23294472694396973, "learning_rate": 7.171440246449024e-05, "loss": 0.1268, "step": 23350 }, { "grad_norm": 0.24466565251350403, "learning_rate": 7.168957578919555e-05, "loss": 0.1095, "step": 23360 }, { "grad_norm": 0.24017030000686646, "learning_rate": 7.16647425251804e-05, "loss": 0.1206, "step": 23370 }, { "grad_norm": 0.24282918870449066, "learning_rate": 7.163990267998852e-05, "loss": 0.1238, "step": 23380 }, { "grad_norm": 0.21834105253219604, "learning_rate": 7.161505626116556e-05, "loss": 0.1158, "step": 23390 }, { "grad_norm": 0.24783293902873993, "learning_rate": 7.159020327625923e-05, "loss": 0.1197, "step": 23400 }, { "grad_norm": 0.22164225578308105, "learning_rate": 7.15653437328192e-05, "loss": 0.1212, "step": 23410 }, { "grad_norm": 0.2456619143486023, "learning_rate": 7.154047763839713e-05, "loss": 0.1265, "step": 23420 }, { "grad_norm": 0.20326583087444305, "learning_rate": 7.15156050005467e-05, "loss": 0.124, "step": 23430 }, { "grad_norm": 0.29573217034339905, "learning_rate": 7.149072582682357e-05, "loss": 0.1225, "step": 23440 }, { "grad_norm": 0.2344193309545517, "learning_rate": 7.146584012478535e-05, "loss": 0.1243, "step": 23450 }, { "grad_norm": 0.17490282654762268, "learning_rate": 7.144094790199169e-05, "loss": 0.1213, "step": 23460 }, { "grad_norm": 0.22879239916801453, "learning_rate": 7.141604916600415e-05, "loss": 0.1212, "step": 23470 }, { "grad_norm": 0.23900899291038513, "learning_rate": 7.139114392438635e-05, "loss": 0.1237, "step": 23480 }, { "grad_norm": 0.23573137819766998, "learning_rate": 7.136623218470382e-05, "loss": 0.1263, "step": 23490 }, { "grad_norm": 0.25898921489715576, "learning_rate": 7.13413139545241e-05, "loss": 0.1242, "step": 23500 }, { "grad_norm": 0.23272179067134857, "learning_rate": 7.131638924141668e-05, "loss": 0.1171, "step": 23510 }, { "grad_norm": 0.2155197262763977, "learning_rate": 7.129145805295304e-05, "loss": 0.1256, "step": 23520 }, { "grad_norm": 0.215163454413414, "learning_rate": 7.126652039670661e-05, "loss": 0.1129, "step": 23530 }, { "grad_norm": 0.20020483434200287, "learning_rate": 7.124157628025278e-05, "loss": 0.1235, "step": 23540 }, { "grad_norm": 0.2476043850183487, "learning_rate": 7.121662571116894e-05, "loss": 0.1126, "step": 23550 }, { "grad_norm": 0.1885933130979538, "learning_rate": 7.119166869703441e-05, "loss": 0.1183, "step": 23560 }, { "grad_norm": 0.18972350656986237, "learning_rate": 7.116670524543044e-05, "loss": 0.1261, "step": 23570 }, { "grad_norm": 0.22416579723358154, "learning_rate": 7.114173536394032e-05, "loss": 0.1168, "step": 23580 }, { "grad_norm": 0.29723915457725525, "learning_rate": 7.111675906014917e-05, "loss": 0.1248, "step": 23590 }, { "grad_norm": 0.2602030038833618, "learning_rate": 7.109177634164421e-05, "loss": 0.1258, "step": 23600 }, { "grad_norm": 0.22813090682029724, "learning_rate": 7.106678721601449e-05, "loss": 0.1212, "step": 23610 }, { "grad_norm": 0.24110344052314758, "learning_rate": 7.104179169085103e-05, "loss": 0.1214, "step": 23620 }, { "grad_norm": 0.22736522555351257, "learning_rate": 7.101678977374683e-05, "loss": 0.1207, "step": 23630 }, { "grad_norm": 0.2320585548877716, "learning_rate": 7.099178147229685e-05, "loss": 0.1294, "step": 23640 }, { "grad_norm": 0.2145485281944275, "learning_rate": 7.096676679409789e-05, "loss": 0.1178, "step": 23650 }, { "grad_norm": 0.24771851301193237, "learning_rate": 7.094174574674877e-05, "loss": 0.1168, "step": 23660 }, { "grad_norm": 0.2668149173259735, "learning_rate": 7.091671833785025e-05, "loss": 0.1293, "step": 23670 }, { "grad_norm": 0.32392236590385437, "learning_rate": 7.089168457500493e-05, "loss": 0.1281, "step": 23680 }, { "grad_norm": 0.2332274168729782, "learning_rate": 7.086664446581747e-05, "loss": 0.1252, "step": 23690 }, { "grad_norm": 0.22704057395458221, "learning_rate": 7.084159801789438e-05, "loss": 0.1175, "step": 23700 }, { "grad_norm": 0.2736653685569763, "learning_rate": 7.081654523884411e-05, "loss": 0.1197, "step": 23710 }, { "grad_norm": 0.26064422726631165, "learning_rate": 7.0791486136277e-05, "loss": 0.1217, "step": 23720 }, { "grad_norm": 0.2750549614429474, "learning_rate": 7.07664207178054e-05, "loss": 0.1223, "step": 23730 }, { "grad_norm": 0.24296651780605316, "learning_rate": 7.074134899104345e-05, "loss": 0.1154, "step": 23740 }, { "grad_norm": 0.29290860891342163, "learning_rate": 7.071627096360735e-05, "loss": 0.1266, "step": 23750 }, { "grad_norm": 0.233255997300148, "learning_rate": 7.069118664311511e-05, "loss": 0.124, "step": 23760 }, { "grad_norm": 0.2798751890659332, "learning_rate": 7.06660960371867e-05, "loss": 0.1179, "step": 23770 }, { "grad_norm": 0.21119734644889832, "learning_rate": 7.064099915344396e-05, "loss": 0.1254, "step": 23780 }, { "grad_norm": 0.30011945962905884, "learning_rate": 7.061589599951066e-05, "loss": 0.127, "step": 23790 }, { "grad_norm": 0.23078349232673645, "learning_rate": 7.05907865830125e-05, "loss": 0.1169, "step": 23800 }, { "grad_norm": 0.21471525728702545, "learning_rate": 7.056567091157703e-05, "loss": 0.1179, "step": 23810 }, { "grad_norm": 0.2105414867401123, "learning_rate": 7.054054899283375e-05, "loss": 0.115, "step": 23820 }, { "grad_norm": 0.21429288387298584, "learning_rate": 7.051542083441403e-05, "loss": 0.1303, "step": 23830 }, { "grad_norm": 0.23645329475402832, "learning_rate": 7.049028644395113e-05, "loss": 0.1159, "step": 23840 }, { "grad_norm": 0.2486206442117691, "learning_rate": 7.046514582908024e-05, "loss": 0.134, "step": 23850 }, { "grad_norm": 0.22759117186069489, "learning_rate": 7.043999899743838e-05, "loss": 0.1305, "step": 23860 }, { "grad_norm": 0.20574800670146942, "learning_rate": 7.041484595666451e-05, "loss": 0.127, "step": 23870 }, { "grad_norm": 0.2892090976238251, "learning_rate": 7.038968671439948e-05, "loss": 0.1327, "step": 23880 }, { "grad_norm": 0.24161198735237122, "learning_rate": 7.036452127828596e-05, "loss": 0.1283, "step": 23890 }, { "grad_norm": 0.21048963069915771, "learning_rate": 7.033934965596859e-05, "loss": 0.1215, "step": 23900 }, { "grad_norm": 0.22682452201843262, "learning_rate": 7.031417185509381e-05, "loss": 0.1183, "step": 23910 }, { "grad_norm": 0.2701947093009949, "learning_rate": 7.028898788331e-05, "loss": 0.1276, "step": 23920 }, { "grad_norm": 0.19710934162139893, "learning_rate": 7.026379774826736e-05, "loss": 0.1172, "step": 23930 }, { "grad_norm": 0.2164735645055771, "learning_rate": 7.0238601457618e-05, "loss": 0.1169, "step": 23940 }, { "grad_norm": 0.23931537568569183, "learning_rate": 7.02133990190159e-05, "loss": 0.124, "step": 23950 }, { "grad_norm": 0.22911743819713593, "learning_rate": 7.018819044011687e-05, "loss": 0.1205, "step": 23960 }, { "grad_norm": 0.21697790920734406, "learning_rate": 7.016297572857863e-05, "loss": 0.1163, "step": 23970 }, { "grad_norm": 0.2249331772327423, "learning_rate": 7.013775489206072e-05, "loss": 0.1217, "step": 23980 }, { "grad_norm": 0.2591222822666168, "learning_rate": 7.01125279382246e-05, "loss": 0.1289, "step": 23990 }, { "grad_norm": 0.2548219561576843, "learning_rate": 7.008729487473351e-05, "loss": 0.1316, "step": 24000 }, { "grad_norm": 0.20647983253002167, "learning_rate": 7.006205570925263e-05, "loss": 0.1242, "step": 24010 }, { "grad_norm": 0.2287038117647171, "learning_rate": 7.003681044944892e-05, "loss": 0.1312, "step": 24020 }, { "grad_norm": 0.19871526956558228, "learning_rate": 7.001155910299126e-05, "loss": 0.1134, "step": 24030 }, { "grad_norm": 0.37156662344932556, "learning_rate": 6.99863016775503e-05, "loss": 0.1254, "step": 24040 }, { "grad_norm": 0.28319376707077026, "learning_rate": 6.996103818079859e-05, "loss": 0.1261, "step": 24050 }, { "grad_norm": 0.2246025651693344, "learning_rate": 6.993576862041054e-05, "loss": 0.1201, "step": 24060 }, { "grad_norm": 0.21290063858032227, "learning_rate": 6.991049300406235e-05, "loss": 0.1198, "step": 24070 }, { "grad_norm": 0.18965956568717957, "learning_rate": 6.988521133943209e-05, "loss": 0.1201, "step": 24080 }, { "grad_norm": 0.23663313686847687, "learning_rate": 6.985992363419966e-05, "loss": 0.1314, "step": 24090 }, { "grad_norm": 0.1737031191587448, "learning_rate": 6.983462989604682e-05, "loss": 0.1147, "step": 24100 }, { "grad_norm": 0.25447335839271545, "learning_rate": 6.980933013265709e-05, "loss": 0.1306, "step": 24110 }, { "grad_norm": 0.20746631920337677, "learning_rate": 6.978402435171592e-05, "loss": 0.1232, "step": 24120 }, { "grad_norm": 0.2454143762588501, "learning_rate": 6.975871256091052e-05, "loss": 0.1197, "step": 24130 }, { "grad_norm": 0.22878286242485046, "learning_rate": 6.973339476792995e-05, "loss": 0.1272, "step": 24140 }, { "grad_norm": 0.2621552646160126, "learning_rate": 6.970807098046505e-05, "loss": 0.1165, "step": 24150 }, { "grad_norm": 0.2007000893354416, "learning_rate": 6.968274120620858e-05, "loss": 0.1158, "step": 24160 }, { "grad_norm": 0.2782788872718811, "learning_rate": 6.965740545285499e-05, "loss": 0.1245, "step": 24170 }, { "grad_norm": 0.27781006693840027, "learning_rate": 6.963206372810068e-05, "loss": 0.1191, "step": 24180 }, { "grad_norm": 0.2673066556453705, "learning_rate": 6.960671603964375e-05, "loss": 0.1315, "step": 24190 }, { "grad_norm": 0.22311973571777344, "learning_rate": 6.958136239518418e-05, "loss": 0.1215, "step": 24200 }, { "grad_norm": 0.23206201195716858, "learning_rate": 6.955600280242371e-05, "loss": 0.1213, "step": 24210 }, { "grad_norm": 0.2413286417722702, "learning_rate": 6.953063726906596e-05, "loss": 0.122, "step": 24220 }, { "grad_norm": 0.21014466881752014, "learning_rate": 6.950526580281626e-05, "loss": 0.1191, "step": 24230 }, { "grad_norm": 0.24389982223510742, "learning_rate": 6.947988841138184e-05, "loss": 0.1244, "step": 24240 }, { "grad_norm": 0.23471033573150635, "learning_rate": 6.945450510247165e-05, "loss": 0.1172, "step": 24250 }, { "grad_norm": 0.2408541738986969, "learning_rate": 6.942911588379647e-05, "loss": 0.1287, "step": 24260 }, { "grad_norm": 0.23014914989471436, "learning_rate": 6.940372076306888e-05, "loss": 0.1158, "step": 24270 }, { "grad_norm": 0.2569168210029602, "learning_rate": 6.937831974800326e-05, "loss": 0.1153, "step": 24280 }, { "grad_norm": 0.2127721905708313, "learning_rate": 6.935291284631574e-05, "loss": 0.1203, "step": 24290 }, { "grad_norm": 0.24308659136295319, "learning_rate": 6.932750006572428e-05, "loss": 0.124, "step": 24300 }, { "grad_norm": 0.2753639817237854, "learning_rate": 6.930208141394863e-05, "loss": 0.1247, "step": 24310 }, { "grad_norm": 0.25042101740837097, "learning_rate": 6.927665689871026e-05, "loss": 0.1278, "step": 24320 }, { "grad_norm": 0.23339998722076416, "learning_rate": 6.925122652773253e-05, "loss": 0.1281, "step": 24330 }, { "grad_norm": 0.2716617286205292, "learning_rate": 6.922579030874046e-05, "loss": 0.1241, "step": 24340 }, { "grad_norm": 0.2700761556625366, "learning_rate": 6.920034824946093e-05, "loss": 0.118, "step": 24350 }, { "grad_norm": 0.2783305048942566, "learning_rate": 6.917490035762255e-05, "loss": 0.133, "step": 24360 }, { "grad_norm": 0.2740013301372528, "learning_rate": 6.914944664095573e-05, "loss": 0.122, "step": 24370 }, { "grad_norm": 0.22650843858718872, "learning_rate": 6.912398710719264e-05, "loss": 0.1213, "step": 24380 }, { "grad_norm": 0.2680566906929016, "learning_rate": 6.90985217640672e-05, "loss": 0.1327, "step": 24390 }, { "grad_norm": 0.2280191034078598, "learning_rate": 6.90730506193151e-05, "loss": 0.1233, "step": 24400 }, { "grad_norm": 0.22150997817516327, "learning_rate": 6.904757368067384e-05, "loss": 0.1278, "step": 24410 }, { "grad_norm": 0.2294817417860031, "learning_rate": 6.90220909558826e-05, "loss": 0.1267, "step": 24420 }, { "grad_norm": 0.2635347247123718, "learning_rate": 6.899660245268237e-05, "loss": 0.1185, "step": 24430 }, { "grad_norm": 0.21158309280872345, "learning_rate": 6.897110817881592e-05, "loss": 0.1224, "step": 24440 }, { "grad_norm": 0.2568514347076416, "learning_rate": 6.894560814202769e-05, "loss": 0.1309, "step": 24450 }, { "grad_norm": 0.23237508535385132, "learning_rate": 6.892010235006394e-05, "loss": 0.1122, "step": 24460 }, { "grad_norm": 0.197264164686203, "learning_rate": 6.889459081067264e-05, "loss": 0.1293, "step": 24470 }, { "grad_norm": 0.2396577149629593, "learning_rate": 6.886907353160356e-05, "loss": 0.1223, "step": 24480 }, { "grad_norm": 0.26733043789863586, "learning_rate": 6.884355052060814e-05, "loss": 0.1257, "step": 24490 }, { "grad_norm": 0.24427376687526703, "learning_rate": 6.88180217854396e-05, "loss": 0.1187, "step": 24500 }, { "grad_norm": 0.19939926266670227, "learning_rate": 6.87924873338529e-05, "loss": 0.1181, "step": 24510 }, { "grad_norm": 0.2168911248445511, "learning_rate": 6.876694717360475e-05, "loss": 0.1221, "step": 24520 }, { "grad_norm": 0.22912247478961945, "learning_rate": 6.874140131245355e-05, "loss": 0.1243, "step": 24530 }, { "grad_norm": 0.2957187592983246, "learning_rate": 6.871584975815948e-05, "loss": 0.126, "step": 24540 }, { "grad_norm": 0.20954734086990356, "learning_rate": 6.86902925184844e-05, "loss": 0.122, "step": 24550 }, { "grad_norm": 0.22349269688129425, "learning_rate": 6.866472960119195e-05, "loss": 0.1129, "step": 24560 }, { "grad_norm": 0.1972552239894867, "learning_rate": 6.863916101404748e-05, "loss": 0.1183, "step": 24570 }, { "grad_norm": 0.2239305078983307, "learning_rate": 6.8613586764818e-05, "loss": 0.1119, "step": 24580 }, { "grad_norm": 0.19683562219142914, "learning_rate": 6.858800686127233e-05, "loss": 0.1285, "step": 24590 }, { "grad_norm": 0.23564749956130981, "learning_rate": 6.856242131118097e-05, "loss": 0.126, "step": 24600 }, { "grad_norm": 0.2067396491765976, "learning_rate": 6.853683012231614e-05, "loss": 0.1185, "step": 24610 }, { "grad_norm": 0.26654252409935, "learning_rate": 6.851123330245173e-05, "loss": 0.1266, "step": 24620 }, { "grad_norm": 0.2649032175540924, "learning_rate": 6.848563085936343e-05, "loss": 0.1227, "step": 24630 }, { "grad_norm": 0.2268669605255127, "learning_rate": 6.846002280082853e-05, "loss": 0.1135, "step": 24640 }, { "grad_norm": 0.24117986857891083, "learning_rate": 6.843440913462614e-05, "loss": 0.1244, "step": 24650 }, { "grad_norm": 0.23372767865657806, "learning_rate": 6.840878986853698e-05, "loss": 0.1191, "step": 24660 }, { "grad_norm": 0.23829570412635803, "learning_rate": 6.838316501034352e-05, "loss": 0.1276, "step": 24670 }, { "grad_norm": 0.24103505909442902, "learning_rate": 6.83575345678299e-05, "loss": 0.1217, "step": 24680 }, { "grad_norm": 0.24099986255168915, "learning_rate": 6.833189854878196e-05, "loss": 0.1321, "step": 24690 }, { "grad_norm": 0.2091635912656784, "learning_rate": 6.83062569609873e-05, "loss": 0.1262, "step": 24700 }, { "grad_norm": 0.19402530789375305, "learning_rate": 6.828060981223512e-05, "loss": 0.1094, "step": 24710 }, { "grad_norm": 0.2655376195907593, "learning_rate": 6.825495711031634e-05, "loss": 0.1202, "step": 24720 }, { "grad_norm": 0.2416241317987442, "learning_rate": 6.822929886302359e-05, "loss": 0.1243, "step": 24730 }, { "grad_norm": 0.21178939938545227, "learning_rate": 6.820363507815116e-05, "loss": 0.1115, "step": 24740 }, { "grad_norm": 0.21232517063617706, "learning_rate": 6.817796576349501e-05, "loss": 0.1236, "step": 24750 }, { "grad_norm": 0.2080630213022232, "learning_rate": 6.815229092685285e-05, "loss": 0.1129, "step": 24760 }, { "grad_norm": 0.2767654061317444, "learning_rate": 6.812661057602399e-05, "loss": 0.118, "step": 24770 }, { "grad_norm": 0.2491123527288437, "learning_rate": 6.810092471880943e-05, "loss": 0.1183, "step": 24780 }, { "grad_norm": 0.2231786847114563, "learning_rate": 6.807523336301187e-05, "loss": 0.1252, "step": 24790 }, { "grad_norm": 0.2067578285932541, "learning_rate": 6.804953651643566e-05, "loss": 0.1222, "step": 24800 }, { "grad_norm": 0.29216763377189636, "learning_rate": 6.802383418688685e-05, "loss": 0.123, "step": 24810 }, { "grad_norm": 0.2620583772659302, "learning_rate": 6.799812638217309e-05, "loss": 0.1129, "step": 24820 }, { "grad_norm": 0.22167016565799713, "learning_rate": 6.797241311010373e-05, "loss": 0.1142, "step": 24830 }, { "grad_norm": 0.25446486473083496, "learning_rate": 6.794669437848982e-05, "loss": 0.1219, "step": 24840 }, { "grad_norm": 0.25310078263282776, "learning_rate": 6.792097019514402e-05, "loss": 0.1324, "step": 24850 }, { "grad_norm": 0.23814812302589417, "learning_rate": 6.789524056788064e-05, "loss": 0.1235, "step": 24860 }, { "grad_norm": 0.22566744685173035, "learning_rate": 6.786950550451567e-05, "loss": 0.109, "step": 24870 }, { "grad_norm": 0.257314532995224, "learning_rate": 6.784376501286676e-05, "loss": 0.123, "step": 24880 }, { "grad_norm": 0.25791576504707336, "learning_rate": 6.781801910075316e-05, "loss": 0.1232, "step": 24890 }, { "grad_norm": 0.2260763794183731, "learning_rate": 6.779226777599581e-05, "loss": 0.1264, "step": 24900 }, { "grad_norm": 0.18901586532592773, "learning_rate": 6.776651104641729e-05, "loss": 0.1188, "step": 24910 }, { "grad_norm": 0.1919073760509491, "learning_rate": 6.774074891984183e-05, "loss": 0.1123, "step": 24920 }, { "grad_norm": 0.25234347581863403, "learning_rate": 6.771498140409526e-05, "loss": 0.1241, "step": 24930 }, { "grad_norm": 0.23490117490291595, "learning_rate": 6.768920850700506e-05, "loss": 0.1302, "step": 24940 }, { "grad_norm": 0.21017657220363617, "learning_rate": 6.766343023640039e-05, "loss": 0.1179, "step": 24950 }, { "grad_norm": 0.2099086195230484, "learning_rate": 6.763764660011198e-05, "loss": 0.1219, "step": 24960 }, { "grad_norm": 0.29650789499282837, "learning_rate": 6.761185760597223e-05, "loss": 0.1222, "step": 24970 }, { "grad_norm": 0.23890365660190582, "learning_rate": 6.758606326181515e-05, "loss": 0.1236, "step": 24980 }, { "grad_norm": 0.2814948260784149, "learning_rate": 6.75602635754764e-05, "loss": 0.1245, "step": 24990 }, { "grad_norm": 0.2420649528503418, "learning_rate": 6.75344585547932e-05, "loss": 0.12, "step": 25000 }, { "grad_norm": 0.22273243963718414, "learning_rate": 6.750864820760449e-05, "loss": 0.1251, "step": 25010 }, { "grad_norm": 0.24559536576271057, "learning_rate": 6.748283254175072e-05, "loss": 0.1228, "step": 25020 }, { "grad_norm": 0.2651984989643097, "learning_rate": 6.745701156507404e-05, "loss": 0.1216, "step": 25030 }, { "grad_norm": 0.21007201075553894, "learning_rate": 6.743118528541818e-05, "loss": 0.1146, "step": 25040 }, { "grad_norm": 0.22821299731731415, "learning_rate": 6.740535371062846e-05, "loss": 0.1181, "step": 25050 }, { "grad_norm": 0.2995266318321228, "learning_rate": 6.737951684855185e-05, "loss": 0.1239, "step": 25060 }, { "grad_norm": 0.2621530294418335, "learning_rate": 6.735367470703691e-05, "loss": 0.1177, "step": 25070 }, { "grad_norm": 0.24430875480175018, "learning_rate": 6.732782729393379e-05, "loss": 0.1172, "step": 25080 }, { "grad_norm": 0.1723787784576416, "learning_rate": 6.730197461709425e-05, "loss": 0.1137, "step": 25090 }, { "grad_norm": 0.23315095901489258, "learning_rate": 6.727611668437164e-05, "loss": 0.1232, "step": 25100 }, { "grad_norm": 0.23666758835315704, "learning_rate": 6.725025350362094e-05, "loss": 0.1271, "step": 25110 }, { "grad_norm": 0.23118610680103302, "learning_rate": 6.72243850826987e-05, "loss": 0.1263, "step": 25120 }, { "grad_norm": 0.2046862244606018, "learning_rate": 6.719851142946305e-05, "loss": 0.117, "step": 25130 }, { "grad_norm": 0.22567251324653625, "learning_rate": 6.717263255177372e-05, "loss": 0.1181, "step": 25140 }, { "grad_norm": 0.27445900440216064, "learning_rate": 6.714674845749205e-05, "loss": 0.1303, "step": 25150 }, { "grad_norm": 0.2670195996761322, "learning_rate": 6.712085915448092e-05, "loss": 0.1233, "step": 25160 }, { "grad_norm": 0.23646564781665802, "learning_rate": 6.709496465060486e-05, "loss": 0.1259, "step": 25170 }, { "grad_norm": 0.2563851773738861, "learning_rate": 6.706906495372987e-05, "loss": 0.1223, "step": 25180 }, { "grad_norm": 0.2611478865146637, "learning_rate": 6.704316007172365e-05, "loss": 0.1182, "step": 25190 }, { "grad_norm": 0.2617427706718445, "learning_rate": 6.701725001245539e-05, "loss": 0.1211, "step": 25200 }, { "grad_norm": 0.28732502460479736, "learning_rate": 6.699133478379588e-05, "loss": 0.1157, "step": 25210 }, { "grad_norm": 0.26319944858551025, "learning_rate": 6.69654143936175e-05, "loss": 0.1245, "step": 25220 }, { "grad_norm": 0.2577567994594574, "learning_rate": 6.693948884979419e-05, "loss": 0.1246, "step": 25230 }, { "grad_norm": 0.22937551140785217, "learning_rate": 6.691355816020142e-05, "loss": 0.1173, "step": 25240 }, { "grad_norm": 0.2183106243610382, "learning_rate": 6.688762233271624e-05, "loss": 0.127, "step": 25250 }, { "grad_norm": 0.2419993281364441, "learning_rate": 6.68616813752173e-05, "loss": 0.1172, "step": 25260 }, { "grad_norm": 0.23851308226585388, "learning_rate": 6.683573529558477e-05, "loss": 0.128, "step": 25270 }, { "grad_norm": 0.2285805493593216, "learning_rate": 6.680978410170037e-05, "loss": 0.1238, "step": 25280 }, { "grad_norm": 0.20875990390777588, "learning_rate": 6.678382780144741e-05, "loss": 0.1137, "step": 25290 }, { "grad_norm": 0.19193261861801147, "learning_rate": 6.675786640271071e-05, "loss": 0.119, "step": 25300 }, { "grad_norm": 0.21945641934871674, "learning_rate": 6.673189991337665e-05, "loss": 0.1275, "step": 25310 }, { "grad_norm": 0.20765914022922516, "learning_rate": 6.670592834133317e-05, "loss": 0.1184, "step": 25320 }, { "grad_norm": 0.23644085228443146, "learning_rate": 6.667995169446979e-05, "loss": 0.1211, "step": 25330 }, { "grad_norm": 0.21543952822685242, "learning_rate": 6.665396998067747e-05, "loss": 0.1151, "step": 25340 }, { "grad_norm": 0.24522477388381958, "learning_rate": 6.66279832078488e-05, "loss": 0.1195, "step": 25350 }, { "grad_norm": 0.21242007613182068, "learning_rate": 6.660199138387786e-05, "loss": 0.1198, "step": 25360 }, { "grad_norm": 0.24883703887462616, "learning_rate": 6.65759945166603e-05, "loss": 0.1268, "step": 25370 }, { "grad_norm": 0.2123064249753952, "learning_rate": 6.654999261409326e-05, "loss": 0.1198, "step": 25380 }, { "grad_norm": 0.2593260109424591, "learning_rate": 6.652398568407544e-05, "loss": 0.1189, "step": 25390 }, { "grad_norm": 0.19302581250667572, "learning_rate": 6.649797373450707e-05, "loss": 0.1148, "step": 25400 }, { "grad_norm": 0.27679216861724854, "learning_rate": 6.647195677328988e-05, "loss": 0.1178, "step": 25410 }, { "grad_norm": 0.25009679794311523, "learning_rate": 6.644593480832712e-05, "loss": 0.1267, "step": 25420 }, { "grad_norm": 0.265118271112442, "learning_rate": 6.641990784752363e-05, "loss": 0.126, "step": 25430 }, { "grad_norm": 0.22169695794582367, "learning_rate": 6.639387589878566e-05, "loss": 0.131, "step": 25440 }, { "grad_norm": 0.22122111916542053, "learning_rate": 6.636783897002103e-05, "loss": 0.125, "step": 25450 }, { "grad_norm": 0.23684746026992798, "learning_rate": 6.63417970691391e-05, "loss": 0.1246, "step": 25460 }, { "grad_norm": 0.23137938976287842, "learning_rate": 6.63157502040507e-05, "loss": 0.1239, "step": 25470 }, { "grad_norm": 0.2710244953632355, "learning_rate": 6.628969838266819e-05, "loss": 0.1203, "step": 25480 }, { "grad_norm": 0.23066097497940063, "learning_rate": 6.626364161290541e-05, "loss": 0.1233, "step": 25490 }, { "grad_norm": 0.2231416404247284, "learning_rate": 6.623757990267774e-05, "loss": 0.1204, "step": 25500 }, { "grad_norm": 0.24663828313350677, "learning_rate": 6.621151325990201e-05, "loss": 0.1262, "step": 25510 }, { "grad_norm": 0.24397534132003784, "learning_rate": 6.618544169249657e-05, "loss": 0.1268, "step": 25520 }, { "grad_norm": 0.21880799531936646, "learning_rate": 6.615936520838133e-05, "loss": 0.1335, "step": 25530 }, { "grad_norm": 0.26476743817329407, "learning_rate": 6.613328381547759e-05, "loss": 0.132, "step": 25540 }, { "grad_norm": 0.23189902305603027, "learning_rate": 6.610719752170821e-05, "loss": 0.1214, "step": 25550 }, { "grad_norm": 0.20647422969341278, "learning_rate": 6.60811063349975e-05, "loss": 0.1256, "step": 25560 }, { "grad_norm": 0.24295471608638763, "learning_rate": 6.605501026327127e-05, "loss": 0.129, "step": 25570 }, { "grad_norm": 0.20206041634082794, "learning_rate": 6.602890931445685e-05, "loss": 0.1275, "step": 25580 }, { "grad_norm": 0.19997821748256683, "learning_rate": 6.6002803496483e-05, "loss": 0.118, "step": 25590 }, { "grad_norm": 0.16591276228427887, "learning_rate": 6.597669281727997e-05, "loss": 0.1198, "step": 25600 }, { "grad_norm": 0.23177145421504974, "learning_rate": 6.595057728477949e-05, "loss": 0.1211, "step": 25610 }, { "grad_norm": 0.18973404169082642, "learning_rate": 6.59244569069148e-05, "loss": 0.1232, "step": 25620 }, { "grad_norm": 0.28522539138793945, "learning_rate": 6.589833169162054e-05, "loss": 0.1246, "step": 25630 }, { "grad_norm": 0.23741215467453003, "learning_rate": 6.587220164683291e-05, "loss": 0.1223, "step": 25640 }, { "grad_norm": 0.23017188906669617, "learning_rate": 6.58460667804895e-05, "loss": 0.1204, "step": 25650 }, { "grad_norm": 0.2643693685531616, "learning_rate": 6.581992710052938e-05, "loss": 0.1255, "step": 25660 }, { "grad_norm": 0.2422526776790619, "learning_rate": 6.579378261489311e-05, "loss": 0.1205, "step": 25670 }, { "grad_norm": 0.2583813965320587, "learning_rate": 6.576763333152268e-05, "loss": 0.1222, "step": 25680 }, { "grad_norm": 0.2252214252948761, "learning_rate": 6.574147925836159e-05, "loss": 0.121, "step": 25690 }, { "grad_norm": 0.23055008053779602, "learning_rate": 6.571532040335472e-05, "loss": 0.1276, "step": 25700 }, { "grad_norm": 0.19751614332199097, "learning_rate": 6.568915677444845e-05, "loss": 0.1209, "step": 25710 }, { "grad_norm": 0.21689213812351227, "learning_rate": 6.56629883795906e-05, "loss": 0.1234, "step": 25720 }, { "grad_norm": 0.23021046817302704, "learning_rate": 6.563681522673043e-05, "loss": 0.1197, "step": 25730 }, { "grad_norm": 0.23002701997756958, "learning_rate": 6.561063732381867e-05, "loss": 0.1221, "step": 25740 }, { "grad_norm": 0.1946128010749817, "learning_rate": 6.558445467880745e-05, "loss": 0.116, "step": 25750 }, { "grad_norm": 0.21732771396636963, "learning_rate": 6.55582672996504e-05, "loss": 0.1204, "step": 25760 }, { "grad_norm": 0.22542989253997803, "learning_rate": 6.553207519430253e-05, "loss": 0.1234, "step": 25770 }, { "grad_norm": 0.2760174572467804, "learning_rate": 6.550587837072032e-05, "loss": 0.1312, "step": 25780 }, { "grad_norm": 0.2306980937719345, "learning_rate": 6.547967683686166e-05, "loss": 0.1252, "step": 25790 }, { "grad_norm": 0.23286324739456177, "learning_rate": 6.545347060068591e-05, "loss": 0.1209, "step": 25800 }, { "grad_norm": 0.17623017728328705, "learning_rate": 6.542725967015382e-05, "loss": 0.1184, "step": 25810 }, { "grad_norm": 0.2272224873304367, "learning_rate": 6.540104405322757e-05, "loss": 0.1162, "step": 25820 }, { "grad_norm": 0.22553974390029907, "learning_rate": 6.537482375787077e-05, "loss": 0.1192, "step": 25830 }, { "grad_norm": 0.2520228624343872, "learning_rate": 6.534859879204845e-05, "loss": 0.1248, "step": 25840 }, { "grad_norm": 0.2682705223560333, "learning_rate": 6.532236916372709e-05, "loss": 0.1351, "step": 25850 }, { "grad_norm": 0.24878503382205963, "learning_rate": 6.529613488087454e-05, "loss": 0.1286, "step": 25860 }, { "grad_norm": 0.2696751356124878, "learning_rate": 6.526989595146009e-05, "loss": 0.1211, "step": 25870 }, { "grad_norm": 0.23076100647449493, "learning_rate": 6.524365238345441e-05, "loss": 0.1255, "step": 25880 }, { "grad_norm": 0.2452211081981659, "learning_rate": 6.521740418482964e-05, "loss": 0.122, "step": 25890 }, { "grad_norm": 0.23588047921657562, "learning_rate": 6.519115136355925e-05, "loss": 0.1195, "step": 25900 }, { "grad_norm": 0.20429617166519165, "learning_rate": 6.51648939276182e-05, "loss": 0.1268, "step": 25910 }, { "grad_norm": 0.2384493350982666, "learning_rate": 6.513863188498277e-05, "loss": 0.1151, "step": 25920 }, { "grad_norm": 0.24601198732852936, "learning_rate": 6.511236524363068e-05, "loss": 0.1218, "step": 25930 }, { "grad_norm": 0.21387836337089539, "learning_rate": 6.508609401154104e-05, "loss": 0.1227, "step": 25940 }, { "grad_norm": 0.26093149185180664, "learning_rate": 6.505981819669439e-05, "loss": 0.1228, "step": 25950 }, { "grad_norm": 0.18408767879009247, "learning_rate": 6.503353780707258e-05, "loss": 0.1163, "step": 25960 }, { "grad_norm": 0.24868489801883698, "learning_rate": 6.500725285065895e-05, "loss": 0.1264, "step": 25970 }, { "grad_norm": 0.2325180619955063, "learning_rate": 6.498096333543813e-05, "loss": 0.1164, "step": 25980 }, { "grad_norm": 0.2600410580635071, "learning_rate": 6.49546692693962e-05, "loss": 0.1193, "step": 25990 }, { "grad_norm": 0.24175089597702026, "learning_rate": 6.492837066052059e-05, "loss": 0.1208, "step": 26000 }, { "grad_norm": 0.20394015312194824, "learning_rate": 6.490206751680014e-05, "loss": 0.117, "step": 26010 }, { "grad_norm": 0.22190478444099426, "learning_rate": 6.487575984622505e-05, "loss": 0.126, "step": 26020 }, { "grad_norm": 0.22166535258293152, "learning_rate": 6.484944765678689e-05, "loss": 0.1232, "step": 26030 }, { "grad_norm": 0.27126434445381165, "learning_rate": 6.482313095647861e-05, "loss": 0.1174, "step": 26040 }, { "grad_norm": 0.2621332108974457, "learning_rate": 6.479680975329451e-05, "loss": 0.1128, "step": 26050 }, { "grad_norm": 0.29802119731903076, "learning_rate": 6.477048405523031e-05, "loss": 0.1225, "step": 26060 }, { "grad_norm": 0.24775844812393188, "learning_rate": 6.474415387028304e-05, "loss": 0.1127, "step": 26070 }, { "grad_norm": 0.2679702937602997, "learning_rate": 6.471781920645114e-05, "loss": 0.1259, "step": 26080 }, { "grad_norm": 0.23463183641433716, "learning_rate": 6.469148007173434e-05, "loss": 0.1214, "step": 26090 }, { "grad_norm": 0.23981556296348572, "learning_rate": 6.466513647413381e-05, "loss": 0.1223, "step": 26100 }, { "grad_norm": 0.2631443440914154, "learning_rate": 6.463878842165203e-05, "loss": 0.12, "step": 26110 }, { "grad_norm": 0.2644517719745636, "learning_rate": 6.461243592229286e-05, "loss": 0.1218, "step": 26120 }, { "grad_norm": 0.19272609055042267, "learning_rate": 6.458607898406146e-05, "loss": 0.12, "step": 26130 }, { "grad_norm": 0.22696934640407562, "learning_rate": 6.455971761496439e-05, "loss": 0.1121, "step": 26140 }, { "grad_norm": 0.22403226792812347, "learning_rate": 6.453335182300953e-05, "loss": 0.1273, "step": 26150 }, { "grad_norm": 0.2625650465488434, "learning_rate": 6.450698161620612e-05, "loss": 0.1299, "step": 26160 }, { "grad_norm": 0.22603318095207214, "learning_rate": 6.448060700256473e-05, "loss": 0.1202, "step": 26170 }, { "grad_norm": 0.241298645734787, "learning_rate": 6.445422799009726e-05, "loss": 0.1291, "step": 26180 }, { "grad_norm": 0.22611857950687408, "learning_rate": 6.442784458681699e-05, "loss": 0.118, "step": 26190 }, { "grad_norm": 0.2773529291152954, "learning_rate": 6.440145680073847e-05, "loss": 0.1231, "step": 26200 }, { "grad_norm": 0.2274520993232727, "learning_rate": 6.437506463987762e-05, "loss": 0.1229, "step": 26210 }, { "grad_norm": 0.18991225957870483, "learning_rate": 6.434866811225168e-05, "loss": 0.1219, "step": 26220 }, { "grad_norm": 0.19626450538635254, "learning_rate": 6.432226722587923e-05, "loss": 0.1254, "step": 26230 }, { "grad_norm": 0.2332887500524521, "learning_rate": 6.429586198878015e-05, "loss": 0.1173, "step": 26240 }, { "grad_norm": 0.263633131980896, "learning_rate": 6.426945240897566e-05, "loss": 0.1202, "step": 26250 }, { "grad_norm": 0.20052117109298706, "learning_rate": 6.424303849448829e-05, "loss": 0.1268, "step": 26260 }, { "grad_norm": 0.2345976084470749, "learning_rate": 6.42166202533419e-05, "loss": 0.1168, "step": 26270 }, { "grad_norm": 0.20970329642295837, "learning_rate": 6.419019769356164e-05, "loss": 0.1204, "step": 26280 }, { "grad_norm": 0.2433854341506958, "learning_rate": 6.416377082317398e-05, "loss": 0.1259, "step": 26290 }, { "grad_norm": 0.2652387022972107, "learning_rate": 6.413733965020674e-05, "loss": 0.1241, "step": 26300 }, { "grad_norm": 0.21043671667575836, "learning_rate": 6.411090418268896e-05, "loss": 0.1245, "step": 26310 }, { "grad_norm": 0.23409688472747803, "learning_rate": 6.408446442865109e-05, "loss": 0.1171, "step": 26320 }, { "grad_norm": 0.18927544355392456, "learning_rate": 6.405802039612479e-05, "loss": 0.1371, "step": 26330 }, { "grad_norm": 0.2336805760860443, "learning_rate": 6.403157209314308e-05, "loss": 0.1191, "step": 26340 }, { "grad_norm": 0.25978586077690125, "learning_rate": 6.400511952774024e-05, "loss": 0.1298, "step": 26350 }, { "grad_norm": 0.2560664713382721, "learning_rate": 6.397866270795187e-05, "loss": 0.1166, "step": 26360 }, { "grad_norm": 0.27185893058776855, "learning_rate": 6.395220164181489e-05, "loss": 0.1321, "step": 26370 }, { "grad_norm": 0.25743570923805237, "learning_rate": 6.39257363373674e-05, "loss": 0.1262, "step": 26380 }, { "grad_norm": 0.18401604890823364, "learning_rate": 6.389926680264892e-05, "loss": 0.1118, "step": 26390 }, { "grad_norm": 0.2211686372756958, "learning_rate": 6.387279304570017e-05, "loss": 0.1252, "step": 26400 }, { "grad_norm": 0.21929079294204712, "learning_rate": 6.384631507456319e-05, "loss": 0.1241, "step": 26410 }, { "grad_norm": 0.22444862127304077, "learning_rate": 6.381983289728126e-05, "loss": 0.117, "step": 26420 }, { "grad_norm": 0.3152218461036682, "learning_rate": 6.3793346521899e-05, "loss": 0.1213, "step": 26430 }, { "grad_norm": 0.2648904323577881, "learning_rate": 6.376685595646226e-05, "loss": 0.1213, "step": 26440 }, { "grad_norm": 0.21723268926143646, "learning_rate": 6.374036120901816e-05, "loss": 0.1248, "step": 26450 }, { "grad_norm": 0.214776873588562, "learning_rate": 6.371386228761514e-05, "loss": 0.1229, "step": 26460 }, { "grad_norm": 0.19782470166683197, "learning_rate": 6.368735920030283e-05, "loss": 0.1126, "step": 26470 }, { "grad_norm": 0.1861705780029297, "learning_rate": 6.366085195513218e-05, "loss": 0.1256, "step": 26480 }, { "grad_norm": 0.20715521275997162, "learning_rate": 6.363434056015543e-05, "loss": 0.1135, "step": 26490 }, { "grad_norm": 0.25420743227005005, "learning_rate": 6.360782502342599e-05, "loss": 0.1304, "step": 26500 }, { "grad_norm": 0.23506073653697968, "learning_rate": 6.358130535299862e-05, "loss": 0.1182, "step": 26510 }, { "grad_norm": 0.21360711753368378, "learning_rate": 6.355478155692926e-05, "loss": 0.1221, "step": 26520 }, { "grad_norm": 0.20659518241882324, "learning_rate": 6.352825364327517e-05, "loss": 0.1088, "step": 26530 }, { "grad_norm": 0.2127317637205124, "learning_rate": 6.350172162009482e-05, "loss": 0.125, "step": 26540 }, { "grad_norm": 0.24084682762622833, "learning_rate": 6.347518549544793e-05, "loss": 0.1245, "step": 26550 }, { "grad_norm": 0.22406236827373505, "learning_rate": 6.344864527739547e-05, "loss": 0.113, "step": 26560 }, { "grad_norm": 0.22753243148326874, "learning_rate": 6.342210097399966e-05, "loss": 0.1182, "step": 26570 }, { "grad_norm": 0.2424263209104538, "learning_rate": 6.339555259332398e-05, "loss": 0.1234, "step": 26580 }, { "grad_norm": 0.17347848415374756, "learning_rate": 6.33690001434331e-05, "loss": 0.1176, "step": 26590 }, { "grad_norm": 0.22385303676128387, "learning_rate": 6.334244363239296e-05, "loss": 0.1211, "step": 26600 }, { "grad_norm": 0.23857125639915466, "learning_rate": 6.331588306827073e-05, "loss": 0.1222, "step": 26610 }, { "grad_norm": 0.21603301167488098, "learning_rate": 6.328931845913483e-05, "loss": 0.1163, "step": 26620 }, { "grad_norm": 0.226329043507576, "learning_rate": 6.326274981305484e-05, "loss": 0.1289, "step": 26630 }, { "grad_norm": 0.23930440843105316, "learning_rate": 6.323617713810166e-05, "loss": 0.1268, "step": 26640 }, { "grad_norm": 0.23209978640079498, "learning_rate": 6.320960044234734e-05, "loss": 0.1297, "step": 26650 }, { "grad_norm": 0.23244401812553406, "learning_rate": 6.318301973386518e-05, "loss": 0.1203, "step": 26660 }, { "grad_norm": 0.25351542234420776, "learning_rate": 6.315643502072971e-05, "loss": 0.1182, "step": 26670 }, { "grad_norm": 0.2359689176082611, "learning_rate": 6.312984631101667e-05, "loss": 0.1169, "step": 26680 }, { "grad_norm": 0.17424046993255615, "learning_rate": 6.310325361280297e-05, "loss": 0.1201, "step": 26690 }, { "grad_norm": 0.213556706905365, "learning_rate": 6.30766569341668e-05, "loss": 0.1267, "step": 26700 }, { "grad_norm": 0.20737050473690033, "learning_rate": 6.305005628318753e-05, "loss": 0.1252, "step": 26710 }, { "grad_norm": 0.2673301696777344, "learning_rate": 6.302345166794572e-05, "loss": 0.1308, "step": 26720 }, { "grad_norm": 0.24520498514175415, "learning_rate": 6.299684309652316e-05, "loss": 0.1089, "step": 26730 }, { "grad_norm": 0.19212433695793152, "learning_rate": 6.297023057700283e-05, "loss": 0.1133, "step": 26740 }, { "grad_norm": 0.21121077239513397, "learning_rate": 6.294361411746891e-05, "loss": 0.1212, "step": 26750 }, { "grad_norm": 0.24586960673332214, "learning_rate": 6.291699372600677e-05, "loss": 0.1224, "step": 26760 }, { "grad_norm": 0.22181138396263123, "learning_rate": 6.2890369410703e-05, "loss": 0.1201, "step": 26770 }, { "grad_norm": 0.2551491856575012, "learning_rate": 6.286374117964534e-05, "loss": 0.1184, "step": 26780 }, { "grad_norm": 0.23753106594085693, "learning_rate": 6.283710904092277e-05, "loss": 0.1224, "step": 26790 }, { "grad_norm": 0.21950456500053406, "learning_rate": 6.281047300262542e-05, "loss": 0.1248, "step": 26800 }, { "grad_norm": 0.2349110245704651, "learning_rate": 6.278383307284461e-05, "loss": 0.123, "step": 26810 }, { "grad_norm": 0.2546871602535248, "learning_rate": 6.275718925967284e-05, "loss": 0.1225, "step": 26820 }, { "grad_norm": 0.20171210169792175, "learning_rate": 6.273054157120382e-05, "loss": 0.1268, "step": 26830 }, { "grad_norm": 0.2665509879589081, "learning_rate": 6.270389001553238e-05, "loss": 0.1304, "step": 26840 }, { "grad_norm": 0.2734447717666626, "learning_rate": 6.26772346007546e-05, "loss": 0.1225, "step": 26850 }, { "grad_norm": 0.2757488191127777, "learning_rate": 6.265057533496767e-05, "loss": 0.1297, "step": 26860 }, { "grad_norm": 0.23835457861423492, "learning_rate": 6.262391222626997e-05, "loss": 0.124, "step": 26870 }, { "grad_norm": 0.24909034371376038, "learning_rate": 6.259724528276106e-05, "loss": 0.1275, "step": 26880 }, { "grad_norm": 0.21995224058628082, "learning_rate": 6.257057451254162e-05, "loss": 0.1172, "step": 26890 }, { "grad_norm": 0.23422689735889435, "learning_rate": 6.254389992371357e-05, "loss": 0.1223, "step": 26900 }, { "grad_norm": 0.25657087564468384, "learning_rate": 6.25172215243799e-05, "loss": 0.1219, "step": 26910 }, { "grad_norm": 0.23183126747608185, "learning_rate": 6.249053932264486e-05, "loss": 0.1145, "step": 26920 }, { "grad_norm": 0.20701061189174652, "learning_rate": 6.246385332661376e-05, "loss": 0.1105, "step": 26930 }, { "grad_norm": 0.20481546223163605, "learning_rate": 6.24371635443931e-05, "loss": 0.1253, "step": 26940 }, { "grad_norm": 0.19194015860557556, "learning_rate": 6.241046998409054e-05, "loss": 0.1206, "step": 26950 }, { "grad_norm": 0.1948658674955368, "learning_rate": 6.238377265381489e-05, "loss": 0.1205, "step": 26960 }, { "grad_norm": 0.19921568036079407, "learning_rate": 6.235707156167607e-05, "loss": 0.1223, "step": 26970 }, { "grad_norm": 0.21785780787467957, "learning_rate": 6.233036671578519e-05, "loss": 0.1137, "step": 26980 }, { "grad_norm": 0.2728109061717987, "learning_rate": 6.230365812425445e-05, "loss": 0.1281, "step": 26990 }, { "grad_norm": 0.2640842795372009, "learning_rate": 6.227694579519724e-05, "loss": 0.1255, "step": 27000 }, { "grad_norm": 0.27194106578826904, "learning_rate": 6.225022973672805e-05, "loss": 0.1314, "step": 27010 }, { "grad_norm": 0.24590753018856049, "learning_rate": 6.222350995696253e-05, "loss": 0.1216, "step": 27020 }, { "grad_norm": 0.29150527715682983, "learning_rate": 6.21967864640174e-05, "loss": 0.1212, "step": 27030 }, { "grad_norm": 0.24670661985874176, "learning_rate": 6.217005926601059e-05, "loss": 0.1182, "step": 27040 }, { "grad_norm": 0.2185191810131073, "learning_rate": 6.214332837106111e-05, "loss": 0.1203, "step": 27050 }, { "grad_norm": 0.20172417163848877, "learning_rate": 6.21165937872891e-05, "loss": 0.1219, "step": 27060 }, { "grad_norm": 0.24788078665733337, "learning_rate": 6.208985552281582e-05, "loss": 0.1266, "step": 27070 }, { "grad_norm": 0.28709298372268677, "learning_rate": 6.206311358576364e-05, "loss": 0.1214, "step": 27080 }, { "grad_norm": 0.3616798222064972, "learning_rate": 6.203636798425608e-05, "loss": 0.1264, "step": 27090 }, { "grad_norm": 0.2002987265586853, "learning_rate": 6.20096187264177e-05, "loss": 0.1203, "step": 27100 }, { "grad_norm": 0.21617400646209717, "learning_rate": 6.198286582037425e-05, "loss": 0.1239, "step": 27110 }, { "grad_norm": 0.27099311351776123, "learning_rate": 6.195610927425256e-05, "loss": 0.1262, "step": 27120 }, { "grad_norm": 0.24615715444087982, "learning_rate": 6.192934909618056e-05, "loss": 0.1243, "step": 27130 }, { "grad_norm": 0.21233464777469635, "learning_rate": 6.190258529428728e-05, "loss": 0.1215, "step": 27140 }, { "grad_norm": 0.1972608119249344, "learning_rate": 6.187581787670285e-05, "loss": 0.1263, "step": 27150 }, { "grad_norm": 0.2493678778409958, "learning_rate": 6.184904685155852e-05, "loss": 0.1239, "step": 27160 }, { "grad_norm": 0.1965753585100174, "learning_rate": 6.18222722269866e-05, "loss": 0.1144, "step": 27170 }, { "grad_norm": 0.224775031208992, "learning_rate": 6.179549401112053e-05, "loss": 0.1323, "step": 27180 }, { "grad_norm": 0.2594658434391022, "learning_rate": 6.176871221209482e-05, "loss": 0.1235, "step": 27190 }, { "grad_norm": 0.2957926392555237, "learning_rate": 6.174192683804508e-05, "loss": 0.121, "step": 27200 }, { "grad_norm": 0.26021435856819153, "learning_rate": 6.1715137897108e-05, "loss": 0.1171, "step": 27210 }, { "grad_norm": 0.2113140970468521, "learning_rate": 6.168834539742134e-05, "loss": 0.1206, "step": 27220 }, { "grad_norm": 0.1805797964334488, "learning_rate": 6.166154934712397e-05, "loss": 0.1199, "step": 27230 }, { "grad_norm": 0.2541646957397461, "learning_rate": 6.163474975435581e-05, "loss": 0.1221, "step": 27240 }, { "grad_norm": 0.21097256243228912, "learning_rate": 6.160794662725787e-05, "loss": 0.1126, "step": 27250 }, { "grad_norm": 0.19528445601463318, "learning_rate": 6.158113997397222e-05, "loss": 0.1193, "step": 27260 }, { "grad_norm": 0.21380648016929626, "learning_rate": 6.155432980264205e-05, "loss": 0.1151, "step": 27270 }, { "grad_norm": 0.2305394858121872, "learning_rate": 6.152751612141156e-05, "loss": 0.1244, "step": 27280 }, { "grad_norm": 0.1928005963563919, "learning_rate": 6.150069893842602e-05, "loss": 0.1206, "step": 27290 }, { "grad_norm": 0.2183910757303238, "learning_rate": 6.147387826183182e-05, "loss": 0.1195, "step": 27300 }, { "grad_norm": 0.17758993804454803, "learning_rate": 6.144705409977635e-05, "loss": 0.1153, "step": 27310 }, { "grad_norm": 0.19506637752056122, "learning_rate": 6.142022646040808e-05, "loss": 0.1211, "step": 27320 }, { "grad_norm": 0.20190028846263885, "learning_rate": 6.139339535187653e-05, "loss": 0.1239, "step": 27330 }, { "grad_norm": 0.16555771231651306, "learning_rate": 6.136656078233232e-05, "loss": 0.1222, "step": 27340 }, { "grad_norm": 0.2068955898284912, "learning_rate": 6.133972275992707e-05, "loss": 0.1276, "step": 27350 }, { "grad_norm": 0.3056567907333374, "learning_rate": 6.131288129281342e-05, "loss": 0.1313, "step": 27360 }, { "grad_norm": 0.19490312039852142, "learning_rate": 6.128603638914516e-05, "loss": 0.1254, "step": 27370 }, { "grad_norm": 0.22589226067066193, "learning_rate": 6.125918805707704e-05, "loss": 0.1142, "step": 27380 }, { "grad_norm": 0.1852133721113205, "learning_rate": 6.123233630476485e-05, "loss": 0.1175, "step": 27390 }, { "grad_norm": 0.20638121664524078, "learning_rate": 6.120548114036547e-05, "loss": 0.125, "step": 27400 }, { "grad_norm": 0.18511492013931274, "learning_rate": 6.117862257203679e-05, "loss": 0.1207, "step": 27410 }, { "grad_norm": 0.2180042713880539, "learning_rate": 6.115176060793771e-05, "loss": 0.1251, "step": 27420 }, { "grad_norm": 0.22962686419487, "learning_rate": 6.112489525622822e-05, "loss": 0.1205, "step": 27430 }, { "grad_norm": 0.2034856081008911, "learning_rate": 6.109802652506928e-05, "loss": 0.1153, "step": 27440 }, { "grad_norm": 0.23472356796264648, "learning_rate": 6.107115442262291e-05, "loss": 0.1164, "step": 27450 }, { "grad_norm": 0.2728124260902405, "learning_rate": 6.104427895705214e-05, "loss": 0.1226, "step": 27460 }, { "grad_norm": 0.2371789664030075, "learning_rate": 6.101740013652103e-05, "loss": 0.122, "step": 27470 }, { "grad_norm": 0.21993690729141235, "learning_rate": 6.099051796919465e-05, "loss": 0.1192, "step": 27480 }, { "grad_norm": 0.2600933909416199, "learning_rate": 6.096363246323911e-05, "loss": 0.1282, "step": 27490 }, { "grad_norm": 0.2415778487920761, "learning_rate": 6.0936743626821504e-05, "loss": 0.133, "step": 27500 }, { "grad_norm": 0.249442920088768, "learning_rate": 6.090985146810996e-05, "loss": 0.1222, "step": 27510 }, { "grad_norm": 0.2262355089187622, "learning_rate": 6.088295599527357e-05, "loss": 0.1268, "step": 27520 }, { "grad_norm": 0.231104776263237, "learning_rate": 6.085605721648252e-05, "loss": 0.1163, "step": 27530 }, { "grad_norm": 0.2568577527999878, "learning_rate": 6.082915513990792e-05, "loss": 0.1222, "step": 27540 }, { "grad_norm": 0.2779764235019684, "learning_rate": 6.080224977372192e-05, "loss": 0.1218, "step": 27550 }, { "grad_norm": 0.22217458486557007, "learning_rate": 6.0775341126097666e-05, "loss": 0.1243, "step": 27560 }, { "grad_norm": 0.20150502026081085, "learning_rate": 6.074842920520926e-05, "loss": 0.1204, "step": 27570 }, { "grad_norm": 0.20687387883663177, "learning_rate": 6.072151401923186e-05, "loss": 0.1206, "step": 27580 }, { "grad_norm": 0.22253432869911194, "learning_rate": 6.069459557634159e-05, "loss": 0.1154, "step": 27590 }, { "grad_norm": 0.20098307728767395, "learning_rate": 6.066767388471557e-05, "loss": 0.1256, "step": 27600 }, { "grad_norm": 0.20842154324054718, "learning_rate": 6.064074895253188e-05, "loss": 0.1215, "step": 27610 }, { "grad_norm": 0.19380976259708405, "learning_rate": 6.061382078796961e-05, "loss": 0.1174, "step": 27620 }, { "grad_norm": 0.20786823332309723, "learning_rate": 6.0586889399208814e-05, "loss": 0.1227, "step": 27630 }, { "grad_norm": 0.1936732679605484, "learning_rate": 6.0559954794430565e-05, "loss": 0.1156, "step": 27640 }, { "grad_norm": 0.2175561934709549, "learning_rate": 6.053301698181687e-05, "loss": 0.1216, "step": 27650 }, { "grad_norm": 0.24408476054668427, "learning_rate": 6.0506075969550725e-05, "loss": 0.1171, "step": 27660 }, { "grad_norm": 0.22966039180755615, "learning_rate": 6.047913176581609e-05, "loss": 0.1268, "step": 27670 }, { "grad_norm": 0.2304457426071167, "learning_rate": 6.0452184378797904e-05, "loss": 0.1199, "step": 27680 }, { "grad_norm": 0.23715627193450928, "learning_rate": 6.042523381668209e-05, "loss": 0.1167, "step": 27690 }, { "grad_norm": 0.23278795182704926, "learning_rate": 6.03982800876555e-05, "loss": 0.12, "step": 27700 }, { "grad_norm": 0.22869475185871124, "learning_rate": 6.0371323199905975e-05, "loss": 0.111, "step": 27710 }, { "grad_norm": 0.1888587772846222, "learning_rate": 6.03443631616223e-05, "loss": 0.1183, "step": 27720 }, { "grad_norm": 0.21531687676906586, "learning_rate": 6.031739998099421e-05, "loss": 0.122, "step": 27730 }, { "grad_norm": 0.2264878749847412, "learning_rate": 6.029043366621243e-05, "loss": 0.1247, "step": 27740 }, { "grad_norm": 0.21168731153011322, "learning_rate": 6.0263464225468615e-05, "loss": 0.1201, "step": 27750 }, { "grad_norm": 0.18937519192695618, "learning_rate": 6.023649166695534e-05, "loss": 0.1215, "step": 27760 }, { "grad_norm": 0.18895119428634644, "learning_rate": 6.0209515998866186e-05, "loss": 0.1228, "step": 27770 }, { "grad_norm": 0.16731856763362885, "learning_rate": 6.018253722939563e-05, "loss": 0.1269, "step": 27780 }, { "grad_norm": 0.20617303252220154, "learning_rate": 6.015555536673914e-05, "loss": 0.1258, "step": 27790 }, { "grad_norm": 0.19520699977874756, "learning_rate": 6.0128570419093054e-05, "loss": 0.1093, "step": 27800 }, { "grad_norm": 0.20403249561786652, "learning_rate": 6.010158239465471e-05, "loss": 0.1263, "step": 27810 }, { "grad_norm": 0.2172161191701889, "learning_rate": 6.007459130162235e-05, "loss": 0.125, "step": 27820 }, { "grad_norm": 0.2372017502784729, "learning_rate": 6.004759714819516e-05, "loss": 0.1166, "step": 27830 }, { "grad_norm": 0.21602587401866913, "learning_rate": 6.002059994257323e-05, "loss": 0.1196, "step": 27840 }, { "grad_norm": 0.2442028671503067, "learning_rate": 5.999359969295764e-05, "loss": 0.1185, "step": 27850 }, { "grad_norm": 0.17428410053253174, "learning_rate": 5.9966596407550314e-05, "loss": 0.1121, "step": 27860 }, { "grad_norm": 0.1974898874759674, "learning_rate": 5.993959009455416e-05, "loss": 0.1162, "step": 27870 }, { "grad_norm": 0.21297487616539001, "learning_rate": 5.991258076217298e-05, "loss": 0.1117, "step": 27880 }, { "grad_norm": 0.2261224240064621, "learning_rate": 5.988556841861147e-05, "loss": 0.1283, "step": 27890 }, { "grad_norm": 0.2273963838815689, "learning_rate": 5.985855307207531e-05, "loss": 0.1206, "step": 27900 }, { "grad_norm": 0.23375949263572693, "learning_rate": 5.9831534730771e-05, "loss": 0.1233, "step": 27910 }, { "grad_norm": 0.22987839579582214, "learning_rate": 5.980451340290605e-05, "loss": 0.1283, "step": 27920 }, { "grad_norm": 0.24274545907974243, "learning_rate": 5.97774890966888e-05, "loss": 0.1292, "step": 27930 }, { "grad_norm": 0.2568039298057556, "learning_rate": 5.975046182032851e-05, "loss": 0.1239, "step": 27940 }, { "grad_norm": 0.24553674459457397, "learning_rate": 5.972343158203537e-05, "loss": 0.1218, "step": 27950 }, { "grad_norm": 0.18306709825992584, "learning_rate": 5.969639839002045e-05, "loss": 0.1225, "step": 27960 }, { "grad_norm": 0.17481298744678497, "learning_rate": 5.966936225249572e-05, "loss": 0.1214, "step": 27970 }, { "grad_norm": 0.20446868240833282, "learning_rate": 5.9642323177674044e-05, "loss": 0.1228, "step": 27980 }, { "grad_norm": 0.24092760682106018, "learning_rate": 5.9615281173769154e-05, "loss": 0.1141, "step": 27990 }, { "grad_norm": 0.21478821337223053, "learning_rate": 5.958823624899574e-05, "loss": 0.1154, "step": 28000 }, { "grad_norm": 0.20393472909927368, "learning_rate": 5.956118841156933e-05, "loss": 0.1202, "step": 28010 }, { "grad_norm": 0.23190313577651978, "learning_rate": 5.953413766970631e-05, "loss": 0.1284, "step": 28020 }, { "grad_norm": 0.2039436250925064, "learning_rate": 5.9507084031624e-05, "loss": 0.1173, "step": 28030 }, { "grad_norm": 0.24972568452358246, "learning_rate": 5.948002750554058e-05, "loss": 0.13, "step": 28040 }, { "grad_norm": 0.22238577902317047, "learning_rate": 5.9452968099675124e-05, "loss": 0.1177, "step": 28050 }, { "grad_norm": 0.17012867331504822, "learning_rate": 5.9425905822247527e-05, "loss": 0.1162, "step": 28060 }, { "grad_norm": 0.21027101576328278, "learning_rate": 5.939884068147864e-05, "loss": 0.1123, "step": 28070 }, { "grad_norm": 0.2208753377199173, "learning_rate": 5.937177268559011e-05, "loss": 0.1206, "step": 28080 }, { "grad_norm": 0.2373145967721939, "learning_rate": 5.934470184280448e-05, "loss": 0.125, "step": 28090 }, { "grad_norm": 0.1888873428106308, "learning_rate": 5.931762816134516e-05, "loss": 0.1245, "step": 28100 }, { "grad_norm": 0.18590402603149414, "learning_rate": 5.9290551649436434e-05, "loss": 0.1189, "step": 28110 }, { "grad_norm": 0.17493823170661926, "learning_rate": 5.9263472315303416e-05, "loss": 0.1252, "step": 28120 }, { "grad_norm": 0.2176230549812317, "learning_rate": 5.9236390167172096e-05, "loss": 0.1155, "step": 28130 }, { "grad_norm": 0.22021012008190155, "learning_rate": 5.920930521326932e-05, "loss": 0.1055, "step": 28140 }, { "grad_norm": 0.2029227316379547, "learning_rate": 5.918221746182276e-05, "loss": 0.113, "step": 28150 }, { "grad_norm": 0.22180058062076569, "learning_rate": 5.9155126921061e-05, "loss": 0.1225, "step": 28160 }, { "grad_norm": 0.23064705729484558, "learning_rate": 5.91280335992134e-05, "loss": 0.118, "step": 28170 }, { "grad_norm": 0.17333678901195526, "learning_rate": 5.91009375045102e-05, "loss": 0.119, "step": 28180 }, { "grad_norm": 0.21929743885993958, "learning_rate": 5.9073838645182476e-05, "loss": 0.1119, "step": 28190 }, { "grad_norm": 0.2051183581352234, "learning_rate": 5.904673702946217e-05, "loss": 0.1259, "step": 28200 }, { "grad_norm": 0.2536568343639374, "learning_rate": 5.9019632665582004e-05, "loss": 0.1235, "step": 28210 }, { "grad_norm": 0.23740656673908234, "learning_rate": 5.899252556177559e-05, "loss": 0.1225, "step": 28220 }, { "grad_norm": 0.1897038072347641, "learning_rate": 5.896541572627735e-05, "loss": 0.1264, "step": 28230 }, { "grad_norm": 0.23404918611049652, "learning_rate": 5.893830316732253e-05, "loss": 0.1191, "step": 28240 }, { "grad_norm": 0.28561022877693176, "learning_rate": 5.8911187893147214e-05, "loss": 0.1169, "step": 28250 }, { "grad_norm": 0.219776451587677, "learning_rate": 5.888406991198828e-05, "loss": 0.1302, "step": 28260 }, { "grad_norm": 0.2357955127954483, "learning_rate": 5.885694923208349e-05, "loss": 0.1246, "step": 28270 }, { "grad_norm": 0.20508885383605957, "learning_rate": 5.882982586167138e-05, "loss": 0.1135, "step": 28280 }, { "grad_norm": 0.20581234991550446, "learning_rate": 5.880269980899131e-05, "loss": 0.1145, "step": 28290 }, { "grad_norm": 0.20068179070949554, "learning_rate": 5.8775571082283465e-05, "loss": 0.1206, "step": 28300 }, { "grad_norm": 0.19185014069080353, "learning_rate": 5.8748439689788824e-05, "loss": 0.1122, "step": 28310 }, { "grad_norm": 0.20837461948394775, "learning_rate": 5.87213056397492e-05, "loss": 0.1194, "step": 28320 }, { "grad_norm": 0.22589556872844696, "learning_rate": 5.869416894040719e-05, "loss": 0.1168, "step": 28330 }, { "grad_norm": 0.27041539549827576, "learning_rate": 5.866702960000621e-05, "loss": 0.1227, "step": 28340 }, { "grad_norm": 0.23487083613872528, "learning_rate": 5.863988762679048e-05, "loss": 0.1202, "step": 28350 }, { "grad_norm": 0.2046910673379898, "learning_rate": 5.8612743029005e-05, "loss": 0.1149, "step": 28360 }, { "grad_norm": 0.18841242790222168, "learning_rate": 5.858559581489561e-05, "loss": 0.1173, "step": 28370 }, { "grad_norm": 0.22723647952079773, "learning_rate": 5.85584459927089e-05, "loss": 0.1225, "step": 28380 }, { "grad_norm": 0.22251030802726746, "learning_rate": 5.853129357069227e-05, "loss": 0.113, "step": 28390 }, { "grad_norm": 0.20475982129573822, "learning_rate": 5.8504138557093913e-05, "loss": 0.1142, "step": 28400 }, { "grad_norm": 0.2669866681098938, "learning_rate": 5.8476980960162784e-05, "loss": 0.1281, "step": 28410 }, { "grad_norm": 0.20684634149074554, "learning_rate": 5.844982078814868e-05, "loss": 0.1177, "step": 28420 }, { "grad_norm": 0.21322405338287354, "learning_rate": 5.842265804930211e-05, "loss": 0.119, "step": 28430 }, { "grad_norm": 0.22241543233394623, "learning_rate": 5.839549275187444e-05, "loss": 0.1224, "step": 28440 }, { "grad_norm": 0.2121526151895523, "learning_rate": 5.836832490411771e-05, "loss": 0.1194, "step": 28450 }, { "grad_norm": 0.3384893536567688, "learning_rate": 5.834115451428485e-05, "loss": 0.129, "step": 28460 }, { "grad_norm": 0.22977188229560852, "learning_rate": 5.831398159062946e-05, "loss": 0.1203, "step": 28470 }, { "grad_norm": 0.25914400815963745, "learning_rate": 5.828680614140599e-05, "loss": 0.1285, "step": 28480 }, { "grad_norm": 0.26437658071517944, "learning_rate": 5.825962817486962e-05, "loss": 0.1285, "step": 28490 }, { "grad_norm": 0.20328938961029053, "learning_rate": 5.823244769927629e-05, "loss": 0.1229, "step": 28500 }, { "grad_norm": 0.2003871649503708, "learning_rate": 5.8205264722882716e-05, "loss": 0.1151, "step": 28510 }, { "grad_norm": 0.29011473059654236, "learning_rate": 5.817807925394636e-05, "loss": 0.1217, "step": 28520 }, { "grad_norm": 0.22163134813308716, "learning_rate": 5.815089130072546e-05, "loss": 0.1145, "step": 28530 }, { "grad_norm": 0.22559916973114014, "learning_rate": 5.8123700871479e-05, "loss": 0.1225, "step": 28540 }, { "grad_norm": 0.20524805784225464, "learning_rate": 5.809650797446671e-05, "loss": 0.1216, "step": 28550 }, { "grad_norm": 0.23341262340545654, "learning_rate": 5.806931261794907e-05, "loss": 0.1127, "step": 28560 }, { "grad_norm": 0.23867449164390564, "learning_rate": 5.804211481018731e-05, "loss": 0.1177, "step": 28570 }, { "grad_norm": 0.18949314951896667, "learning_rate": 5.801491455944341e-05, "loss": 0.1213, "step": 28580 }, { "grad_norm": 0.2570512592792511, "learning_rate": 5.79877118739801e-05, "loss": 0.1205, "step": 28590 }, { "grad_norm": 0.15371055901050568, "learning_rate": 5.7960506762060816e-05, "loss": 0.1205, "step": 28600 }, { "grad_norm": 0.2324678748846054, "learning_rate": 5.793329923194977e-05, "loss": 0.1182, "step": 28610 }, { "grad_norm": 0.1827525496482849, "learning_rate": 5.790608929191187e-05, "loss": 0.1314, "step": 28620 }, { "grad_norm": 0.21875670552253723, "learning_rate": 5.78788769502128e-05, "loss": 0.1215, "step": 28630 }, { "grad_norm": 0.2213621884584427, "learning_rate": 5.785166221511894e-05, "loss": 0.1216, "step": 28640 }, { "grad_norm": 0.2305217981338501, "learning_rate": 5.7824445094897415e-05, "loss": 0.1199, "step": 28650 }, { "grad_norm": 0.2467866837978363, "learning_rate": 5.7797225597816065e-05, "loss": 0.1201, "step": 28660 }, { "grad_norm": 0.21423450112342834, "learning_rate": 5.777000373214345e-05, "loss": 0.1258, "step": 28670 }, { "grad_norm": 0.21997861564159393, "learning_rate": 5.774277950614885e-05, "loss": 0.1277, "step": 28680 }, { "grad_norm": 0.2512442469596863, "learning_rate": 5.771555292810227e-05, "loss": 0.1219, "step": 28690 }, { "grad_norm": 0.2367607206106186, "learning_rate": 5.768832400627444e-05, "loss": 0.1239, "step": 28700 }, { "grad_norm": 0.21156509220600128, "learning_rate": 5.7661092748936775e-05, "loss": 0.1186, "step": 28710 }, { "grad_norm": 0.19993355870246887, "learning_rate": 5.76338591643614e-05, "loss": 0.1231, "step": 28720 }, { "grad_norm": 0.19712066650390625, "learning_rate": 5.760662326082118e-05, "loss": 0.1209, "step": 28730 }, { "grad_norm": 0.17923294007778168, "learning_rate": 5.757938504658965e-05, "loss": 0.1133, "step": 28740 }, { "grad_norm": 0.21441413462162018, "learning_rate": 5.755214452994107e-05, "loss": 0.1156, "step": 28750 }, { "grad_norm": 0.22969204187393188, "learning_rate": 5.752490171915039e-05, "loss": 0.1251, "step": 28760 }, { "grad_norm": 0.20755404233932495, "learning_rate": 5.749765662249324e-05, "loss": 0.1186, "step": 28770 }, { "grad_norm": 0.28863662481307983, "learning_rate": 5.747040924824596e-05, "loss": 0.1301, "step": 28780 }, { "grad_norm": 0.23093032836914062, "learning_rate": 5.7443159604685613e-05, "loss": 0.1219, "step": 28790 }, { "grad_norm": 0.17321772873401642, "learning_rate": 5.74159077000899e-05, "loss": 0.1241, "step": 28800 }, { "grad_norm": 0.19103097915649414, "learning_rate": 5.7388653542737235e-05, "loss": 0.1237, "step": 28810 }, { "grad_norm": 0.17633968591690063, "learning_rate": 5.736139714090672e-05, "loss": 0.1209, "step": 28820 }, { "grad_norm": 0.1929604709148407, "learning_rate": 5.73341385028781e-05, "loss": 0.1105, "step": 28830 }, { "grad_norm": 0.18906286358833313, "learning_rate": 5.7306877636931855e-05, "loss": 0.1117, "step": 28840 }, { "grad_norm": 0.21684585511684418, "learning_rate": 5.7279614551349125e-05, "loss": 0.1237, "step": 28850 }, { "grad_norm": 0.18380676209926605, "learning_rate": 5.725234925441169e-05, "loss": 0.1231, "step": 28860 }, { "grad_norm": 0.26512885093688965, "learning_rate": 5.7225081754402044e-05, "loss": 0.1242, "step": 28870 }, { "grad_norm": 0.22387909889221191, "learning_rate": 5.7197812059603326e-05, "loss": 0.1166, "step": 28880 }, { "grad_norm": 0.19758176803588867, "learning_rate": 5.717054017829934e-05, "loss": 0.1173, "step": 28890 }, { "grad_norm": 0.2317163199186325, "learning_rate": 5.7143266118774584e-05, "loss": 0.1251, "step": 28900 }, { "grad_norm": 0.18513916432857513, "learning_rate": 5.711598988931418e-05, "loss": 0.1183, "step": 28910 }, { "grad_norm": 0.21452981233596802, "learning_rate": 5.7088711498203954e-05, "loss": 0.1127, "step": 28920 }, { "grad_norm": 0.21407851576805115, "learning_rate": 5.706143095373033e-05, "loss": 0.1285, "step": 28930 }, { "grad_norm": 0.20719754695892334, "learning_rate": 5.703414826418042e-05, "loss": 0.1089, "step": 28940 }, { "grad_norm": 0.20137982070446014, "learning_rate": 5.7006863437842007e-05, "loss": 0.114, "step": 28950 }, { "grad_norm": 0.2262512594461441, "learning_rate": 5.697957648300348e-05, "loss": 0.1218, "step": 28960 }, { "grad_norm": 0.18247029185295105, "learning_rate": 5.695228740795391e-05, "loss": 0.1173, "step": 28970 }, { "grad_norm": 0.1989476978778839, "learning_rate": 5.6924996220982985e-05, "loss": 0.1108, "step": 28980 }, { "grad_norm": 0.2089000940322876, "learning_rate": 5.6897702930381045e-05, "loss": 0.1217, "step": 28990 }, { "grad_norm": 0.19312626123428345, "learning_rate": 5.687040754443908e-05, "loss": 0.1151, "step": 29000 }, { "grad_norm": 0.23519794642925262, "learning_rate": 5.6843110071448725e-05, "loss": 0.1226, "step": 29010 }, { "grad_norm": 0.19882944226264954, "learning_rate": 5.6815810519702194e-05, "loss": 0.1173, "step": 29020 }, { "grad_norm": 0.17222599685192108, "learning_rate": 5.6788508897492396e-05, "loss": 0.1138, "step": 29030 }, { "grad_norm": 0.19852153956890106, "learning_rate": 5.676120521311282e-05, "loss": 0.1267, "step": 29040 }, { "grad_norm": 0.24409440159797668, "learning_rate": 5.6733899474857634e-05, "loss": 0.1185, "step": 29050 }, { "grad_norm": 0.2172825038433075, "learning_rate": 5.670659169102157e-05, "loss": 0.1235, "step": 29060 }, { "grad_norm": 0.23597319424152374, "learning_rate": 5.6679281869900044e-05, "loss": 0.1162, "step": 29070 }, { "grad_norm": 0.19024977087974548, "learning_rate": 5.6651970019789045e-05, "loss": 0.118, "step": 29080 }, { "grad_norm": 0.19383494555950165, "learning_rate": 5.662465614898519e-05, "loss": 0.1277, "step": 29090 }, { "grad_norm": 0.2605030834674835, "learning_rate": 5.6597340265785695e-05, "loss": 0.1174, "step": 29100 }, { "grad_norm": 0.263409286737442, "learning_rate": 5.657002237848843e-05, "loss": 0.1275, "step": 29110 }, { "grad_norm": 0.23781771957874298, "learning_rate": 5.654270249539183e-05, "loss": 0.1175, "step": 29120 }, { "grad_norm": 0.2094735950231552, "learning_rate": 5.651538062479498e-05, "loss": 0.1175, "step": 29130 }, { "grad_norm": 0.21643808484077454, "learning_rate": 5.648805677499751e-05, "loss": 0.1161, "step": 29140 }, { "grad_norm": 0.19086837768554688, "learning_rate": 5.646073095429969e-05, "loss": 0.1137, "step": 29150 }, { "grad_norm": 0.1983538120985031, "learning_rate": 5.643340317100241e-05, "loss": 0.1123, "step": 29160 }, { "grad_norm": 0.21246270835399628, "learning_rate": 5.64060734334071e-05, "loss": 0.1155, "step": 29170 }, { "grad_norm": 0.22289423644542694, "learning_rate": 5.637874174981583e-05, "loss": 0.1223, "step": 29180 }, { "grad_norm": 0.19268901646137238, "learning_rate": 5.635140812853124e-05, "loss": 0.1126, "step": 29190 }, { "grad_norm": 0.19100090861320496, "learning_rate": 5.6324072577856544e-05, "loss": 0.121, "step": 29200 }, { "grad_norm": 0.3115653693675995, "learning_rate": 5.629673510609559e-05, "loss": 0.1237, "step": 29210 }, { "grad_norm": 0.24178802967071533, "learning_rate": 5.626939572155276e-05, "loss": 0.1207, "step": 29220 }, { "grad_norm": 0.20528030395507812, "learning_rate": 5.6242054432533054e-05, "loss": 0.1219, "step": 29230 }, { "grad_norm": 0.16819247603416443, "learning_rate": 5.621471124734201e-05, "loss": 0.1206, "step": 29240 }, { "grad_norm": 0.1931724101305008, "learning_rate": 5.6187366174285794e-05, "loss": 0.113, "step": 29250 }, { "grad_norm": 0.24384775757789612, "learning_rate": 5.616001922167109e-05, "loss": 0.1205, "step": 29260 }, { "grad_norm": 0.2345827966928482, "learning_rate": 5.61326703978052e-05, "loss": 0.127, "step": 29270 }, { "grad_norm": 0.2403554618358612, "learning_rate": 5.6105319710995964e-05, "loss": 0.1199, "step": 29280 }, { "grad_norm": 0.2285236418247223, "learning_rate": 5.60779671695518e-05, "loss": 0.1215, "step": 29290 }, { "grad_norm": 0.20966443419456482, "learning_rate": 5.6050612781781684e-05, "loss": 0.1138, "step": 29300 }, { "grad_norm": 0.18827258050441742, "learning_rate": 5.602325655599516e-05, "loss": 0.123, "step": 29310 }, { "grad_norm": 0.2328571230173111, "learning_rate": 5.599589850050234e-05, "loss": 0.1181, "step": 29320 }, { "grad_norm": 0.22125232219696045, "learning_rate": 5.5968538623613874e-05, "loss": 0.1295, "step": 29330 }, { "grad_norm": 0.18229006230831146, "learning_rate": 5.594117693364095e-05, "loss": 0.1169, "step": 29340 }, { "grad_norm": 0.23128105700016022, "learning_rate": 5.591381343889535e-05, "loss": 0.1224, "step": 29350 }, { "grad_norm": 0.15959466993808746, "learning_rate": 5.5886448147689355e-05, "loss": 0.1165, "step": 29360 }, { "grad_norm": 0.1895359456539154, "learning_rate": 5.585908106833585e-05, "loss": 0.1145, "step": 29370 }, { "grad_norm": 0.16929075121879578, "learning_rate": 5.5831712209148226e-05, "loss": 0.1167, "step": 29380 }, { "grad_norm": 0.20344872772693634, "learning_rate": 5.58043415784404e-05, "loss": 0.1263, "step": 29390 }, { "grad_norm": 0.17239929735660553, "learning_rate": 5.577696918452686e-05, "loss": 0.1272, "step": 29400 }, { "grad_norm": 0.21756576001644135, "learning_rate": 5.5749595035722604e-05, "loss": 0.1176, "step": 29410 }, { "grad_norm": 0.2690567076206207, "learning_rate": 5.5722219140343193e-05, "loss": 0.1209, "step": 29420 }, { "grad_norm": 0.23232996463775635, "learning_rate": 5.56948415067047e-05, "loss": 0.1221, "step": 29430 }, { "grad_norm": 0.1775432676076889, "learning_rate": 5.5667462143123704e-05, "loss": 0.1163, "step": 29440 }, { "grad_norm": 0.219065859913826, "learning_rate": 5.564008105791737e-05, "loss": 0.1228, "step": 29450 }, { "grad_norm": 0.1897243708372116, "learning_rate": 5.5612698259403316e-05, "loss": 0.121, "step": 29460 }, { "grad_norm": 0.21312034130096436, "learning_rate": 5.5585313755899724e-05, "loss": 0.1157, "step": 29470 }, { "grad_norm": 0.18105491995811462, "learning_rate": 5.5557927555725285e-05, "loss": 0.115, "step": 29480 }, { "grad_norm": 0.21026137471199036, "learning_rate": 5.55305396671992e-05, "loss": 0.1178, "step": 29490 }, { "grad_norm": 0.199519544839859, "learning_rate": 5.55031500986412e-05, "loss": 0.1154, "step": 29500 }, { "grad_norm": 0.2522449493408203, "learning_rate": 5.547575885837149e-05, "loss": 0.1218, "step": 29510 }, { "grad_norm": 0.2185608595609665, "learning_rate": 5.5448365954710825e-05, "loss": 0.1192, "step": 29520 }, { "grad_norm": 0.21522562205791473, "learning_rate": 5.5420971395980446e-05, "loss": 0.1253, "step": 29530 }, { "grad_norm": 0.23118412494659424, "learning_rate": 5.539357519050209e-05, "loss": 0.1218, "step": 29540 }, { "grad_norm": 0.18692243099212646, "learning_rate": 5.536617734659799e-05, "loss": 0.1174, "step": 29550 }, { "grad_norm": 0.2406919002532959, "learning_rate": 5.533877787259091e-05, "loss": 0.1204, "step": 29560 }, { "grad_norm": 0.19910214841365814, "learning_rate": 5.5311376776804044e-05, "loss": 0.1146, "step": 29570 }, { "grad_norm": 0.26090216636657715, "learning_rate": 5.528397406756118e-05, "loss": 0.1186, "step": 29580 }, { "grad_norm": 0.21698492765426636, "learning_rate": 5.525656975318652e-05, "loss": 0.1174, "step": 29590 }, { "grad_norm": 0.1920350342988968, "learning_rate": 5.522916384200474e-05, "loss": 0.1211, "step": 29600 }, { "grad_norm": 0.1728621870279312, "learning_rate": 5.520175634234106e-05, "loss": 0.1243, "step": 29610 }, { "grad_norm": 0.1801081746816635, "learning_rate": 5.517434726252113e-05, "loss": 0.12, "step": 29620 }, { "grad_norm": 0.1973036676645279, "learning_rate": 5.514693661087113e-05, "loss": 0.1116, "step": 29630 }, { "grad_norm": 0.26739874482154846, "learning_rate": 5.511952439571769e-05, "loss": 0.1207, "step": 29640 }, { "grad_norm": 0.18987078964710236, "learning_rate": 5.509211062538791e-05, "loss": 0.1184, "step": 29650 }, { "grad_norm": 0.20366455614566803, "learning_rate": 5.506469530820939e-05, "loss": 0.1213, "step": 29660 }, { "grad_norm": 0.26005733013153076, "learning_rate": 5.503727845251014e-05, "loss": 0.1229, "step": 29670 }, { "grad_norm": 0.21355301141738892, "learning_rate": 5.50098600666187e-05, "loss": 0.1194, "step": 29680 }, { "grad_norm": 0.17276477813720703, "learning_rate": 5.498244015886406e-05, "loss": 0.1165, "step": 29690 }, { "grad_norm": 0.16628870368003845, "learning_rate": 5.495501873757565e-05, "loss": 0.1094, "step": 29700 }, { "grad_norm": 0.2279559075832367, "learning_rate": 5.492759581108336e-05, "loss": 0.126, "step": 29710 }, { "grad_norm": 0.2785917818546295, "learning_rate": 5.490017138771759e-05, "loss": 0.1216, "step": 29720 }, { "grad_norm": 0.20549358427524567, "learning_rate": 5.487274547580912e-05, "loss": 0.1197, "step": 29730 }, { "grad_norm": 0.20748808979988098, "learning_rate": 5.484531808368923e-05, "loss": 0.1239, "step": 29740 }, { "grad_norm": 0.2441563904285431, "learning_rate": 5.4817889219689656e-05, "loss": 0.1166, "step": 29750 }, { "grad_norm": 0.24321886897087097, "learning_rate": 5.4790458892142536e-05, "loss": 0.1207, "step": 29760 }, { "grad_norm": 0.16709458827972412, "learning_rate": 5.476302710938048e-05, "loss": 0.1147, "step": 29770 }, { "grad_norm": 0.18033258616924286, "learning_rate": 5.473559387973657e-05, "loss": 0.1156, "step": 29780 }, { "grad_norm": 0.2454724907875061, "learning_rate": 5.470815921154425e-05, "loss": 0.1237, "step": 29790 }, { "grad_norm": 0.2115088552236557, "learning_rate": 5.468072311313749e-05, "loss": 0.1285, "step": 29800 }, { "grad_norm": 0.2219797521829605, "learning_rate": 5.465328559285063e-05, "loss": 0.1206, "step": 29810 }, { "grad_norm": 0.21907325088977814, "learning_rate": 5.462584665901849e-05, "loss": 0.1245, "step": 29820 }, { "grad_norm": 0.1824362576007843, "learning_rate": 5.4598406319976235e-05, "loss": 0.1119, "step": 29830 }, { "grad_norm": 0.18565422296524048, "learning_rate": 5.457096458405958e-05, "loss": 0.1252, "step": 29840 }, { "grad_norm": 0.24255795776844025, "learning_rate": 5.454352145960457e-05, "loss": 0.1245, "step": 29850 }, { "grad_norm": 0.2202218621969223, "learning_rate": 5.4516076954947715e-05, "loss": 0.1271, "step": 29860 }, { "grad_norm": 0.23618443310260773, "learning_rate": 5.448863107842591e-05, "loss": 0.1263, "step": 29870 }, { "grad_norm": 0.20601989328861237, "learning_rate": 5.446118383837651e-05, "loss": 0.114, "step": 29880 }, { "grad_norm": 0.23652039468288422, "learning_rate": 5.443373524313722e-05, "loss": 0.1159, "step": 29890 }, { "grad_norm": 0.226159930229187, "learning_rate": 5.440628530104626e-05, "loss": 0.1194, "step": 29900 }, { "grad_norm": 0.23160390555858612, "learning_rate": 5.4378834020442146e-05, "loss": 0.1178, "step": 29910 }, { "grad_norm": 0.1860545426607132, "learning_rate": 5.4351381409663884e-05, "loss": 0.1157, "step": 29920 }, { "grad_norm": 0.19822141528129578, "learning_rate": 5.432392747705084e-05, "loss": 0.1194, "step": 29930 }, { "grad_norm": 0.19457609951496124, "learning_rate": 5.429647223094278e-05, "loss": 0.1216, "step": 29940 }, { "grad_norm": 0.17559435963630676, "learning_rate": 5.4269015679679924e-05, "loss": 0.1214, "step": 29950 }, { "grad_norm": 0.18696458637714386, "learning_rate": 5.424155783160281e-05, "loss": 0.1219, "step": 29960 }, { "grad_norm": 0.183108389377594, "learning_rate": 5.4214098695052415e-05, "loss": 0.1226, "step": 29970 }, { "grad_norm": 0.17689497768878937, "learning_rate": 5.418663827837012e-05, "loss": 0.1255, "step": 29980 }, { "grad_norm": 0.20884767174720764, "learning_rate": 5.415917658989763e-05, "loss": 0.1155, "step": 29990 }, { "grad_norm": 0.1719101071357727, "learning_rate": 5.413171363797713e-05, "loss": 0.1169, "step": 30000 }, { "grad_norm": 0.2259693145751953, "learning_rate": 5.4104249430951116e-05, "loss": 0.1173, "step": 30010 }, { "grad_norm": 0.21262212097644806, "learning_rate": 5.4076783977162494e-05, "loss": 0.1208, "step": 30020 }, { "grad_norm": 0.19281397759914398, "learning_rate": 5.4049317284954525e-05, "loss": 0.1192, "step": 30030 }, { "grad_norm": 0.20686134696006775, "learning_rate": 5.4021849362670884e-05, "loss": 0.122, "step": 30040 }, { "grad_norm": 0.21844486892223358, "learning_rate": 5.3994380218655604e-05, "loss": 0.1179, "step": 30050 }, { "grad_norm": 0.19908268749713898, "learning_rate": 5.396690986125309e-05, "loss": 0.1189, "step": 30060 }, { "grad_norm": 0.23234201967716217, "learning_rate": 5.3939438298808075e-05, "loss": 0.1184, "step": 30070 }, { "grad_norm": 0.2088625431060791, "learning_rate": 5.3911965539665744e-05, "loss": 0.1254, "step": 30080 }, { "grad_norm": 0.26103994250297546, "learning_rate": 5.388449159217156e-05, "loss": 0.1214, "step": 30090 }, { "grad_norm": 0.2074834257364273, "learning_rate": 5.3857016464671385e-05, "loss": 0.1235, "step": 30100 }, { "grad_norm": 0.19488078355789185, "learning_rate": 5.382954016551146e-05, "loss": 0.1214, "step": 30110 }, { "grad_norm": 0.20153644680976868, "learning_rate": 5.380206270303835e-05, "loss": 0.1186, "step": 30120 }, { "grad_norm": 0.24516496062278748, "learning_rate": 5.377458408559897e-05, "loss": 0.1257, "step": 30130 }, { "grad_norm": 0.2581283450126648, "learning_rate": 5.374710432154061e-05, "loss": 0.1257, "step": 30140 }, { "grad_norm": 0.2642630934715271, "learning_rate": 5.3719623419210886e-05, "loss": 0.1277, "step": 30150 }, { "grad_norm": 0.2119964212179184, "learning_rate": 5.3692141386957786e-05, "loss": 0.1294, "step": 30160 }, { "grad_norm": 0.23423011600971222, "learning_rate": 5.3664658233129616e-05, "loss": 0.1214, "step": 30170 }, { "grad_norm": 0.21686610579490662, "learning_rate": 5.363717396607504e-05, "loss": 0.1209, "step": 30180 }, { "grad_norm": 0.20326924324035645, "learning_rate": 5.360968859414305e-05, "loss": 0.1161, "step": 30190 }, { "grad_norm": 0.22375643253326416, "learning_rate": 5.358220212568295e-05, "loss": 0.1199, "step": 30200 }, { "grad_norm": 0.18897129595279694, "learning_rate": 5.355471456904444e-05, "loss": 0.1197, "step": 30210 }, { "grad_norm": 0.25416672229766846, "learning_rate": 5.3527225932577495e-05, "loss": 0.1237, "step": 30220 }, { "grad_norm": 0.21442806720733643, "learning_rate": 5.349973622463246e-05, "loss": 0.1201, "step": 30230 }, { "grad_norm": 0.23255033791065216, "learning_rate": 5.3472245453559956e-05, "loss": 0.121, "step": 30240 }, { "grad_norm": 0.1964501589536667, "learning_rate": 5.3444753627710955e-05, "loss": 0.1114, "step": 30250 }, { "grad_norm": 0.2000785619020462, "learning_rate": 5.341726075543676e-05, "loss": 0.1177, "step": 30260 }, { "grad_norm": 0.20372149348258972, "learning_rate": 5.338976684508898e-05, "loss": 0.1204, "step": 30270 }, { "grad_norm": 0.19574950635433197, "learning_rate": 5.336227190501953e-05, "loss": 0.1238, "step": 30280 }, { "grad_norm": 0.22106294333934784, "learning_rate": 5.3334775943580664e-05, "loss": 0.1181, "step": 30290 }, { "grad_norm": 0.21348795294761658, "learning_rate": 5.330727896912491e-05, "loss": 0.1135, "step": 30300 }, { "grad_norm": 0.20899416506290436, "learning_rate": 5.327978099000511e-05, "loss": 0.1178, "step": 30310 }, { "grad_norm": 0.23781567811965942, "learning_rate": 5.3252282014574465e-05, "loss": 0.1133, "step": 30320 }, { "grad_norm": 0.22269272804260254, "learning_rate": 5.322478205118641e-05, "loss": 0.1242, "step": 30330 }, { "grad_norm": 0.16529721021652222, "learning_rate": 5.3197281108194704e-05, "loss": 0.1159, "step": 30340 }, { "grad_norm": 0.18154676258563995, "learning_rate": 5.316977919395342e-05, "loss": 0.1168, "step": 30350 }, { "grad_norm": 0.2020394653081894, "learning_rate": 5.314227631681691e-05, "loss": 0.1245, "step": 30360 }, { "grad_norm": 0.23704561591148376, "learning_rate": 5.311477248513982e-05, "loss": 0.1265, "step": 30370 }, { "grad_norm": 0.2539527714252472, "learning_rate": 5.30872677072771e-05, "loss": 0.1191, "step": 30380 }, { "grad_norm": 0.24536985158920288, "learning_rate": 5.3059761991583954e-05, "loss": 0.1205, "step": 30390 }, { "grad_norm": 0.26906618475914, "learning_rate": 5.303225534641592e-05, "loss": 0.1134, "step": 30400 }, { "grad_norm": 0.20502401888370514, "learning_rate": 5.300474778012875e-05, "loss": 0.1168, "step": 30410 }, { "grad_norm": 0.19248943030834198, "learning_rate": 5.297723930107855e-05, "loss": 0.124, "step": 30420 }, { "grad_norm": 0.20195019245147705, "learning_rate": 5.294972991762167e-05, "loss": 0.1234, "step": 30430 }, { "grad_norm": 0.2077111005783081, "learning_rate": 5.292221963811472e-05, "loss": 0.128, "step": 30440 }, { "grad_norm": 0.2390645444393158, "learning_rate": 5.28947084709146e-05, "loss": 0.1173, "step": 30450 }, { "grad_norm": 0.2175135314464569, "learning_rate": 5.2867196424378465e-05, "loss": 0.1241, "step": 30460 }, { "grad_norm": 0.17656353116035461, "learning_rate": 5.2839683506863765e-05, "loss": 0.1173, "step": 30470 }, { "grad_norm": 0.2233765423297882, "learning_rate": 5.281216972672821e-05, "loss": 0.1231, "step": 30480 }, { "grad_norm": 0.1804308146238327, "learning_rate": 5.278465509232973e-05, "loss": 0.1186, "step": 30490 }, { "grad_norm": 0.19048435986042023, "learning_rate": 5.275713961202655e-05, "loss": 0.1192, "step": 30500 }, { "grad_norm": 0.21828079223632812, "learning_rate": 5.2729623294177165e-05, "loss": 0.1151, "step": 30510 }, { "grad_norm": 0.2120705246925354, "learning_rate": 5.270210614714028e-05, "loss": 0.1204, "step": 30520 }, { "grad_norm": 0.2529371976852417, "learning_rate": 5.267458817927491e-05, "loss": 0.1239, "step": 30530 }, { "grad_norm": 0.2022893726825714, "learning_rate": 5.264706939894026e-05, "loss": 0.1234, "step": 30540 }, { "grad_norm": 0.22380313277244568, "learning_rate": 5.261954981449584e-05, "loss": 0.122, "step": 30550 }, { "grad_norm": 0.19318419694900513, "learning_rate": 5.2592029434301324e-05, "loss": 0.123, "step": 30560 }, { "grad_norm": 0.21027027070522308, "learning_rate": 5.256450826671672e-05, "loss": 0.1184, "step": 30570 }, { "grad_norm": 0.1957574039697647, "learning_rate": 5.253698632010221e-05, "loss": 0.1143, "step": 30580 }, { "grad_norm": 0.30700263381004333, "learning_rate": 5.2509463602818246e-05, "loss": 0.114, "step": 30590 }, { "grad_norm": 0.25869280099868774, "learning_rate": 5.248194012322549e-05, "loss": 0.1272, "step": 30600 }, { "grad_norm": 0.21676316857337952, "learning_rate": 5.245441588968486e-05, "loss": 0.1198, "step": 30610 }, { "grad_norm": 0.19789469242095947, "learning_rate": 5.242689091055748e-05, "loss": 0.1146, "step": 30620 }, { "grad_norm": 0.22810673713684082, "learning_rate": 5.239936519420473e-05, "loss": 0.1171, "step": 30630 }, { "grad_norm": 0.19678352773189545, "learning_rate": 5.2371838748988175e-05, "loss": 0.1109, "step": 30640 }, { "grad_norm": 0.15170717239379883, "learning_rate": 5.234431158326965e-05, "loss": 0.1107, "step": 30650 }, { "grad_norm": 0.17349378764629364, "learning_rate": 5.231678370541115e-05, "loss": 0.1173, "step": 30660 }, { "grad_norm": 0.22025765478610992, "learning_rate": 5.228925512377495e-05, "loss": 0.1213, "step": 30670 }, { "grad_norm": 0.1939883679151535, "learning_rate": 5.2261725846723465e-05, "loss": 0.127, "step": 30680 }, { "grad_norm": 0.18035253882408142, "learning_rate": 5.22341958826194e-05, "loss": 0.1169, "step": 30690 }, { "grad_norm": 0.16154012084007263, "learning_rate": 5.22066652398256e-05, "loss": 0.1202, "step": 30700 }, { "grad_norm": 0.18386845290660858, "learning_rate": 5.2179133926705185e-05, "loss": 0.1245, "step": 30710 }, { "grad_norm": 0.1899072825908661, "learning_rate": 5.215160195162141e-05, "loss": 0.122, "step": 30720 }, { "grad_norm": 0.23027946054935455, "learning_rate": 5.212406932293776e-05, "loss": 0.1348, "step": 30730 }, { "grad_norm": 0.2107136845588684, "learning_rate": 5.209653604901795e-05, "loss": 0.1186, "step": 30740 }, { "grad_norm": 0.21410882472991943, "learning_rate": 5.206900213822584e-05, "loss": 0.1199, "step": 30750 }, { "grad_norm": 0.21713481843471527, "learning_rate": 5.204146759892551e-05, "loss": 0.1196, "step": 30760 }, { "grad_norm": 0.2202843427658081, "learning_rate": 5.2013932439481216e-05, "loss": 0.1226, "step": 30770 }, { "grad_norm": 0.21224625408649445, "learning_rate": 5.198639666825743e-05, "loss": 0.1199, "step": 30780 }, { "grad_norm": 0.2536056637763977, "learning_rate": 5.195886029361877e-05, "loss": 0.1287, "step": 30790 }, { "grad_norm": 0.1942405104637146, "learning_rate": 5.193132332393009e-05, "loss": 0.1155, "step": 30800 }, { "grad_norm": 0.18755675852298737, "learning_rate": 5.1903785767556376e-05, "loss": 0.1136, "step": 30810 }, { "grad_norm": 0.21739201247692108, "learning_rate": 5.187624763286282e-05, "loss": 0.1157, "step": 30820 }, { "grad_norm": 0.19296903908252716, "learning_rate": 5.184870892821475e-05, "loss": 0.1318, "step": 30830 }, { "grad_norm": 0.24566048383712769, "learning_rate": 5.182116966197773e-05, "loss": 0.1205, "step": 30840 }, { "grad_norm": 0.2160448133945465, "learning_rate": 5.1793629842517466e-05, "loss": 0.1174, "step": 30850 }, { "grad_norm": 0.20638175308704376, "learning_rate": 5.17660894781998e-05, "loss": 0.1203, "step": 30860 }, { "grad_norm": 0.2622026205062866, "learning_rate": 5.173854857739079e-05, "loss": 0.125, "step": 30870 }, { "grad_norm": 0.2907429039478302, "learning_rate": 5.171100714845661e-05, "loss": 0.1238, "step": 30880 }, { "grad_norm": 0.2353971004486084, "learning_rate": 5.1683465199763646e-05, "loss": 0.1145, "step": 30890 }, { "grad_norm": 0.18618972599506378, "learning_rate": 5.16559227396784e-05, "loss": 0.1212, "step": 30900 }, { "grad_norm": 0.19961436092853546, "learning_rate": 5.1628379776567556e-05, "loss": 0.1253, "step": 30910 }, { "grad_norm": 0.20943330228328705, "learning_rate": 5.160083631879792e-05, "loss": 0.1194, "step": 30920 }, { "grad_norm": 0.18532489240169525, "learning_rate": 5.1573292374736484e-05, "loss": 0.1181, "step": 30930 }, { "grad_norm": 0.15048754215240479, "learning_rate": 5.1545747952750356e-05, "loss": 0.1113, "step": 30940 }, { "grad_norm": 0.22451560199260712, "learning_rate": 5.151820306120682e-05, "loss": 0.118, "step": 30950 }, { "grad_norm": 0.21337789297103882, "learning_rate": 5.149065770847328e-05, "loss": 0.1224, "step": 30960 }, { "grad_norm": 0.2265670895576477, "learning_rate": 5.1463111902917297e-05, "loss": 0.1268, "step": 30970 }, { "grad_norm": 0.21419039368629456, "learning_rate": 5.143556565290654e-05, "loss": 0.1203, "step": 30980 }, { "grad_norm": 0.206349715590477, "learning_rate": 5.140801896680882e-05, "loss": 0.1192, "step": 30990 }, { "grad_norm": 0.2164161056280136, "learning_rate": 5.1380471852992144e-05, "loss": 0.1272, "step": 31000 }, { "grad_norm": 0.22689417004585266, "learning_rate": 5.135292431982457e-05, "loss": 0.1249, "step": 31010 }, { "grad_norm": 0.2625069320201874, "learning_rate": 5.1325376375674294e-05, "loss": 0.125, "step": 31020 }, { "grad_norm": 0.2432997077703476, "learning_rate": 5.129782802890968e-05, "loss": 0.13, "step": 31030 }, { "grad_norm": 0.1967928260564804, "learning_rate": 5.127027928789916e-05, "loss": 0.1287, "step": 31040 }, { "grad_norm": 0.24297361075878143, "learning_rate": 5.124273016101135e-05, "loss": 0.1181, "step": 31050 }, { "grad_norm": 0.2306526005268097, "learning_rate": 5.121518065661492e-05, "loss": 0.1199, "step": 31060 }, { "grad_norm": 0.18890652060508728, "learning_rate": 5.11876307830787e-05, "loss": 0.1228, "step": 31070 }, { "grad_norm": 0.22018736600875854, "learning_rate": 5.1160080548771596e-05, "loss": 0.1189, "step": 31080 }, { "grad_norm": 0.20489759743213654, "learning_rate": 5.1132529962062656e-05, "loss": 0.127, "step": 31090 }, { "grad_norm": 0.20079779624938965, "learning_rate": 5.110497903132101e-05, "loss": 0.1155, "step": 31100 }, { "grad_norm": 0.24940215051174164, "learning_rate": 5.107742776491592e-05, "loss": 0.1246, "step": 31110 }, { "grad_norm": 0.17630228400230408, "learning_rate": 5.104987617121673e-05, "loss": 0.1203, "step": 31120 }, { "grad_norm": 0.20683562755584717, "learning_rate": 5.102232425859287e-05, "loss": 0.1177, "step": 31130 }, { "grad_norm": 0.19725115597248077, "learning_rate": 5.09947720354139e-05, "loss": 0.1267, "step": 31140 }, { "grad_norm": 0.2117319107055664, "learning_rate": 5.096721951004942e-05, "loss": 0.1275, "step": 31150 }, { "grad_norm": 0.21523983776569366, "learning_rate": 5.0939666690869227e-05, "loss": 0.115, "step": 31160 }, { "grad_norm": 0.21364252269268036, "learning_rate": 5.0912113586243096e-05, "loss": 0.1123, "step": 31170 }, { "grad_norm": 0.20159941911697388, "learning_rate": 5.0884560204540935e-05, "loss": 0.1109, "step": 31180 }, { "grad_norm": 0.2168731987476349, "learning_rate": 5.0857006554132736e-05, "loss": 0.1151, "step": 31190 }, { "grad_norm": 0.24111998081207275, "learning_rate": 5.0829452643388575e-05, "loss": 0.1206, "step": 31200 }, { "grad_norm": 0.21077078580856323, "learning_rate": 5.08018984806786e-05, "loss": 0.1162, "step": 31210 }, { "grad_norm": 0.23786409199237823, "learning_rate": 5.0774344074373036e-05, "loss": 0.128, "step": 31220 }, { "grad_norm": 0.21342825889587402, "learning_rate": 5.07467894328422e-05, "loss": 0.1103, "step": 31230 }, { "grad_norm": 0.19705861806869507, "learning_rate": 5.0719234564456454e-05, "loss": 0.1209, "step": 31240 }, { "grad_norm": 0.19826461374759674, "learning_rate": 5.0691679477586216e-05, "loss": 0.117, "step": 31250 }, { "grad_norm": 0.19825389981269836, "learning_rate": 5.0664124180602035e-05, "loss": 0.113, "step": 31260 }, { "grad_norm": 0.19681957364082336, "learning_rate": 5.063656868187447e-05, "loss": 0.1224, "step": 31270 }, { "grad_norm": 0.21215976774692535, "learning_rate": 5.060901298977413e-05, "loss": 0.1164, "step": 31280 }, { "grad_norm": 0.22221755981445312, "learning_rate": 5.0581457112671725e-05, "loss": 0.1232, "step": 31290 }, { "grad_norm": 0.22388093173503876, "learning_rate": 5.0553901058938016e-05, "loss": 0.1144, "step": 31300 }, { "grad_norm": 0.19469179213047028, "learning_rate": 5.052634483694377e-05, "loss": 0.1214, "step": 31310 }, { "grad_norm": 0.2627709209918976, "learning_rate": 5.049878845505988e-05, "loss": 0.1204, "step": 31320 }, { "grad_norm": 0.2003667652606964, "learning_rate": 5.047123192165721e-05, "loss": 0.115, "step": 31330 }, { "grad_norm": 0.21537335216999054, "learning_rate": 5.0443675245106735e-05, "loss": 0.1134, "step": 31340 }, { "grad_norm": 0.19619539380073547, "learning_rate": 5.0416118433779426e-05, "loss": 0.1164, "step": 31350 }, { "grad_norm": 0.1886649876832962, "learning_rate": 5.038856149604633e-05, "loss": 0.1202, "step": 31360 }, { "grad_norm": 0.23589736223220825, "learning_rate": 5.03610044402785e-05, "loss": 0.1099, "step": 31370 }, { "grad_norm": 0.19046293199062347, "learning_rate": 5.033344727484707e-05, "loss": 0.1166, "step": 31380 }, { "grad_norm": 0.1809900850057602, "learning_rate": 5.030589000812315e-05, "loss": 0.1206, "step": 31390 }, { "grad_norm": 0.14880792796611786, "learning_rate": 5.027833264847793e-05, "loss": 0.1154, "step": 31400 }, { "grad_norm": 0.2172730714082718, "learning_rate": 5.025077520428258e-05, "loss": 0.1213, "step": 31410 }, { "grad_norm": 0.18151159584522247, "learning_rate": 5.022321768390837e-05, "loss": 0.1106, "step": 31420 }, { "grad_norm": 0.20228742063045502, "learning_rate": 5.0195660095726516e-05, "loss": 0.1095, "step": 31430 }, { "grad_norm": 0.21552778780460358, "learning_rate": 5.016810244810829e-05, "loss": 0.1203, "step": 31440 }, { "grad_norm": 0.19775064289569855, "learning_rate": 5.0140544749424976e-05, "loss": 0.1153, "step": 31450 }, { "grad_norm": 0.20884443819522858, "learning_rate": 5.0112987008047874e-05, "loss": 0.1181, "step": 31460 }, { "grad_norm": 0.22090932726860046, "learning_rate": 5.008542923234831e-05, "loss": 0.118, "step": 31470 }, { "grad_norm": 0.24116460978984833, "learning_rate": 5.00578714306976e-05, "loss": 0.1144, "step": 31480 }, { "grad_norm": 0.22069382667541504, "learning_rate": 5.0030313611467084e-05, "loss": 0.119, "step": 31490 }, { "grad_norm": 0.21169190108776093, "learning_rate": 5.0002755783028074e-05, "loss": 0.1195, "step": 31500 }, { "grad_norm": 0.2946892976760864, "learning_rate": 4.997519795375194e-05, "loss": 0.1244, "step": 31510 }, { "grad_norm": 0.2042141705751419, "learning_rate": 4.9947640132010016e-05, "loss": 0.1136, "step": 31520 }, { "grad_norm": 0.18280360102653503, "learning_rate": 4.9920082326173625e-05, "loss": 0.1218, "step": 31530 }, { "grad_norm": 0.2301081418991089, "learning_rate": 4.9892524544614114e-05, "loss": 0.1197, "step": 31540 }, { "grad_norm": 0.1679537296295166, "learning_rate": 4.986496679570283e-05, "loss": 0.1138, "step": 31550 }, { "grad_norm": 0.2186257541179657, "learning_rate": 4.983740908781105e-05, "loss": 0.1302, "step": 31560 }, { "grad_norm": 0.24775990843772888, "learning_rate": 4.9809851429310116e-05, "loss": 0.128, "step": 31570 }, { "grad_norm": 0.19715681672096252, "learning_rate": 4.9782293828571275e-05, "loss": 0.121, "step": 31580 }, { "grad_norm": 0.2466265857219696, "learning_rate": 4.9754736293965846e-05, "loss": 0.1146, "step": 31590 }, { "grad_norm": 0.2475796788930893, "learning_rate": 4.972717883386502e-05, "loss": 0.1286, "step": 31600 }, { "grad_norm": 0.19298379123210907, "learning_rate": 4.9699621456640075e-05, "loss": 0.1172, "step": 31610 }, { "grad_norm": 0.19948694109916687, "learning_rate": 4.9672064170662214e-05, "loss": 0.11, "step": 31620 }, { "grad_norm": 0.2198144495487213, "learning_rate": 4.9644506984302583e-05, "loss": 0.117, "step": 31630 }, { "grad_norm": 0.2028421312570572, "learning_rate": 4.9616949905932356e-05, "loss": 0.1151, "step": 31640 }, { "grad_norm": 0.21118487417697906, "learning_rate": 4.9589392943922615e-05, "loss": 0.1121, "step": 31650 }, { "grad_norm": 0.24528270959854126, "learning_rate": 4.956183610664447e-05, "loss": 0.1109, "step": 31660 }, { "grad_norm": 0.20099429786205292, "learning_rate": 4.9534279402468945e-05, "loss": 0.1263, "step": 31670 }, { "grad_norm": 0.1914747804403305, "learning_rate": 4.9506722839767036e-05, "loss": 0.1068, "step": 31680 }, { "grad_norm": 0.23051980137825012, "learning_rate": 4.947916642690972e-05, "loss": 0.1296, "step": 31690 }, { "grad_norm": 0.2108468860387802, "learning_rate": 4.9451610172267874e-05, "loss": 0.1158, "step": 31700 }, { "grad_norm": 0.21195295453071594, "learning_rate": 4.9424054084212376e-05, "loss": 0.1159, "step": 31710 }, { "grad_norm": 0.24245811998844147, "learning_rate": 4.939649817111407e-05, "loss": 0.1239, "step": 31720 }, { "grad_norm": 0.23719251155853271, "learning_rate": 4.936894244134365e-05, "loss": 0.1264, "step": 31730 }, { "grad_norm": 0.19413723051548004, "learning_rate": 4.9341386903271886e-05, "loss": 0.1205, "step": 31740 }, { "grad_norm": 0.19379419088363647, "learning_rate": 4.931383156526936e-05, "loss": 0.117, "step": 31750 }, { "grad_norm": 0.16127143800258636, "learning_rate": 4.92862764357067e-05, "loss": 0.1191, "step": 31760 }, { "grad_norm": 0.18557336926460266, "learning_rate": 4.925872152295443e-05, "loss": 0.1214, "step": 31770 }, { "grad_norm": 0.22130300104618073, "learning_rate": 4.923116683538296e-05, "loss": 0.1322, "step": 31780 }, { "grad_norm": 0.22922533750534058, "learning_rate": 4.920361238136273e-05, "loss": 0.1157, "step": 31790 }, { "grad_norm": 0.20933228731155396, "learning_rate": 4.9176058169264014e-05, "loss": 0.1167, "step": 31800 }, { "grad_norm": 0.2057158201932907, "learning_rate": 4.9148504207457074e-05, "loss": 0.1102, "step": 31810 }, { "grad_norm": 0.20369075238704681, "learning_rate": 4.912095050431208e-05, "loss": 0.1284, "step": 31820 }, { "grad_norm": 0.21493232250213623, "learning_rate": 4.909339706819911e-05, "loss": 0.1194, "step": 31830 }, { "grad_norm": 0.22029545903205872, "learning_rate": 4.906584390748819e-05, "loss": 0.1216, "step": 31840 }, { "grad_norm": 0.23769105970859528, "learning_rate": 4.9038291030549195e-05, "loss": 0.1172, "step": 31850 }, { "grad_norm": 0.2507826089859009, "learning_rate": 4.9010738445751995e-05, "loss": 0.1167, "step": 31860 }, { "grad_norm": 0.2510761320590973, "learning_rate": 4.8983186161466364e-05, "loss": 0.1134, "step": 31870 }, { "grad_norm": 0.19509382545948029, "learning_rate": 4.89556341860619e-05, "loss": 0.1145, "step": 31880 }, { "grad_norm": 0.18770702183246613, "learning_rate": 4.892808252790822e-05, "loss": 0.1135, "step": 31890 }, { "grad_norm": 0.2049817144870758, "learning_rate": 4.890053119537475e-05, "loss": 0.1268, "step": 31900 }, { "grad_norm": 0.19522236287593842, "learning_rate": 4.887298019683087e-05, "loss": 0.114, "step": 31910 }, { "grad_norm": 0.1706932783126831, "learning_rate": 4.884542954064587e-05, "loss": 0.1122, "step": 31920 }, { "grad_norm": 0.23628833889961243, "learning_rate": 4.881787923518887e-05, "loss": 0.1134, "step": 31930 }, { "grad_norm": 0.21993593871593475, "learning_rate": 4.879032928882896e-05, "loss": 0.1219, "step": 31940 }, { "grad_norm": 0.21007151901721954, "learning_rate": 4.876277970993505e-05, "loss": 0.1158, "step": 31950 }, { "grad_norm": 0.2285635620355606, "learning_rate": 4.873523050687602e-05, "loss": 0.1267, "step": 31960 }, { "grad_norm": 0.19200292229652405, "learning_rate": 4.870768168802056e-05, "loss": 0.1152, "step": 31970 }, { "grad_norm": 0.17674019932746887, "learning_rate": 4.868013326173728e-05, "loss": 0.1193, "step": 31980 }, { "grad_norm": 0.24576689302921295, "learning_rate": 4.865258523639468e-05, "loss": 0.1177, "step": 31990 }, { "grad_norm": 0.19111713767051697, "learning_rate": 4.862503762036109e-05, "loss": 0.1172, "step": 32000 }, { "grad_norm": 0.24754326045513153, "learning_rate": 4.859749042200478e-05, "loss": 0.1092, "step": 32010 }, { "grad_norm": 0.22587986290454865, "learning_rate": 4.856994364969384e-05, "loss": 0.1186, "step": 32020 }, { "grad_norm": 0.23668111860752106, "learning_rate": 4.854239731179625e-05, "loss": 0.1188, "step": 32030 }, { "grad_norm": 0.19726833701133728, "learning_rate": 4.85148514166799e-05, "loss": 0.1236, "step": 32040 }, { "grad_norm": 0.17754985392093658, "learning_rate": 4.8487305972712456e-05, "loss": 0.1103, "step": 32050 }, { "grad_norm": 0.20213748514652252, "learning_rate": 4.8459760988261526e-05, "loss": 0.113, "step": 32060 }, { "grad_norm": 0.23308934271335602, "learning_rate": 4.843221647169453e-05, "loss": 0.1164, "step": 32070 }, { "grad_norm": 0.21093882620334625, "learning_rate": 4.840467243137878e-05, "loss": 0.1257, "step": 32080 }, { "grad_norm": 0.2476472109556198, "learning_rate": 4.837712887568143e-05, "loss": 0.1235, "step": 32090 }, { "grad_norm": 0.24376478791236877, "learning_rate": 4.8349585812969464e-05, "loss": 0.1198, "step": 32100 }, { "grad_norm": 0.19165101647377014, "learning_rate": 4.8322043251609775e-05, "loss": 0.1165, "step": 32110 }, { "grad_norm": 0.21269957721233368, "learning_rate": 4.8294501199969015e-05, "loss": 0.1195, "step": 32120 }, { "grad_norm": 0.18020612001419067, "learning_rate": 4.826695966641376e-05, "loss": 0.1203, "step": 32130 }, { "grad_norm": 0.17330142855644226, "learning_rate": 4.823941865931043e-05, "loss": 0.129, "step": 32140 }, { "grad_norm": 0.22186042368412018, "learning_rate": 4.82118781870252e-05, "loss": 0.1213, "step": 32150 }, { "grad_norm": 0.19089359045028687, "learning_rate": 4.8184338257924185e-05, "loss": 0.1253, "step": 32160 }, { "grad_norm": 0.2117050141096115, "learning_rate": 4.815679888037324e-05, "loss": 0.1269, "step": 32170 }, { "grad_norm": 0.19223350286483765, "learning_rate": 4.8129260062738135e-05, "loss": 0.1237, "step": 32180 }, { "grad_norm": 0.228929340839386, "learning_rate": 4.810172181338445e-05, "loss": 0.1177, "step": 32190 }, { "grad_norm": 0.2448403239250183, "learning_rate": 4.807418414067753e-05, "loss": 0.1113, "step": 32200 }, { "grad_norm": 0.2311154156923294, "learning_rate": 4.804664705298264e-05, "loss": 0.1264, "step": 32210 }, { "grad_norm": 0.20293332636356354, "learning_rate": 4.80191105586648e-05, "loss": 0.1211, "step": 32220 }, { "grad_norm": 0.17282968759536743, "learning_rate": 4.799157466608886e-05, "loss": 0.1184, "step": 32230 }, { "grad_norm": 0.19488374888896942, "learning_rate": 4.796403938361951e-05, "loss": 0.1183, "step": 32240 }, { "grad_norm": 0.16175056993961334, "learning_rate": 4.793650471962123e-05, "loss": 0.1196, "step": 32250 }, { "grad_norm": 0.215452641248703, "learning_rate": 4.790897068245835e-05, "loss": 0.1146, "step": 32260 }, { "grad_norm": 0.18948015570640564, "learning_rate": 4.7881437280494954e-05, "loss": 0.1182, "step": 32270 }, { "grad_norm": 0.1864880472421646, "learning_rate": 4.7853904522094965e-05, "loss": 0.1202, "step": 32280 }, { "grad_norm": 0.25769054889678955, "learning_rate": 4.782637241562215e-05, "loss": 0.1204, "step": 32290 }, { "grad_norm": 0.2210136502981186, "learning_rate": 4.779884096943997e-05, "loss": 0.1155, "step": 32300 }, { "grad_norm": 0.22903332114219666, "learning_rate": 4.777131019191182e-05, "loss": 0.1209, "step": 32310 }, { "grad_norm": 0.25719061493873596, "learning_rate": 4.774378009140076e-05, "loss": 0.1207, "step": 32320 }, { "grad_norm": 0.22237235307693481, "learning_rate": 4.7716250676269735e-05, "loss": 0.1164, "step": 32330 }, { "grad_norm": 0.2646999657154083, "learning_rate": 4.7688721954881485e-05, "loss": 0.1284, "step": 32340 }, { "grad_norm": 0.21037472784519196, "learning_rate": 4.7661193935598446e-05, "loss": 0.1166, "step": 32350 }, { "grad_norm": 0.2095896303653717, "learning_rate": 4.763366662678296e-05, "loss": 0.1199, "step": 32360 }, { "grad_norm": 0.21783846616744995, "learning_rate": 4.7606140036797064e-05, "loss": 0.119, "step": 32370 }, { "grad_norm": 0.21147842705249786, "learning_rate": 4.7578614174002614e-05, "loss": 0.1187, "step": 32380 }, { "grad_norm": 0.22294043004512787, "learning_rate": 4.755108904676125e-05, "loss": 0.125, "step": 32390 }, { "grad_norm": 0.21552395820617676, "learning_rate": 4.752356466343436e-05, "loss": 0.1181, "step": 32400 }, { "grad_norm": 0.24064937233924866, "learning_rate": 4.7496041032383174e-05, "loss": 0.1233, "step": 32410 }, { "grad_norm": 0.23475737869739532, "learning_rate": 4.746851816196858e-05, "loss": 0.1224, "step": 32420 }, { "grad_norm": 0.20939555764198303, "learning_rate": 4.744099606055135e-05, "loss": 0.1183, "step": 32430 }, { "grad_norm": 0.24928498268127441, "learning_rate": 4.741347473649193e-05, "loss": 0.1328, "step": 32440 }, { "grad_norm": 0.16189630329608917, "learning_rate": 4.738595419815058e-05, "loss": 0.11, "step": 32450 }, { "grad_norm": 0.19257497787475586, "learning_rate": 4.7358434453887365e-05, "loss": 0.1192, "step": 32460 }, { "grad_norm": 0.1366671770811081, "learning_rate": 4.7330915512061976e-05, "loss": 0.1197, "step": 32470 }, { "grad_norm": 0.1911262720823288, "learning_rate": 4.730339738103402e-05, "loss": 0.1116, "step": 32480 }, { "grad_norm": 0.2727947533130646, "learning_rate": 4.727588006916271e-05, "loss": 0.1255, "step": 32490 }, { "grad_norm": 0.24504177272319794, "learning_rate": 4.724836358480711e-05, "loss": 0.117, "step": 32500 }, { "grad_norm": 0.2168380618095398, "learning_rate": 4.722084793632601e-05, "loss": 0.1241, "step": 32510 }, { "grad_norm": 0.2365248203277588, "learning_rate": 4.719333313207792e-05, "loss": 0.1162, "step": 32520 }, { "grad_norm": 0.19388282299041748, "learning_rate": 4.716581918042114e-05, "loss": 0.1144, "step": 32530 }, { "grad_norm": 0.22355014085769653, "learning_rate": 4.7138306089713636e-05, "loss": 0.1266, "step": 32540 }, { "grad_norm": 0.16998662054538727, "learning_rate": 4.7110793868313183e-05, "loss": 0.1186, "step": 32550 }, { "grad_norm": 0.20774511992931366, "learning_rate": 4.708328252457729e-05, "loss": 0.1181, "step": 32560 }, { "grad_norm": 0.20144498348236084, "learning_rate": 4.7055772066863135e-05, "loss": 0.1211, "step": 32570 }, { "grad_norm": 0.18966269493103027, "learning_rate": 4.702826250352771e-05, "loss": 0.1342, "step": 32580 }, { "grad_norm": 0.2689021825790405, "learning_rate": 4.7000753842927653e-05, "loss": 0.1209, "step": 32590 }, { "grad_norm": 0.22579285502433777, "learning_rate": 4.6973246093419384e-05, "loss": 0.1243, "step": 32600 }, { "grad_norm": 0.19995740056037903, "learning_rate": 4.694573926335906e-05, "loss": 0.11, "step": 32610 }, { "grad_norm": 0.22311878204345703, "learning_rate": 4.6918233361102476e-05, "loss": 0.1161, "step": 32620 }, { "grad_norm": 0.17888659238815308, "learning_rate": 4.689072839500525e-05, "loss": 0.1155, "step": 32630 }, { "grad_norm": 0.23771390318870544, "learning_rate": 4.6863224373422635e-05, "loss": 0.123, "step": 32640 }, { "grad_norm": 0.19851325452327728, "learning_rate": 4.683572130470962e-05, "loss": 0.119, "step": 32650 }, { "grad_norm": 0.17156539857387543, "learning_rate": 4.680821919722094e-05, "loss": 0.1125, "step": 32660 }, { "grad_norm": 0.18351486325263977, "learning_rate": 4.6780718059310975e-05, "loss": 0.1233, "step": 32670 }, { "grad_norm": 0.19587408006191254, "learning_rate": 4.675321789933389e-05, "loss": 0.124, "step": 32680 }, { "grad_norm": 0.18747563660144806, "learning_rate": 4.6725718725643464e-05, "loss": 0.1141, "step": 32690 }, { "grad_norm": 0.18614698946475983, "learning_rate": 4.669822054659323e-05, "loss": 0.1202, "step": 32700 }, { "grad_norm": 0.20324373245239258, "learning_rate": 4.667072337053644e-05, "loss": 0.1281, "step": 32710 }, { "grad_norm": 0.20687054097652435, "learning_rate": 4.6643227205825965e-05, "loss": 0.1181, "step": 32720 }, { "grad_norm": 0.19831669330596924, "learning_rate": 4.6615732060814454e-05, "loss": 0.119, "step": 32730 }, { "grad_norm": 0.1512058973312378, "learning_rate": 4.658823794385417e-05, "loss": 0.1099, "step": 32740 }, { "grad_norm": 0.17499616742134094, "learning_rate": 4.6560744863297115e-05, "loss": 0.1176, "step": 32750 }, { "grad_norm": 0.2190309464931488, "learning_rate": 4.653325282749498e-05, "loss": 0.1146, "step": 32760 }, { "grad_norm": 0.2478158175945282, "learning_rate": 4.6505761844799075e-05, "loss": 0.122, "step": 32770 }, { "grad_norm": 0.26514264941215515, "learning_rate": 4.647827192356048e-05, "loss": 0.1153, "step": 32780 }, { "grad_norm": 0.1736721247434616, "learning_rate": 4.645078307212989e-05, "loss": 0.1098, "step": 32790 }, { "grad_norm": 0.20756173133850098, "learning_rate": 4.642329529885768e-05, "loss": 0.1197, "step": 32800 }, { "grad_norm": 0.2119571566581726, "learning_rate": 4.639580861209393e-05, "loss": 0.1242, "step": 32810 }, { "grad_norm": 0.19991528987884521, "learning_rate": 4.636832302018835e-05, "loss": 0.1118, "step": 32820 }, { "grad_norm": 0.21199965476989746, "learning_rate": 4.6340838531490365e-05, "loss": 0.1204, "step": 32830 }, { "grad_norm": 0.18234802782535553, "learning_rate": 4.6313355154349e-05, "loss": 0.1174, "step": 32840 }, { "grad_norm": 0.21490556001663208, "learning_rate": 4.6285872897113025e-05, "loss": 0.1177, "step": 32850 }, { "grad_norm": 0.19468538463115692, "learning_rate": 4.625839176813077e-05, "loss": 0.111, "step": 32860 }, { "grad_norm": 0.19110889732837677, "learning_rate": 4.623091177575031e-05, "loss": 0.1223, "step": 32870 }, { "grad_norm": 0.18988056480884552, "learning_rate": 4.620343292831936e-05, "loss": 0.1176, "step": 32880 }, { "grad_norm": 0.18755333125591278, "learning_rate": 4.6175955234185206e-05, "loss": 0.117, "step": 32890 }, { "grad_norm": 0.17948855459690094, "learning_rate": 4.614847870169492e-05, "loss": 0.1231, "step": 32900 }, { "grad_norm": 0.17578397691249847, "learning_rate": 4.612100333919509e-05, "loss": 0.1107, "step": 32910 }, { "grad_norm": 0.17486050724983215, "learning_rate": 4.609352915503202e-05, "loss": 0.1074, "step": 32920 }, { "grad_norm": 0.17459507286548615, "learning_rate": 4.606605615755166e-05, "loss": 0.1151, "step": 32930 }, { "grad_norm": 0.20035122334957123, "learning_rate": 4.6038584355099576e-05, "loss": 0.1166, "step": 32940 }, { "grad_norm": 0.22566702961921692, "learning_rate": 4.6011113756020964e-05, "loss": 0.1177, "step": 32950 }, { "grad_norm": 0.24275752902030945, "learning_rate": 4.598364436866066e-05, "loss": 0.1204, "step": 32960 }, { "grad_norm": 0.1763123869895935, "learning_rate": 4.595617620136316e-05, "loss": 0.1213, "step": 32970 }, { "grad_norm": 0.20311994850635529, "learning_rate": 4.592870926247257e-05, "loss": 0.1189, "step": 32980 }, { "grad_norm": 0.18918848037719727, "learning_rate": 4.5901243560332594e-05, "loss": 0.1082, "step": 32990 }, { "grad_norm": 0.15612374246120453, "learning_rate": 4.587377910328662e-05, "loss": 0.1133, "step": 33000 }, { "grad_norm": 0.1980387270450592, "learning_rate": 4.5846315899677586e-05, "loss": 0.1175, "step": 33010 }, { "grad_norm": 0.22397129237651825, "learning_rate": 4.5818853957848114e-05, "loss": 0.1127, "step": 33020 }, { "grad_norm": 0.2794690728187561, "learning_rate": 4.579139328614043e-05, "loss": 0.1173, "step": 33030 }, { "grad_norm": 0.20115213096141815, "learning_rate": 4.576393389289633e-05, "loss": 0.1126, "step": 33040 }, { "grad_norm": 0.25092387199401855, "learning_rate": 4.573647578645728e-05, "loss": 0.1249, "step": 33050 }, { "grad_norm": 0.23504510521888733, "learning_rate": 4.57090189751643e-05, "loss": 0.1184, "step": 33060 }, { "grad_norm": 0.22279003262519836, "learning_rate": 4.568156346735806e-05, "loss": 0.113, "step": 33070 }, { "grad_norm": 0.19269855320453644, "learning_rate": 4.565410927137882e-05, "loss": 0.1168, "step": 33080 }, { "grad_norm": 0.15465544164180756, "learning_rate": 4.562665639556644e-05, "loss": 0.1202, "step": 33090 }, { "grad_norm": 0.1597319096326828, "learning_rate": 4.559920484826037e-05, "loss": 0.1122, "step": 33100 }, { "grad_norm": 0.20396935939788818, "learning_rate": 4.5571754637799665e-05, "loss": 0.1183, "step": 33110 }, { "grad_norm": 0.17625358700752258, "learning_rate": 4.554430577252298e-05, "loss": 0.1146, "step": 33120 }, { "grad_norm": 0.23707254230976105, "learning_rate": 4.551685826076858e-05, "loss": 0.1134, "step": 33130 }, { "grad_norm": 0.22089916467666626, "learning_rate": 4.5489412110874246e-05, "loss": 0.1209, "step": 33140 }, { "grad_norm": 0.24471086263656616, "learning_rate": 4.5461967331177444e-05, "loss": 0.1227, "step": 33150 }, { "grad_norm": 0.22095410525798798, "learning_rate": 4.5434523930015115e-05, "loss": 0.1169, "step": 33160 }, { "grad_norm": 0.19754904508590698, "learning_rate": 4.540708191572388e-05, "loss": 0.121, "step": 33170 }, { "grad_norm": 0.22067345678806305, "learning_rate": 4.537964129663991e-05, "loss": 0.1203, "step": 33180 }, { "grad_norm": 0.2602653503417969, "learning_rate": 4.535220208109889e-05, "loss": 0.1227, "step": 33190 }, { "grad_norm": 0.22397416830062866, "learning_rate": 4.5324764277436194e-05, "loss": 0.1244, "step": 33200 }, { "grad_norm": 0.2186998873949051, "learning_rate": 4.529732789398664e-05, "loss": 0.1183, "step": 33210 }, { "grad_norm": 0.19185446202754974, "learning_rate": 4.526989293908472e-05, "loss": 0.1155, "step": 33220 }, { "grad_norm": 0.19249266386032104, "learning_rate": 4.524245942106442e-05, "loss": 0.1177, "step": 33230 }, { "grad_norm": 0.2180725336074829, "learning_rate": 4.5215027348259345e-05, "loss": 0.1208, "step": 33240 }, { "grad_norm": 0.1934894621372223, "learning_rate": 4.5187596729002616e-05, "loss": 0.1201, "step": 33250 }, { "grad_norm": 0.21772699058055878, "learning_rate": 4.516016757162693e-05, "loss": 0.118, "step": 33260 }, { "grad_norm": 0.4656614363193512, "learning_rate": 4.513273988446457e-05, "loss": 0.1232, "step": 33270 }, { "grad_norm": 0.3429129123687744, "learning_rate": 4.5105313675847296e-05, "loss": 0.1261, "step": 33280 }, { "grad_norm": 0.21528910100460052, "learning_rate": 4.5077888954106495e-05, "loss": 0.1141, "step": 33290 }, { "grad_norm": 0.215675488114357, "learning_rate": 4.505046572757309e-05, "loss": 0.1198, "step": 33300 }, { "grad_norm": 0.17360037565231323, "learning_rate": 4.502304400457749e-05, "loss": 0.1145, "step": 33310 }, { "grad_norm": 0.23837922513484955, "learning_rate": 4.499562379344973e-05, "loss": 0.1249, "step": 33320 }, { "grad_norm": 0.1970413774251938, "learning_rate": 4.4968205102519306e-05, "loss": 0.1247, "step": 33330 }, { "grad_norm": 0.23151344060897827, "learning_rate": 4.494078794011532e-05, "loss": 0.1213, "step": 33340 }, { "grad_norm": 0.18657658994197845, "learning_rate": 4.491337231456639e-05, "loss": 0.1167, "step": 33350 }, { "grad_norm": 0.19981099665164948, "learning_rate": 4.4885958234200634e-05, "loss": 0.1179, "step": 33360 }, { "grad_norm": 0.2424406111240387, "learning_rate": 4.485854570734575e-05, "loss": 0.1283, "step": 33370 }, { "grad_norm": 0.1888858675956726, "learning_rate": 4.483113474232891e-05, "loss": 0.1158, "step": 33380 }, { "grad_norm": 0.22055764496326447, "learning_rate": 4.480372534747688e-05, "loss": 0.1225, "step": 33390 }, { "grad_norm": 0.20332780480384827, "learning_rate": 4.477631753111588e-05, "loss": 0.1135, "step": 33400 }, { "grad_norm": 0.17974083125591278, "learning_rate": 4.4748911301571686e-05, "loss": 0.1153, "step": 33410 }, { "grad_norm": 0.17443206906318665, "learning_rate": 4.472150666716961e-05, "loss": 0.1142, "step": 33420 }, { "grad_norm": 0.2190767079591751, "learning_rate": 4.469410363623442e-05, "loss": 0.1141, "step": 33430 }, { "grad_norm": 0.18441133201122284, "learning_rate": 4.466670221709044e-05, "loss": 0.1126, "step": 33440 }, { "grad_norm": 0.1695360243320465, "learning_rate": 4.463930241806154e-05, "loss": 0.1108, "step": 33450 }, { "grad_norm": 0.2521215081214905, "learning_rate": 4.4611904247471006e-05, "loss": 0.1263, "step": 33460 }, { "grad_norm": 0.18633954226970673, "learning_rate": 4.458450771364171e-05, "loss": 0.118, "step": 33470 }, { "grad_norm": 0.18785515427589417, "learning_rate": 4.4557112824895965e-05, "loss": 0.112, "step": 33480 }, { "grad_norm": 0.22168222069740295, "learning_rate": 4.452971958955563e-05, "loss": 0.1181, "step": 33490 }, { "grad_norm": 0.17998717725276947, "learning_rate": 4.450232801594208e-05, "loss": 0.1188, "step": 33500 }, { "grad_norm": 0.1858266443014145, "learning_rate": 4.447493811237609e-05, "loss": 0.1199, "step": 33510 }, { "grad_norm": 0.20881007611751556, "learning_rate": 4.444754988717804e-05, "loss": 0.1184, "step": 33520 }, { "grad_norm": 0.20516306161880493, "learning_rate": 4.442016334866771e-05, "loss": 0.119, "step": 33530 }, { "grad_norm": 0.203963041305542, "learning_rate": 4.4392778505164445e-05, "loss": 0.1156, "step": 33540 }, { "grad_norm": 0.22063502669334412, "learning_rate": 4.436539536498702e-05, "loss": 0.1157, "step": 33550 }, { "grad_norm": 0.22078779339790344, "learning_rate": 4.433801393645369e-05, "loss": 0.1188, "step": 33560 }, { "grad_norm": 0.16627374291419983, "learning_rate": 4.431063422788226e-05, "loss": 0.1143, "step": 33570 }, { "grad_norm": 0.22102345526218414, "learning_rate": 4.428325624758991e-05, "loss": 0.1146, "step": 33580 }, { "grad_norm": 0.23022833466529846, "learning_rate": 4.4255880003893366e-05, "loss": 0.125, "step": 33590 }, { "grad_norm": 0.21258644759655, "learning_rate": 4.422850550510884e-05, "loss": 0.118, "step": 33600 }, { "grad_norm": 0.21827183663845062, "learning_rate": 4.4201132759551934e-05, "loss": 0.1179, "step": 33610 }, { "grad_norm": 0.2147085815668106, "learning_rate": 4.4173761775537804e-05, "loss": 0.1238, "step": 33620 }, { "grad_norm": 0.18078747391700745, "learning_rate": 4.414639256138099e-05, "loss": 0.1164, "step": 33630 }, { "grad_norm": 0.18969418108463287, "learning_rate": 4.411902512539557e-05, "loss": 0.109, "step": 33640 }, { "grad_norm": 0.20207086205482483, "learning_rate": 4.4091659475895044e-05, "loss": 0.1242, "step": 33650 }, { "grad_norm": 0.2310345619916916, "learning_rate": 4.406429562119235e-05, "loss": 0.1134, "step": 33660 }, { "grad_norm": 0.19394497573375702, "learning_rate": 4.4036933569599945e-05, "loss": 0.1207, "step": 33670 }, { "grad_norm": 0.24527059495449066, "learning_rate": 4.400957332942965e-05, "loss": 0.1101, "step": 33680 }, { "grad_norm": 0.19702480733394623, "learning_rate": 4.3982214908992844e-05, "loss": 0.1164, "step": 33690 }, { "grad_norm": 0.21617580950260162, "learning_rate": 4.3954858316600235e-05, "loss": 0.1183, "step": 33700 }, { "grad_norm": 0.19659718871116638, "learning_rate": 4.392750356056205e-05, "loss": 0.1149, "step": 33710 }, { "grad_norm": 0.18802252411842346, "learning_rate": 4.390015064918798e-05, "loss": 0.1164, "step": 33720 }, { "grad_norm": 0.16999226808547974, "learning_rate": 4.387279959078705e-05, "loss": 0.1131, "step": 33730 }, { "grad_norm": 0.1626933217048645, "learning_rate": 4.384545039366786e-05, "loss": 0.1121, "step": 33740 }, { "grad_norm": 0.18720483779907227, "learning_rate": 4.381810306613831e-05, "loss": 0.1177, "step": 33750 }, { "grad_norm": 0.19014392793178558, "learning_rate": 4.3790757616505826e-05, "loss": 0.127, "step": 33760 }, { "grad_norm": 0.2385985106229782, "learning_rate": 4.376341405307725e-05, "loss": 0.1148, "step": 33770 }, { "grad_norm": 0.19556112587451935, "learning_rate": 4.37360723841588e-05, "loss": 0.1167, "step": 33780 }, { "grad_norm": 0.18848177790641785, "learning_rate": 4.370873261805619e-05, "loss": 0.1132, "step": 33790 }, { "grad_norm": 0.19012890756130219, "learning_rate": 4.368139476307449e-05, "loss": 0.12, "step": 33800 }, { "grad_norm": 0.18686304986476898, "learning_rate": 4.365405882751822e-05, "loss": 0.1174, "step": 33810 }, { "grad_norm": 0.2258925437927246, "learning_rate": 4.3626724819691326e-05, "loss": 0.1169, "step": 33820 }, { "grad_norm": 0.2211323231458664, "learning_rate": 4.359939274789715e-05, "loss": 0.1147, "step": 33830 }, { "grad_norm": 0.2057323008775711, "learning_rate": 4.357206262043848e-05, "loss": 0.1198, "step": 33840 }, { "grad_norm": 0.2338549941778183, "learning_rate": 4.354473444561745e-05, "loss": 0.1148, "step": 33850 }, { "grad_norm": 0.15032079815864563, "learning_rate": 4.3517408231735644e-05, "loss": 0.1091, "step": 33860 }, { "grad_norm": 0.1658288538455963, "learning_rate": 4.3490083987094086e-05, "loss": 0.1111, "step": 33870 }, { "grad_norm": 0.20969781279563904, "learning_rate": 4.34627617199931e-05, "loss": 0.1161, "step": 33880 }, { "grad_norm": 0.1503712683916092, "learning_rate": 4.3435441438732526e-05, "loss": 0.1165, "step": 33890 }, { "grad_norm": 0.19762758910655975, "learning_rate": 4.340812315161149e-05, "loss": 0.1228, "step": 33900 }, { "grad_norm": 0.14812414348125458, "learning_rate": 4.338080686692859e-05, "loss": 0.1137, "step": 33910 }, { "grad_norm": 0.19364812970161438, "learning_rate": 4.3353492592981816e-05, "loss": 0.1158, "step": 33920 }, { "grad_norm": 0.15559791028499603, "learning_rate": 4.3326180338068485e-05, "loss": 0.1125, "step": 33930 }, { "grad_norm": 0.17042995989322662, "learning_rate": 4.3298870110485356e-05, "loss": 0.1131, "step": 33940 }, { "grad_norm": 0.20994487404823303, "learning_rate": 4.3271561918528567e-05, "loss": 0.1228, "step": 33950 }, { "grad_norm": 0.15954290330410004, "learning_rate": 4.324425577049359e-05, "loss": 0.116, "step": 33960 }, { "grad_norm": 0.2193266749382019, "learning_rate": 4.321695167467535e-05, "loss": 0.1225, "step": 33970 }, { "grad_norm": 0.23271112143993378, "learning_rate": 4.3189649639368093e-05, "loss": 0.1203, "step": 33980 }, { "grad_norm": 0.17527589201927185, "learning_rate": 4.316234967286547e-05, "loss": 0.1135, "step": 33990 }, { "grad_norm": 0.19740746915340424, "learning_rate": 4.313505178346046e-05, "loss": 0.1198, "step": 34000 }, { "grad_norm": 0.1984902173280716, "learning_rate": 4.3107755979445465e-05, "loss": 0.1172, "step": 34010 }, { "grad_norm": 0.22943750023841858, "learning_rate": 4.308046226911224e-05, "loss": 0.1163, "step": 34020 }, { "grad_norm": 0.1945953667163849, "learning_rate": 4.305317066075185e-05, "loss": 0.1192, "step": 34030 }, { "grad_norm": 0.21569690108299255, "learning_rate": 4.302588116265482e-05, "loss": 0.105, "step": 34040 }, { "grad_norm": 0.20343852043151855, "learning_rate": 4.299859378311094e-05, "loss": 0.1189, "step": 34050 }, { "grad_norm": 0.21104349195957184, "learning_rate": 4.2971308530409424e-05, "loss": 0.1161, "step": 34060 }, { "grad_norm": 0.19615431129932404, "learning_rate": 4.2944025412838765e-05, "loss": 0.1191, "step": 34070 }, { "grad_norm": 0.17453227937221527, "learning_rate": 4.291674443868689e-05, "loss": 0.1162, "step": 34080 }, { "grad_norm": 0.18343842029571533, "learning_rate": 4.288946561624104e-05, "loss": 0.1246, "step": 34090 }, { "grad_norm": 0.20943211019039154, "learning_rate": 4.2862188953787794e-05, "loss": 0.113, "step": 34100 }, { "grad_norm": 0.19970297813415527, "learning_rate": 4.283491445961308e-05, "loss": 0.1216, "step": 34110 }, { "grad_norm": 0.20956693589687347, "learning_rate": 4.2807642142002155e-05, "loss": 0.1158, "step": 34120 }, { "grad_norm": 0.17966735363006592, "learning_rate": 4.278037200923966e-05, "loss": 0.1087, "step": 34130 }, { "grad_norm": 0.16040270030498505, "learning_rate": 4.275310406960953e-05, "loss": 0.1114, "step": 34140 }, { "grad_norm": 0.2014777809381485, "learning_rate": 4.272583833139502e-05, "loss": 0.1229, "step": 34150 }, { "grad_norm": 0.1828417032957077, "learning_rate": 4.2698574802878794e-05, "loss": 0.1159, "step": 34160 }, { "grad_norm": 0.19892242550849915, "learning_rate": 4.2671313492342734e-05, "loss": 0.1194, "step": 34170 }, { "grad_norm": 0.15223902463912964, "learning_rate": 4.264405440806813e-05, "loss": 0.1134, "step": 34180 }, { "grad_norm": 0.17178572714328766, "learning_rate": 4.26167975583356e-05, "loss": 0.114, "step": 34190 }, { "grad_norm": 0.15885105729103088, "learning_rate": 4.2589542951425e-05, "loss": 0.1125, "step": 34200 }, { "grad_norm": 0.20659714937210083, "learning_rate": 4.2562290595615615e-05, "loss": 0.1271, "step": 34210 }, { "grad_norm": 0.17578734457492828, "learning_rate": 4.2535040499185946e-05, "loss": 0.1232, "step": 34220 }, { "grad_norm": 0.25263461470603943, "learning_rate": 4.250779267041387e-05, "loss": 0.1116, "step": 34230 }, { "grad_norm": 0.21834948658943176, "learning_rate": 4.248054711757657e-05, "loss": 0.1099, "step": 34240 }, { "grad_norm": 0.18782414495944977, "learning_rate": 4.245330384895052e-05, "loss": 0.1183, "step": 34250 }, { "grad_norm": 0.21088306605815887, "learning_rate": 4.242606287281151e-05, "loss": 0.1177, "step": 34260 }, { "grad_norm": 0.17884500324726105, "learning_rate": 4.2398824197434595e-05, "loss": 0.1197, "step": 34270 }, { "grad_norm": 0.20790398120880127, "learning_rate": 4.23715878310942e-05, "loss": 0.1176, "step": 34280 }, { "grad_norm": 0.179665207862854, "learning_rate": 4.234435378206402e-05, "loss": 0.1196, "step": 34290 }, { "grad_norm": 0.1620020568370819, "learning_rate": 4.2317122058617006e-05, "loss": 0.1105, "step": 34300 }, { "grad_norm": 0.1947091817855835, "learning_rate": 4.2289892669025485e-05, "loss": 0.1079, "step": 34310 }, { "grad_norm": 0.18927840888500214, "learning_rate": 4.226266562156097e-05, "loss": 0.1136, "step": 34320 }, { "grad_norm": 0.20055967569351196, "learning_rate": 4.223544092449435e-05, "loss": 0.1158, "step": 34330 }, { "grad_norm": 0.18008123338222504, "learning_rate": 4.2208218586095784e-05, "loss": 0.1164, "step": 34340 }, { "grad_norm": 0.18124252557754517, "learning_rate": 4.218099861463466e-05, "loss": 0.1235, "step": 34350 }, { "grad_norm": 0.1858607530593872, "learning_rate": 4.215378101837972e-05, "loss": 0.1259, "step": 34360 }, { "grad_norm": 0.1693916916847229, "learning_rate": 4.2126565805598937e-05, "loss": 0.1144, "step": 34370 }, { "grad_norm": 0.23810701072216034, "learning_rate": 4.209935298455957e-05, "loss": 0.1178, "step": 34380 }, { "grad_norm": 0.17258644104003906, "learning_rate": 4.207214256352817e-05, "loss": 0.1155, "step": 34390 }, { "grad_norm": 0.18481455743312836, "learning_rate": 4.2044934550770524e-05, "loss": 0.121, "step": 34400 }, { "grad_norm": 0.18147437274456024, "learning_rate": 4.201772895455174e-05, "loss": 0.1086, "step": 34410 }, { "grad_norm": 0.15975771844387054, "learning_rate": 4.199052578313613e-05, "loss": 0.1158, "step": 34420 }, { "grad_norm": 0.22862942516803741, "learning_rate": 4.1963325044787294e-05, "loss": 0.119, "step": 34430 }, { "grad_norm": 0.18111783266067505, "learning_rate": 4.193612674776814e-05, "loss": 0.1197, "step": 34440 }, { "grad_norm": 0.18415573239326477, "learning_rate": 4.1908930900340745e-05, "loss": 0.1226, "step": 34450 }, { "grad_norm": 0.20438522100448608, "learning_rate": 4.1881737510766536e-05, "loss": 0.1256, "step": 34460 }, { "grad_norm": 0.15269343554973602, "learning_rate": 4.185454658730609e-05, "loss": 0.1214, "step": 34470 }, { "grad_norm": 0.24314522743225098, "learning_rate": 4.1827358138219355e-05, "loss": 0.1201, "step": 34480 }, { "grad_norm": 0.23733195662498474, "learning_rate": 4.1800172171765404e-05, "loss": 0.1225, "step": 34490 }, { "grad_norm": 0.2161019891500473, "learning_rate": 4.177298869620264e-05, "loss": 0.1151, "step": 34500 }, { "grad_norm": 0.21383677423000336, "learning_rate": 4.1745807719788705e-05, "loss": 0.119, "step": 34510 }, { "grad_norm": 0.1871655136346817, "learning_rate": 4.1718629250780445e-05, "loss": 0.1139, "step": 34520 }, { "grad_norm": 0.20969799160957336, "learning_rate": 4.1691453297433956e-05, "loss": 0.1182, "step": 34530 }, { "grad_norm": 0.17866046726703644, "learning_rate": 4.166427986800457e-05, "loss": 0.1089, "step": 34540 }, { "grad_norm": 0.2306760549545288, "learning_rate": 4.163710897074688e-05, "loss": 0.1168, "step": 34550 }, { "grad_norm": 0.21021708846092224, "learning_rate": 4.1609940613914686e-05, "loss": 0.1206, "step": 34560 }, { "grad_norm": 0.16086211800575256, "learning_rate": 4.1582774805760996e-05, "loss": 0.1128, "step": 34570 }, { "grad_norm": 0.1916075497865677, "learning_rate": 4.155561155453809e-05, "loss": 0.119, "step": 34580 }, { "grad_norm": 0.21613289415836334, "learning_rate": 4.15284508684974e-05, "loss": 0.1235, "step": 34590 }, { "grad_norm": 0.197369784116745, "learning_rate": 4.1501292755889675e-05, "loss": 0.111, "step": 34600 }, { "grad_norm": 0.1898033171892166, "learning_rate": 4.1474137224964833e-05, "loss": 0.1156, "step": 34610 }, { "grad_norm": 0.32127222418785095, "learning_rate": 4.144698428397197e-05, "loss": 0.1219, "step": 34620 }, { "grad_norm": 0.25001728534698486, "learning_rate": 4.1419833941159466e-05, "loss": 0.1264, "step": 34630 }, { "grad_norm": 0.18568047881126404, "learning_rate": 4.1392686204774846e-05, "loss": 0.1196, "step": 34640 }, { "grad_norm": 0.190387561917305, "learning_rate": 4.13655410830649e-05, "loss": 0.1127, "step": 34650 }, { "grad_norm": 0.17399857938289642, "learning_rate": 4.1338398584275594e-05, "loss": 0.1133, "step": 34660 }, { "grad_norm": 0.18143627047538757, "learning_rate": 4.1311258716652104e-05, "loss": 0.1112, "step": 34670 }, { "grad_norm": 0.20335881412029266, "learning_rate": 4.128412148843881e-05, "loss": 0.1134, "step": 34680 }, { "grad_norm": 0.17179091274738312, "learning_rate": 4.125698690787926e-05, "loss": 0.1192, "step": 34690 }, { "grad_norm": 0.2281274050474167, "learning_rate": 4.1229854983216245e-05, "loss": 0.1159, "step": 34700 }, { "grad_norm": 0.17209306359291077, "learning_rate": 4.120272572269175e-05, "loss": 0.1158, "step": 34710 }, { "grad_norm": 0.1702210158109665, "learning_rate": 4.117559913454687e-05, "loss": 0.1161, "step": 34720 }, { "grad_norm": 0.1889793574810028, "learning_rate": 4.114847522702201e-05, "loss": 0.1202, "step": 34730 }, { "grad_norm": 0.17574988305568695, "learning_rate": 4.112135400835664e-05, "loss": 0.1197, "step": 34740 }, { "grad_norm": 0.1768784075975418, "learning_rate": 4.109423548678949e-05, "loss": 0.115, "step": 34750 }, { "grad_norm": 0.2088901549577713, "learning_rate": 4.106711967055848e-05, "loss": 0.1176, "step": 34760 }, { "grad_norm": 0.2150862067937851, "learning_rate": 4.1040006567900636e-05, "loss": 0.1197, "step": 34770 }, { "grad_norm": 0.2501806616783142, "learning_rate": 4.101289618705224e-05, "loss": 0.1205, "step": 34780 }, { "grad_norm": 0.19337797164916992, "learning_rate": 4.0985788536248675e-05, "loss": 0.1214, "step": 34790 }, { "grad_norm": 0.2704721689224243, "learning_rate": 4.095868362372454e-05, "loss": 0.1214, "step": 34800 }, { "grad_norm": 0.204759418964386, "learning_rate": 4.0931581457713614e-05, "loss": 0.1161, "step": 34810 }, { "grad_norm": 0.2040865421295166, "learning_rate": 4.09044820464488e-05, "loss": 0.1136, "step": 34820 }, { "grad_norm": 0.18924549221992493, "learning_rate": 4.087738539816219e-05, "loss": 0.1094, "step": 34830 }, { "grad_norm": 0.19445806741714478, "learning_rate": 4.085029152108501e-05, "loss": 0.1246, "step": 34840 }, { "grad_norm": 0.16829392313957214, "learning_rate": 4.0823200423447714e-05, "loss": 0.1117, "step": 34850 }, { "grad_norm": 0.1789558082818985, "learning_rate": 4.079611211347981e-05, "loss": 0.1111, "step": 34860 }, { "grad_norm": 0.20466038584709167, "learning_rate": 4.076902659941002e-05, "loss": 0.1087, "step": 34870 }, { "grad_norm": 0.1462222784757614, "learning_rate": 4.074194388946624e-05, "loss": 0.1125, "step": 34880 }, { "grad_norm": 0.17788584530353546, "learning_rate": 4.071486399187545e-05, "loss": 0.1149, "step": 34890 }, { "grad_norm": 0.17353998124599457, "learning_rate": 4.0687786914863836e-05, "loss": 0.1166, "step": 34900 }, { "grad_norm": 0.186956524848938, "learning_rate": 4.0660712666656666e-05, "loss": 0.1178, "step": 34910 }, { "grad_norm": 0.21584585309028625, "learning_rate": 4.0633641255478394e-05, "loss": 0.1201, "step": 34920 }, { "grad_norm": 0.2239702045917511, "learning_rate": 4.0606572689552624e-05, "loss": 0.1198, "step": 34930 }, { "grad_norm": 0.17948947846889496, "learning_rate": 4.0579506977102036e-05, "loss": 0.1167, "step": 34940 }, { "grad_norm": 0.16375325620174408, "learning_rate": 4.055244412634849e-05, "loss": 0.118, "step": 34950 }, { "grad_norm": 0.24287362396717072, "learning_rate": 4.052538414551298e-05, "loss": 0.1136, "step": 34960 }, { "grad_norm": 0.1648644506931305, "learning_rate": 4.0498327042815596e-05, "loss": 0.116, "step": 34970 }, { "grad_norm": 0.19858837127685547, "learning_rate": 4.047127282647559e-05, "loss": 0.1118, "step": 34980 }, { "grad_norm": 0.15601298213005066, "learning_rate": 4.04442215047113e-05, "loss": 0.1111, "step": 34990 }, { "grad_norm": 0.17039990425109863, "learning_rate": 4.041717308574023e-05, "loss": 0.1171, "step": 35000 }, { "grad_norm": 0.2461501657962799, "learning_rate": 4.039012757777893e-05, "loss": 0.1141, "step": 35010 }, { "grad_norm": 0.24578535556793213, "learning_rate": 4.036308498904314e-05, "loss": 0.1222, "step": 35020 }, { "grad_norm": 0.1677270084619522, "learning_rate": 4.033604532774771e-05, "loss": 0.1205, "step": 35030 }, { "grad_norm": 0.21464009582996368, "learning_rate": 4.030900860210652e-05, "loss": 0.1209, "step": 35040 }, { "grad_norm": 0.23066437244415283, "learning_rate": 4.028197482033266e-05, "loss": 0.1216, "step": 35050 }, { "grad_norm": 0.2100825309753418, "learning_rate": 4.0254943990638246e-05, "loss": 0.1095, "step": 35060 }, { "grad_norm": 0.2171335071325302, "learning_rate": 4.022791612123454e-05, "loss": 0.1213, "step": 35070 }, { "grad_norm": 0.18246395885944366, "learning_rate": 4.020089122033192e-05, "loss": 0.121, "step": 35080 }, { "grad_norm": 0.21530261635780334, "learning_rate": 4.01738692961398e-05, "loss": 0.1166, "step": 35090 }, { "grad_norm": 0.16892920434474945, "learning_rate": 4.014685035686675e-05, "loss": 0.1187, "step": 35100 }, { "grad_norm": 0.17266447842121124, "learning_rate": 4.011983441072039e-05, "loss": 0.1201, "step": 35110 }, { "grad_norm": 0.17995871603488922, "learning_rate": 4.0092821465907485e-05, "loss": 0.1166, "step": 35120 }, { "grad_norm": 0.1586918830871582, "learning_rate": 4.006581153063383e-05, "loss": 0.117, "step": 35130 }, { "grad_norm": 0.20838187634944916, "learning_rate": 4.003880461310432e-05, "loss": 0.1133, "step": 35140 }, { "grad_norm": 0.20887157320976257, "learning_rate": 4.001180072152298e-05, "loss": 0.1258, "step": 35150 }, { "grad_norm": 0.20225273072719574, "learning_rate": 3.998479986409285e-05, "loss": 0.1198, "step": 35160 }, { "grad_norm": 0.2396678328514099, "learning_rate": 3.995780204901607e-05, "loss": 0.1262, "step": 35170 }, { "grad_norm": 0.22500085830688477, "learning_rate": 3.993080728449391e-05, "loss": 0.1132, "step": 35180 }, { "grad_norm": 0.22991278767585754, "learning_rate": 3.990381557872661e-05, "loss": 0.115, "step": 35190 }, { "grad_norm": 0.20904825627803802, "learning_rate": 3.987682693991359e-05, "loss": 0.1183, "step": 35200 }, { "grad_norm": 0.2097807377576828, "learning_rate": 3.9849841376253226e-05, "loss": 0.1218, "step": 35210 }, { "grad_norm": 0.1952189952135086, "learning_rate": 3.982285889594306e-05, "loss": 0.1176, "step": 35220 }, { "grad_norm": 0.1512131243944168, "learning_rate": 3.9795879507179665e-05, "loss": 0.1122, "step": 35230 }, { "grad_norm": 0.19283968210220337, "learning_rate": 3.9768903218158634e-05, "loss": 0.1176, "step": 35240 }, { "grad_norm": 0.1944636106491089, "learning_rate": 3.974193003707468e-05, "loss": 0.1258, "step": 35250 }, { "grad_norm": 0.19030414521694183, "learning_rate": 3.971495997212152e-05, "loss": 0.1181, "step": 35260 }, { "grad_norm": 0.16506077349185944, "learning_rate": 3.9687993031491985e-05, "loss": 0.1146, "step": 35270 }, { "grad_norm": 0.1595265120267868, "learning_rate": 3.966102922337787e-05, "loss": 0.1194, "step": 35280 }, { "grad_norm": 0.19187650084495544, "learning_rate": 3.963406855597009e-05, "loss": 0.1191, "step": 35290 }, { "grad_norm": 0.2025662064552307, "learning_rate": 3.960711103745861e-05, "loss": 0.114, "step": 35300 }, { "grad_norm": 0.22408734261989594, "learning_rate": 3.958015667603237e-05, "loss": 0.1227, "step": 35310 }, { "grad_norm": 0.20384548604488373, "learning_rate": 3.955320547987943e-05, "loss": 0.1105, "step": 35320 }, { "grad_norm": 0.20373141765594482, "learning_rate": 3.952625745718681e-05, "loss": 0.1086, "step": 35330 }, { "grad_norm": 0.1681358516216278, "learning_rate": 3.949931261614064e-05, "loss": 0.1149, "step": 35340 }, { "grad_norm": 0.22953973710536957, "learning_rate": 3.947237096492605e-05, "loss": 0.1221, "step": 35350 }, { "grad_norm": 0.1555200219154358, "learning_rate": 3.944543251172719e-05, "loss": 0.1161, "step": 35360 }, { "grad_norm": 0.200561985373497, "learning_rate": 3.941849726472725e-05, "loss": 0.1211, "step": 35370 }, { "grad_norm": 0.1653200089931488, "learning_rate": 3.939156523210846e-05, "loss": 0.1126, "step": 35380 }, { "grad_norm": 0.18569400906562805, "learning_rate": 3.9364636422052046e-05, "loss": 0.1087, "step": 35390 }, { "grad_norm": 0.17198850214481354, "learning_rate": 3.933771084273828e-05, "loss": 0.1116, "step": 35400 }, { "grad_norm": 0.1607934832572937, "learning_rate": 3.931078850234643e-05, "loss": 0.1154, "step": 35410 }, { "grad_norm": 0.17042501270771027, "learning_rate": 3.928386940905483e-05, "loss": 0.1127, "step": 35420 }, { "grad_norm": 0.16638483107089996, "learning_rate": 3.925695357104073e-05, "loss": 0.1071, "step": 35430 }, { "grad_norm": 0.18636047840118408, "learning_rate": 3.923004099648049e-05, "loss": 0.1162, "step": 35440 }, { "grad_norm": 0.18114455044269562, "learning_rate": 3.920313169354944e-05, "loss": 0.1227, "step": 35450 }, { "grad_norm": 0.22623461484909058, "learning_rate": 3.9176225670421897e-05, "loss": 0.1163, "step": 35460 }, { "grad_norm": 0.19250677525997162, "learning_rate": 3.9149322935271224e-05, "loss": 0.1107, "step": 35470 }, { "grad_norm": 0.16837935149669647, "learning_rate": 3.9122423496269725e-05, "loss": 0.1089, "step": 35480 }, { "grad_norm": 0.1832878440618515, "learning_rate": 3.909552736158877e-05, "loss": 0.1208, "step": 35490 }, { "grad_norm": 0.1394355148077011, "learning_rate": 3.90686345393987e-05, "loss": 0.1203, "step": 35500 }, { "grad_norm": 0.14968983829021454, "learning_rate": 3.9041745037868816e-05, "loss": 0.1124, "step": 35510 }, { "grad_norm": 0.16015022993087769, "learning_rate": 3.9014858865167465e-05, "loss": 0.1044, "step": 35520 }, { "grad_norm": 0.19097647070884705, "learning_rate": 3.8987976029461935e-05, "loss": 0.1096, "step": 35530 }, { "grad_norm": 0.16473661363124847, "learning_rate": 3.896109653891853e-05, "loss": 0.1183, "step": 35540 }, { "grad_norm": 0.20564454793930054, "learning_rate": 3.893422040170254e-05, "loss": 0.1121, "step": 35550 }, { "grad_norm": 0.21511279046535492, "learning_rate": 3.8907347625978207e-05, "loss": 0.1132, "step": 35560 }, { "grad_norm": 0.24046003818511963, "learning_rate": 3.88804782199088e-05, "loss": 0.1157, "step": 35570 }, { "grad_norm": 0.18294817209243774, "learning_rate": 3.8853612191656495e-05, "loss": 0.1172, "step": 35580 }, { "grad_norm": 0.2533971071243286, "learning_rate": 3.88267495493825e-05, "loss": 0.1113, "step": 35590 }, { "grad_norm": 0.20200100541114807, "learning_rate": 3.8799890301247004e-05, "loss": 0.1185, "step": 35600 }, { "grad_norm": 0.21025966107845306, "learning_rate": 3.8773034455409096e-05, "loss": 0.117, "step": 35610 }, { "grad_norm": 0.1756962239742279, "learning_rate": 3.8746182020026904e-05, "loss": 0.1135, "step": 35620 }, { "grad_norm": 0.1607566475868225, "learning_rate": 3.871933300325745e-05, "loss": 0.1127, "step": 35630 }, { "grad_norm": 0.1923205405473709, "learning_rate": 3.869248741325679e-05, "loss": 0.1206, "step": 35640 }, { "grad_norm": 0.19439150393009186, "learning_rate": 3.866564525817992e-05, "loss": 0.1146, "step": 35650 }, { "grad_norm": 0.1682727187871933, "learning_rate": 3.8638806546180725e-05, "loss": 0.1103, "step": 35660 }, { "grad_norm": 0.20234312117099762, "learning_rate": 3.861197128541213e-05, "loss": 0.1227, "step": 35670 }, { "grad_norm": 0.19979414343833923, "learning_rate": 3.858513948402599e-05, "loss": 0.1174, "step": 35680 }, { "grad_norm": 0.16512157022953033, "learning_rate": 3.8558311150173077e-05, "loss": 0.1257, "step": 35690 }, { "grad_norm": 0.1669427454471588, "learning_rate": 3.853148629200312e-05, "loss": 0.121, "step": 35700 }, { "grad_norm": 0.16840983927249908, "learning_rate": 3.850466491766482e-05, "loss": 0.1128, "step": 35710 }, { "grad_norm": 0.17265765368938446, "learning_rate": 3.847784703530583e-05, "loss": 0.1162, "step": 35720 }, { "grad_norm": 0.18702086806297302, "learning_rate": 3.845103265307266e-05, "loss": 0.1186, "step": 35730 }, { "grad_norm": 0.20829178392887115, "learning_rate": 3.842422177911086e-05, "loss": 0.1064, "step": 35740 }, { "grad_norm": 0.18208034336566925, "learning_rate": 3.8397414421564826e-05, "loss": 0.108, "step": 35750 }, { "grad_norm": 0.15207265317440033, "learning_rate": 3.8370610588577935e-05, "loss": 0.1122, "step": 35760 }, { "grad_norm": 0.16806092858314514, "learning_rate": 3.834381028829251e-05, "loss": 0.1121, "step": 35770 }, { "grad_norm": 0.21647505462169647, "learning_rate": 3.8317013528849745e-05, "loss": 0.1188, "step": 35780 }, { "grad_norm": 0.23985417187213898, "learning_rate": 3.8290220318389815e-05, "loss": 0.1138, "step": 35790 }, { "grad_norm": 0.20498883724212646, "learning_rate": 3.8263430665051746e-05, "loss": 0.1095, "step": 35800 }, { "grad_norm": 0.191909521818161, "learning_rate": 3.8236644576973554e-05, "loss": 0.1229, "step": 35810 }, { "grad_norm": 0.19562900066375732, "learning_rate": 3.820986206229217e-05, "loss": 0.1088, "step": 35820 }, { "grad_norm": 0.19434072077274323, "learning_rate": 3.8183083129143384e-05, "loss": 0.1214, "step": 35830 }, { "grad_norm": 0.22009608149528503, "learning_rate": 3.815630778566193e-05, "loss": 0.1107, "step": 35840 }, { "grad_norm": 0.20711824297904968, "learning_rate": 3.812953603998145e-05, "loss": 0.1147, "step": 35850 }, { "grad_norm": 0.17955948412418365, "learning_rate": 3.8102767900234504e-05, "loss": 0.1114, "step": 35860 }, { "grad_norm": 0.22996573150157928, "learning_rate": 3.807600337455256e-05, "loss": 0.1152, "step": 35870 }, { "grad_norm": 0.19446024298667908, "learning_rate": 3.804924247106593e-05, "loss": 0.1133, "step": 35880 }, { "grad_norm": 0.19629578292369843, "learning_rate": 3.8022485197903925e-05, "loss": 0.1127, "step": 35890 }, { "grad_norm": 0.1861361265182495, "learning_rate": 3.799573156319464e-05, "loss": 0.1228, "step": 35900 }, { "grad_norm": 0.19218160212039948, "learning_rate": 3.796898157506515e-05, "loss": 0.117, "step": 35910 }, { "grad_norm": 0.16926009953022003, "learning_rate": 3.794223524164143e-05, "loss": 0.1155, "step": 35920 }, { "grad_norm": 0.24673694372177124, "learning_rate": 3.7915492571048245e-05, "loss": 0.123, "step": 35930 }, { "grad_norm": 0.19017930328845978, "learning_rate": 3.788875357140937e-05, "loss": 0.113, "step": 35940 }, { "grad_norm": 0.17220520973205566, "learning_rate": 3.786201825084736e-05, "loss": 0.1121, "step": 35950 }, { "grad_norm": 0.15550123155117035, "learning_rate": 3.783528661748372e-05, "loss": 0.1085, "step": 35960 }, { "grad_norm": 0.18400226533412933, "learning_rate": 3.780855867943882e-05, "loss": 0.1152, "step": 35970 }, { "grad_norm": 0.20244067907333374, "learning_rate": 3.778183444483189e-05, "loss": 0.1181, "step": 35980 }, { "grad_norm": 0.1875724047422409, "learning_rate": 3.775511392178108e-05, "loss": 0.1132, "step": 35990 }, { "grad_norm": 0.18913164734840393, "learning_rate": 3.772839711840332e-05, "loss": 0.1166, "step": 36000 }, { "grad_norm": 0.15030664205551147, "learning_rate": 3.7701684042814515e-05, "loss": 0.115, "step": 36010 }, { "grad_norm": 0.18015702068805695, "learning_rate": 3.76749747031294e-05, "loss": 0.1111, "step": 36020 }, { "grad_norm": 0.1744326502084732, "learning_rate": 3.764826910746152e-05, "loss": 0.1214, "step": 36030 }, { "grad_norm": 0.19299782812595367, "learning_rate": 3.762156726392338e-05, "loss": 0.1167, "step": 36040 }, { "grad_norm": 0.1829201877117157, "learning_rate": 3.759486918062625e-05, "loss": 0.1148, "step": 36050 }, { "grad_norm": 0.21160605549812317, "learning_rate": 3.756817486568033e-05, "loss": 0.118, "step": 36060 }, { "grad_norm": 0.21072132885456085, "learning_rate": 3.7541484327194654e-05, "loss": 0.1194, "step": 36070 }, { "grad_norm": 0.21855220198631287, "learning_rate": 3.751479757327707e-05, "loss": 0.1199, "step": 36080 }, { "grad_norm": 0.17385663092136383, "learning_rate": 3.7488114612034345e-05, "loss": 0.1184, "step": 36090 }, { "grad_norm": 0.1803685575723648, "learning_rate": 3.7461435451572044e-05, "loss": 0.1155, "step": 36100 }, { "grad_norm": 0.15138030052185059, "learning_rate": 3.743476009999459e-05, "loss": 0.1174, "step": 36110 }, { "grad_norm": 0.17414860427379608, "learning_rate": 3.7408088565405245e-05, "loss": 0.1093, "step": 36120 }, { "grad_norm": 0.16372938454151154, "learning_rate": 3.738142085590612e-05, "loss": 0.1172, "step": 36130 }, { "grad_norm": 0.16081401705741882, "learning_rate": 3.7354756979598194e-05, "loss": 0.1152, "step": 36140 }, { "grad_norm": 0.1514350324869156, "learning_rate": 3.7328096944581187e-05, "loss": 0.1087, "step": 36150 }, { "grad_norm": 0.16193628311157227, "learning_rate": 3.730144075895377e-05, "loss": 0.1172, "step": 36160 }, { "grad_norm": 0.18337656557559967, "learning_rate": 3.727478843081335e-05, "loss": 0.1108, "step": 36170 }, { "grad_norm": 0.16857552528381348, "learning_rate": 3.72481399682562e-05, "loss": 0.1153, "step": 36180 }, { "grad_norm": 0.2125975489616394, "learning_rate": 3.722149537937747e-05, "loss": 0.1129, "step": 36190 }, { "grad_norm": 0.15001599490642548, "learning_rate": 3.7194854672271015e-05, "loss": 0.1063, "step": 36200 }, { "grad_norm": 0.19903843104839325, "learning_rate": 3.7168217855029644e-05, "loss": 0.1133, "step": 36210 }, { "grad_norm": 0.23452764749526978, "learning_rate": 3.7141584935744856e-05, "loss": 0.1155, "step": 36220 }, { "grad_norm": 0.2042313814163208, "learning_rate": 3.7114955922507055e-05, "loss": 0.1122, "step": 36230 }, { "grad_norm": 0.16606391966342926, "learning_rate": 3.708833082340545e-05, "loss": 0.1174, "step": 36240 }, { "grad_norm": 0.1684076488018036, "learning_rate": 3.7061709646528034e-05, "loss": 0.1033, "step": 36250 }, { "grad_norm": 0.1489981710910797, "learning_rate": 3.7035092399961604e-05, "loss": 0.1164, "step": 36260 }, { "grad_norm": 0.18157078325748444, "learning_rate": 3.700847909179177e-05, "loss": 0.1252, "step": 36270 }, { "grad_norm": 0.14420253038406372, "learning_rate": 3.698186973010297e-05, "loss": 0.1276, "step": 36280 }, { "grad_norm": 0.1638721078634262, "learning_rate": 3.695526432297844e-05, "loss": 0.114, "step": 36290 }, { "grad_norm": 0.1714656800031662, "learning_rate": 3.692866287850017e-05, "loss": 0.1164, "step": 36300 }, { "grad_norm": 0.17327572405338287, "learning_rate": 3.6902065404749006e-05, "loss": 0.1182, "step": 36310 }, { "grad_norm": 0.18742063641548157, "learning_rate": 3.6875471909804516e-05, "loss": 0.112, "step": 36320 }, { "grad_norm": 0.22141538560390472, "learning_rate": 3.6848882401745135e-05, "loss": 0.1214, "step": 36330 }, { "grad_norm": 0.1459421068429947, "learning_rate": 3.682229688864806e-05, "loss": 0.1102, "step": 36340 }, { "grad_norm": 0.20078754425048828, "learning_rate": 3.6795715378589235e-05, "loss": 0.1177, "step": 36350 }, { "grad_norm": 0.18831795454025269, "learning_rate": 3.676913787964345e-05, "loss": 0.1124, "step": 36360 }, { "grad_norm": 0.15165765583515167, "learning_rate": 3.674256439988423e-05, "loss": 0.1232, "step": 36370 }, { "grad_norm": 0.17198140919208527, "learning_rate": 3.6715994947383904e-05, "loss": 0.1153, "step": 36380 }, { "grad_norm": 0.2153671532869339, "learning_rate": 3.668942953021357e-05, "loss": 0.1179, "step": 36390 }, { "grad_norm": 0.18156060576438904, "learning_rate": 3.66628681564431e-05, "loss": 0.1165, "step": 36400 }, { "grad_norm": 0.20002177357673645, "learning_rate": 3.663631083414114e-05, "loss": 0.1175, "step": 36410 }, { "grad_norm": 0.1509268879890442, "learning_rate": 3.660975757137509e-05, "loss": 0.1065, "step": 36420 }, { "grad_norm": 0.19294124841690063, "learning_rate": 3.658320837621114e-05, "loss": 0.1248, "step": 36430 }, { "grad_norm": 0.1965116411447525, "learning_rate": 3.655666325671426e-05, "loss": 0.1244, "step": 36440 }, { "grad_norm": 0.15327756106853485, "learning_rate": 3.65301222209481e-05, "loss": 0.1076, "step": 36450 }, { "grad_norm": 0.19989068806171417, "learning_rate": 3.650358527697519e-05, "loss": 0.1131, "step": 36460 }, { "grad_norm": 0.170302152633667, "learning_rate": 3.64770524328567e-05, "loss": 0.1214, "step": 36470 }, { "grad_norm": 0.17396023869514465, "learning_rate": 3.645052369665265e-05, "loss": 0.1123, "step": 36480 }, { "grad_norm": 0.21986931562423706, "learning_rate": 3.6423999076421724e-05, "loss": 0.1152, "step": 36490 }, { "grad_norm": 0.18073570728302002, "learning_rate": 3.639747858022142e-05, "loss": 0.1188, "step": 36500 }, { "grad_norm": 0.1849719136953354, "learning_rate": 3.637096221610799e-05, "loss": 0.1153, "step": 36510 }, { "grad_norm": 0.15867944061756134, "learning_rate": 3.634444999213638e-05, "loss": 0.115, "step": 36520 }, { "grad_norm": 0.19044099748134613, "learning_rate": 3.6317941916360296e-05, "loss": 0.1142, "step": 36530 }, { "grad_norm": 0.20880654454231262, "learning_rate": 3.629143799683221e-05, "loss": 0.1141, "step": 36540 }, { "grad_norm": 0.22972123324871063, "learning_rate": 3.626493824160331e-05, "loss": 0.1139, "step": 36550 }, { "grad_norm": 0.18356670439243317, "learning_rate": 3.623844265872352e-05, "loss": 0.1053, "step": 36560 }, { "grad_norm": 0.24531082808971405, "learning_rate": 3.621195125624149e-05, "loss": 0.114, "step": 36570 }, { "grad_norm": 0.20142048597335815, "learning_rate": 3.618546404220463e-05, "loss": 0.1145, "step": 36580 }, { "grad_norm": 0.1783338189125061, "learning_rate": 3.615898102465903e-05, "loss": 0.1172, "step": 36590 }, { "grad_norm": 0.15022428333759308, "learning_rate": 3.6132502211649544e-05, "loss": 0.1091, "step": 36600 }, { "grad_norm": 0.15007150173187256, "learning_rate": 3.610602761121975e-05, "loss": 0.1099, "step": 36610 }, { "grad_norm": 0.16899503767490387, "learning_rate": 3.6079557231411897e-05, "loss": 0.1224, "step": 36620 }, { "grad_norm": 0.14641901850700378, "learning_rate": 3.6053091080267035e-05, "loss": 0.1113, "step": 36630 }, { "grad_norm": 0.14250364899635315, "learning_rate": 3.602662916582483e-05, "loss": 0.1148, "step": 36640 }, { "grad_norm": 0.15869751572608948, "learning_rate": 3.600017149612375e-05, "loss": 0.1188, "step": 36650 }, { "grad_norm": 0.18198679387569427, "learning_rate": 3.5973718079200935e-05, "loss": 0.1169, "step": 36660 }, { "grad_norm": 0.17474590241909027, "learning_rate": 3.5947268923092216e-05, "loss": 0.1114, "step": 36670 }, { "grad_norm": 0.1901651918888092, "learning_rate": 3.592082403583216e-05, "loss": 0.1189, "step": 36680 }, { "grad_norm": 0.1792629361152649, "learning_rate": 3.5894383425454004e-05, "loss": 0.1151, "step": 36690 }, { "grad_norm": 0.18898804485797882, "learning_rate": 3.586794709998975e-05, "loss": 0.1108, "step": 36700 }, { "grad_norm": 0.19218651950359344, "learning_rate": 3.584151506747002e-05, "loss": 0.1167, "step": 36710 }, { "grad_norm": 0.18123339116573334, "learning_rate": 3.581508733592418e-05, "loss": 0.1073, "step": 36720 }, { "grad_norm": 0.1664261817932129, "learning_rate": 3.5788663913380297e-05, "loss": 0.119, "step": 36730 }, { "grad_norm": 0.1758383959531784, "learning_rate": 3.576224480786506e-05, "loss": 0.1207, "step": 36740 }, { "grad_norm": 0.22911956906318665, "learning_rate": 3.573583002740393e-05, "loss": 0.1256, "step": 36750 }, { "grad_norm": 0.23598328232765198, "learning_rate": 3.570941958002103e-05, "loss": 0.1199, "step": 36760 }, { "grad_norm": 0.16639618575572968, "learning_rate": 3.568301347373912e-05, "loss": 0.1111, "step": 36770 }, { "grad_norm": 0.22414694726467133, "learning_rate": 3.5656611716579726e-05, "loss": 0.1117, "step": 36780 }, { "grad_norm": 0.2060069739818573, "learning_rate": 3.5630214316562946e-05, "loss": 0.1173, "step": 36790 }, { "grad_norm": 0.19611403346061707, "learning_rate": 3.560382128170766e-05, "loss": 0.1224, "step": 36800 }, { "grad_norm": 0.21389730274677277, "learning_rate": 3.5577432620031374e-05, "loss": 0.1179, "step": 36810 }, { "grad_norm": 0.19604003429412842, "learning_rate": 3.5551048339550216e-05, "loss": 0.1183, "step": 36820 }, { "grad_norm": 0.2009672224521637, "learning_rate": 3.55246684482791e-05, "loss": 0.1151, "step": 36830 }, { "grad_norm": 0.17435002326965332, "learning_rate": 3.5498292954231496e-05, "loss": 0.1196, "step": 36840 }, { "grad_norm": 0.20257316529750824, "learning_rate": 3.54719218654196e-05, "loss": 0.1186, "step": 36850 }, { "grad_norm": 0.2276190221309662, "learning_rate": 3.544555518985425e-05, "loss": 0.1177, "step": 36860 }, { "grad_norm": 0.1969987154006958, "learning_rate": 3.541919293554494e-05, "loss": 0.107, "step": 36870 }, { "grad_norm": 0.1692531555891037, "learning_rate": 3.539283511049985e-05, "loss": 0.1145, "step": 36880 }, { "grad_norm": 0.17195619642734528, "learning_rate": 3.5366481722725755e-05, "loss": 0.1177, "step": 36890 }, { "grad_norm": 0.1914987862110138, "learning_rate": 3.534013278022816e-05, "loss": 0.1155, "step": 36900 }, { "grad_norm": 0.22749674320220947, "learning_rate": 3.531378829101113e-05, "loss": 0.121, "step": 36910 }, { "grad_norm": 0.16866527497768402, "learning_rate": 3.528744826307746e-05, "loss": 0.1186, "step": 36920 }, { "grad_norm": 0.15486721694469452, "learning_rate": 3.5261112704428554e-05, "loss": 0.1121, "step": 36930 }, { "grad_norm": 0.16227945685386658, "learning_rate": 3.523478162306443e-05, "loss": 0.1077, "step": 36940 }, { "grad_norm": 0.21194511651992798, "learning_rate": 3.520845502698381e-05, "loss": 0.1163, "step": 36950 }, { "grad_norm": 0.19163821637630463, "learning_rate": 3.5182132924184005e-05, "loss": 0.1143, "step": 36960 }, { "grad_norm": 0.20677579939365387, "learning_rate": 3.5155815322660966e-05, "loss": 0.1149, "step": 36970 }, { "grad_norm": 0.15915293991565704, "learning_rate": 3.512950223040931e-05, "loss": 0.1144, "step": 36980 }, { "grad_norm": 0.2018166482448578, "learning_rate": 3.5103193655422216e-05, "loss": 0.1154, "step": 36990 }, { "grad_norm": 0.16575299203395844, "learning_rate": 3.5076889605691596e-05, "loss": 0.1177, "step": 37000 }, { "grad_norm": 0.15428854525089264, "learning_rate": 3.505059008920787e-05, "loss": 0.1192, "step": 37010 }, { "grad_norm": 0.21746696531772614, "learning_rate": 3.502429511396016e-05, "loss": 0.1208, "step": 37020 }, { "grad_norm": 0.18170802295207977, "learning_rate": 3.4998004687936196e-05, "loss": 0.1139, "step": 37030 }, { "grad_norm": 0.17470712959766388, "learning_rate": 3.497171881912229e-05, "loss": 0.1127, "step": 37040 }, { "grad_norm": 0.16942180693149567, "learning_rate": 3.494543751550342e-05, "loss": 0.119, "step": 37050 }, { "grad_norm": 0.1932782381772995, "learning_rate": 3.491916078506313e-05, "loss": 0.1113, "step": 37060 }, { "grad_norm": 0.17460650205612183, "learning_rate": 3.489288863578361e-05, "loss": 0.1178, "step": 37070 }, { "grad_norm": 0.23391300439834595, "learning_rate": 3.4866621075645646e-05, "loss": 0.1187, "step": 37080 }, { "grad_norm": 0.20139864087104797, "learning_rate": 3.4840358112628614e-05, "loss": 0.1189, "step": 37090 }, { "grad_norm": 0.2376893013715744, "learning_rate": 3.481409975471053e-05, "loss": 0.115, "step": 37100 }, { "grad_norm": 0.22217267751693726, "learning_rate": 3.4787846009867986e-05, "loss": 0.1156, "step": 37110 }, { "grad_norm": 0.19119024276733398, "learning_rate": 3.476159688607615e-05, "loss": 0.1102, "step": 37120 }, { "grad_norm": 0.195794478058815, "learning_rate": 3.4735352391308854e-05, "loss": 0.1091, "step": 37130 }, { "grad_norm": 0.207433819770813, "learning_rate": 3.4709112533538446e-05, "loss": 0.1227, "step": 37140 }, { "grad_norm": 0.18547506630420685, "learning_rate": 3.4682877320735934e-05, "loss": 0.1142, "step": 37150 }, { "grad_norm": 0.171724334359169, "learning_rate": 3.465664676087085e-05, "loss": 0.105, "step": 37160 }, { "grad_norm": 0.14291353523731232, "learning_rate": 3.463042086191136e-05, "loss": 0.1105, "step": 37170 }, { "grad_norm": 0.21758873760700226, "learning_rate": 3.460419963182423e-05, "loss": 0.1176, "step": 37180 }, { "grad_norm": 0.21737925708293915, "learning_rate": 3.457798307857473e-05, "loss": 0.1191, "step": 37190 }, { "grad_norm": 0.195791095495224, "learning_rate": 3.455177121012678e-05, "loss": 0.1152, "step": 37200 }, { "grad_norm": 0.258527934551239, "learning_rate": 3.452556403444285e-05, "loss": 0.1171, "step": 37210 }, { "grad_norm": 0.2363341897726059, "learning_rate": 3.4499361559483975e-05, "loss": 0.1166, "step": 37220 }, { "grad_norm": 0.1746969074010849, "learning_rate": 3.44731637932098e-05, "loss": 0.1119, "step": 37230 }, { "grad_norm": 0.17574140429496765, "learning_rate": 3.44469707435785e-05, "loss": 0.1137, "step": 37240 }, { "grad_norm": 0.17484202980995178, "learning_rate": 3.4420782418546835e-05, "loss": 0.1181, "step": 37250 }, { "grad_norm": 0.155159130692482, "learning_rate": 3.439459882607012e-05, "loss": 0.1119, "step": 37260 }, { "grad_norm": 0.1679462194442749, "learning_rate": 3.436841997410225e-05, "loss": 0.1129, "step": 37270 }, { "grad_norm": 0.17005887627601624, "learning_rate": 3.434224587059567e-05, "loss": 0.1225, "step": 37280 }, { "grad_norm": 0.15792316198349, "learning_rate": 3.431607652350136e-05, "loss": 0.107, "step": 37290 }, { "grad_norm": 0.16192831099033356, "learning_rate": 3.428991194076891e-05, "loss": 0.115, "step": 37300 }, { "grad_norm": 0.15587258338928223, "learning_rate": 3.4263752130346394e-05, "loss": 0.119, "step": 37310 }, { "grad_norm": 0.17020675539970398, "learning_rate": 3.4237597100180515e-05, "loss": 0.1098, "step": 37320 }, { "grad_norm": 0.16206467151641846, "learning_rate": 3.4211446858216427e-05, "loss": 0.1157, "step": 37330 }, { "grad_norm": 0.16805532574653625, "learning_rate": 3.4185301412397915e-05, "loss": 0.1064, "step": 37340 }, { "grad_norm": 0.15757577121257782, "learning_rate": 3.415916077066729e-05, "loss": 0.1136, "step": 37350 }, { "grad_norm": 0.17929595708847046, "learning_rate": 3.413302494096535e-05, "loss": 0.1171, "step": 37360 }, { "grad_norm": 0.14765921235084534, "learning_rate": 3.410689393123151e-05, "loss": 0.1182, "step": 37370 }, { "grad_norm": 0.14596323668956757, "learning_rate": 3.408076774940364e-05, "loss": 0.1142, "step": 37380 }, { "grad_norm": 0.16636936366558075, "learning_rate": 3.40546464034182e-05, "loss": 0.1116, "step": 37390 }, { "grad_norm": 0.20114775002002716, "learning_rate": 3.4028529901210185e-05, "loss": 0.1189, "step": 37400 }, { "grad_norm": 0.20368021726608276, "learning_rate": 3.4002418250713086e-05, "loss": 0.1091, "step": 37410 }, { "grad_norm": 0.21940702199935913, "learning_rate": 3.3976311459858936e-05, "loss": 0.115, "step": 37420 }, { "grad_norm": 0.23983827233314514, "learning_rate": 3.395020953657826e-05, "loss": 0.1193, "step": 37430 }, { "grad_norm": 0.21061819791793823, "learning_rate": 3.3924112488800165e-05, "loss": 0.1113, "step": 37440 }, { "grad_norm": 0.17232157289981842, "learning_rate": 3.389802032445225e-05, "loss": 0.1111, "step": 37450 }, { "grad_norm": 0.15664853155612946, "learning_rate": 3.38719330514606e-05, "loss": 0.1132, "step": 37460 }, { "grad_norm": 0.18708881735801697, "learning_rate": 3.3845850677749866e-05, "loss": 0.1188, "step": 37470 }, { "grad_norm": 0.16384519636631012, "learning_rate": 3.3819773211243157e-05, "loss": 0.1148, "step": 37480 }, { "grad_norm": 0.14966526627540588, "learning_rate": 3.379370065986213e-05, "loss": 0.1183, "step": 37490 }, { "grad_norm": 0.1605735719203949, "learning_rate": 3.3767633031526955e-05, "loss": 0.1208, "step": 37500 }, { "grad_norm": 0.20874230563640594, "learning_rate": 3.374157033415626e-05, "loss": 0.1158, "step": 37510 }, { "grad_norm": 0.17613664269447327, "learning_rate": 3.371551257566723e-05, "loss": 0.1136, "step": 37520 }, { "grad_norm": 0.15261895954608917, "learning_rate": 3.36894597639755e-05, "loss": 0.1152, "step": 37530 }, { "grad_norm": 0.17491023242473602, "learning_rate": 3.366341190699523e-05, "loss": 0.1115, "step": 37540 }, { "grad_norm": 0.1619729995727539, "learning_rate": 3.36373690126391e-05, "loss": 0.1098, "step": 37550 }, { "grad_norm": 0.15133036673069, "learning_rate": 3.3611331088818234e-05, "loss": 0.1094, "step": 37560 }, { "grad_norm": 0.1735580414533615, "learning_rate": 3.3585298143442265e-05, "loss": 0.1151, "step": 37570 }, { "grad_norm": 0.2029537409543991, "learning_rate": 3.35592701844193e-05, "loss": 0.1098, "step": 37580 }, { "grad_norm": 0.20482021570205688, "learning_rate": 3.353324721965596e-05, "loss": 0.1212, "step": 37590 }, { "grad_norm": 0.20228266716003418, "learning_rate": 3.350722925705736e-05, "loss": 0.1207, "step": 37600 }, { "grad_norm": 0.15523633360862732, "learning_rate": 3.348121630452703e-05, "loss": 0.1121, "step": 37610 }, { "grad_norm": 0.1676321029663086, "learning_rate": 3.3455208369967044e-05, "loss": 0.1103, "step": 37620 }, { "grad_norm": 0.17217214405536652, "learning_rate": 3.34292054612779e-05, "loss": 0.1105, "step": 37630 }, { "grad_norm": 0.19833329319953918, "learning_rate": 3.340320758635861e-05, "loss": 0.1234, "step": 37640 }, { "grad_norm": 0.18287986516952515, "learning_rate": 3.337721475310666e-05, "loss": 0.1199, "step": 37650 }, { "grad_norm": 0.18385714292526245, "learning_rate": 3.335122696941795e-05, "loss": 0.1112, "step": 37660 }, { "grad_norm": 0.17018334567546844, "learning_rate": 3.332524424318692e-05, "loss": 0.1147, "step": 37670 }, { "grad_norm": 0.1692025065422058, "learning_rate": 3.32992665823064e-05, "loss": 0.1167, "step": 37680 }, { "grad_norm": 0.1805708110332489, "learning_rate": 3.327329399466774e-05, "loss": 0.1085, "step": 37690 }, { "grad_norm": 0.20939113199710846, "learning_rate": 3.324732648816072e-05, "loss": 0.1201, "step": 37700 }, { "grad_norm": 0.18941515684127808, "learning_rate": 3.322136407067358e-05, "loss": 0.1137, "step": 37710 }, { "grad_norm": 0.13343532383441925, "learning_rate": 3.3195406750093036e-05, "loss": 0.1173, "step": 37720 }, { "grad_norm": 0.1962229162454605, "learning_rate": 3.3169454534304205e-05, "loss": 0.1157, "step": 37730 }, { "grad_norm": 0.15630969405174255, "learning_rate": 3.3143507431190725e-05, "loss": 0.114, "step": 37740 }, { "grad_norm": 0.2060731053352356, "learning_rate": 3.311756544863459e-05, "loss": 0.1113, "step": 37750 }, { "grad_norm": 0.16436129808425903, "learning_rate": 3.309162859451633e-05, "loss": 0.1107, "step": 37760 }, { "grad_norm": 0.12147890776395798, "learning_rate": 3.306569687671487e-05, "loss": 0.113, "step": 37770 }, { "grad_norm": 0.15232378244400024, "learning_rate": 3.303977030310756e-05, "loss": 0.108, "step": 37780 }, { "grad_norm": 0.20000338554382324, "learning_rate": 3.3013848881570245e-05, "loss": 0.1144, "step": 37790 }, { "grad_norm": 0.1645292341709137, "learning_rate": 3.298793261997712e-05, "loss": 0.1099, "step": 37800 }, { "grad_norm": 0.18287430703639984, "learning_rate": 3.2962021526200893e-05, "loss": 0.1129, "step": 37810 }, { "grad_norm": 0.15433213114738464, "learning_rate": 3.293611560811268e-05, "loss": 0.1158, "step": 37820 }, { "grad_norm": 0.2029055804014206, "learning_rate": 3.291021487358199e-05, "loss": 0.1189, "step": 37830 }, { "grad_norm": 0.18673492968082428, "learning_rate": 3.28843193304768e-05, "loss": 0.1179, "step": 37840 }, { "grad_norm": 0.15747246146202087, "learning_rate": 3.2858428986663456e-05, "loss": 0.1121, "step": 37850 }, { "grad_norm": 0.1662473976612091, "learning_rate": 3.283254385000681e-05, "loss": 0.1159, "step": 37860 }, { "grad_norm": 0.17564664781093597, "learning_rate": 3.2806663928370076e-05, "loss": 0.1164, "step": 37870 }, { "grad_norm": 0.16275785863399506, "learning_rate": 3.278078922961485e-05, "loss": 0.1022, "step": 37880 }, { "grad_norm": 0.1534149944782257, "learning_rate": 3.275491976160123e-05, "loss": 0.1102, "step": 37890 }, { "grad_norm": 0.19671675562858582, "learning_rate": 3.2729055532187645e-05, "loss": 0.112, "step": 37900 }, { "grad_norm": 0.22477781772613525, "learning_rate": 3.270319654923097e-05, "loss": 0.1115, "step": 37910 }, { "grad_norm": 0.16597090661525726, "learning_rate": 3.2677342820586506e-05, "loss": 0.1181, "step": 37920 }, { "grad_norm": 0.22146670520305634, "learning_rate": 3.2651494354107905e-05, "loss": 0.1125, "step": 37930 }, { "grad_norm": 0.18089738488197327, "learning_rate": 3.2625651157647266e-05, "loss": 0.1084, "step": 37940 }, { "grad_norm": 0.15995442867279053, "learning_rate": 3.259981323905505e-05, "loss": 0.113, "step": 37950 }, { "grad_norm": 0.1734006106853485, "learning_rate": 3.257398060618014e-05, "loss": 0.1164, "step": 37960 }, { "grad_norm": 0.23734056949615479, "learning_rate": 3.254815326686983e-05, "loss": 0.1155, "step": 37970 }, { "grad_norm": 0.1900714486837387, "learning_rate": 3.2522331228969774e-05, "loss": 0.1128, "step": 37980 }, { "grad_norm": 0.19185630977153778, "learning_rate": 3.2496514500324006e-05, "loss": 0.1093, "step": 37990 }, { "grad_norm": 0.18153473734855652, "learning_rate": 3.247070308877498e-05, "loss": 0.1147, "step": 38000 }, { "grad_norm": 0.19516925513744354, "learning_rate": 3.2444897002163515e-05, "loss": 0.1143, "step": 38010 }, { "grad_norm": 0.19339346885681152, "learning_rate": 3.241909624832885e-05, "loss": 0.1121, "step": 38020 }, { "grad_norm": 0.14510531723499298, "learning_rate": 3.239330083510852e-05, "loss": 0.1111, "step": 38030 }, { "grad_norm": 0.18466563522815704, "learning_rate": 3.236751077033855e-05, "loss": 0.1153, "step": 38040 }, { "grad_norm": 0.12592484056949615, "learning_rate": 3.234172606185322e-05, "loss": 0.1142, "step": 38050 }, { "grad_norm": 0.18041828274726868, "learning_rate": 3.231594671748528e-05, "loss": 0.1146, "step": 38060 }, { "grad_norm": 0.1926681101322174, "learning_rate": 3.2290172745065815e-05, "loss": 0.125, "step": 38070 }, { "grad_norm": 0.17454302310943604, "learning_rate": 3.226440415242426e-05, "loss": 0.1222, "step": 38080 }, { "grad_norm": 0.2001204490661621, "learning_rate": 3.223864094738846e-05, "loss": 0.1131, "step": 38090 }, { "grad_norm": 0.188945934176445, "learning_rate": 3.221288313778456e-05, "loss": 0.1198, "step": 38100 }, { "grad_norm": 0.2595844566822052, "learning_rate": 3.2187130731437125e-05, "loss": 0.1215, "step": 38110 }, { "grad_norm": 0.18383632600307465, "learning_rate": 3.216138373616905e-05, "loss": 0.1172, "step": 38120 }, { "grad_norm": 0.19809237122535706, "learning_rate": 3.21356421598016e-05, "loss": 0.119, "step": 38130 }, { "grad_norm": 0.15610243380069733, "learning_rate": 3.210990601015438e-05, "loss": 0.1084, "step": 38140 }, { "grad_norm": 0.19397033751010895, "learning_rate": 3.208417529504535e-05, "loss": 0.1154, "step": 38150 }, { "grad_norm": 0.16204942762851715, "learning_rate": 3.205845002229084e-05, "loss": 0.1165, "step": 38160 }, { "grad_norm": 0.17962566018104553, "learning_rate": 3.203273019970547e-05, "loss": 0.1166, "step": 38170 }, { "grad_norm": 0.16969826817512512, "learning_rate": 3.200701583510227e-05, "loss": 0.112, "step": 38180 }, { "grad_norm": 0.17070050537586212, "learning_rate": 3.198130693629261e-05, "loss": 0.1147, "step": 38190 }, { "grad_norm": 0.22916370630264282, "learning_rate": 3.195560351108612e-05, "loss": 0.1239, "step": 38200 }, { "grad_norm": 0.21370436251163483, "learning_rate": 3.1929905567290865e-05, "loss": 0.1273, "step": 38210 }, { "grad_norm": 0.19002754986286163, "learning_rate": 3.1904213112713164e-05, "loss": 0.1186, "step": 38220 }, { "grad_norm": 0.1995343267917633, "learning_rate": 3.187852615515774e-05, "loss": 0.1176, "step": 38230 }, { "grad_norm": 0.1521925926208496, "learning_rate": 3.1852844702427606e-05, "loss": 0.1127, "step": 38240 }, { "grad_norm": 0.20110304653644562, "learning_rate": 3.18271687623241e-05, "loss": 0.1163, "step": 38250 }, { "grad_norm": 0.19302524626255035, "learning_rate": 3.1801498342646896e-05, "loss": 0.115, "step": 38260 }, { "grad_norm": 0.18158049881458282, "learning_rate": 3.177583345119398e-05, "loss": 0.1126, "step": 38270 }, { "grad_norm": 0.21951907873153687, "learning_rate": 3.17501740957617e-05, "loss": 0.1215, "step": 38280 }, { "grad_norm": 0.15288269519805908, "learning_rate": 3.172452028414467e-05, "loss": 0.1151, "step": 38290 }, { "grad_norm": 0.18432939052581787, "learning_rate": 3.169887202413583e-05, "loss": 0.1136, "step": 38300 }, { "grad_norm": 0.18656140565872192, "learning_rate": 3.167322932352646e-05, "loss": 0.1265, "step": 38310 }, { "grad_norm": 0.14348533749580383, "learning_rate": 3.164759219010613e-05, "loss": 0.105, "step": 38320 }, { "grad_norm": 0.1532903015613556, "learning_rate": 3.1621960631662725e-05, "loss": 0.1067, "step": 38330 }, { "grad_norm": 0.2059893161058426, "learning_rate": 3.159633465598245e-05, "loss": 0.1116, "step": 38340 }, { "grad_norm": 0.18836423754692078, "learning_rate": 3.1570714270849767e-05, "loss": 0.1184, "step": 38350 }, { "grad_norm": 0.1762850135564804, "learning_rate": 3.1545099484047516e-05, "loss": 0.1137, "step": 38360 }, { "grad_norm": 0.1982356160879135, "learning_rate": 3.151949030335674e-05, "loss": 0.1192, "step": 38370 }, { "grad_norm": 0.16846595704555511, "learning_rate": 3.149388673655687e-05, "loss": 0.1149, "step": 38380 }, { "grad_norm": 0.18292540311813354, "learning_rate": 3.146828879142559e-05, "loss": 0.1226, "step": 38390 }, { "grad_norm": 0.15401200950145721, "learning_rate": 3.1442696475738866e-05, "loss": 0.1191, "step": 38400 }, { "grad_norm": 0.18494541943073273, "learning_rate": 3.141710979727098e-05, "loss": 0.1107, "step": 38410 }, { "grad_norm": 0.18226176500320435, "learning_rate": 3.139152876379447e-05, "loss": 0.1065, "step": 38420 }, { "grad_norm": 0.20225317776203156, "learning_rate": 3.1365953383080214e-05, "loss": 0.1145, "step": 38430 }, { "grad_norm": 0.19994445145130157, "learning_rate": 3.134038366289731e-05, "loss": 0.1171, "step": 38440 }, { "grad_norm": 0.234098881483078, "learning_rate": 3.131481961101317e-05, "loss": 0.1249, "step": 38450 }, { "grad_norm": 0.2286107987165451, "learning_rate": 3.128926123519349e-05, "loss": 0.1136, "step": 38460 }, { "grad_norm": 0.1552436798810959, "learning_rate": 3.1263708543202194e-05, "loss": 0.1102, "step": 38470 }, { "grad_norm": 0.15919995307922363, "learning_rate": 3.123816154280155e-05, "loss": 0.1208, "step": 38480 }, { "grad_norm": 0.2101457715034485, "learning_rate": 3.121262024175207e-05, "loss": 0.1186, "step": 38490 }, { "grad_norm": 0.1622799187898636, "learning_rate": 3.118708464781248e-05, "loss": 0.1275, "step": 38500 }, { "grad_norm": 0.1698935478925705, "learning_rate": 3.116155476873987e-05, "loss": 0.1159, "step": 38510 }, { "grad_norm": 0.17723028361797333, "learning_rate": 3.11360306122895e-05, "loss": 0.1202, "step": 38520 }, { "grad_norm": 0.1563873589038849, "learning_rate": 3.1110512186214975e-05, "loss": 0.1068, "step": 38530 }, { "grad_norm": 0.16692779958248138, "learning_rate": 3.1084999498268095e-05, "loss": 0.1045, "step": 38540 }, { "grad_norm": 0.18873237073421478, "learning_rate": 3.1059492556198934e-05, "loss": 0.1179, "step": 38550 }, { "grad_norm": 0.18464729189872742, "learning_rate": 3.103399136775586e-05, "loss": 0.1206, "step": 38560 }, { "grad_norm": 0.17609232664108276, "learning_rate": 3.100849594068541e-05, "loss": 0.1184, "step": 38570 }, { "grad_norm": 0.17106343805789948, "learning_rate": 3.0983006282732484e-05, "loss": 0.113, "step": 38580 }, { "grad_norm": 0.17688226699829102, "learning_rate": 3.0957522401640116e-05, "loss": 0.1148, "step": 38590 }, { "grad_norm": 0.1717468947172165, "learning_rate": 3.0932044305149645e-05, "loss": 0.12, "step": 38600 }, { "grad_norm": 0.1823488026857376, "learning_rate": 3.090657200100068e-05, "loss": 0.1131, "step": 38610 }, { "grad_norm": 0.1309354305267334, "learning_rate": 3.088110549693099e-05, "loss": 0.1048, "step": 38620 }, { "grad_norm": 0.16415035724639893, "learning_rate": 3.085564480067667e-05, "loss": 0.113, "step": 38630 }, { "grad_norm": 0.19354897737503052, "learning_rate": 3.0830189919971955e-05, "loss": 0.1165, "step": 38640 }, { "grad_norm": 0.18565532565116882, "learning_rate": 3.080474086254939e-05, "loss": 0.1177, "step": 38650 }, { "grad_norm": 0.1908581554889679, "learning_rate": 3.077929763613975e-05, "loss": 0.111, "step": 38660 }, { "grad_norm": 0.18579010665416718, "learning_rate": 3.075386024847198e-05, "loss": 0.1151, "step": 38670 }, { "grad_norm": 0.17255905270576477, "learning_rate": 3.072842870727331e-05, "loss": 0.1155, "step": 38680 }, { "grad_norm": 0.20782768726348877, "learning_rate": 3.070300302026916e-05, "loss": 0.1147, "step": 38690 }, { "grad_norm": 0.17977607250213623, "learning_rate": 3.067758319518318e-05, "loss": 0.1101, "step": 38700 }, { "grad_norm": 0.16724838316440582, "learning_rate": 3.065216923973725e-05, "loss": 0.1138, "step": 38710 }, { "grad_norm": 0.13702434301376343, "learning_rate": 3.062676116165145e-05, "loss": 0.1105, "step": 38720 }, { "grad_norm": 0.19772027432918549, "learning_rate": 3.06013589686441e-05, "loss": 0.1096, "step": 38730 }, { "grad_norm": 0.17440897226333618, "learning_rate": 3.05759626684317e-05, "loss": 0.116, "step": 38740 }, { "grad_norm": 0.17783544957637787, "learning_rate": 3.055057226872896e-05, "loss": 0.1077, "step": 38750 }, { "grad_norm": 0.18277600407600403, "learning_rate": 3.052518777724887e-05, "loss": 0.1084, "step": 38760 }, { "grad_norm": 0.23269619047641754, "learning_rate": 3.04998092017025e-05, "loss": 0.1128, "step": 38770 }, { "grad_norm": 0.27947255969047546, "learning_rate": 3.0474436549799246e-05, "loss": 0.1157, "step": 38780 }, { "grad_norm": 0.21705637872219086, "learning_rate": 3.044906982924661e-05, "loss": 0.1153, "step": 38790 }, { "grad_norm": 0.20596317946910858, "learning_rate": 3.0423709047750337e-05, "loss": 0.112, "step": 38800 }, { "grad_norm": 0.15843643248081207, "learning_rate": 3.03983542130144e-05, "loss": 0.1112, "step": 38810 }, { "grad_norm": 0.20636513829231262, "learning_rate": 3.0373005332740877e-05, "loss": 0.1116, "step": 38820 }, { "grad_norm": 0.14324547350406647, "learning_rate": 3.034766241463013e-05, "loss": 0.1078, "step": 38830 }, { "grad_norm": 0.1710210144519806, "learning_rate": 3.032232546638064e-05, "loss": 0.1238, "step": 38840 }, { "grad_norm": 0.17326971888542175, "learning_rate": 3.0296994495689114e-05, "loss": 0.1105, "step": 38850 }, { "grad_norm": 0.1305495649576187, "learning_rate": 3.0271669510250444e-05, "loss": 0.1128, "step": 38860 }, { "grad_norm": 0.17440740764141083, "learning_rate": 3.024635051775766e-05, "loss": 0.1248, "step": 38870 }, { "grad_norm": 0.20146067440509796, "learning_rate": 3.022103752590205e-05, "loss": 0.1193, "step": 38880 }, { "grad_norm": 0.16205622255802155, "learning_rate": 3.0195730542372992e-05, "loss": 0.1108, "step": 38890 }, { "grad_norm": 0.16281405091285706, "learning_rate": 3.0170429574858084e-05, "loss": 0.1062, "step": 38900 }, { "grad_norm": 0.14997996389865875, "learning_rate": 3.0145134631043127e-05, "loss": 0.1157, "step": 38910 }, { "grad_norm": 0.17083202302455902, "learning_rate": 3.0119845718612018e-05, "loss": 0.1176, "step": 38920 }, { "grad_norm": 0.15955917537212372, "learning_rate": 3.009456284524688e-05, "loss": 0.112, "step": 38930 }, { "grad_norm": 0.21765759587287903, "learning_rate": 3.0069286018627967e-05, "loss": 0.1147, "step": 38940 }, { "grad_norm": 0.1511438488960266, "learning_rate": 3.0044015246433743e-05, "loss": 0.1133, "step": 38950 }, { "grad_norm": 0.24416539072990417, "learning_rate": 3.0018750536340755e-05, "loss": 0.1217, "step": 38960 }, { "grad_norm": 0.1796288788318634, "learning_rate": 2.999349189602378e-05, "loss": 0.1161, "step": 38970 }, { "grad_norm": 0.17860651016235352, "learning_rate": 2.9968239333155733e-05, "loss": 0.1223, "step": 38980 }, { "grad_norm": 0.17763040959835052, "learning_rate": 2.994299285540767e-05, "loss": 0.1138, "step": 38990 }, { "grad_norm": 0.17284606397151947, "learning_rate": 2.9917752470448813e-05, "loss": 0.1174, "step": 39000 }, { "grad_norm": 0.24619194865226746, "learning_rate": 2.9892518185946495e-05, "loss": 0.1228, "step": 39010 }, { "grad_norm": 0.20512934029102325, "learning_rate": 2.986729000956624e-05, "loss": 0.1264, "step": 39020 }, { "grad_norm": 0.19742442667484283, "learning_rate": 2.9842067948971736e-05, "loss": 0.1153, "step": 39030 }, { "grad_norm": 0.20225957036018372, "learning_rate": 2.9816852011824727e-05, "loss": 0.1163, "step": 39040 }, { "grad_norm": 0.2068500518798828, "learning_rate": 2.979164220578519e-05, "loss": 0.1209, "step": 39050 }, { "grad_norm": 0.1856566220521927, "learning_rate": 2.9766438538511165e-05, "loss": 0.1152, "step": 39060 }, { "grad_norm": 0.18743719160556793, "learning_rate": 2.9741241017658873e-05, "loss": 0.1152, "step": 39070 }, { "grad_norm": 0.16044433414936066, "learning_rate": 2.971604965088267e-05, "loss": 0.1117, "step": 39080 }, { "grad_norm": 0.20254477858543396, "learning_rate": 2.9690864445835008e-05, "loss": 0.1187, "step": 39090 }, { "grad_norm": 0.19305503368377686, "learning_rate": 2.966568541016651e-05, "loss": 0.1159, "step": 39100 }, { "grad_norm": 0.22801737487316132, "learning_rate": 2.9640512551525867e-05, "loss": 0.122, "step": 39110 }, { "grad_norm": 0.1745585799217224, "learning_rate": 2.961534587755995e-05, "loss": 0.1149, "step": 39120 }, { "grad_norm": 0.16976520419120789, "learning_rate": 2.959018539591375e-05, "loss": 0.1158, "step": 39130 }, { "grad_norm": 0.19531933963298798, "learning_rate": 2.9565031114230325e-05, "loss": 0.1215, "step": 39140 }, { "grad_norm": 0.17521534860134125, "learning_rate": 2.9539883040150895e-05, "loss": 0.1135, "step": 39150 }, { "grad_norm": 0.1881144642829895, "learning_rate": 2.9514741181314774e-05, "loss": 0.1148, "step": 39160 }, { "grad_norm": 0.15067915618419647, "learning_rate": 2.94896055453594e-05, "loss": 0.1125, "step": 39170 }, { "grad_norm": 0.1679619401693344, "learning_rate": 2.9464476139920332e-05, "loss": 0.1079, "step": 39180 }, { "grad_norm": 0.15564760565757751, "learning_rate": 2.9439352972631186e-05, "loss": 0.1161, "step": 39190 }, { "grad_norm": 0.16873423755168915, "learning_rate": 2.9414236051123757e-05, "loss": 0.1134, "step": 39200 }, { "grad_norm": 0.17949311435222626, "learning_rate": 2.938912538302785e-05, "loss": 0.1131, "step": 39210 }, { "grad_norm": 0.1778033971786499, "learning_rate": 2.9364020975971464e-05, "loss": 0.1166, "step": 39220 }, { "grad_norm": 0.18122683465480804, "learning_rate": 2.9338922837580657e-05, "loss": 0.1041, "step": 39230 }, { "grad_norm": 0.189020574092865, "learning_rate": 2.931383097547955e-05, "loss": 0.1157, "step": 39240 }, { "grad_norm": 0.2222643494606018, "learning_rate": 2.928874539729043e-05, "loss": 0.1171, "step": 39250 }, { "grad_norm": 0.15444447100162506, "learning_rate": 2.926366611063358e-05, "loss": 0.1152, "step": 39260 }, { "grad_norm": 0.17570362985134125, "learning_rate": 2.9238593123127463e-05, "loss": 0.1108, "step": 39270 }, { "grad_norm": 0.14614172279834747, "learning_rate": 2.9213526442388583e-05, "loss": 0.1169, "step": 39280 }, { "grad_norm": 0.15219731628894806, "learning_rate": 2.9188466076031545e-05, "loss": 0.11, "step": 39290 }, { "grad_norm": 0.18900607526302338, "learning_rate": 2.9163412031669012e-05, "loss": 0.1127, "step": 39300 }, { "grad_norm": 0.1913411021232605, "learning_rate": 2.913836431691175e-05, "loss": 0.1147, "step": 39310 }, { "grad_norm": 0.17476791143417358, "learning_rate": 2.9113322939368583e-05, "loss": 0.1168, "step": 39320 }, { "grad_norm": 0.177902951836586, "learning_rate": 2.9088287906646427e-05, "loss": 0.1098, "step": 39330 }, { "grad_norm": 0.21437551081180573, "learning_rate": 2.906325922635024e-05, "loss": 0.1245, "step": 39340 }, { "grad_norm": 0.1824629157781601, "learning_rate": 2.903823690608313e-05, "loss": 0.1133, "step": 39350 }, { "grad_norm": 0.16901688277721405, "learning_rate": 2.9013220953446174e-05, "loss": 0.1099, "step": 39360 }, { "grad_norm": 0.14798346161842346, "learning_rate": 2.8988211376038564e-05, "loss": 0.1107, "step": 39370 }, { "grad_norm": 0.16735868155956268, "learning_rate": 2.8963208181457564e-05, "loss": 0.1178, "step": 39380 }, { "grad_norm": 0.11023800075054169, "learning_rate": 2.8938211377298453e-05, "loss": 0.1153, "step": 39390 }, { "grad_norm": 0.12647463381290436, "learning_rate": 2.8913220971154652e-05, "loss": 0.1115, "step": 39400 }, { "grad_norm": 0.15483886003494263, "learning_rate": 2.888823697061753e-05, "loss": 0.1151, "step": 39410 }, { "grad_norm": 0.17524592578411102, "learning_rate": 2.8863259383276618e-05, "loss": 0.1045, "step": 39420 }, { "grad_norm": 0.15205973386764526, "learning_rate": 2.8838288216719395e-05, "loss": 0.1186, "step": 39430 }, { "grad_norm": 0.18921810388565063, "learning_rate": 2.8813323478531484e-05, "loss": 0.1182, "step": 39440 }, { "grad_norm": 0.17486488819122314, "learning_rate": 2.8788365176296496e-05, "loss": 0.1156, "step": 39450 }, { "grad_norm": 0.2023821324110031, "learning_rate": 2.876341331759611e-05, "loss": 0.1143, "step": 39460 }, { "grad_norm": 0.13761906325817108, "learning_rate": 2.8738467910010036e-05, "loss": 0.1132, "step": 39470 }, { "grad_norm": 0.1573120355606079, "learning_rate": 2.8713528961116032e-05, "loss": 0.1126, "step": 39480 }, { "grad_norm": 0.18319450318813324, "learning_rate": 2.8688596478489875e-05, "loss": 0.118, "step": 39490 }, { "grad_norm": 0.1729348748922348, "learning_rate": 2.8663670469705434e-05, "loss": 0.1073, "step": 39500 }, { "grad_norm": 0.1378379911184311, "learning_rate": 2.8638750942334546e-05, "loss": 0.1157, "step": 39510 }, { "grad_norm": 0.18679848313331604, "learning_rate": 2.8613837903947115e-05, "loss": 0.1115, "step": 39520 }, { "grad_norm": 0.164638951420784, "learning_rate": 2.858893136211106e-05, "loss": 0.1138, "step": 39530 }, { "grad_norm": 0.14405861496925354, "learning_rate": 2.8564031324392315e-05, "loss": 0.1183, "step": 39540 }, { "grad_norm": 0.1808837354183197, "learning_rate": 2.85391377983549e-05, "loss": 0.1196, "step": 39550 }, { "grad_norm": 0.19837120175361633, "learning_rate": 2.851425079156075e-05, "loss": 0.1088, "step": 39560 }, { "grad_norm": 0.18048006296157837, "learning_rate": 2.848937031156994e-05, "loss": 0.1221, "step": 39570 }, { "grad_norm": 0.1911044716835022, "learning_rate": 2.846449636594044e-05, "loss": 0.1101, "step": 39580 }, { "grad_norm": 0.2032880336046219, "learning_rate": 2.843962896222836e-05, "loss": 0.1174, "step": 39590 }, { "grad_norm": 0.1970837265253067, "learning_rate": 2.8414768107987722e-05, "loss": 0.1113, "step": 39600 }, { "grad_norm": 0.158486008644104, "learning_rate": 2.838991381077061e-05, "loss": 0.1193, "step": 39610 }, { "grad_norm": 0.22456412017345428, "learning_rate": 2.83650660781271e-05, "loss": 0.1124, "step": 39620 }, { "grad_norm": 0.18232735991477966, "learning_rate": 2.8340224917605285e-05, "loss": 0.1179, "step": 39630 }, { "grad_norm": 0.15895119309425354, "learning_rate": 2.831539033675122e-05, "loss": 0.1091, "step": 39640 }, { "grad_norm": 0.1391766369342804, "learning_rate": 2.8290562343109038e-05, "loss": 0.1098, "step": 39650 }, { "grad_norm": 0.18646027147769928, "learning_rate": 2.826574094422082e-05, "loss": 0.1181, "step": 39660 }, { "grad_norm": 0.1903359740972519, "learning_rate": 2.8240926147626645e-05, "loss": 0.116, "step": 39670 }, { "grad_norm": 0.1651301085948944, "learning_rate": 2.8216117960864586e-05, "loss": 0.1102, "step": 39680 }, { "grad_norm": 0.1521131694316864, "learning_rate": 2.8191316391470703e-05, "loss": 0.1151, "step": 39690 }, { "grad_norm": 0.14715617895126343, "learning_rate": 2.816652144697911e-05, "loss": 0.1175, "step": 39700 }, { "grad_norm": 0.1681675910949707, "learning_rate": 2.8141733134921783e-05, "loss": 0.1206, "step": 39710 }, { "grad_norm": 0.20296958088874817, "learning_rate": 2.811695146282884e-05, "loss": 0.1158, "step": 39720 }, { "grad_norm": 0.16190862655639648, "learning_rate": 2.8092176438228212e-05, "loss": 0.1141, "step": 39730 }, { "grad_norm": 0.17378878593444824, "learning_rate": 2.806740806864598e-05, "loss": 0.1141, "step": 39740 }, { "grad_norm": 0.15416887402534485, "learning_rate": 2.804264636160604e-05, "loss": 0.1144, "step": 39750 }, { "grad_norm": 0.15781910717487335, "learning_rate": 2.8017891324630402e-05, "loss": 0.1136, "step": 39760 }, { "grad_norm": 0.1800372451543808, "learning_rate": 2.7993142965238976e-05, "loss": 0.1171, "step": 39770 }, { "grad_norm": 0.139891117811203, "learning_rate": 2.7968401290949665e-05, "loss": 0.1151, "step": 39780 }, { "grad_norm": 0.18535159528255463, "learning_rate": 2.7943666309278328e-05, "loss": 0.1133, "step": 39790 }, { "grad_norm": 0.12375925481319427, "learning_rate": 2.7918938027738783e-05, "loss": 0.1072, "step": 39800 }, { "grad_norm": 0.19290076196193695, "learning_rate": 2.789421645384287e-05, "loss": 0.125, "step": 39810 }, { "grad_norm": 0.16750742495059967, "learning_rate": 2.786950159510032e-05, "loss": 0.1102, "step": 39820 }, { "grad_norm": 0.1605137437582016, "learning_rate": 2.7844793459018876e-05, "loss": 0.1137, "step": 39830 }, { "grad_norm": 0.1562051922082901, "learning_rate": 2.7820092053104195e-05, "loss": 0.1092, "step": 39840 }, { "grad_norm": 0.1937750279903412, "learning_rate": 2.7795397384859933e-05, "loss": 0.1232, "step": 39850 }, { "grad_norm": 0.23650015890598297, "learning_rate": 2.7770709461787638e-05, "loss": 0.117, "step": 39860 }, { "grad_norm": 0.16471122205257416, "learning_rate": 2.7746028291386915e-05, "loss": 0.1158, "step": 39870 }, { "grad_norm": 0.1420920491218567, "learning_rate": 2.772135388115519e-05, "loss": 0.1096, "step": 39880 }, { "grad_norm": 0.15287138521671295, "learning_rate": 2.7696686238587945e-05, "loss": 0.1137, "step": 39890 }, { "grad_norm": 0.18459618091583252, "learning_rate": 2.7672025371178505e-05, "loss": 0.1155, "step": 39900 }, { "grad_norm": 0.17518974840641022, "learning_rate": 2.7647371286418238e-05, "loss": 0.1169, "step": 39910 }, { "grad_norm": 0.17439380288124084, "learning_rate": 2.762272399179639e-05, "loss": 0.1232, "step": 39920 }, { "grad_norm": 0.14077205955982208, "learning_rate": 2.7598083494800154e-05, "loss": 0.1131, "step": 39930 }, { "grad_norm": 0.1377743035554886, "learning_rate": 2.7573449802914664e-05, "loss": 0.1166, "step": 39940 }, { "grad_norm": 0.17099516093730927, "learning_rate": 2.7548822923622964e-05, "loss": 0.1115, "step": 39950 }, { "grad_norm": 0.20154549181461334, "learning_rate": 2.752420286440609e-05, "loss": 0.1128, "step": 39960 }, { "grad_norm": 0.15092740952968597, "learning_rate": 2.749958963274295e-05, "loss": 0.1074, "step": 39970 }, { "grad_norm": 0.14173413813114166, "learning_rate": 2.747498323611039e-05, "loss": 0.1109, "step": 39980 }, { "grad_norm": 0.17003405094146729, "learning_rate": 2.7450383681983184e-05, "loss": 0.1144, "step": 39990 }, { "grad_norm": 0.1487162560224533, "learning_rate": 2.742579097783403e-05, "loss": 0.1104, "step": 40000 }, { "grad_norm": 0.14584632217884064, "learning_rate": 2.7401205131133512e-05, "loss": 0.11, "step": 40010 }, { "grad_norm": 0.15060706436634064, "learning_rate": 2.7376626149350238e-05, "loss": 0.1105, "step": 40020 }, { "grad_norm": 0.1438777893781662, "learning_rate": 2.735205403995056e-05, "loss": 0.1127, "step": 40030 }, { "grad_norm": 0.16140048205852509, "learning_rate": 2.7327488810398917e-05, "loss": 0.1129, "step": 40040 }, { "grad_norm": 0.17747361958026886, "learning_rate": 2.7302930468157507e-05, "loss": 0.1128, "step": 40050 }, { "grad_norm": 0.1686062067747116, "learning_rate": 2.727837902068655e-05, "loss": 0.1181, "step": 40060 }, { "grad_norm": 0.1912623792886734, "learning_rate": 2.7253834475444123e-05, "loss": 0.1139, "step": 40070 }, { "grad_norm": 0.16058893501758575, "learning_rate": 2.7229296839886204e-05, "loss": 0.11, "step": 40080 }, { "grad_norm": 0.13165277242660522, "learning_rate": 2.720476612146668e-05, "loss": 0.1021, "step": 40090 }, { "grad_norm": 0.13083484768867493, "learning_rate": 2.7180242327637317e-05, "loss": 0.1061, "step": 40100 }, { "grad_norm": 0.1349671334028244, "learning_rate": 2.7155725465847826e-05, "loss": 0.1016, "step": 40110 }, { "grad_norm": 0.14855024218559265, "learning_rate": 2.713121554354578e-05, "loss": 0.1125, "step": 40120 }, { "grad_norm": 0.14877532422542572, "learning_rate": 2.7106712568176628e-05, "loss": 0.1086, "step": 40130 }, { "grad_norm": 0.15517203509807587, "learning_rate": 2.708221654718374e-05, "loss": 0.1169, "step": 40140 }, { "grad_norm": 0.14011937379837036, "learning_rate": 2.7057727488008357e-05, "loss": 0.117, "step": 40150 }, { "grad_norm": 0.15917344391345978, "learning_rate": 2.703324539808961e-05, "loss": 0.1144, "step": 40160 }, { "grad_norm": 0.19038310647010803, "learning_rate": 2.7008770284864505e-05, "loss": 0.1148, "step": 40170 }, { "grad_norm": 0.15207454562187195, "learning_rate": 2.6984302155767916e-05, "loss": 0.1114, "step": 40180 }, { "grad_norm": 0.193206325173378, "learning_rate": 2.6959841018232683e-05, "loss": 0.1131, "step": 40190 }, { "grad_norm": 0.15737080574035645, "learning_rate": 2.693538687968937e-05, "loss": 0.1122, "step": 40200 }, { "grad_norm": 0.18311521410942078, "learning_rate": 2.6910939747566556e-05, "loss": 0.112, "step": 40210 }, { "grad_norm": 0.16276542842388153, "learning_rate": 2.6886499629290607e-05, "loss": 0.1118, "step": 40220 }, { "grad_norm": 0.19168858230113983, "learning_rate": 2.6862066532285802e-05, "loss": 0.1151, "step": 40230 }, { "grad_norm": 0.1860428750514984, "learning_rate": 2.6837640463974262e-05, "loss": 0.113, "step": 40240 }, { "grad_norm": 0.16934487223625183, "learning_rate": 2.681322143177596e-05, "loss": 0.1065, "step": 40250 }, { "grad_norm": 0.18347863852977753, "learning_rate": 2.678880944310882e-05, "loss": 0.119, "step": 40260 }, { "grad_norm": 0.17574092745780945, "learning_rate": 2.6764404505388474e-05, "loss": 0.1142, "step": 40270 }, { "grad_norm": 0.17093797028064728, "learning_rate": 2.6740006626028558e-05, "loss": 0.1206, "step": 40280 }, { "grad_norm": 0.1692546159029007, "learning_rate": 2.671561581244048e-05, "loss": 0.1065, "step": 40290 }, { "grad_norm": 0.15914058685302734, "learning_rate": 2.6691232072033536e-05, "loss": 0.1101, "step": 40300 }, { "grad_norm": 0.17882676422595978, "learning_rate": 2.6666855412214852e-05, "loss": 0.1169, "step": 40310 }, { "grad_norm": 0.16904477775096893, "learning_rate": 2.664248584038942e-05, "loss": 0.1123, "step": 40320 }, { "grad_norm": 0.17546342313289642, "learning_rate": 2.6618123363960047e-05, "loss": 0.1112, "step": 40330 }, { "grad_norm": 0.14696362614631653, "learning_rate": 2.659376799032748e-05, "loss": 0.1129, "step": 40340 }, { "grad_norm": 0.1710735708475113, "learning_rate": 2.6569419726890145e-05, "loss": 0.1141, "step": 40350 }, { "grad_norm": 0.16745592653751373, "learning_rate": 2.654507858104447e-05, "loss": 0.1096, "step": 40360 }, { "grad_norm": 0.214598149061203, "learning_rate": 2.652074456018463e-05, "loss": 0.1126, "step": 40370 }, { "grad_norm": 0.18136604130268097, "learning_rate": 2.6496417671702646e-05, "loss": 0.112, "step": 40380 }, { "grad_norm": 0.16984225809574127, "learning_rate": 2.6472097922988427e-05, "loss": 0.11, "step": 40390 }, { "grad_norm": 0.18809273838996887, "learning_rate": 2.6447785321429607e-05, "loss": 0.1156, "step": 40400 }, { "grad_norm": 0.13981984555721283, "learning_rate": 2.6423479874411784e-05, "loss": 0.1131, "step": 40410 }, { "grad_norm": 0.2083340436220169, "learning_rate": 2.6399181589318234e-05, "loss": 0.1307, "step": 40420 }, { "grad_norm": 0.20444230735301971, "learning_rate": 2.6374890473530188e-05, "loss": 0.1171, "step": 40430 }, { "grad_norm": 0.17723578214645386, "learning_rate": 2.635060653442664e-05, "loss": 0.1202, "step": 40440 }, { "grad_norm": 0.19085174798965454, "learning_rate": 2.6326329779384395e-05, "loss": 0.1156, "step": 40450 }, { "grad_norm": 0.15236587822437286, "learning_rate": 2.63020602157781e-05, "loss": 0.1205, "step": 40460 }, { "grad_norm": 0.1931127905845642, "learning_rate": 2.62777978509802e-05, "loss": 0.1093, "step": 40470 }, { "grad_norm": 0.1712074875831604, "learning_rate": 2.6253542692360954e-05, "loss": 0.1134, "step": 40480 }, { "grad_norm": 0.12056834995746613, "learning_rate": 2.6229294747288458e-05, "loss": 0.1041, "step": 40490 }, { "grad_norm": 0.1761419177055359, "learning_rate": 2.6205054023128596e-05, "loss": 0.1184, "step": 40500 }, { "grad_norm": 0.14737151563167572, "learning_rate": 2.6180820527245043e-05, "loss": 0.1224, "step": 40510 }, { "grad_norm": 0.22672070562839508, "learning_rate": 2.6156594266999313e-05, "loss": 0.1169, "step": 40520 }, { "grad_norm": 0.1679825484752655, "learning_rate": 2.6132375249750672e-05, "loss": 0.1164, "step": 40530 }, { "grad_norm": 0.13544531166553497, "learning_rate": 2.6108163482856286e-05, "loss": 0.1077, "step": 40540 }, { "grad_norm": 0.15551169216632843, "learning_rate": 2.6083958973670964e-05, "loss": 0.1133, "step": 40550 }, { "grad_norm": 0.15929365158081055, "learning_rate": 2.6059761729547483e-05, "loss": 0.1107, "step": 40560 }, { "grad_norm": 0.16238397359848022, "learning_rate": 2.603557175783624e-05, "loss": 0.1123, "step": 40570 }, { "grad_norm": 0.15232840180397034, "learning_rate": 2.601138906588559e-05, "loss": 0.1163, "step": 40580 }, { "grad_norm": 0.18920135498046875, "learning_rate": 2.598721366104152e-05, "loss": 0.1192, "step": 40590 }, { "grad_norm": 0.1525282859802246, "learning_rate": 2.5963045550647945e-05, "loss": 0.11, "step": 40600 }, { "grad_norm": 0.18420109152793884, "learning_rate": 2.5938884742046466e-05, "loss": 0.1067, "step": 40610 }, { "grad_norm": 0.1365787386894226, "learning_rate": 2.5914731242576507e-05, "loss": 0.1195, "step": 40620 }, { "grad_norm": 0.16094467043876648, "learning_rate": 2.5890585059575268e-05, "loss": 0.1132, "step": 40630 }, { "grad_norm": 0.15818347036838531, "learning_rate": 2.5866446200377688e-05, "loss": 0.1084, "step": 40640 }, { "grad_norm": 0.1993614286184311, "learning_rate": 2.5842314672316566e-05, "loss": 0.1211, "step": 40650 }, { "grad_norm": 0.21340171992778778, "learning_rate": 2.581819048272239e-05, "loss": 0.1156, "step": 40660 }, { "grad_norm": 0.14479529857635498, "learning_rate": 2.5794073638923478e-05, "loss": 0.1109, "step": 40670 }, { "grad_norm": 0.1516212373971939, "learning_rate": 2.576996414824586e-05, "loss": 0.1054, "step": 40680 }, { "grad_norm": 0.17086389660835266, "learning_rate": 2.574586201801339e-05, "loss": 0.1178, "step": 40690 }, { "grad_norm": 0.16144154965877533, "learning_rate": 2.572176725554762e-05, "loss": 0.1104, "step": 40700 }, { "grad_norm": 0.16478702425956726, "learning_rate": 2.5697679868167966e-05, "loss": 0.1122, "step": 40710 }, { "grad_norm": 0.15890125930309296, "learning_rate": 2.5673599863191468e-05, "loss": 0.118, "step": 40720 }, { "grad_norm": 0.16189613938331604, "learning_rate": 2.564952724793306e-05, "loss": 0.1053, "step": 40730 }, { "grad_norm": 0.17337100207805634, "learning_rate": 2.5625462029705306e-05, "loss": 0.1153, "step": 40740 }, { "grad_norm": 0.17508287727832794, "learning_rate": 2.5601404215818624e-05, "loss": 0.1241, "step": 40750 }, { "grad_norm": 0.2615421712398529, "learning_rate": 2.5577353813581144e-05, "loss": 0.1163, "step": 40760 }, { "grad_norm": 0.17321103811264038, "learning_rate": 2.5553310830298733e-05, "loss": 0.1069, "step": 40770 }, { "grad_norm": 0.1766384243965149, "learning_rate": 2.5529275273275012e-05, "loss": 0.1177, "step": 40780 }, { "grad_norm": 0.12115247547626495, "learning_rate": 2.550524714981133e-05, "loss": 0.1247, "step": 40790 }, { "grad_norm": 0.1580183506011963, "learning_rate": 2.5481226467206837e-05, "loss": 0.1054, "step": 40800 }, { "grad_norm": 0.1354752779006958, "learning_rate": 2.5457213232758365e-05, "loss": 0.1126, "step": 40810 }, { "grad_norm": 0.1470417082309723, "learning_rate": 2.5433207453760498e-05, "loss": 0.1051, "step": 40820 }, { "grad_norm": 0.14218653738498688, "learning_rate": 2.5409209137505552e-05, "loss": 0.1056, "step": 40830 }, { "grad_norm": 0.15669794380664825, "learning_rate": 2.5385218291283597e-05, "loss": 0.1129, "step": 40840 }, { "grad_norm": 0.20379309356212616, "learning_rate": 2.5361234922382383e-05, "loss": 0.1143, "step": 40850 }, { "grad_norm": 0.19495193660259247, "learning_rate": 2.533725903808749e-05, "loss": 0.1152, "step": 40860 }, { "grad_norm": 0.24146519601345062, "learning_rate": 2.5313290645682085e-05, "loss": 0.1196, "step": 40870 }, { "grad_norm": 0.2064671367406845, "learning_rate": 2.52893297524472e-05, "loss": 0.1146, "step": 40880 }, { "grad_norm": 0.17201051115989685, "learning_rate": 2.526537636566145e-05, "loss": 0.1087, "step": 40890 }, { "grad_norm": 0.15848033130168915, "learning_rate": 2.5241430492601305e-05, "loss": 0.1127, "step": 40900 }, { "grad_norm": 0.15511149168014526, "learning_rate": 2.5217492140540867e-05, "loss": 0.1094, "step": 40910 }, { "grad_norm": 0.16584976017475128, "learning_rate": 2.5193561316751967e-05, "loss": 0.1089, "step": 40920 }, { "grad_norm": 0.18083100020885468, "learning_rate": 2.516963802850416e-05, "loss": 0.112, "step": 40930 }, { "grad_norm": 0.12796790897846222, "learning_rate": 2.5145722283064698e-05, "loss": 0.109, "step": 40940 }, { "grad_norm": 0.1658654361963272, "learning_rate": 2.5121814087698602e-05, "loss": 0.1189, "step": 40950 }, { "grad_norm": 0.1742628663778305, "learning_rate": 2.509791344966848e-05, "loss": 0.1167, "step": 40960 }, { "grad_norm": 0.15491874516010284, "learning_rate": 2.5074020376234768e-05, "loss": 0.1142, "step": 40970 }, { "grad_norm": 0.19761642813682556, "learning_rate": 2.5050134874655534e-05, "loss": 0.1161, "step": 40980 }, { "grad_norm": 0.14843164384365082, "learning_rate": 2.5026256952186566e-05, "loss": 0.111, "step": 40990 }, { "grad_norm": 0.15392237901687622, "learning_rate": 2.5002386616081335e-05, "loss": 0.1114, "step": 41000 }, { "grad_norm": 0.16924671828746796, "learning_rate": 2.497852387359103e-05, "loss": 0.1145, "step": 41010 }, { "grad_norm": 0.14356094598770142, "learning_rate": 2.4954668731964496e-05, "loss": 0.118, "step": 41020 }, { "grad_norm": 0.17260567843914032, "learning_rate": 2.4930821198448364e-05, "loss": 0.1149, "step": 41030 }, { "grad_norm": 0.15269352495670319, "learning_rate": 2.4906981280286796e-05, "loss": 0.1022, "step": 41040 }, { "grad_norm": 0.1445702314376831, "learning_rate": 2.488314898472179e-05, "loss": 0.113, "step": 41050 }, { "grad_norm": 0.17079563438892365, "learning_rate": 2.485932431899295e-05, "loss": 0.1138, "step": 41060 }, { "grad_norm": 0.16203315556049347, "learning_rate": 2.4835507290337584e-05, "loss": 0.115, "step": 41070 }, { "grad_norm": 0.18515200912952423, "learning_rate": 2.4811697905990672e-05, "loss": 0.1152, "step": 41080 }, { "grad_norm": 0.18594594299793243, "learning_rate": 2.4787896173184854e-05, "loss": 0.105, "step": 41090 }, { "grad_norm": 0.16839824616909027, "learning_rate": 2.4764102099150534e-05, "loss": 0.1122, "step": 41100 }, { "grad_norm": 0.19346627593040466, "learning_rate": 2.4740315691115644e-05, "loss": 0.1075, "step": 41110 }, { "grad_norm": 0.20319603383541107, "learning_rate": 2.4716536956305918e-05, "loss": 0.1089, "step": 41120 }, { "grad_norm": 0.1720401793718338, "learning_rate": 2.4692765901944697e-05, "loss": 0.1104, "step": 41130 }, { "grad_norm": 0.18389275670051575, "learning_rate": 2.4669002535253e-05, "loss": 0.1103, "step": 41140 }, { "grad_norm": 0.16037489473819733, "learning_rate": 2.46452468634495e-05, "loss": 0.1126, "step": 41150 }, { "grad_norm": 0.22546391189098358, "learning_rate": 2.462149889375055e-05, "loss": 0.1128, "step": 41160 }, { "grad_norm": 0.17275159060955048, "learning_rate": 2.459775863337014e-05, "loss": 0.1203, "step": 41170 }, { "grad_norm": 0.1399308741092682, "learning_rate": 2.4574026089519985e-05, "loss": 0.1046, "step": 41180 }, { "grad_norm": 0.16120316088199615, "learning_rate": 2.4550301269409333e-05, "loss": 0.1121, "step": 41190 }, { "grad_norm": 0.14056219160556793, "learning_rate": 2.4526584180245216e-05, "loss": 0.1215, "step": 41200 }, { "grad_norm": 0.16316522657871246, "learning_rate": 2.4502874829232236e-05, "loss": 0.112, "step": 41210 }, { "grad_norm": 0.1430041640996933, "learning_rate": 2.447917322357267e-05, "loss": 0.1125, "step": 41220 }, { "grad_norm": 0.15368354320526123, "learning_rate": 2.4455479370466443e-05, "loss": 0.1109, "step": 41230 }, { "grad_norm": 0.16528300940990448, "learning_rate": 2.4431793277111097e-05, "loss": 0.1178, "step": 41240 }, { "grad_norm": 0.13962562382221222, "learning_rate": 2.4408114950701905e-05, "loss": 0.1086, "step": 41250 }, { "grad_norm": 0.13152629137039185, "learning_rate": 2.4384444398431634e-05, "loss": 0.1139, "step": 41260 }, { "grad_norm": 0.14655588567256927, "learning_rate": 2.4360781627490837e-05, "loss": 0.109, "step": 41270 }, { "grad_norm": 0.15226484835147858, "learning_rate": 2.433712664506762e-05, "loss": 0.1156, "step": 41280 }, { "grad_norm": 0.167071133852005, "learning_rate": 2.431347945834774e-05, "loss": 0.1054, "step": 41290 }, { "grad_norm": 0.15129148960113525, "learning_rate": 2.428984007451458e-05, "loss": 0.1129, "step": 41300 }, { "grad_norm": 0.18695223331451416, "learning_rate": 2.426620850074917e-05, "loss": 0.1106, "step": 41310 }, { "grad_norm": 0.17905311286449432, "learning_rate": 2.424258474423014e-05, "loss": 0.1192, "step": 41320 }, { "grad_norm": 0.17377005517482758, "learning_rate": 2.421896881213382e-05, "loss": 0.1045, "step": 41330 }, { "grad_norm": 0.13471625745296478, "learning_rate": 2.419536071163402e-05, "loss": 0.1036, "step": 41340 }, { "grad_norm": 0.17949159443378448, "learning_rate": 2.417176044990233e-05, "loss": 0.119, "step": 41350 }, { "grad_norm": 0.17275752127170563, "learning_rate": 2.4148168034107855e-05, "loss": 0.1103, "step": 41360 }, { "grad_norm": 0.16702187061309814, "learning_rate": 2.4124583471417355e-05, "loss": 0.113, "step": 41370 }, { "grad_norm": 0.1636355072259903, "learning_rate": 2.41010067689952e-05, "loss": 0.1102, "step": 41380 }, { "grad_norm": 0.18707937002182007, "learning_rate": 2.4077437934003338e-05, "loss": 0.1155, "step": 41390 }, { "grad_norm": 0.16406425833702087, "learning_rate": 2.405387697360143e-05, "loss": 0.1077, "step": 41400 }, { "grad_norm": 0.17349228262901306, "learning_rate": 2.4030323894946595e-05, "loss": 0.1119, "step": 41410 }, { "grad_norm": 0.1345822960138321, "learning_rate": 2.40067787051937e-05, "loss": 0.1042, "step": 41420 }, { "grad_norm": 0.15782900154590607, "learning_rate": 2.3983241411495087e-05, "loss": 0.1208, "step": 41430 }, { "grad_norm": 0.13364717364311218, "learning_rate": 2.3959712021000823e-05, "loss": 0.1111, "step": 41440 }, { "grad_norm": 0.1614425927400589, "learning_rate": 2.3936190540858495e-05, "loss": 0.1171, "step": 41450 }, { "grad_norm": 0.16491125524044037, "learning_rate": 2.39126769782133e-05, "loss": 0.1119, "step": 41460 }, { "grad_norm": 0.1590110957622528, "learning_rate": 2.388917134020805e-05, "loss": 0.1119, "step": 41470 }, { "grad_norm": 0.1323145031929016, "learning_rate": 2.3865673633983128e-05, "loss": 0.1033, "step": 41480 }, { "grad_norm": 0.17451949417591095, "learning_rate": 2.3842183866676492e-05, "loss": 0.1113, "step": 41490 }, { "grad_norm": 0.1565479189157486, "learning_rate": 2.381870204542377e-05, "loss": 0.1081, "step": 41500 }, { "grad_norm": 0.16047561168670654, "learning_rate": 2.379522817735808e-05, "loss": 0.1064, "step": 41510 }, { "grad_norm": 0.13598328828811646, "learning_rate": 2.377176226961018e-05, "loss": 0.1041, "step": 41520 }, { "grad_norm": 0.1483488380908966, "learning_rate": 2.3748304329308384e-05, "loss": 0.1118, "step": 41530 }, { "grad_norm": 0.2265770137310028, "learning_rate": 2.372485436357858e-05, "loss": 0.1108, "step": 41540 }, { "grad_norm": 0.18827588856220245, "learning_rate": 2.3701412379544296e-05, "loss": 0.1171, "step": 41550 }, { "grad_norm": 0.16691653430461884, "learning_rate": 2.367797838432653e-05, "loss": 0.107, "step": 41560 }, { "grad_norm": 0.1902560442686081, "learning_rate": 2.3654552385043967e-05, "loss": 0.1202, "step": 41570 }, { "grad_norm": 0.16391156613826752, "learning_rate": 2.3631134388812742e-05, "loss": 0.107, "step": 41580 }, { "grad_norm": 0.1557219922542572, "learning_rate": 2.3607724402746684e-05, "loss": 0.1066, "step": 41590 }, { "grad_norm": 0.1521712690591812, "learning_rate": 2.35843224339571e-05, "loss": 0.1113, "step": 41600 }, { "grad_norm": 0.1963447779417038, "learning_rate": 2.3560928489552897e-05, "loss": 0.1168, "step": 41610 }, { "grad_norm": 0.18205322325229645, "learning_rate": 2.353754257664053e-05, "loss": 0.1179, "step": 41620 }, { "grad_norm": 0.17020447552204132, "learning_rate": 2.3514164702324037e-05, "loss": 0.105, "step": 41630 }, { "grad_norm": 0.20098662376403809, "learning_rate": 2.3490794873704963e-05, "loss": 0.1129, "step": 41640 }, { "grad_norm": 0.19478219747543335, "learning_rate": 2.3467433097882496e-05, "loss": 0.1186, "step": 41650 }, { "grad_norm": 0.22032995522022247, "learning_rate": 2.34440793819533e-05, "loss": 0.1131, "step": 41660 }, { "grad_norm": 0.19550302624702454, "learning_rate": 2.3420733733011617e-05, "loss": 0.113, "step": 41670 }, { "grad_norm": 0.13851511478424072, "learning_rate": 2.3397396158149243e-05, "loss": 0.1029, "step": 41680 }, { "grad_norm": 0.19113288819789886, "learning_rate": 2.3374066664455498e-05, "loss": 0.1151, "step": 41690 }, { "grad_norm": 0.18209396302700043, "learning_rate": 2.3350745259017315e-05, "loss": 0.1036, "step": 41700 }, { "grad_norm": 0.1977216601371765, "learning_rate": 2.332743194891906e-05, "loss": 0.1116, "step": 41710 }, { "grad_norm": 0.1550934910774231, "learning_rate": 2.330412674124276e-05, "loss": 0.1181, "step": 41720 }, { "grad_norm": 0.1830894649028778, "learning_rate": 2.328082964306786e-05, "loss": 0.1188, "step": 41730 }, { "grad_norm": 0.15978936851024628, "learning_rate": 2.325754066147145e-05, "loss": 0.1088, "step": 41740 }, { "grad_norm": 0.15769453346729279, "learning_rate": 2.32342598035281e-05, "loss": 0.1053, "step": 41750 }, { "grad_norm": 0.12252137064933777, "learning_rate": 2.321098707630991e-05, "loss": 0.1085, "step": 41760 }, { "grad_norm": 0.14149433374404907, "learning_rate": 2.318772248688652e-05, "loss": 0.1168, "step": 41770 }, { "grad_norm": 0.191127747297287, "learning_rate": 2.3164466042325107e-05, "loss": 0.1149, "step": 41780 }, { "grad_norm": 0.1577165573835373, "learning_rate": 2.3141217749690353e-05, "loss": 0.1129, "step": 41790 }, { "grad_norm": 0.19654813408851624, "learning_rate": 2.3117977616044466e-05, "loss": 0.1197, "step": 41800 }, { "grad_norm": 0.15691901743412018, "learning_rate": 2.309474564844722e-05, "loss": 0.1153, "step": 41810 }, { "grad_norm": 0.19530437886714935, "learning_rate": 2.307152185395585e-05, "loss": 0.1118, "step": 41820 }, { "grad_norm": 0.17328672111034393, "learning_rate": 2.3048306239625144e-05, "loss": 0.1209, "step": 41830 }, { "grad_norm": 0.17448127269744873, "learning_rate": 2.3025098812507378e-05, "loss": 0.116, "step": 41840 }, { "grad_norm": 0.17228682339191437, "learning_rate": 2.3001899579652366e-05, "loss": 0.1097, "step": 41850 }, { "grad_norm": 0.1885160654783249, "learning_rate": 2.2978708548107393e-05, "loss": 0.1094, "step": 41860 }, { "grad_norm": 0.19045375287532806, "learning_rate": 2.2955525724917348e-05, "loss": 0.1125, "step": 41870 }, { "grad_norm": 0.1933370679616928, "learning_rate": 2.2932351117124477e-05, "loss": 0.1058, "step": 41880 }, { "grad_norm": 0.178427591919899, "learning_rate": 2.29091847317687e-05, "loss": 0.1183, "step": 41890 }, { "grad_norm": 0.17546458542346954, "learning_rate": 2.2886026575887277e-05, "loss": 0.106, "step": 41900 }, { "grad_norm": 0.13385678827762604, "learning_rate": 2.2862876656515094e-05, "loss": 0.1147, "step": 41910 }, { "grad_norm": 0.1465286910533905, "learning_rate": 2.2839734980684464e-05, "loss": 0.1244, "step": 41920 }, { "grad_norm": 0.1403263956308365, "learning_rate": 2.281660155542522e-05, "loss": 0.1132, "step": 41930 }, { "grad_norm": 0.1510915905237198, "learning_rate": 2.279347638776469e-05, "loss": 0.1105, "step": 41940 }, { "grad_norm": 0.16123706102371216, "learning_rate": 2.2770359484727665e-05, "loss": 0.117, "step": 41950 }, { "grad_norm": 0.16617447137832642, "learning_rate": 2.27472508533365e-05, "loss": 0.1173, "step": 41960 }, { "grad_norm": 0.1580059677362442, "learning_rate": 2.2724150500610948e-05, "loss": 0.1106, "step": 41970 }, { "grad_norm": 0.16243228316307068, "learning_rate": 2.2701058433568302e-05, "loss": 0.1051, "step": 41980 }, { "grad_norm": 0.1265542060136795, "learning_rate": 2.2677974659223318e-05, "loss": 0.1086, "step": 41990 }, { "grad_norm": 0.14875507354736328, "learning_rate": 2.2654899184588235e-05, "loss": 0.1038, "step": 42000 }, { "grad_norm": 0.18470658361911774, "learning_rate": 2.2631832016672756e-05, "loss": 0.1167, "step": 42010 }, { "grad_norm": 0.18332581222057343, "learning_rate": 2.2608773162484127e-05, "loss": 0.1204, "step": 42020 }, { "grad_norm": 0.14157319068908691, "learning_rate": 2.2585722629026958e-05, "loss": 0.1154, "step": 42030 }, { "grad_norm": 0.14067302644252777, "learning_rate": 2.2562680423303457e-05, "loss": 0.1173, "step": 42040 }, { "grad_norm": 0.16854876279830933, "learning_rate": 2.2539646552313165e-05, "loss": 0.111, "step": 42050 }, { "grad_norm": 0.18526022136211395, "learning_rate": 2.251662102305322e-05, "loss": 0.1121, "step": 42060 }, { "grad_norm": 0.17730867862701416, "learning_rate": 2.2493603842518152e-05, "loss": 0.1252, "step": 42070 }, { "grad_norm": 0.1953623741865158, "learning_rate": 2.2470595017699974e-05, "loss": 0.1075, "step": 42080 }, { "grad_norm": 0.17926205694675446, "learning_rate": 2.244759455558816e-05, "loss": 0.1188, "step": 42090 }, { "grad_norm": 0.17324623465538025, "learning_rate": 2.2424602463169614e-05, "loss": 0.1151, "step": 42100 }, { "grad_norm": 0.1726798266172409, "learning_rate": 2.2401618747428776e-05, "loss": 0.1107, "step": 42110 }, { "grad_norm": 0.1516299545764923, "learning_rate": 2.237864341534747e-05, "loss": 0.1097, "step": 42120 }, { "grad_norm": 0.16043101251125336, "learning_rate": 2.2355676473904998e-05, "loss": 0.1082, "step": 42130 }, { "grad_norm": 0.12855760753154755, "learning_rate": 2.2332717930078108e-05, "loss": 0.1066, "step": 42140 }, { "grad_norm": 0.15669910609722137, "learning_rate": 2.2309767790840992e-05, "loss": 0.1189, "step": 42150 }, { "grad_norm": 0.12342929095029831, "learning_rate": 2.228682606316529e-05, "loss": 0.1114, "step": 42160 }, { "grad_norm": 0.13860735297203064, "learning_rate": 2.2263892754020138e-05, "loss": 0.1093, "step": 42170 }, { "grad_norm": 0.16511273384094238, "learning_rate": 2.2240967870372004e-05, "loss": 0.1111, "step": 42180 }, { "grad_norm": 0.17737460136413574, "learning_rate": 2.2218051419184933e-05, "loss": 0.1094, "step": 42190 }, { "grad_norm": 0.20116500556468964, "learning_rate": 2.219514340742026e-05, "loss": 0.1133, "step": 42200 }, { "grad_norm": 0.1888701468706131, "learning_rate": 2.2172243842036898e-05, "loss": 0.1112, "step": 42210 }, { "grad_norm": 0.1595032811164856, "learning_rate": 2.2149352729991107e-05, "loss": 0.1068, "step": 42220 }, { "grad_norm": 0.13059546053409576, "learning_rate": 2.2126470078236605e-05, "loss": 0.1081, "step": 42230 }, { "grad_norm": 0.13840100169181824, "learning_rate": 2.2103595893724533e-05, "loss": 0.1121, "step": 42240 }, { "grad_norm": 0.20247679948806763, "learning_rate": 2.208073018340345e-05, "loss": 0.1232, "step": 42250 }, { "grad_norm": 0.1506248414516449, "learning_rate": 2.2057872954219405e-05, "loss": 0.1114, "step": 42260 }, { "grad_norm": 0.11639156192541122, "learning_rate": 2.203502421311575e-05, "loss": 0.1099, "step": 42270 }, { "grad_norm": 0.1345321238040924, "learning_rate": 2.2012183967033388e-05, "loss": 0.1232, "step": 42280 }, { "grad_norm": 0.1510654091835022, "learning_rate": 2.198935222291056e-05, "loss": 0.1094, "step": 42290 }, { "grad_norm": 0.16759485006332397, "learning_rate": 2.1966528987682948e-05, "loss": 0.112, "step": 42300 }, { "grad_norm": 0.13633033633232117, "learning_rate": 2.194371426828365e-05, "loss": 0.1, "step": 42310 }, { "grad_norm": 0.1329352855682373, "learning_rate": 2.192090807164317e-05, "loss": 0.1154, "step": 42320 }, { "grad_norm": 0.1631133109331131, "learning_rate": 2.1898110404689422e-05, "loss": 0.1164, "step": 42330 }, { "grad_norm": 0.15213367342948914, "learning_rate": 2.1875321274347776e-05, "loss": 0.1203, "step": 42340 }, { "grad_norm": 0.1454755663871765, "learning_rate": 2.18525406875409e-05, "loss": 0.1104, "step": 42350 }, { "grad_norm": 0.17325951159000397, "learning_rate": 2.1829768651188997e-05, "loss": 0.116, "step": 42360 }, { "grad_norm": 0.16703234612941742, "learning_rate": 2.180700517220958e-05, "loss": 0.1111, "step": 42370 }, { "grad_norm": 0.17787861824035645, "learning_rate": 2.1784250257517603e-05, "loss": 0.1265, "step": 42380 }, { "grad_norm": 0.17005619406700134, "learning_rate": 2.1761503914025406e-05, "loss": 0.1072, "step": 42390 }, { "grad_norm": 0.16351741552352905, "learning_rate": 2.1738766148642705e-05, "loss": 0.1067, "step": 42400 }, { "grad_norm": 0.16943877935409546, "learning_rate": 2.1716036968276683e-05, "loss": 0.1039, "step": 42410 }, { "grad_norm": 0.13689880073070526, "learning_rate": 2.1693316379831808e-05, "loss": 0.1128, "step": 42420 }, { "grad_norm": 0.1739533692598343, "learning_rate": 2.1670604390210037e-05, "loss": 0.1103, "step": 42430 }, { "grad_norm": 0.13949620723724365, "learning_rate": 2.1647901006310656e-05, "loss": 0.1173, "step": 42440 }, { "grad_norm": 0.1456248164176941, "learning_rate": 2.1625206235030353e-05, "loss": 0.1116, "step": 42450 }, { "grad_norm": 0.16065306961536407, "learning_rate": 2.160252008326321e-05, "loss": 0.1178, "step": 42460 }, { "grad_norm": 0.14571118354797363, "learning_rate": 2.157984255790067e-05, "loss": 0.1155, "step": 42470 }, { "grad_norm": 0.15482401847839355, "learning_rate": 2.1557173665831553e-05, "loss": 0.1173, "step": 42480 }, { "grad_norm": 0.13323909044265747, "learning_rate": 2.153451341394212e-05, "loss": 0.1061, "step": 42490 }, { "grad_norm": 0.10407952964305878, "learning_rate": 2.151186180911589e-05, "loss": 0.1088, "step": 42500 }, { "grad_norm": 0.11094868928194046, "learning_rate": 2.1489218858233877e-05, "loss": 0.1114, "step": 42510 }, { "grad_norm": 0.10917127877473831, "learning_rate": 2.1466584568174392e-05, "loss": 0.1081, "step": 42520 }, { "grad_norm": 0.10914647579193115, "learning_rate": 2.1443958945813132e-05, "loss": 0.1188, "step": 42530 }, { "grad_norm": 0.17030157148838043, "learning_rate": 2.1421341998023163e-05, "loss": 0.108, "step": 42540 }, { "grad_norm": 0.1336601972579956, "learning_rate": 2.139873373167491e-05, "loss": 0.1127, "step": 42550 }, { "grad_norm": 0.18791203200817108, "learning_rate": 2.13761341536362e-05, "loss": 0.1112, "step": 42560 }, { "grad_norm": 0.18384087085723877, "learning_rate": 2.1353543270772136e-05, "loss": 0.1078, "step": 42570 }, { "grad_norm": 0.1896393895149231, "learning_rate": 2.1330961089945297e-05, "loss": 0.119, "step": 42580 }, { "grad_norm": 0.16888427734375, "learning_rate": 2.130838761801548e-05, "loss": 0.1159, "step": 42590 }, { "grad_norm": 0.14361757040023804, "learning_rate": 2.1285822861839966e-05, "loss": 0.1133, "step": 42600 }, { "grad_norm": 0.15379901230335236, "learning_rate": 2.126326682827331e-05, "loss": 0.1198, "step": 42610 }, { "grad_norm": 0.1454397737979889, "learning_rate": 2.124071952416744e-05, "loss": 0.1182, "step": 42620 }, { "grad_norm": 0.13812625408172607, "learning_rate": 2.1218180956371634e-05, "loss": 0.1101, "step": 42630 }, { "grad_norm": 0.16350069642066956, "learning_rate": 2.119565113173252e-05, "loss": 0.1106, "step": 42640 }, { "grad_norm": 0.14590254426002502, "learning_rate": 2.1173130057094033e-05, "loss": 0.1105, "step": 42650 }, { "grad_norm": 0.19698244333267212, "learning_rate": 2.115061773929753e-05, "loss": 0.1166, "step": 42660 }, { "grad_norm": 0.1504298448562622, "learning_rate": 2.1128114185181623e-05, "loss": 0.1196, "step": 42670 }, { "grad_norm": 0.1911879926919937, "learning_rate": 2.1105619401582317e-05, "loss": 0.1214, "step": 42680 }, { "grad_norm": 0.18015940487384796, "learning_rate": 2.1083133395332928e-05, "loss": 0.1169, "step": 42690 }, { "grad_norm": 0.15656857192516327, "learning_rate": 2.1060656173264082e-05, "loss": 0.1112, "step": 42700 }, { "grad_norm": 0.11893272399902344, "learning_rate": 2.103818774220383e-05, "loss": 0.1135, "step": 42710 }, { "grad_norm": 0.18443337082862854, "learning_rate": 2.1015728108977412e-05, "loss": 0.124, "step": 42720 }, { "grad_norm": 0.15422114729881287, "learning_rate": 2.0993277280407548e-05, "loss": 0.1017, "step": 42730 }, { "grad_norm": 0.17281194031238556, "learning_rate": 2.0970835263314132e-05, "loss": 0.1109, "step": 42740 }, { "grad_norm": 0.11077912151813507, "learning_rate": 2.094840206451451e-05, "loss": 0.1136, "step": 42750 }, { "grad_norm": 0.1753210574388504, "learning_rate": 2.0925977690823273e-05, "loss": 0.1081, "step": 42760 }, { "grad_norm": 0.14395849406719208, "learning_rate": 2.0903562149052364e-05, "loss": 0.1072, "step": 42770 }, { "grad_norm": 0.1649383157491684, "learning_rate": 2.0881155446011025e-05, "loss": 0.1079, "step": 42780 }, { "grad_norm": 0.15679877996444702, "learning_rate": 2.0858757588505823e-05, "loss": 0.115, "step": 42790 }, { "grad_norm": 0.19712699949741364, "learning_rate": 2.0836368583340622e-05, "loss": 0.1118, "step": 42800 }, { "grad_norm": 0.14590710401535034, "learning_rate": 2.081398843731664e-05, "loss": 0.1083, "step": 42810 }, { "grad_norm": 0.17271912097930908, "learning_rate": 2.0791617157232357e-05, "loss": 0.1144, "step": 42820 }, { "grad_norm": 0.13080282509326935, "learning_rate": 2.0769254749883576e-05, "loss": 0.1108, "step": 42830 }, { "grad_norm": 0.1641426682472229, "learning_rate": 2.0746901222063415e-05, "loss": 0.1113, "step": 42840 }, { "grad_norm": 0.17499466240406036, "learning_rate": 2.072455658056226e-05, "loss": 0.105, "step": 42850 }, { "grad_norm": 0.18334433436393738, "learning_rate": 2.0702220832167873e-05, "loss": 0.1063, "step": 42860 }, { "grad_norm": 0.1786607950925827, "learning_rate": 2.0679893983665205e-05, "loss": 0.108, "step": 42870 }, { "grad_norm": 0.1413411647081375, "learning_rate": 2.0657576041836622e-05, "loss": 0.1105, "step": 42880 }, { "grad_norm": 0.15591557323932648, "learning_rate": 2.0635267013461666e-05, "loss": 0.1119, "step": 42890 }, { "grad_norm": 0.15204180777072906, "learning_rate": 2.061296690531728e-05, "loss": 0.1154, "step": 42900 }, { "grad_norm": 0.12667182087898254, "learning_rate": 2.0590675724177622e-05, "loss": 0.1067, "step": 42910 }, { "grad_norm": 0.15597005188465118, "learning_rate": 2.0568393476814167e-05, "loss": 0.1181, "step": 42920 }, { "grad_norm": 0.11890352517366409, "learning_rate": 2.0546120169995685e-05, "loss": 0.105, "step": 42930 }, { "grad_norm": 0.15216805040836334, "learning_rate": 2.0523855810488214e-05, "loss": 0.12, "step": 42940 }, { "grad_norm": 0.1549912840127945, "learning_rate": 2.050160040505505e-05, "loss": 0.1142, "step": 42950 }, { "grad_norm": 0.14457967877388, "learning_rate": 2.0479353960456843e-05, "loss": 0.1121, "step": 42960 }, { "grad_norm": 0.16938713192939758, "learning_rate": 2.0457116483451456e-05, "loss": 0.1184, "step": 42970 }, { "grad_norm": 0.18320591747760773, "learning_rate": 2.0434887980794043e-05, "loss": 0.1166, "step": 42980 }, { "grad_norm": 0.14827963709831238, "learning_rate": 2.0412668459237043e-05, "loss": 0.1095, "step": 42990 }, { "grad_norm": 0.1172785609960556, "learning_rate": 2.039045792553016e-05, "loss": 0.1043, "step": 43000 }, { "grad_norm": 0.1332283765077591, "learning_rate": 2.036825638642036e-05, "loss": 0.1099, "step": 43010 }, { "grad_norm": 0.15411953628063202, "learning_rate": 2.0346063848651868e-05, "loss": 0.1115, "step": 43020 }, { "grad_norm": 0.17499692738056183, "learning_rate": 2.0323880318966254e-05, "loss": 0.1124, "step": 43030 }, { "grad_norm": 0.19950826466083527, "learning_rate": 2.030170580410221e-05, "loss": 0.1155, "step": 43040 }, { "grad_norm": 0.2036200910806656, "learning_rate": 2.0279540310795837e-05, "loss": 0.1069, "step": 43050 }, { "grad_norm": 0.14687447249889374, "learning_rate": 2.0257383845780365e-05, "loss": 0.1133, "step": 43060 }, { "grad_norm": 0.1656857281923294, "learning_rate": 2.0235236415786384e-05, "loss": 0.1122, "step": 43070 }, { "grad_norm": 0.1489773690700531, "learning_rate": 2.021309802754169e-05, "loss": 0.1051, "step": 43080 }, { "grad_norm": 0.15871946513652802, "learning_rate": 2.0190968687771332e-05, "loss": 0.1136, "step": 43090 }, { "grad_norm": 0.147311732172966, "learning_rate": 2.016884840319763e-05, "loss": 0.1083, "step": 43100 }, { "grad_norm": 0.14710462093353271, "learning_rate": 2.0146737180540122e-05, "loss": 0.1069, "step": 43110 }, { "grad_norm": 0.1741078495979309, "learning_rate": 2.012463502651564e-05, "loss": 0.1138, "step": 43120 }, { "grad_norm": 0.14987386763095856, "learning_rate": 2.0102541947838228e-05, "loss": 0.1102, "step": 43130 }, { "grad_norm": 0.165403351187706, "learning_rate": 2.0080457951219173e-05, "loss": 0.1107, "step": 43140 }, { "grad_norm": 0.15813769400119781, "learning_rate": 2.0058383043367017e-05, "loss": 0.1143, "step": 43150 }, { "grad_norm": 0.13078269362449646, "learning_rate": 2.0036317230987528e-05, "loss": 0.1111, "step": 43160 }, { "grad_norm": 0.13249528408050537, "learning_rate": 2.0014260520783696e-05, "loss": 0.1119, "step": 43170 }, { "grad_norm": 0.13900040090084076, "learning_rate": 1.9992212919455834e-05, "loss": 0.1077, "step": 43180 }, { "grad_norm": 0.13896676898002625, "learning_rate": 1.9970174433701333e-05, "loss": 0.1075, "step": 43190 }, { "grad_norm": 0.1345243901014328, "learning_rate": 1.9948145070214992e-05, "loss": 0.1107, "step": 43200 }, { "grad_norm": 0.15144795179367065, "learning_rate": 1.9926124835688663e-05, "loss": 0.1104, "step": 43210 }, { "grad_norm": 0.17428845167160034, "learning_rate": 1.9904113736811576e-05, "loss": 0.1219, "step": 43220 }, { "grad_norm": 0.165151447057724, "learning_rate": 1.9882111780270096e-05, "loss": 0.1134, "step": 43230 }, { "grad_norm": 0.17596925795078278, "learning_rate": 1.986011897274784e-05, "loss": 0.1082, "step": 43240 }, { "grad_norm": 0.14325889945030212, "learning_rate": 1.983813532092565e-05, "loss": 0.1065, "step": 43250 }, { "grad_norm": 0.1615118384361267, "learning_rate": 1.981616083148155e-05, "loss": 0.109, "step": 43260 }, { "grad_norm": 0.13429269194602966, "learning_rate": 1.9794195511090845e-05, "loss": 0.1094, "step": 43270 }, { "grad_norm": 0.15762944519519806, "learning_rate": 1.977223936642601e-05, "loss": 0.1116, "step": 43280 }, { "grad_norm": 0.1360010802745819, "learning_rate": 1.975029240415674e-05, "loss": 0.1169, "step": 43290 }, { "grad_norm": 0.12928389012813568, "learning_rate": 1.9728354630949936e-05, "loss": 0.1077, "step": 43300 }, { "grad_norm": 0.18680834770202637, "learning_rate": 1.9706426053469716e-05, "loss": 0.1143, "step": 43310 }, { "grad_norm": 0.13229598104953766, "learning_rate": 1.9684506678377396e-05, "loss": 0.103, "step": 43320 }, { "grad_norm": 0.1446089893579483, "learning_rate": 1.9662596512331544e-05, "loss": 0.1088, "step": 43330 }, { "grad_norm": 0.1362665742635727, "learning_rate": 1.964069556198782e-05, "loss": 0.113, "step": 43340 }, { "grad_norm": 0.16008929908275604, "learning_rate": 1.9618803833999232e-05, "loss": 0.1144, "step": 43350 }, { "grad_norm": 0.17116603255271912, "learning_rate": 1.9596921335015838e-05, "loss": 0.1117, "step": 43360 }, { "grad_norm": 0.15270733833312988, "learning_rate": 1.957504807168501e-05, "loss": 0.102, "step": 43370 }, { "grad_norm": 0.17028763890266418, "learning_rate": 1.9553184050651253e-05, "loss": 0.1081, "step": 43380 }, { "grad_norm": 0.17437255382537842, "learning_rate": 1.953132927855628e-05, "loss": 0.1151, "step": 43390 }, { "grad_norm": 0.15730096399784088, "learning_rate": 1.9509483762038995e-05, "loss": 0.1125, "step": 43400 }, { "grad_norm": 0.16687437891960144, "learning_rate": 1.9487647507735467e-05, "loss": 0.1065, "step": 43410 }, { "grad_norm": 0.14048653841018677, "learning_rate": 1.9465820522279032e-05, "loss": 0.1103, "step": 43420 }, { "grad_norm": 0.12773896753787994, "learning_rate": 1.9444002812300078e-05, "loss": 0.1139, "step": 43430 }, { "grad_norm": 0.1662198156118393, "learning_rate": 1.94221943844263e-05, "loss": 0.1145, "step": 43440 }, { "grad_norm": 0.16820034384727478, "learning_rate": 1.9400395245282515e-05, "loss": 0.1184, "step": 43450 }, { "grad_norm": 0.16262304782867432, "learning_rate": 1.937860540149071e-05, "loss": 0.1173, "step": 43460 }, { "grad_norm": 0.16684606671333313, "learning_rate": 1.9356824859670082e-05, "loss": 0.1127, "step": 43470 }, { "grad_norm": 0.12161793559789658, "learning_rate": 1.9335053626436967e-05, "loss": 0.1121, "step": 43480 }, { "grad_norm": 0.16286087036132812, "learning_rate": 1.9313291708404885e-05, "loss": 0.1228, "step": 43490 }, { "grad_norm": 0.16363060474395752, "learning_rate": 1.9291539112184587e-05, "loss": 0.1171, "step": 43500 }, { "grad_norm": 0.13570481538772583, "learning_rate": 1.9269795844383854e-05, "loss": 0.1083, "step": 43510 }, { "grad_norm": 0.15208160877227783, "learning_rate": 1.9248061911607777e-05, "loss": 0.117, "step": 43520 }, { "grad_norm": 0.12700368463993073, "learning_rate": 1.9226337320458538e-05, "loss": 0.11, "step": 43530 }, { "grad_norm": 0.1318783462047577, "learning_rate": 1.9204622077535488e-05, "loss": 0.112, "step": 43540 }, { "grad_norm": 0.22043979167938232, "learning_rate": 1.9182916189435147e-05, "loss": 0.1107, "step": 43550 }, { "grad_norm": 0.1969509869813919, "learning_rate": 1.916121966275117e-05, "loss": 0.1101, "step": 43560 }, { "grad_norm": 0.14710378646850586, "learning_rate": 1.9139532504074443e-05, "loss": 0.1054, "step": 43570 }, { "grad_norm": 0.1262083798646927, "learning_rate": 1.9117854719992885e-05, "loss": 0.1046, "step": 43580 }, { "grad_norm": 0.12516939640045166, "learning_rate": 1.9096186317091687e-05, "loss": 0.112, "step": 43590 }, { "grad_norm": 0.14050789177417755, "learning_rate": 1.9074527301953116e-05, "loss": 0.1127, "step": 43600 }, { "grad_norm": 0.14604438841342926, "learning_rate": 1.9052877681156607e-05, "loss": 0.1131, "step": 43610 }, { "grad_norm": 0.1211899071931839, "learning_rate": 1.903123746127875e-05, "loss": 0.1168, "step": 43620 }, { "grad_norm": 0.14107628166675568, "learning_rate": 1.900960664889327e-05, "loss": 0.1084, "step": 43630 }, { "grad_norm": 0.18519648909568787, "learning_rate": 1.8987985250571015e-05, "loss": 0.1103, "step": 43640 }, { "grad_norm": 0.15967901051044464, "learning_rate": 1.8966373272880054e-05, "loss": 0.1119, "step": 43650 }, { "grad_norm": 0.15806938707828522, "learning_rate": 1.8944770722385462e-05, "loss": 0.1138, "step": 43660 }, { "grad_norm": 0.17695188522338867, "learning_rate": 1.8923177605649576e-05, "loss": 0.106, "step": 43670 }, { "grad_norm": 0.1501394808292389, "learning_rate": 1.8901593929231802e-05, "loss": 0.1148, "step": 43680 }, { "grad_norm": 0.1808118373155594, "learning_rate": 1.8880019699688684e-05, "loss": 0.1072, "step": 43690 }, { "grad_norm": 0.14153094589710236, "learning_rate": 1.8858454923573904e-05, "loss": 0.1121, "step": 43700 }, { "grad_norm": 0.17644870281219482, "learning_rate": 1.8836899607438253e-05, "loss": 0.1184, "step": 43710 }, { "grad_norm": 0.154730424284935, "learning_rate": 1.8815353757829723e-05, "loss": 0.1151, "step": 43720 }, { "grad_norm": 0.1564897745847702, "learning_rate": 1.879381738129331e-05, "loss": 0.1104, "step": 43730 }, { "grad_norm": 0.13729918003082275, "learning_rate": 1.8772290484371236e-05, "loss": 0.1182, "step": 43740 }, { "grad_norm": 0.16312219202518463, "learning_rate": 1.8750773073602795e-05, "loss": 0.1145, "step": 43750 }, { "grad_norm": 0.1076166108250618, "learning_rate": 1.8729265155524405e-05, "loss": 0.1026, "step": 43760 }, { "grad_norm": 0.1578301042318344, "learning_rate": 1.8707766736669607e-05, "loss": 0.1089, "step": 43770 }, { "grad_norm": 0.15296518802642822, "learning_rate": 1.8686277823569055e-05, "loss": 0.1154, "step": 43780 }, { "grad_norm": 0.1363750398159027, "learning_rate": 1.8664798422750484e-05, "loss": 0.1079, "step": 43790 }, { "grad_norm": 0.12295347452163696, "learning_rate": 1.8643328540738832e-05, "loss": 0.1143, "step": 43800 }, { "grad_norm": 0.12611587345600128, "learning_rate": 1.862186818405601e-05, "loss": 0.1066, "step": 43810 }, { "grad_norm": 0.1777186542749405, "learning_rate": 1.8600417359221156e-05, "loss": 0.1147, "step": 43820 }, { "grad_norm": 0.17416433990001678, "learning_rate": 1.8578976072750454e-05, "loss": 0.1065, "step": 43830 }, { "grad_norm": 0.14321114122867584, "learning_rate": 1.8557544331157194e-05, "loss": 0.1125, "step": 43840 }, { "grad_norm": 0.1906372457742691, "learning_rate": 1.8536122140951785e-05, "loss": 0.1146, "step": 43850 }, { "grad_norm": 0.16640646755695343, "learning_rate": 1.8514709508641688e-05, "loss": 0.117, "step": 43860 }, { "grad_norm": 0.1737520694732666, "learning_rate": 1.8493306440731555e-05, "loss": 0.1116, "step": 43870 }, { "grad_norm": 0.13332073390483856, "learning_rate": 1.8471912943723013e-05, "loss": 0.1064, "step": 43880 }, { "grad_norm": 0.12183927744626999, "learning_rate": 1.8450529024114894e-05, "loss": 0.1088, "step": 43890 }, { "grad_norm": 0.13640019297599792, "learning_rate": 1.842915468840301e-05, "loss": 0.1064, "step": 43900 }, { "grad_norm": 0.1286996603012085, "learning_rate": 1.840778994308037e-05, "loss": 0.1092, "step": 43910 }, { "grad_norm": 0.12723030149936676, "learning_rate": 1.8386434794637004e-05, "loss": 0.1122, "step": 43920 }, { "grad_norm": 0.14802886545658112, "learning_rate": 1.8365089249560034e-05, "loss": 0.1105, "step": 43930 }, { "grad_norm": 0.1202504113316536, "learning_rate": 1.8343753314333683e-05, "loss": 0.1105, "step": 43940 }, { "grad_norm": 0.15969328582286835, "learning_rate": 1.8322426995439236e-05, "loss": 0.1181, "step": 43950 }, { "grad_norm": 0.15740452706813812, "learning_rate": 1.8301110299355058e-05, "loss": 0.1091, "step": 43960 }, { "grad_norm": 0.20084500312805176, "learning_rate": 1.8279803232556625e-05, "loss": 0.1046, "step": 43970 }, { "grad_norm": 0.14433303475379944, "learning_rate": 1.8258505801516444e-05, "loss": 0.1093, "step": 43980 }, { "grad_norm": 0.16697558760643005, "learning_rate": 1.8237218012704117e-05, "loss": 0.1115, "step": 43990 }, { "grad_norm": 0.17834673821926117, "learning_rate": 1.821593987258631e-05, "loss": 0.1114, "step": 44000 }, { "grad_norm": 0.14853307604789734, "learning_rate": 1.8194671387626744e-05, "loss": 0.107, "step": 44010 }, { "grad_norm": 0.14646592736244202, "learning_rate": 1.8173412564286276e-05, "loss": 0.1154, "step": 44020 }, { "grad_norm": 0.149038165807724, "learning_rate": 1.8152163409022697e-05, "loss": 0.1148, "step": 44030 }, { "grad_norm": 0.1669139862060547, "learning_rate": 1.8130923928291023e-05, "loss": 0.113, "step": 44040 }, { "grad_norm": 0.13914506137371063, "learning_rate": 1.8109694128543163e-05, "loss": 0.1089, "step": 44050 }, { "grad_norm": 0.16546490788459778, "learning_rate": 1.8088474016228237e-05, "loss": 0.1097, "step": 44060 }, { "grad_norm": 0.18186326324939728, "learning_rate": 1.8067263597792328e-05, "loss": 0.1128, "step": 44070 }, { "grad_norm": 0.17459674179553986, "learning_rate": 1.80460628796786e-05, "loss": 0.1092, "step": 44080 }, { "grad_norm": 0.1581047922372818, "learning_rate": 1.8024871868327276e-05, "loss": 0.1158, "step": 44090 }, { "grad_norm": 0.17612189054489136, "learning_rate": 1.8003690570175608e-05, "loss": 0.1139, "step": 44100 }, { "grad_norm": 0.14931903779506683, "learning_rate": 1.7982518991657943e-05, "loss": 0.1045, "step": 44110 }, { "grad_norm": 0.1345987468957901, "learning_rate": 1.7961357139205643e-05, "loss": 0.1111, "step": 44120 }, { "grad_norm": 0.14223432540893555, "learning_rate": 1.7940205019247108e-05, "loss": 0.1146, "step": 44130 }, { "grad_norm": 0.15219751000404358, "learning_rate": 1.79190626382078e-05, "loss": 0.1056, "step": 44140 }, { "grad_norm": 0.13838626444339752, "learning_rate": 1.7897930002510215e-05, "loss": 0.1074, "step": 44150 }, { "grad_norm": 0.14084604382514954, "learning_rate": 1.787680711857387e-05, "loss": 0.108, "step": 44160 }, { "grad_norm": 0.16451166570186615, "learning_rate": 1.7855693992815398e-05, "loss": 0.1103, "step": 44170 }, { "grad_norm": 0.12987196445465088, "learning_rate": 1.7834590631648328e-05, "loss": 0.1124, "step": 44180 }, { "grad_norm": 0.10904335975646973, "learning_rate": 1.7813497041483384e-05, "loss": 0.1086, "step": 44190 }, { "grad_norm": 0.14143817126750946, "learning_rate": 1.779241322872817e-05, "loss": 0.1077, "step": 44200 }, { "grad_norm": 0.12120719254016876, "learning_rate": 1.777133919978744e-05, "loss": 0.1227, "step": 44210 }, { "grad_norm": 0.1545756757259369, "learning_rate": 1.7750274961062912e-05, "loss": 0.1167, "step": 44220 }, { "grad_norm": 0.15621556341648102, "learning_rate": 1.772922051895335e-05, "loss": 0.1054, "step": 44230 }, { "grad_norm": 0.15491512417793274, "learning_rate": 1.770817587985453e-05, "loss": 0.1114, "step": 44240 }, { "grad_norm": 0.16804538667201996, "learning_rate": 1.7687141050159246e-05, "loss": 0.1133, "step": 44250 }, { "grad_norm": 0.210036501288414, "learning_rate": 1.7666116036257375e-05, "loss": 0.1142, "step": 44260 }, { "grad_norm": 0.16318020224571228, "learning_rate": 1.764510084453569e-05, "loss": 0.1098, "step": 44270 }, { "grad_norm": 0.17178870737552643, "learning_rate": 1.76240954813781e-05, "loss": 0.1081, "step": 44280 }, { "grad_norm": 0.12588942050933838, "learning_rate": 1.7603099953165476e-05, "loss": 0.1125, "step": 44290 }, { "grad_norm": 0.15651647746562958, "learning_rate": 1.7582114266275683e-05, "loss": 0.1108, "step": 44300 }, { "grad_norm": 0.20982229709625244, "learning_rate": 1.756113842708364e-05, "loss": 0.1188, "step": 44310 }, { "grad_norm": 0.16139748692512512, "learning_rate": 1.7540172441961245e-05, "loss": 0.1125, "step": 44320 }, { "grad_norm": 0.13995130360126495, "learning_rate": 1.7519216317277387e-05, "loss": 0.1155, "step": 44330 }, { "grad_norm": 0.14143489301204681, "learning_rate": 1.7498270059398046e-05, "loss": 0.1209, "step": 44340 }, { "grad_norm": 0.15293316543102264, "learning_rate": 1.7477333674686062e-05, "loss": 0.1133, "step": 44350 }, { "grad_norm": 0.17736998200416565, "learning_rate": 1.745640716950142e-05, "loss": 0.1166, "step": 44360 }, { "grad_norm": 0.15171000361442566, "learning_rate": 1.7435490550201017e-05, "loss": 0.105, "step": 44370 }, { "grad_norm": 0.12271373718976974, "learning_rate": 1.7414583823138762e-05, "loss": 0.1111, "step": 44380 }, { "grad_norm": 0.15975496172904968, "learning_rate": 1.739368699466558e-05, "loss": 0.1081, "step": 44390 }, { "grad_norm": 0.16830816864967346, "learning_rate": 1.737280007112935e-05, "loss": 0.1093, "step": 44400 }, { "grad_norm": 0.15977512300014496, "learning_rate": 1.735192305887502e-05, "loss": 0.1072, "step": 44410 }, { "grad_norm": 0.1403706967830658, "learning_rate": 1.733105596424441e-05, "loss": 0.1234, "step": 44420 }, { "grad_norm": 0.17559856176376343, "learning_rate": 1.7310198793576437e-05, "loss": 0.1118, "step": 44430 }, { "grad_norm": 0.14942258596420288, "learning_rate": 1.7289351553206952e-05, "loss": 0.1065, "step": 44440 }, { "grad_norm": 0.13453371822834015, "learning_rate": 1.7268514249468788e-05, "loss": 0.1109, "step": 44450 }, { "grad_norm": 0.1768960952758789, "learning_rate": 1.7247686888691765e-05, "loss": 0.1106, "step": 44460 }, { "grad_norm": 0.14952757954597473, "learning_rate": 1.7226869477202694e-05, "loss": 0.11, "step": 44470 }, { "grad_norm": 0.1293233186006546, "learning_rate": 1.7206062021325336e-05, "loss": 0.1035, "step": 44480 }, { "grad_norm": 0.12213372439146042, "learning_rate": 1.7185264527380502e-05, "loss": 0.1114, "step": 44490 }, { "grad_norm": 0.14927136898040771, "learning_rate": 1.716447700168584e-05, "loss": 0.1155, "step": 44500 }, { "grad_norm": 0.11322814971208572, "learning_rate": 1.714369945055611e-05, "loss": 0.1049, "step": 44510 }, { "grad_norm": 0.13117651641368866, "learning_rate": 1.7122931880302968e-05, "loss": 0.107, "step": 44520 }, { "grad_norm": 0.14805686473846436, "learning_rate": 1.710217429723505e-05, "loss": 0.1121, "step": 44530 }, { "grad_norm": 0.17185617983341217, "learning_rate": 1.7081426707657972e-05, "loss": 0.1135, "step": 44540 }, { "grad_norm": 0.18211746215820312, "learning_rate": 1.7060689117874275e-05, "loss": 0.1117, "step": 44550 }, { "grad_norm": 0.14445935189723969, "learning_rate": 1.703996153418354e-05, "loss": 0.1126, "step": 44560 }, { "grad_norm": 0.11583346128463745, "learning_rate": 1.7019243962882205e-05, "loss": 0.1069, "step": 44570 }, { "grad_norm": 0.1302737146615982, "learning_rate": 1.6998536410263754e-05, "loss": 0.1148, "step": 44580 }, { "grad_norm": 0.14083774387836456, "learning_rate": 1.6977838882618596e-05, "loss": 0.1237, "step": 44590 }, { "grad_norm": 0.13715636730194092, "learning_rate": 1.6957151386234088e-05, "loss": 0.1035, "step": 44600 }, { "grad_norm": 0.1459553837776184, "learning_rate": 1.6936473927394536e-05, "loss": 0.1072, "step": 44610 }, { "grad_norm": 0.15944728255271912, "learning_rate": 1.6915806512381222e-05, "loss": 0.1098, "step": 44620 }, { "grad_norm": 0.135390505194664, "learning_rate": 1.6895149147472344e-05, "loss": 0.112, "step": 44630 }, { "grad_norm": 0.15597274899482727, "learning_rate": 1.6874501838943073e-05, "loss": 0.1107, "step": 44640 }, { "grad_norm": 0.14322637021541595, "learning_rate": 1.6853864593065506e-05, "loss": 0.1182, "step": 44650 }, { "grad_norm": 0.13528715074062347, "learning_rate": 1.683323741610871e-05, "loss": 0.1117, "step": 44660 }, { "grad_norm": 0.1282854825258255, "learning_rate": 1.6812620314338674e-05, "loss": 0.1109, "step": 44670 }, { "grad_norm": 0.1345527470111847, "learning_rate": 1.6792013294018326e-05, "loss": 0.1116, "step": 44680 }, { "grad_norm": 0.15566964447498322, "learning_rate": 1.6771416361407526e-05, "loss": 0.1089, "step": 44690 }, { "grad_norm": 0.17948497831821442, "learning_rate": 1.675082952276308e-05, "loss": 0.1178, "step": 44700 }, { "grad_norm": 0.1434749960899353, "learning_rate": 1.6730252784338757e-05, "loss": 0.1094, "step": 44710 }, { "grad_norm": 0.1470109224319458, "learning_rate": 1.6709686152385166e-05, "loss": 0.1073, "step": 44720 }, { "grad_norm": 0.12772907316684723, "learning_rate": 1.668912963314998e-05, "loss": 0.1116, "step": 44730 }, { "grad_norm": 0.17196252942085266, "learning_rate": 1.6668583232877653e-05, "loss": 0.1199, "step": 44740 }, { "grad_norm": 0.16992585361003876, "learning_rate": 1.6648046957809698e-05, "loss": 0.1111, "step": 44750 }, { "grad_norm": 0.17573659121990204, "learning_rate": 1.6627520814184462e-05, "loss": 0.1126, "step": 44760 }, { "grad_norm": 0.16940560936927795, "learning_rate": 1.660700480823726e-05, "loss": 0.1133, "step": 44770 }, { "grad_norm": 0.15176604688167572, "learning_rate": 1.65864989462003e-05, "loss": 0.1121, "step": 44780 }, { "grad_norm": 0.17101867496967316, "learning_rate": 1.656600323430273e-05, "loss": 0.1096, "step": 44790 }, { "grad_norm": 0.16281579434871674, "learning_rate": 1.654551767877059e-05, "loss": 0.1135, "step": 44800 }, { "grad_norm": 0.15870517492294312, "learning_rate": 1.6525042285826874e-05, "loss": 0.112, "step": 44810 }, { "grad_norm": 0.16801774501800537, "learning_rate": 1.6504577061691468e-05, "loss": 0.112, "step": 44820 }, { "grad_norm": 0.1553623080253601, "learning_rate": 1.6484122012581143e-05, "loss": 0.1207, "step": 44830 }, { "grad_norm": 0.1341180056333542, "learning_rate": 1.6463677144709623e-05, "loss": 0.1156, "step": 44840 }, { "grad_norm": 0.16701234877109528, "learning_rate": 1.6443242464287493e-05, "loss": 0.1127, "step": 44850 }, { "grad_norm": 0.14682258665561676, "learning_rate": 1.642281797752232e-05, "loss": 0.1052, "step": 44860 }, { "grad_norm": 0.1464449167251587, "learning_rate": 1.6402403690618456e-05, "loss": 0.1146, "step": 44870 }, { "grad_norm": 0.13000880181789398, "learning_rate": 1.6381999609777295e-05, "loss": 0.1048, "step": 44880 }, { "grad_norm": 0.12511098384857178, "learning_rate": 1.6361605741196983e-05, "loss": 0.1073, "step": 44890 }, { "grad_norm": 0.12369772791862488, "learning_rate": 1.63412220910727e-05, "loss": 0.1176, "step": 44900 }, { "grad_norm": 0.12441133707761765, "learning_rate": 1.6320848665596433e-05, "loss": 0.1068, "step": 44910 }, { "grad_norm": 0.19913077354431152, "learning_rate": 1.6300485470957095e-05, "loss": 0.1097, "step": 44920 }, { "grad_norm": 0.1938401758670807, "learning_rate": 1.6280132513340483e-05, "loss": 0.1099, "step": 44930 }, { "grad_norm": 0.10974758118391037, "learning_rate": 1.62597897989293e-05, "loss": 0.1146, "step": 44940 }, { "grad_norm": 0.14178045094013214, "learning_rate": 1.623945733390309e-05, "loss": 0.1075, "step": 44950 }, { "grad_norm": 0.13367672264575958, "learning_rate": 1.6219135124438374e-05, "loss": 0.1077, "step": 44960 }, { "grad_norm": 0.13265183568000793, "learning_rate": 1.6198823176708465e-05, "loss": 0.1051, "step": 44970 }, { "grad_norm": 0.13972842693328857, "learning_rate": 1.6178521496883613e-05, "loss": 0.1065, "step": 44980 }, { "grad_norm": 0.14716282486915588, "learning_rate": 1.6158230091130926e-05, "loss": 0.1084, "step": 44990 }, { "grad_norm": 0.14334063231945038, "learning_rate": 1.613794896561438e-05, "loss": 0.1126, "step": 45000 }, { "grad_norm": 0.14163604378700256, "learning_rate": 1.6117678126494894e-05, "loss": 0.1052, "step": 45010 }, { "grad_norm": 0.17550383508205414, "learning_rate": 1.6097417579930153e-05, "loss": 0.1147, "step": 45020 }, { "grad_norm": 0.16312941908836365, "learning_rate": 1.6077167332074834e-05, "loss": 0.1107, "step": 45030 }, { "grad_norm": 0.1476052701473236, "learning_rate": 1.605692738908037e-05, "loss": 0.1144, "step": 45040 }, { "grad_norm": 0.12057986110448837, "learning_rate": 1.6036697757095176e-05, "loss": 0.1164, "step": 45050 }, { "grad_norm": 0.1340552419424057, "learning_rate": 1.6016478442264428e-05, "loss": 0.1097, "step": 45060 }, { "grad_norm": 0.17221085727214813, "learning_rate": 1.599626945073026e-05, "loss": 0.1205, "step": 45070 }, { "grad_norm": 0.17295411229133606, "learning_rate": 1.597607078863162e-05, "loss": 0.1124, "step": 45080 }, { "grad_norm": 0.14208437502384186, "learning_rate": 1.595588246210432e-05, "loss": 0.1172, "step": 45090 }, { "grad_norm": 0.16589610278606415, "learning_rate": 1.5935704477281048e-05, "loss": 0.1116, "step": 45100 }, { "grad_norm": 0.1326311230659485, "learning_rate": 1.5915536840291323e-05, "loss": 0.1077, "step": 45110 }, { "grad_norm": 0.14768970012664795, "learning_rate": 1.5895379557261576e-05, "loss": 0.1063, "step": 45120 }, { "grad_norm": 0.12116739153862, "learning_rate": 1.5875232634315033e-05, "loss": 0.1068, "step": 45130 }, { "grad_norm": 0.10582974553108215, "learning_rate": 1.5855096077571812e-05, "loss": 0.1096, "step": 45140 }, { "grad_norm": 0.12862828373908997, "learning_rate": 1.5834969893148855e-05, "loss": 0.1191, "step": 45150 }, { "grad_norm": 0.12215283513069153, "learning_rate": 1.581485408715997e-05, "loss": 0.1077, "step": 45160 }, { "grad_norm": 0.14074186980724335, "learning_rate": 1.5794748665715785e-05, "loss": 0.1098, "step": 45170 }, { "grad_norm": 0.16100017726421356, "learning_rate": 1.5774653634923857e-05, "loss": 0.1111, "step": 45180 }, { "grad_norm": 0.14676107466220856, "learning_rate": 1.575456900088845e-05, "loss": 0.1049, "step": 45190 }, { "grad_norm": 0.11832284182310104, "learning_rate": 1.5734494769710816e-05, "loss": 0.1104, "step": 45200 }, { "grad_norm": 0.1552916318178177, "learning_rate": 1.5714430947488912e-05, "loss": 0.1086, "step": 45210 }, { "grad_norm": 0.14855654537677765, "learning_rate": 1.5694377540317645e-05, "loss": 0.1094, "step": 45220 }, { "grad_norm": 0.13253524899482727, "learning_rate": 1.5674334554288694e-05, "loss": 0.1092, "step": 45230 }, { "grad_norm": 0.14086377620697021, "learning_rate": 1.5654301995490582e-05, "loss": 0.119, "step": 45240 }, { "grad_norm": 0.18073435127735138, "learning_rate": 1.5634279870008685e-05, "loss": 0.1126, "step": 45250 }, { "grad_norm": 0.1337520182132721, "learning_rate": 1.5614268183925174e-05, "loss": 0.1185, "step": 45260 }, { "grad_norm": 0.12890087068080902, "learning_rate": 1.5594266943319097e-05, "loss": 0.1118, "step": 45270 }, { "grad_norm": 0.12479250878095627, "learning_rate": 1.5574276154266294e-05, "loss": 0.1058, "step": 45280 }, { "grad_norm": 0.13299570977687836, "learning_rate": 1.5554295822839437e-05, "loss": 0.1133, "step": 45290 }, { "grad_norm": 0.12546037137508392, "learning_rate": 1.5534325955108025e-05, "loss": 0.1089, "step": 45300 }, { "grad_norm": 0.1455797553062439, "learning_rate": 1.5514366557138373e-05, "loss": 0.1151, "step": 45310 }, { "grad_norm": 0.14824365079402924, "learning_rate": 1.5494417634993602e-05, "loss": 0.1082, "step": 45320 }, { "grad_norm": 0.16193534433841705, "learning_rate": 1.547447919473372e-05, "loss": 0.1081, "step": 45330 }, { "grad_norm": 0.11702370643615723, "learning_rate": 1.5454551242415434e-05, "loss": 0.1073, "step": 45340 }, { "grad_norm": 0.1218416765332222, "learning_rate": 1.543463378409239e-05, "loss": 0.1017, "step": 45350 }, { "grad_norm": 0.1305549442768097, "learning_rate": 1.541472682581493e-05, "loss": 0.1236, "step": 45360 }, { "grad_norm": 0.1811550259590149, "learning_rate": 1.5394830373630298e-05, "loss": 0.1171, "step": 45370 }, { "grad_norm": 0.14549125730991364, "learning_rate": 1.5374944433582506e-05, "loss": 0.1098, "step": 45380 }, { "grad_norm": 0.13005430996418, "learning_rate": 1.5355069011712375e-05, "loss": 0.1146, "step": 45390 }, { "grad_norm": 0.12026368826627731, "learning_rate": 1.5335204114057526e-05, "loss": 0.1076, "step": 45400 }, { "grad_norm": 0.2309873402118683, "learning_rate": 1.5315349746652387e-05, "loss": 0.113, "step": 45410 }, { "grad_norm": 0.15505361557006836, "learning_rate": 1.5295505915528212e-05, "loss": 0.1155, "step": 45420 }, { "grad_norm": 0.1267695128917694, "learning_rate": 1.5275672626713024e-05, "loss": 0.1083, "step": 45430 }, { "grad_norm": 0.13780532777309418, "learning_rate": 1.5255849886231643e-05, "loss": 0.1129, "step": 45440 }, { "grad_norm": 0.1360113024711609, "learning_rate": 1.523603770010571e-05, "loss": 0.1121, "step": 45450 }, { "grad_norm": 0.13943804800510406, "learning_rate": 1.521623607435363e-05, "loss": 0.102, "step": 45460 }, { "grad_norm": 0.14801372587680817, "learning_rate": 1.5196445014990612e-05, "loss": 0.1053, "step": 45470 }, { "grad_norm": 0.15888594090938568, "learning_rate": 1.5176664528028672e-05, "loss": 0.1151, "step": 45480 }, { "grad_norm": 0.15009811520576477, "learning_rate": 1.5156894619476574e-05, "loss": 0.1088, "step": 45490 }, { "grad_norm": 0.15993036329746246, "learning_rate": 1.5137135295339938e-05, "loss": 0.1116, "step": 45500 }, { "grad_norm": 0.1327071338891983, "learning_rate": 1.5117386561621073e-05, "loss": 0.1172, "step": 45510 }, { "grad_norm": 0.13777360320091248, "learning_rate": 1.5097648424319167e-05, "loss": 0.1061, "step": 45520 }, { "grad_norm": 0.14065143465995789, "learning_rate": 1.5077920889430119e-05, "loss": 0.1075, "step": 45530 }, { "grad_norm": 0.19633132219314575, "learning_rate": 1.5058203962946644e-05, "loss": 0.1068, "step": 45540 }, { "grad_norm": 0.17504900693893433, "learning_rate": 1.503849765085822e-05, "loss": 0.1076, "step": 45550 }, { "grad_norm": 0.18146465718746185, "learning_rate": 1.501880195915109e-05, "loss": 0.1135, "step": 45560 }, { "grad_norm": 0.13558650016784668, "learning_rate": 1.499911689380833e-05, "loss": 0.1065, "step": 45570 }, { "grad_norm": 0.1417693942785263, "learning_rate": 1.4979442460809683e-05, "loss": 0.1168, "step": 45580 }, { "grad_norm": 0.10874201357364655, "learning_rate": 1.4959778666131763e-05, "loss": 0.1028, "step": 45590 }, { "grad_norm": 0.13194090127944946, "learning_rate": 1.4940125515747905e-05, "loss": 0.1025, "step": 45600 }, { "grad_norm": 0.10587682574987411, "learning_rate": 1.4920483015628211e-05, "loss": 0.1081, "step": 45610 }, { "grad_norm": 0.1472221314907074, "learning_rate": 1.490085117173956e-05, "loss": 0.1087, "step": 45620 }, { "grad_norm": 0.14447185397148132, "learning_rate": 1.488122999004558e-05, "loss": 0.1139, "step": 45630 }, { "grad_norm": 0.11932117491960526, "learning_rate": 1.486161947650666e-05, "loss": 0.1036, "step": 45640 }, { "grad_norm": 0.10950460284948349, "learning_rate": 1.4842019637079995e-05, "loss": 0.1043, "step": 45650 }, { "grad_norm": 0.12140078097581863, "learning_rate": 1.482243047771944e-05, "loss": 0.1126, "step": 45660 }, { "grad_norm": 0.1654125303030014, "learning_rate": 1.4802852004375712e-05, "loss": 0.1126, "step": 45670 }, { "grad_norm": 0.14145910739898682, "learning_rate": 1.4783284222996218e-05, "loss": 0.1091, "step": 45680 }, { "grad_norm": 0.1393076479434967, "learning_rate": 1.4763727139525135e-05, "loss": 0.1104, "step": 45690 }, { "grad_norm": 0.1286647617816925, "learning_rate": 1.4744180759903392e-05, "loss": 0.1111, "step": 45700 }, { "grad_norm": 0.13021565973758698, "learning_rate": 1.4724645090068635e-05, "loss": 0.1137, "step": 45710 }, { "grad_norm": 0.12115708738565445, "learning_rate": 1.4705120135955341e-05, "loss": 0.1099, "step": 45720 }, { "grad_norm": 0.16120345890522003, "learning_rate": 1.4685605903494614e-05, "loss": 0.1182, "step": 45730 }, { "grad_norm": 0.13048061728477478, "learning_rate": 1.46661023986144e-05, "loss": 0.1204, "step": 45740 }, { "grad_norm": 0.1290426105260849, "learning_rate": 1.4646609627239344e-05, "loss": 0.1151, "step": 45750 }, { "grad_norm": 0.10141265392303467, "learning_rate": 1.4627127595290835e-05, "loss": 0.1188, "step": 45760 }, { "grad_norm": 0.19037297368049622, "learning_rate": 1.460765630868699e-05, "loss": 0.1193, "step": 45770 }, { "grad_norm": 0.1489568054676056, "learning_rate": 1.4588195773342678e-05, "loss": 0.1075, "step": 45780 }, { "grad_norm": 0.16194090247154236, "learning_rate": 1.4568745995169485e-05, "loss": 0.1099, "step": 45790 }, { "grad_norm": 0.11427454650402069, "learning_rate": 1.4549306980075778e-05, "loss": 0.1119, "step": 45800 }, { "grad_norm": 0.15301400423049927, "learning_rate": 1.4529878733966557e-05, "loss": 0.117, "step": 45810 }, { "grad_norm": 0.11638613045215607, "learning_rate": 1.4510461262743658e-05, "loss": 0.1065, "step": 45820 }, { "grad_norm": 0.1505279839038849, "learning_rate": 1.4491054572305585e-05, "loss": 0.1124, "step": 45830 }, { "grad_norm": 0.11889341473579407, "learning_rate": 1.4471658668547566e-05, "loss": 0.1211, "step": 45840 }, { "grad_norm": 0.16197741031646729, "learning_rate": 1.4452273557361579e-05, "loss": 0.1151, "step": 45850 }, { "grad_norm": 0.13233740627765656, "learning_rate": 1.4432899244636282e-05, "loss": 0.109, "step": 45860 }, { "grad_norm": 0.15955163538455963, "learning_rate": 1.4413535736257134e-05, "loss": 0.1178, "step": 45870 }, { "grad_norm": 0.13937640190124512, "learning_rate": 1.439418303810619e-05, "loss": 0.1091, "step": 45880 }, { "grad_norm": 0.13745476305484772, "learning_rate": 1.4374841156062352e-05, "loss": 0.1087, "step": 45890 }, { "grad_norm": 0.12225349247455597, "learning_rate": 1.4355510096001112e-05, "loss": 0.1076, "step": 45900 }, { "grad_norm": 0.12938497960567474, "learning_rate": 1.4336189863794786e-05, "loss": 0.1026, "step": 45910 }, { "grad_norm": 0.14510555565357208, "learning_rate": 1.4316880465312327e-05, "loss": 0.1085, "step": 45920 }, { "grad_norm": 0.12484189867973328, "learning_rate": 1.4297581906419426e-05, "loss": 0.1071, "step": 45930 }, { "grad_norm": 0.13037548959255219, "learning_rate": 1.4278294192978475e-05, "loss": 0.1061, "step": 45940 }, { "grad_norm": 0.1468481570482254, "learning_rate": 1.4259017330848574e-05, "loss": 0.111, "step": 45950 }, { "grad_norm": 0.15718454122543335, "learning_rate": 1.4239751325885498e-05, "loss": 0.1143, "step": 45960 }, { "grad_norm": 0.11645479500293732, "learning_rate": 1.4220496183941795e-05, "loss": 0.1169, "step": 45970 }, { "grad_norm": 0.13887526094913483, "learning_rate": 1.4201251910866648e-05, "loss": 0.1064, "step": 45980 }, { "grad_norm": 0.13665121793746948, "learning_rate": 1.4182018512505957e-05, "loss": 0.1065, "step": 45990 }, { "grad_norm": 0.16075584292411804, "learning_rate": 1.4162795994702327e-05, "loss": 0.1072, "step": 46000 }, { "grad_norm": 0.11545664072036743, "learning_rate": 1.4143584363295032e-05, "loss": 0.1075, "step": 46010 }, { "grad_norm": 0.13172535598278046, "learning_rate": 1.4124383624120101e-05, "loss": 0.1123, "step": 46020 }, { "grad_norm": 0.12637905776500702, "learning_rate": 1.4105193783010151e-05, "loss": 0.1101, "step": 46030 }, { "grad_norm": 0.11473871767520905, "learning_rate": 1.4086014845794621e-05, "loss": 0.1053, "step": 46040 }, { "grad_norm": 0.1278090476989746, "learning_rate": 1.4066846818299489e-05, "loss": 0.1168, "step": 46050 }, { "grad_norm": 0.14618757367134094, "learning_rate": 1.4047689706347555e-05, "loss": 0.1116, "step": 46060 }, { "grad_norm": 0.1482781320810318, "learning_rate": 1.402854351575822e-05, "loss": 0.1077, "step": 46070 }, { "grad_norm": 0.18308775126934052, "learning_rate": 1.4009408252347588e-05, "loss": 0.1086, "step": 46080 }, { "grad_norm": 0.17099495232105255, "learning_rate": 1.399028392192846e-05, "loss": 0.1255, "step": 46090 }, { "grad_norm": 0.13722187280654907, "learning_rate": 1.397117053031029e-05, "loss": 0.1104, "step": 46100 }, { "grad_norm": 0.1372317373752594, "learning_rate": 1.3952068083299213e-05, "loss": 0.1109, "step": 46110 }, { "grad_norm": 0.11051382124423981, "learning_rate": 1.3932976586698082e-05, "loss": 0.1058, "step": 46120 }, { "grad_norm": 0.1768316775560379, "learning_rate": 1.3913896046306363e-05, "loss": 0.1066, "step": 46130 }, { "grad_norm": 0.1429564356803894, "learning_rate": 1.389482646792023e-05, "loss": 0.1063, "step": 46140 }, { "grad_norm": 0.16416317224502563, "learning_rate": 1.387576785733251e-05, "loss": 0.1006, "step": 46150 }, { "grad_norm": 0.14149898290634155, "learning_rate": 1.3856720220332703e-05, "loss": 0.1175, "step": 46160 }, { "grad_norm": 0.18094103038311005, "learning_rate": 1.383768356270701e-05, "loss": 0.1118, "step": 46170 }, { "grad_norm": 0.16064384579658508, "learning_rate": 1.3818657890238207e-05, "loss": 0.1027, "step": 46180 }, { "grad_norm": 0.15508636832237244, "learning_rate": 1.3799643208705859e-05, "loss": 0.1086, "step": 46190 }, { "grad_norm": 0.14518742263317108, "learning_rate": 1.3780639523886058e-05, "loss": 0.1193, "step": 46200 }, { "grad_norm": 0.16905958950519562, "learning_rate": 1.3761646841551668e-05, "loss": 0.1192, "step": 46210 }, { "grad_norm": 0.12153981626033783, "learning_rate": 1.3742665167472146e-05, "loss": 0.1125, "step": 46220 }, { "grad_norm": 0.1565435379743576, "learning_rate": 1.372369450741363e-05, "loss": 0.1201, "step": 46230 }, { "grad_norm": 0.10054558515548706, "learning_rate": 1.3704734867138901e-05, "loss": 0.1129, "step": 46240 }, { "grad_norm": 0.1443062275648117, "learning_rate": 1.36857862524074e-05, "loss": 0.1142, "step": 46250 }, { "grad_norm": 0.16316311061382294, "learning_rate": 1.3666848668975213e-05, "loss": 0.1108, "step": 46260 }, { "grad_norm": 0.15780700743198395, "learning_rate": 1.3647922122595063e-05, "loss": 0.108, "step": 46270 }, { "grad_norm": 0.11819455027580261, "learning_rate": 1.3629006619016366e-05, "loss": 0.103, "step": 46280 }, { "grad_norm": 0.12776252627372742, "learning_rate": 1.3610102163985139e-05, "loss": 0.1163, "step": 46290 }, { "grad_norm": 0.13357453048229218, "learning_rate": 1.3591208763244057e-05, "loss": 0.1145, "step": 46300 }, { "grad_norm": 0.1437278836965561, "learning_rate": 1.3572326422532428e-05, "loss": 0.1274, "step": 46310 }, { "grad_norm": 0.10215375572443008, "learning_rate": 1.355345514758622e-05, "loss": 0.1166, "step": 46320 }, { "grad_norm": 0.12904348969459534, "learning_rate": 1.3534594944138007e-05, "loss": 0.1088, "step": 46330 }, { "grad_norm": 0.13648740947246552, "learning_rate": 1.3515745817917069e-05, "loss": 0.1127, "step": 46340 }, { "grad_norm": 0.14524395763874054, "learning_rate": 1.3496907774649208e-05, "loss": 0.1172, "step": 46350 }, { "grad_norm": 0.1488252431154251, "learning_rate": 1.3478080820056987e-05, "loss": 0.1026, "step": 46360 }, { "grad_norm": 0.12044497579336166, "learning_rate": 1.3459264959859474e-05, "loss": 0.1104, "step": 46370 }, { "grad_norm": 0.11794409155845642, "learning_rate": 1.3440460199772487e-05, "loss": 0.1114, "step": 46380 }, { "grad_norm": 0.10258769989013672, "learning_rate": 1.3421666545508382e-05, "loss": 0.1077, "step": 46390 }, { "grad_norm": 0.1552349030971527, "learning_rate": 1.3402884002776194e-05, "loss": 0.1099, "step": 46400 }, { "grad_norm": 0.11270218342542648, "learning_rate": 1.3384112577281555e-05, "loss": 0.112, "step": 46410 }, { "grad_norm": 0.12137056887149811, "learning_rate": 1.3365352274726711e-05, "loss": 0.1009, "step": 46420 }, { "grad_norm": 0.12803393602371216, "learning_rate": 1.3346603100810578e-05, "loss": 0.1089, "step": 46430 }, { "grad_norm": 0.15708783268928528, "learning_rate": 1.3327865061228645e-05, "loss": 0.1167, "step": 46440 }, { "grad_norm": 0.147589311003685, "learning_rate": 1.330913816167304e-05, "loss": 0.1029, "step": 46450 }, { "grad_norm": 0.14907512068748474, "learning_rate": 1.3290422407832492e-05, "loss": 0.1156, "step": 46460 }, { "grad_norm": 0.1622483730316162, "learning_rate": 1.3271717805392354e-05, "loss": 0.1099, "step": 46470 }, { "grad_norm": 0.1352822184562683, "learning_rate": 1.3253024360034582e-05, "loss": 0.1157, "step": 46480 }, { "grad_norm": 0.13678544759750366, "learning_rate": 1.323434207743779e-05, "loss": 0.1089, "step": 46490 }, { "grad_norm": 0.13440345227718353, "learning_rate": 1.3215670963277105e-05, "loss": 0.1109, "step": 46500 }, { "grad_norm": 0.1293165534734726, "learning_rate": 1.3197011023224376e-05, "loss": 0.1131, "step": 46510 }, { "grad_norm": 0.1632993370294571, "learning_rate": 1.3178362262947941e-05, "loss": 0.1147, "step": 46520 }, { "grad_norm": 0.1591636836528778, "learning_rate": 1.3159724688112845e-05, "loss": 0.1114, "step": 46530 }, { "grad_norm": 0.13248753547668457, "learning_rate": 1.3141098304380683e-05, "loss": 0.105, "step": 46540 }, { "grad_norm": 0.12858466804027557, "learning_rate": 1.3122483117409651e-05, "loss": 0.1021, "step": 46550 }, { "grad_norm": 0.14505347609519958, "learning_rate": 1.3103879132854552e-05, "loss": 0.1168, "step": 46560 }, { "grad_norm": 0.14904595911502838, "learning_rate": 1.3085286356366771e-05, "loss": 0.111, "step": 46570 }, { "grad_norm": 0.13069133460521698, "learning_rate": 1.3066704793594337e-05, "loss": 0.1077, "step": 46580 }, { "grad_norm": 0.1277744472026825, "learning_rate": 1.3048134450181816e-05, "loss": 0.1075, "step": 46590 }, { "grad_norm": 0.13062499463558197, "learning_rate": 1.3029575331770394e-05, "loss": 0.1062, "step": 46600 }, { "grad_norm": 0.13435880839824677, "learning_rate": 1.3011027443997837e-05, "loss": 0.1055, "step": 46610 }, { "grad_norm": 0.1407468020915985, "learning_rate": 1.2992490792498507e-05, "loss": 0.1167, "step": 46620 }, { "grad_norm": 0.1257590502500534, "learning_rate": 1.297396538290333e-05, "loss": 0.102, "step": 46630 }, { "grad_norm": 0.12758371233940125, "learning_rate": 1.2955451220839888e-05, "loss": 0.1179, "step": 46640 }, { "grad_norm": 0.10601192712783813, "learning_rate": 1.2936948311932223e-05, "loss": 0.11, "step": 46650 }, { "grad_norm": 0.12170565128326416, "learning_rate": 1.2918456661801104e-05, "loss": 0.1114, "step": 46660 }, { "grad_norm": 0.1549772024154663, "learning_rate": 1.2899976276063736e-05, "loss": 0.117, "step": 46670 }, { "grad_norm": 0.1535765677690506, "learning_rate": 1.2881507160334022e-05, "loss": 0.1149, "step": 46680 }, { "grad_norm": 0.16288699209690094, "learning_rate": 1.286304932022238e-05, "loss": 0.1175, "step": 46690 }, { "grad_norm": 0.1408219039440155, "learning_rate": 1.2844602761335806e-05, "loss": 0.1062, "step": 46700 }, { "grad_norm": 0.14000992476940155, "learning_rate": 1.2826167489277885e-05, "loss": 0.1154, "step": 46710 }, { "grad_norm": 0.11954362690448761, "learning_rate": 1.2807743509648745e-05, "loss": 0.1077, "step": 46720 }, { "grad_norm": 0.15386730432510376, "learning_rate": 1.2789330828045149e-05, "loss": 0.117, "step": 46730 }, { "grad_norm": 0.14184515178203583, "learning_rate": 1.2770929450060332e-05, "loss": 0.1115, "step": 46740 }, { "grad_norm": 0.1166420429944992, "learning_rate": 1.2752539381284184e-05, "loss": 0.1208, "step": 46750 }, { "grad_norm": 0.1579785943031311, "learning_rate": 1.273416062730311e-05, "loss": 0.1137, "step": 46760 }, { "grad_norm": 0.14272554218769073, "learning_rate": 1.2715793193700088e-05, "loss": 0.1115, "step": 46770 }, { "grad_norm": 0.12897519767284393, "learning_rate": 1.2697437086054664e-05, "loss": 0.112, "step": 46780 }, { "grad_norm": 0.1412162184715271, "learning_rate": 1.2679092309942937e-05, "loss": 0.1113, "step": 46790 }, { "grad_norm": 0.147404283285141, "learning_rate": 1.266075887093755e-05, "loss": 0.1152, "step": 46800 }, { "grad_norm": 0.14341221749782562, "learning_rate": 1.2642436774607757e-05, "loss": 0.1116, "step": 46810 }, { "grad_norm": 0.13299161195755005, "learning_rate": 1.2624126026519278e-05, "loss": 0.1072, "step": 46820 }, { "grad_norm": 0.1438307911157608, "learning_rate": 1.2605826632234474e-05, "loss": 0.1171, "step": 46830 }, { "grad_norm": 0.13044312596321106, "learning_rate": 1.2587538597312198e-05, "loss": 0.1045, "step": 46840 }, { "grad_norm": 0.1679314523935318, "learning_rate": 1.2569261927307884e-05, "loss": 0.1059, "step": 46850 }, { "grad_norm": 0.14520226418972015, "learning_rate": 1.2550996627773493e-05, "loss": 0.1126, "step": 46860 }, { "grad_norm": 0.12882505357265472, "learning_rate": 1.2532742704257527e-05, "loss": 0.1077, "step": 46870 }, { "grad_norm": 0.12179265916347504, "learning_rate": 1.2514500162305087e-05, "loss": 0.1112, "step": 46880 }, { "grad_norm": 0.14344185590744019, "learning_rate": 1.2496269007457728e-05, "loss": 0.1058, "step": 46890 }, { "grad_norm": 0.11053489148616791, "learning_rate": 1.2478049245253625e-05, "loss": 0.1117, "step": 46900 }, { "grad_norm": 0.14801174402236938, "learning_rate": 1.2459840881227459e-05, "loss": 0.1161, "step": 46910 }, { "grad_norm": 0.1286049485206604, "learning_rate": 1.2441643920910435e-05, "loss": 0.1135, "step": 46920 }, { "grad_norm": 0.10989635437726974, "learning_rate": 1.2423458369830322e-05, "loss": 0.1145, "step": 46930 }, { "grad_norm": 0.1491285115480423, "learning_rate": 1.2405284233511406e-05, "loss": 0.1194, "step": 46940 }, { "grad_norm": 0.1653369665145874, "learning_rate": 1.2387121517474487e-05, "loss": 0.1203, "step": 46950 }, { "grad_norm": 0.14976778626441956, "learning_rate": 1.2368970227236975e-05, "loss": 0.1132, "step": 46960 }, { "grad_norm": 0.1444115787744522, "learning_rate": 1.2350830368312688e-05, "loss": 0.1048, "step": 46970 }, { "grad_norm": 0.1602237969636917, "learning_rate": 1.2332701946212083e-05, "loss": 0.1186, "step": 46980 }, { "grad_norm": 0.14180772006511688, "learning_rate": 1.2314584966442077e-05, "loss": 0.1111, "step": 46990 }, { "grad_norm": 0.14297282695770264, "learning_rate": 1.2296479434506136e-05, "loss": 0.1178, "step": 47000 }, { "grad_norm": 0.1357341855764389, "learning_rate": 1.2278385355904232e-05, "loss": 0.1074, "step": 47010 }, { "grad_norm": 0.13702791929244995, "learning_rate": 1.2260302736132867e-05, "loss": 0.1095, "step": 47020 }, { "grad_norm": 0.12936747074127197, "learning_rate": 1.2242231580685098e-05, "loss": 0.1099, "step": 47030 }, { "grad_norm": 0.1406611204147339, "learning_rate": 1.2224171895050413e-05, "loss": 0.1155, "step": 47040 }, { "grad_norm": 0.17026038467884064, "learning_rate": 1.2206123684714903e-05, "loss": 0.1133, "step": 47050 }, { "grad_norm": 0.13442188501358032, "learning_rate": 1.2188086955161132e-05, "loss": 0.1028, "step": 47060 }, { "grad_norm": 0.11239272356033325, "learning_rate": 1.2170061711868175e-05, "loss": 0.1122, "step": 47070 }, { "grad_norm": 0.13685746490955353, "learning_rate": 1.215204796031163e-05, "loss": 0.1107, "step": 47080 }, { "grad_norm": 0.17337577044963837, "learning_rate": 1.2134045705963599e-05, "loss": 0.1163, "step": 47090 }, { "grad_norm": 0.1513880491256714, "learning_rate": 1.2116054954292689e-05, "loss": 0.1137, "step": 47100 }, { "grad_norm": 0.1774810552597046, "learning_rate": 1.2098075710764011e-05, "loss": 0.1154, "step": 47110 }, { "grad_norm": 0.14775978028774261, "learning_rate": 1.2080107980839183e-05, "loss": 0.1162, "step": 47120 }, { "grad_norm": 0.14420852065086365, "learning_rate": 1.2062151769976343e-05, "loss": 0.1047, "step": 47130 }, { "grad_norm": 0.16873139142990112, "learning_rate": 1.204420708363011e-05, "loss": 0.1095, "step": 47140 }, { "grad_norm": 0.11428532004356384, "learning_rate": 1.2026273927251597e-05, "loss": 0.1042, "step": 47150 }, { "grad_norm": 0.1668015718460083, "learning_rate": 1.2008352306288424e-05, "loss": 0.1171, "step": 47160 }, { "grad_norm": 0.09843345731496811, "learning_rate": 1.1990442226184695e-05, "loss": 0.1104, "step": 47170 }, { "grad_norm": 0.09085176140069962, "learning_rate": 1.1972543692381066e-05, "loss": 0.109, "step": 47180 }, { "grad_norm": 0.12634478509426117, "learning_rate": 1.1954656710314576e-05, "loss": 0.1159, "step": 47190 }, { "grad_norm": 0.11203251779079437, "learning_rate": 1.1936781285418875e-05, "loss": 0.1078, "step": 47200 }, { "grad_norm": 0.11738769710063934, "learning_rate": 1.1918917423123993e-05, "loss": 0.1078, "step": 47210 }, { "grad_norm": 0.10415909439325333, "learning_rate": 1.1901065128856537e-05, "loss": 0.109, "step": 47220 }, { "grad_norm": 0.16004197299480438, "learning_rate": 1.1883224408039551e-05, "loss": 0.1192, "step": 47230 }, { "grad_norm": 0.11353921890258789, "learning_rate": 1.1865395266092578e-05, "loss": 0.1097, "step": 47240 }, { "grad_norm": 0.14178748428821564, "learning_rate": 1.1847577708431633e-05, "loss": 0.1109, "step": 47250 }, { "grad_norm": 0.1264931559562683, "learning_rate": 1.1829771740469225e-05, "loss": 0.1081, "step": 47260 }, { "grad_norm": 0.09925087541341782, "learning_rate": 1.1811977367614324e-05, "loss": 0.1039, "step": 47270 }, { "grad_norm": 0.10234832763671875, "learning_rate": 1.1794194595272412e-05, "loss": 0.111, "step": 47280 }, { "grad_norm": 0.14889006316661835, "learning_rate": 1.1776423428845423e-05, "loss": 0.1127, "step": 47290 }, { "grad_norm": 0.133755162358284, "learning_rate": 1.1758663873731756e-05, "loss": 0.1151, "step": 47300 }, { "grad_norm": 0.14816118776798248, "learning_rate": 1.1740915935326302e-05, "loss": 0.1154, "step": 47310 }, { "grad_norm": 0.1535797119140625, "learning_rate": 1.1723179619020396e-05, "loss": 0.1174, "step": 47320 }, { "grad_norm": 0.12299135327339172, "learning_rate": 1.1705454930201914e-05, "loss": 0.1126, "step": 47330 }, { "grad_norm": 0.12972575426101685, "learning_rate": 1.1687741874255087e-05, "loss": 0.1094, "step": 47340 }, { "grad_norm": 0.1247764602303505, "learning_rate": 1.1670040456560728e-05, "loss": 0.1012, "step": 47350 }, { "grad_norm": 0.10799866914749146, "learning_rate": 1.1652350682496005e-05, "loss": 0.1117, "step": 47360 }, { "grad_norm": 0.11297688633203506, "learning_rate": 1.163467255743465e-05, "loss": 0.1033, "step": 47370 }, { "grad_norm": 0.12076500803232193, "learning_rate": 1.1617006086746796e-05, "loss": 0.1107, "step": 47380 }, { "grad_norm": 0.13924115896224976, "learning_rate": 1.1599351275799047e-05, "loss": 0.1083, "step": 47390 }, { "grad_norm": 0.12791769206523895, "learning_rate": 1.1581708129954466e-05, "loss": 0.1073, "step": 47400 }, { "grad_norm": 0.1352231502532959, "learning_rate": 1.1564076654572587e-05, "loss": 0.1118, "step": 47410 }, { "grad_norm": 0.11342546343803406, "learning_rate": 1.1546456855009358e-05, "loss": 0.1162, "step": 47420 }, { "grad_norm": 0.1486702710390091, "learning_rate": 1.1528848736617248e-05, "loss": 0.1147, "step": 47430 }, { "grad_norm": 0.11432099342346191, "learning_rate": 1.1511252304745112e-05, "loss": 0.1032, "step": 47440 }, { "grad_norm": 0.10790374875068665, "learning_rate": 1.1493667564738297e-05, "loss": 0.1098, "step": 47450 }, { "grad_norm": 0.11654920130968094, "learning_rate": 1.1476094521938574e-05, "loss": 0.1113, "step": 47460 }, { "grad_norm": 0.14527195692062378, "learning_rate": 1.1458533181684167e-05, "loss": 0.1139, "step": 47470 }, { "grad_norm": 0.13036949932575226, "learning_rate": 1.1440983549309753e-05, "loss": 0.1069, "step": 47480 }, { "grad_norm": 0.1044112890958786, "learning_rate": 1.1423445630146434e-05, "loss": 0.1074, "step": 47490 }, { "grad_norm": 0.11996691673994064, "learning_rate": 1.1405919429521799e-05, "loss": 0.1095, "step": 47500 }, { "grad_norm": 0.1257437765598297, "learning_rate": 1.1388404952759802e-05, "loss": 0.1096, "step": 47510 }, { "grad_norm": 0.13714811205863953, "learning_rate": 1.1370902205180923e-05, "loss": 0.1189, "step": 47520 }, { "grad_norm": 0.13630525767803192, "learning_rate": 1.1353411192101987e-05, "loss": 0.1119, "step": 47530 }, { "grad_norm": 0.180074080824852, "learning_rate": 1.133593191883634e-05, "loss": 0.1127, "step": 47540 }, { "grad_norm": 0.14645284414291382, "learning_rate": 1.1318464390693711e-05, "loss": 0.1188, "step": 47550 }, { "grad_norm": 0.12325705587863922, "learning_rate": 1.1301008612980257e-05, "loss": 0.1072, "step": 47560 }, { "grad_norm": 0.11606752127408981, "learning_rate": 1.128356459099863e-05, "loss": 0.1163, "step": 47570 }, { "grad_norm": 0.13873761892318726, "learning_rate": 1.1266132330047802e-05, "loss": 0.1134, "step": 47580 }, { "grad_norm": 0.14702513813972473, "learning_rate": 1.1248711835423281e-05, "loss": 0.1068, "step": 47590 }, { "grad_norm": 0.09802252799272537, "learning_rate": 1.123130311241693e-05, "loss": 0.1166, "step": 47600 }, { "grad_norm": 0.13784490525722504, "learning_rate": 1.1213906166317068e-05, "loss": 0.1156, "step": 47610 }, { "grad_norm": 0.13750314712524414, "learning_rate": 1.1196521002408427e-05, "loss": 0.1185, "step": 47620 }, { "grad_norm": 0.1501462757587433, "learning_rate": 1.1179147625972159e-05, "loss": 0.1227, "step": 47630 }, { "grad_norm": 0.15277139842510223, "learning_rate": 1.1161786042285822e-05, "loss": 0.1123, "step": 47640 }, { "grad_norm": 0.11663714051246643, "learning_rate": 1.1144436256623447e-05, "loss": 0.1147, "step": 47650 }, { "grad_norm": 0.10190234333276749, "learning_rate": 1.1127098274255392e-05, "loss": 0.1104, "step": 47660 }, { "grad_norm": 0.10819143056869507, "learning_rate": 1.1109772100448512e-05, "loss": 0.1067, "step": 47670 }, { "grad_norm": 0.11326853185892105, "learning_rate": 1.1092457740466033e-05, "loss": 0.1073, "step": 47680 }, { "grad_norm": 0.13170038163661957, "learning_rate": 1.10751551995676e-05, "loss": 0.1151, "step": 47690 }, { "grad_norm": 0.15063051879405975, "learning_rate": 1.1057864483009262e-05, "loss": 0.113, "step": 47700 }, { "grad_norm": 0.14403791725635529, "learning_rate": 1.1040585596043473e-05, "loss": 0.1122, "step": 47710 }, { "grad_norm": 0.13264481723308563, "learning_rate": 1.1023318543919148e-05, "loss": 0.1073, "step": 47720 }, { "grad_norm": 0.1169225201010704, "learning_rate": 1.10060633318815e-05, "loss": 0.1097, "step": 47730 }, { "grad_norm": 0.12693433463573456, "learning_rate": 1.0988819965172248e-05, "loss": 0.1169, "step": 47740 }, { "grad_norm": 0.13129231333732605, "learning_rate": 1.0971588449029462e-05, "loss": 0.1038, "step": 47750 }, { "grad_norm": 0.12516094744205475, "learning_rate": 1.095436878868762e-05, "loss": 0.1041, "step": 47760 }, { "grad_norm": 0.1324813812971115, "learning_rate": 1.0937160989377598e-05, "loss": 0.1064, "step": 47770 }, { "grad_norm": 0.13676196336746216, "learning_rate": 1.0919965056326676e-05, "loss": 0.1103, "step": 47780 }, { "grad_norm": 0.15620243549346924, "learning_rate": 1.0902780994758504e-05, "loss": 0.1207, "step": 47790 }, { "grad_norm": 0.1485121250152588, "learning_rate": 1.0885608809893193e-05, "loss": 0.1159, "step": 47800 }, { "grad_norm": 0.14767535030841827, "learning_rate": 1.0868448506947142e-05, "loss": 0.1088, "step": 47810 }, { "grad_norm": 0.1186603456735611, "learning_rate": 1.0851300091133243e-05, "loss": 0.1093, "step": 47820 }, { "grad_norm": 0.10603129118680954, "learning_rate": 1.083416356766071e-05, "loss": 0.1043, "step": 47830 }, { "grad_norm": 0.10595536231994629, "learning_rate": 1.0817038941735175e-05, "loss": 0.1089, "step": 47840 }, { "grad_norm": 0.09856719523668289, "learning_rate": 1.0799926218558642e-05, "loss": 0.1121, "step": 47850 }, { "grad_norm": 0.1180100291967392, "learning_rate": 1.0782825403329488e-05, "loss": 0.1091, "step": 47860 }, { "grad_norm": 0.10863153636455536, "learning_rate": 1.076573650124254e-05, "loss": 0.1055, "step": 47870 }, { "grad_norm": 0.14936229586601257, "learning_rate": 1.0748659517488891e-05, "loss": 0.1131, "step": 47880 }, { "grad_norm": 0.14611200988292694, "learning_rate": 1.0731594457256138e-05, "loss": 0.1092, "step": 47890 }, { "grad_norm": 0.13664449751377106, "learning_rate": 1.0714541325728139e-05, "loss": 0.1151, "step": 47900 }, { "grad_norm": 0.16398070752620697, "learning_rate": 1.0697500128085231e-05, "loss": 0.1141, "step": 47910 }, { "grad_norm": 0.18521994352340698, "learning_rate": 1.0680470869504055e-05, "loss": 0.115, "step": 47920 }, { "grad_norm": 0.14114034175872803, "learning_rate": 1.066345355515766e-05, "loss": 0.1159, "step": 47930 }, { "grad_norm": 0.15722870826721191, "learning_rate": 1.0646448190215453e-05, "loss": 0.1148, "step": 47940 }, { "grad_norm": 0.13035964965820312, "learning_rate": 1.0629454779843217e-05, "loss": 0.1099, "step": 47950 }, { "grad_norm": 0.13422897458076477, "learning_rate": 1.0612473329203082e-05, "loss": 0.1135, "step": 47960 }, { "grad_norm": 0.14466674625873566, "learning_rate": 1.0595503843453596e-05, "loss": 0.108, "step": 47970 }, { "grad_norm": 0.10142262279987335, "learning_rate": 1.0578546327749634e-05, "loss": 0.1047, "step": 47980 }, { "grad_norm": 0.10207919031381607, "learning_rate": 1.0561600787242425e-05, "loss": 0.1066, "step": 47990 }, { "grad_norm": 0.13754822313785553, "learning_rate": 1.0544667227079591e-05, "loss": 0.1067, "step": 48000 }, { "grad_norm": 0.1420045644044876, "learning_rate": 1.0527745652405085e-05, "loss": 0.1089, "step": 48010 }, { "grad_norm": 0.11071006208658218, "learning_rate": 1.051083606835927e-05, "loss": 0.1109, "step": 48020 }, { "grad_norm": 0.13336291909217834, "learning_rate": 1.049393848007878e-05, "loss": 0.1107, "step": 48030 }, { "grad_norm": 0.15425771474838257, "learning_rate": 1.0477052892696709e-05, "loss": 0.1089, "step": 48040 }, { "grad_norm": 0.14403167366981506, "learning_rate": 1.0460179311342394e-05, "loss": 0.1069, "step": 48050 }, { "grad_norm": 0.10393102467060089, "learning_rate": 1.0443317741141634e-05, "loss": 0.1073, "step": 48060 }, { "grad_norm": 0.11811482906341553, "learning_rate": 1.0426468187216514e-05, "loss": 0.1014, "step": 48070 }, { "grad_norm": 0.11080525070428848, "learning_rate": 1.0409630654685477e-05, "loss": 0.1044, "step": 48080 }, { "grad_norm": 0.12049199640750885, "learning_rate": 1.039280514866332e-05, "loss": 0.1144, "step": 48090 }, { "grad_norm": 0.1330070048570633, "learning_rate": 1.0375991674261198e-05, "loss": 0.1032, "step": 48100 }, { "grad_norm": 0.11715873330831528, "learning_rate": 1.0359190236586575e-05, "loss": 0.1055, "step": 48110 }, { "grad_norm": 0.13427409529685974, "learning_rate": 1.0342400840743322e-05, "loss": 0.111, "step": 48120 }, { "grad_norm": 0.11253461986780167, "learning_rate": 1.0325623491831593e-05, "loss": 0.1144, "step": 48130 }, { "grad_norm": 0.14201635122299194, "learning_rate": 1.0308858194947906e-05, "loss": 0.1063, "step": 48140 }, { "grad_norm": 0.1461545079946518, "learning_rate": 1.0292104955185111e-05, "loss": 0.1129, "step": 48150 }, { "grad_norm": 0.13380339741706848, "learning_rate": 1.0275363777632396e-05, "loss": 0.1116, "step": 48160 }, { "grad_norm": 0.14485220611095428, "learning_rate": 1.0258634667375321e-05, "loss": 0.1108, "step": 48170 }, { "grad_norm": 0.14379441738128662, "learning_rate": 1.02419176294957e-05, "loss": 0.1032, "step": 48180 }, { "grad_norm": 0.13585199415683746, "learning_rate": 1.0225212669071782e-05, "loss": 0.1188, "step": 48190 }, { "grad_norm": 0.18269841372966766, "learning_rate": 1.0208519791178029e-05, "loss": 0.1117, "step": 48200 }, { "grad_norm": 0.13257868587970734, "learning_rate": 1.019183900088535e-05, "loss": 0.1053, "step": 48210 }, { "grad_norm": 0.10488693416118622, "learning_rate": 1.0175170303260906e-05, "loss": 0.1083, "step": 48220 }, { "grad_norm": 0.14345848560333252, "learning_rate": 1.0158513703368206e-05, "loss": 0.1077, "step": 48230 }, { "grad_norm": 0.10246195644140244, "learning_rate": 1.0141869206267095e-05, "loss": 0.1072, "step": 48240 }, { "grad_norm": 0.1240563914179802, "learning_rate": 1.0125236817013723e-05, "loss": 0.1122, "step": 48250 }, { "grad_norm": 0.12793248891830444, "learning_rate": 1.010861654066056e-05, "loss": 0.1113, "step": 48260 }, { "grad_norm": 0.11946389824151993, "learning_rate": 1.0092008382256434e-05, "loss": 0.115, "step": 48270 }, { "grad_norm": 0.09987153112888336, "learning_rate": 1.0075412346846458e-05, "loss": 0.1015, "step": 48280 }, { "grad_norm": 0.11880692839622498, "learning_rate": 1.0058828439472056e-05, "loss": 0.1061, "step": 48290 }, { "grad_norm": 0.10485197603702545, "learning_rate": 1.0042256665170996e-05, "loss": 0.1022, "step": 48300 }, { "grad_norm": 0.12322656810283661, "learning_rate": 1.0025697028977332e-05, "loss": 0.1118, "step": 48310 }, { "grad_norm": 0.1360408216714859, "learning_rate": 1.0009149535921454e-05, "loss": 0.1081, "step": 48320 }, { "grad_norm": 0.16438591480255127, "learning_rate": 9.992614191030031e-06, "loss": 0.112, "step": 48330 }, { "grad_norm": 0.12981776893138885, "learning_rate": 9.976090999326115e-06, "loss": 0.1193, "step": 48340 }, { "grad_norm": 0.09404639899730682, "learning_rate": 9.959579965828952e-06, "loss": 0.1123, "step": 48350 }, { "grad_norm": 0.14300668239593506, "learning_rate": 9.943081095554218e-06, "loss": 0.1198, "step": 48360 }, { "grad_norm": 0.1236882358789444, "learning_rate": 9.926594393513783e-06, "loss": 0.1002, "step": 48370 }, { "grad_norm": 0.11937066912651062, "learning_rate": 9.910119864715906e-06, "loss": 0.1105, "step": 48380 }, { "grad_norm": 0.13072726130485535, "learning_rate": 9.8936575141651e-06, "loss": 0.1148, "step": 48390 }, { "grad_norm": 0.15660101175308228, "learning_rate": 9.877207346862194e-06, "loss": 0.1066, "step": 48400 }, { "grad_norm": 0.12327699363231659, "learning_rate": 9.860769367804312e-06, "loss": 0.1041, "step": 48410 }, { "grad_norm": 0.14354023337364197, "learning_rate": 9.844343581984877e-06, "loss": 0.113, "step": 48420 }, { "grad_norm": 0.12594257295131683, "learning_rate": 9.82792999439362e-06, "loss": 0.1096, "step": 48430 }, { "grad_norm": 0.12139374762773514, "learning_rate": 9.811528610016546e-06, "loss": 0.1106, "step": 48440 }, { "grad_norm": 0.11877790093421936, "learning_rate": 9.79513943383597e-06, "loss": 0.1111, "step": 48450 }, { "grad_norm": 0.1201973408460617, "learning_rate": 9.778762470830489e-06, "loss": 0.1008, "step": 48460 }, { "grad_norm": 0.11072327196598053, "learning_rate": 9.762397725974982e-06, "loss": 0.1086, "step": 48470 }, { "grad_norm": 0.08829410374164581, "learning_rate": 9.746045204240622e-06, "loss": 0.1055, "step": 48480 }, { "grad_norm": 0.12638036906719208, "learning_rate": 9.729704910594917e-06, "loss": 0.1079, "step": 48490 }, { "grad_norm": 0.1383623480796814, "learning_rate": 9.713376850001554e-06, "loss": 0.1119, "step": 48500 }, { "grad_norm": 0.14167790114879608, "learning_rate": 9.697061027420622e-06, "loss": 0.1098, "step": 48510 }, { "grad_norm": 0.10887055099010468, "learning_rate": 9.680757447808385e-06, "loss": 0.1069, "step": 48520 }, { "grad_norm": 0.09982580691576004, "learning_rate": 9.664466116117488e-06, "loss": 0.1071, "step": 48530 }, { "grad_norm": 0.11027026921510696, "learning_rate": 9.64818703729678e-06, "loss": 0.1126, "step": 48540 }, { "grad_norm": 0.14374983310699463, "learning_rate": 9.631920216291423e-06, "loss": 0.1088, "step": 48550 }, { "grad_norm": 0.10510057955980301, "learning_rate": 9.615665658042849e-06, "loss": 0.1074, "step": 48560 }, { "grad_norm": 0.1187683492898941, "learning_rate": 9.599423367488747e-06, "loss": 0.1111, "step": 48570 }, { "grad_norm": 0.12052513659000397, "learning_rate": 9.583193349563124e-06, "loss": 0.1173, "step": 48580 }, { "grad_norm": 0.12325143069028854, "learning_rate": 9.566975609196216e-06, "loss": 0.1106, "step": 48590 }, { "grad_norm": 0.12321695685386658, "learning_rate": 9.550770151314548e-06, "loss": 0.1118, "step": 48600 }, { "grad_norm": 0.14289681613445282, "learning_rate": 9.53457698084091e-06, "loss": 0.1124, "step": 48610 }, { "grad_norm": 0.13743162155151367, "learning_rate": 9.518396102694355e-06, "loss": 0.1144, "step": 48620 }, { "grad_norm": 0.12489745020866394, "learning_rate": 9.502227521790198e-06, "loss": 0.1177, "step": 48630 }, { "grad_norm": 0.14769434928894043, "learning_rate": 9.486071243040063e-06, "loss": 0.1081, "step": 48640 }, { "grad_norm": 0.0943843200802803, "learning_rate": 9.469927271351747e-06, "loss": 0.1065, "step": 48650 }, { "grad_norm": 0.11873137950897217, "learning_rate": 9.453795611629419e-06, "loss": 0.1109, "step": 48660 }, { "grad_norm": 0.09626450389623642, "learning_rate": 9.437676268773399e-06, "loss": 0.1128, "step": 48670 }, { "grad_norm": 0.13305015861988068, "learning_rate": 9.421569247680357e-06, "loss": 0.1196, "step": 48680 }, { "grad_norm": 0.11691856384277344, "learning_rate": 9.40547455324316e-06, "loss": 0.1098, "step": 48690 }, { "grad_norm": 0.11343192309141159, "learning_rate": 9.389392190350965e-06, "loss": 0.1105, "step": 48700 }, { "grad_norm": 0.10358737409114838, "learning_rate": 9.373322163889153e-06, "loss": 0.1141, "step": 48710 }, { "grad_norm": 0.09327808022499084, "learning_rate": 9.357264478739375e-06, "loss": 0.1096, "step": 48720 }, { "grad_norm": 0.12710829079151154, "learning_rate": 9.341219139779567e-06, "loss": 0.1133, "step": 48730 }, { "grad_norm": 0.12278195470571518, "learning_rate": 9.325186151883824e-06, "loss": 0.1099, "step": 48740 }, { "grad_norm": 0.11682227998971939, "learning_rate": 9.30916551992258e-06, "loss": 0.113, "step": 48750 }, { "grad_norm": 0.11382704973220825, "learning_rate": 9.293157248762479e-06, "loss": 0.1108, "step": 48760 }, { "grad_norm": 0.11776898801326752, "learning_rate": 9.2771613432664e-06, "loss": 0.1087, "step": 48770 }, { "grad_norm": 0.1622689962387085, "learning_rate": 9.261177808293481e-06, "loss": 0.1105, "step": 48780 }, { "grad_norm": 0.11622191220521927, "learning_rate": 9.245206648699096e-06, "loss": 0.1059, "step": 48790 }, { "grad_norm": 0.10994213819503784, "learning_rate": 9.22924786933485e-06, "loss": 0.1115, "step": 48800 }, { "grad_norm": 0.15565425157546997, "learning_rate": 9.213301475048642e-06, "loss": 0.1176, "step": 48810 }, { "grad_norm": 0.14641477167606354, "learning_rate": 9.197367470684504e-06, "loss": 0.1084, "step": 48820 }, { "grad_norm": 0.10821662098169327, "learning_rate": 9.181445861082816e-06, "loss": 0.1061, "step": 48830 }, { "grad_norm": 0.13266219198703766, "learning_rate": 9.16553665108012e-06, "loss": 0.1069, "step": 48840 }, { "grad_norm": 0.14071589708328247, "learning_rate": 9.149639845509223e-06, "loss": 0.108, "step": 48850 }, { "grad_norm": 0.12604375183582306, "learning_rate": 9.133755449199144e-06, "loss": 0.1113, "step": 48860 }, { "grad_norm": 0.1770804226398468, "learning_rate": 9.117883466975135e-06, "loss": 0.1191, "step": 48870 }, { "grad_norm": 0.19571487605571747, "learning_rate": 9.10202390365873e-06, "loss": 0.106, "step": 48880 }, { "grad_norm": 0.1454371213912964, "learning_rate": 9.086176764067583e-06, "loss": 0.1172, "step": 48890 }, { "grad_norm": 0.11418801546096802, "learning_rate": 9.070342053015684e-06, "loss": 0.114, "step": 48900 }, { "grad_norm": 0.12224393337965012, "learning_rate": 9.054519775313187e-06, "loss": 0.105, "step": 48910 }, { "grad_norm": 0.10682941228151321, "learning_rate": 9.038709935766476e-06, "loss": 0.1082, "step": 48920 }, { "grad_norm": 0.11563534289598465, "learning_rate": 9.02291253917817e-06, "loss": 0.1017, "step": 48930 }, { "grad_norm": 0.10044797509908676, "learning_rate": 9.007127590347091e-06, "loss": 0.1125, "step": 48940 }, { "grad_norm": 0.12081050872802734, "learning_rate": 8.991355094068288e-06, "loss": 0.1192, "step": 48950 }, { "grad_norm": 0.1134074479341507, "learning_rate": 8.975595055133062e-06, "loss": 0.1084, "step": 48960 }, { "grad_norm": 0.10733383148908615, "learning_rate": 8.959847478328848e-06, "loss": 0.1049, "step": 48970 }, { "grad_norm": 0.12504751980304718, "learning_rate": 8.944112368439378e-06, "loss": 0.1143, "step": 48980 }, { "grad_norm": 0.10812456160783768, "learning_rate": 8.928389730244552e-06, "loss": 0.1127, "step": 48990 }, { "grad_norm": 0.10262217372655869, "learning_rate": 8.912679568520494e-06, "loss": 0.1058, "step": 49000 }, { "grad_norm": 0.15629959106445312, "learning_rate": 8.896981888039534e-06, "loss": 0.1154, "step": 49010 }, { "grad_norm": 0.1007416695356369, "learning_rate": 8.881296693570201e-06, "loss": 0.1132, "step": 49020 }, { "grad_norm": 0.11586650460958481, "learning_rate": 8.865623989877281e-06, "loss": 0.1109, "step": 49030 }, { "grad_norm": 0.13148027658462524, "learning_rate": 8.849963781721681e-06, "loss": 0.1094, "step": 49040 }, { "grad_norm": 0.10573603957891464, "learning_rate": 8.834316073860588e-06, "loss": 0.1131, "step": 49050 }, { "grad_norm": 0.11881647258996964, "learning_rate": 8.818680871047357e-06, "loss": 0.1097, "step": 49060 }, { "grad_norm": 0.13660939037799835, "learning_rate": 8.803058178031549e-06, "loss": 0.116, "step": 49070 }, { "grad_norm": 0.1156497448682785, "learning_rate": 8.787447999558922e-06, "loss": 0.1147, "step": 49080 }, { "grad_norm": 0.10411906242370605, "learning_rate": 8.77185034037144e-06, "loss": 0.1098, "step": 49090 }, { "grad_norm": 0.1080666184425354, "learning_rate": 8.756265205207259e-06, "loss": 0.1062, "step": 49100 }, { "grad_norm": 0.12539683282375336, "learning_rate": 8.740692598800732e-06, "loss": 0.1062, "step": 49110 }, { "grad_norm": 0.13223014771938324, "learning_rate": 8.72513252588239e-06, "loss": 0.1153, "step": 49120 }, { "grad_norm": 0.11037858575582504, "learning_rate": 8.709584991178998e-06, "loss": 0.111, "step": 49130 }, { "grad_norm": 0.1422225832939148, "learning_rate": 8.694049999413479e-06, "loss": 0.117, "step": 49140 }, { "grad_norm": 0.1575486958026886, "learning_rate": 8.678527555304945e-06, "loss": 0.1076, "step": 49150 }, { "grad_norm": 0.131959930062294, "learning_rate": 8.663017663568712e-06, "loss": 0.1071, "step": 49160 }, { "grad_norm": 0.1428167223930359, "learning_rate": 8.647520328916259e-06, "loss": 0.1165, "step": 49170 }, { "grad_norm": 0.1275341510772705, "learning_rate": 8.632035556055307e-06, "loss": 0.1079, "step": 49180 }, { "grad_norm": 0.10086194425821304, "learning_rate": 8.616563349689672e-06, "loss": 0.1092, "step": 49190 }, { "grad_norm": 0.15151813626289368, "learning_rate": 8.601103714519448e-06, "loss": 0.1129, "step": 49200 }, { "grad_norm": 0.1268327534198761, "learning_rate": 8.58565665524082e-06, "loss": 0.107, "step": 49210 }, { "grad_norm": 0.10860607773065567, "learning_rate": 8.570222176546222e-06, "loss": 0.1124, "step": 49220 }, { "grad_norm": 0.09018322080373764, "learning_rate": 8.554800283124242e-06, "loss": 0.0981, "step": 49230 }, { "grad_norm": 0.10996763408184052, "learning_rate": 8.539390979659639e-06, "loss": 0.1163, "step": 49240 }, { "grad_norm": 0.12113502621650696, "learning_rate": 8.523994270833352e-06, "loss": 0.1094, "step": 49250 }, { "grad_norm": 0.11764181405305862, "learning_rate": 8.5086101613225e-06, "loss": 0.1116, "step": 49260 }, { "grad_norm": 0.13711628317832947, "learning_rate": 8.493238655800346e-06, "loss": 0.1045, "step": 49270 }, { "grad_norm": 0.13345307111740112, "learning_rate": 8.47787975893638e-06, "loss": 0.1056, "step": 49280 }, { "grad_norm": 0.10761746019124985, "learning_rate": 8.462533475396211e-06, "loss": 0.1011, "step": 49290 }, { "grad_norm": 0.09938359260559082, "learning_rate": 8.447199809841643e-06, "loss": 0.1087, "step": 49300 }, { "grad_norm": 0.09557273983955383, "learning_rate": 8.431878766930635e-06, "loss": 0.1139, "step": 49310 }, { "grad_norm": 0.11192207038402557, "learning_rate": 8.416570351317304e-06, "loss": 0.1093, "step": 49320 }, { "grad_norm": 0.12293162196874619, "learning_rate": 8.401274567651973e-06, "loss": 0.106, "step": 49330 }, { "grad_norm": 0.11102282255887985, "learning_rate": 8.385991420581058e-06, "loss": 0.1035, "step": 49340 }, { "grad_norm": 0.09364127367734909, "learning_rate": 8.370720914747215e-06, "loss": 0.1122, "step": 49350 }, { "grad_norm": 0.09006351977586746, "learning_rate": 8.355463054789181e-06, "loss": 0.1033, "step": 49360 }, { "grad_norm": 0.15114150941371918, "learning_rate": 8.340217845341919e-06, "loss": 0.1138, "step": 49370 }, { "grad_norm": 0.12026996910572052, "learning_rate": 8.324985291036514e-06, "loss": 0.1143, "step": 49380 }, { "grad_norm": 0.12234007567167282, "learning_rate": 8.309765396500213e-06, "loss": 0.1104, "step": 49390 }, { "grad_norm": 0.14502350986003876, "learning_rate": 8.294558166356419e-06, "loss": 0.1113, "step": 49400 }, { "grad_norm": 0.13522008061408997, "learning_rate": 8.279363605224683e-06, "loss": 0.1107, "step": 49410 }, { "grad_norm": 0.1241595670580864, "learning_rate": 8.264181717720704e-06, "loss": 0.1074, "step": 49420 }, { "grad_norm": 0.11153550446033478, "learning_rate": 8.249012508456361e-06, "loss": 0.1028, "step": 49430 }, { "grad_norm": 0.11081883311271667, "learning_rate": 8.233855982039646e-06, "loss": 0.106, "step": 49440 }, { "grad_norm": 0.11420412361621857, "learning_rate": 8.218712143074708e-06, "loss": 0.1028, "step": 49450 }, { "grad_norm": 0.14319419860839844, "learning_rate": 8.203580996161858e-06, "loss": 0.1063, "step": 49460 }, { "grad_norm": 0.13104113936424255, "learning_rate": 8.188462545897512e-06, "loss": 0.1072, "step": 49470 }, { "grad_norm": 0.11065342277288437, "learning_rate": 8.173356796874304e-06, "loss": 0.1081, "step": 49480 }, { "grad_norm": 0.11870473623275757, "learning_rate": 8.158263753680906e-06, "loss": 0.1063, "step": 49490 }, { "grad_norm": 0.1273178607225418, "learning_rate": 8.143183420902239e-06, "loss": 0.1049, "step": 49500 }, { "grad_norm": 0.09836186468601227, "learning_rate": 8.128115803119258e-06, "loss": 0.1042, "step": 49510 }, { "grad_norm": 0.12167353183031082, "learning_rate": 8.11306090490916e-06, "loss": 0.1089, "step": 49520 }, { "grad_norm": 0.11764392256736755, "learning_rate": 8.098018730845169e-06, "loss": 0.1126, "step": 49530 }, { "grad_norm": 0.12088625878095627, "learning_rate": 8.082989285496745e-06, "loss": 0.1081, "step": 49540 }, { "grad_norm": 0.11227581650018692, "learning_rate": 8.067972573429416e-06, "loss": 0.1007, "step": 49550 }, { "grad_norm": 0.12654267251491547, "learning_rate": 8.052968599204874e-06, "loss": 0.1074, "step": 49560 }, { "grad_norm": 0.1256699562072754, "learning_rate": 8.037977367380922e-06, "loss": 0.103, "step": 49570 }, { "grad_norm": 0.10669589787721634, "learning_rate": 8.022998882511495e-06, "loss": 0.1059, "step": 49580 }, { "grad_norm": 0.12420185655355453, "learning_rate": 8.008033149146677e-06, "loss": 0.1146, "step": 49590 }, { "grad_norm": 0.12960539758205414, "learning_rate": 7.993080171832656e-06, "loss": 0.1098, "step": 49600 }, { "grad_norm": 0.1084837019443512, "learning_rate": 7.978139955111752e-06, "loss": 0.1058, "step": 49610 }, { "grad_norm": 0.1066007986664772, "learning_rate": 7.9632125035224e-06, "loss": 0.1102, "step": 49620 }, { "grad_norm": 0.141916885972023, "learning_rate": 7.948297821599177e-06, "loss": 0.1094, "step": 49630 }, { "grad_norm": 0.10942168533802032, "learning_rate": 7.933395913872755e-06, "loss": 0.113, "step": 49640 }, { "grad_norm": 0.13620075583457947, "learning_rate": 7.918506784869972e-06, "loss": 0.1132, "step": 49650 }, { "grad_norm": 0.13929742574691772, "learning_rate": 7.903630439113707e-06, "loss": 0.1104, "step": 49660 }, { "grad_norm": 0.13034822046756744, "learning_rate": 7.888766881123044e-06, "loss": 0.1052, "step": 49670 }, { "grad_norm": 0.13505122065544128, "learning_rate": 7.873916115413099e-06, "loss": 0.1105, "step": 49680 }, { "grad_norm": 0.13549505174160004, "learning_rate": 7.85907814649518e-06, "loss": 0.1117, "step": 49690 }, { "grad_norm": 0.11599671840667725, "learning_rate": 7.844252978876649e-06, "loss": 0.1095, "step": 49700 }, { "grad_norm": 0.12525209784507751, "learning_rate": 7.829440617061001e-06, "loss": 0.1147, "step": 49710 }, { "grad_norm": 0.1345919817686081, "learning_rate": 7.814641065547851e-06, "loss": 0.112, "step": 49720 }, { "grad_norm": 0.11033210903406143, "learning_rate": 7.79985432883289e-06, "loss": 0.1081, "step": 49730 }, { "grad_norm": 0.15025225281715393, "learning_rate": 7.78508041140797e-06, "loss": 0.1144, "step": 49740 }, { "grad_norm": 0.10911034047603607, "learning_rate": 7.770319317760993e-06, "loss": 0.1043, "step": 49750 }, { "grad_norm": 0.11894109100103378, "learning_rate": 7.755571052376004e-06, "loss": 0.1136, "step": 49760 }, { "grad_norm": 0.1007147803902626, "learning_rate": 7.740835619733128e-06, "loss": 0.1118, "step": 49770 }, { "grad_norm": 0.13468126952648163, "learning_rate": 7.726113024308601e-06, "loss": 0.1033, "step": 49780 }, { "grad_norm": 0.10988551378250122, "learning_rate": 7.711403270574746e-06, "loss": 0.1046, "step": 49790 }, { "grad_norm": 0.09851910918951035, "learning_rate": 7.696706363000039e-06, "loss": 0.1028, "step": 49800 }, { "grad_norm": 0.13327790796756744, "learning_rate": 7.682022306048959e-06, "loss": 0.1184, "step": 49810 }, { "grad_norm": 0.1454268842935562, "learning_rate": 7.667351104182186e-06, "loss": 0.1149, "step": 49820 }, { "grad_norm": 0.11143365502357483, "learning_rate": 7.652692761856395e-06, "loss": 0.1145, "step": 49830 }, { "grad_norm": 0.12318305671215057, "learning_rate": 7.63804728352444e-06, "loss": 0.116, "step": 49840 }, { "grad_norm": 0.1196972206234932, "learning_rate": 7.623414673635215e-06, "loss": 0.1203, "step": 49850 }, { "grad_norm": 0.1303410530090332, "learning_rate": 7.608794936633723e-06, "loss": 0.1063, "step": 49860 }, { "grad_norm": 0.1583877056837082, "learning_rate": 7.594188076961056e-06, "loss": 0.1122, "step": 49870 }, { "grad_norm": 0.13827328383922577, "learning_rate": 7.579594099054382e-06, "loss": 0.113, "step": 49880 }, { "grad_norm": 0.16361215710639954, "learning_rate": 7.565013007346983e-06, "loss": 0.1121, "step": 49890 }, { "grad_norm": 0.12819212675094604, "learning_rate": 7.5504448062682035e-06, "loss": 0.0972, "step": 49900 }, { "grad_norm": 0.12373697012662888, "learning_rate": 7.53588950024347e-06, "loss": 0.1124, "step": 49910 }, { "grad_norm": 0.11282319575548172, "learning_rate": 7.5213470936943145e-06, "loss": 0.1143, "step": 49920 }, { "grad_norm": 0.12525004148483276, "learning_rate": 7.506817591038323e-06, "loss": 0.1073, "step": 49930 }, { "grad_norm": 0.12758809328079224, "learning_rate": 7.492300996689183e-06, "loss": 0.1089, "step": 49940 }, { "grad_norm": 0.10865786671638489, "learning_rate": 7.477797315056645e-06, "loss": 0.1064, "step": 49950 }, { "grad_norm": 0.11718955636024475, "learning_rate": 7.463306550546539e-06, "loss": 0.1207, "step": 49960 }, { "grad_norm": 0.11570987850427628, "learning_rate": 7.448828707560812e-06, "loss": 0.1085, "step": 49970 }, { "grad_norm": 0.10539610683917999, "learning_rate": 7.4343637904974e-06, "loss": 0.1045, "step": 49980 }, { "grad_norm": 0.11993521451950073, "learning_rate": 7.419911803750401e-06, "loss": 0.1182, "step": 49990 }, { "grad_norm": 0.11200883984565735, "learning_rate": 7.405472751709935e-06, "loss": 0.1166, "step": 50000 } ], "logging_steps": 10, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }