{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 1.5010359287261963, "learning_rate": 3.0000000000000004e-07, "loss": 1.5525, "step": 10 }, { "grad_norm": 1.2124391794204712, "learning_rate": 6.333333333333333e-07, "loss": 1.5669, "step": 20 }, { "grad_norm": 1.014855146408081, "learning_rate": 9.666666666666668e-07, "loss": 1.5298, "step": 30 }, { "grad_norm": 1.2825695276260376, "learning_rate": 1.3e-06, "loss": 1.5427, "step": 40 }, { "grad_norm": 1.2744516134262085, "learning_rate": 1.6333333333333333e-06, "loss": 1.501, "step": 50 }, { "grad_norm": 2.0881614685058594, "learning_rate": 1.9666666666666668e-06, "loss": 1.3912, "step": 60 }, { "grad_norm": 1.7104469537734985, "learning_rate": 2.3e-06, "loss": 1.2726, "step": 70 }, { "grad_norm": 1.938237190246582, "learning_rate": 2.6333333333333337e-06, "loss": 1.2484, "step": 80 }, { "grad_norm": 1.7891820669174194, "learning_rate": 2.966666666666667e-06, "loss": 1.2466, "step": 90 }, { "grad_norm": 2.449373960494995, "learning_rate": 3.3e-06, "loss": 1.2145, "step": 100 }, { "grad_norm": 1.8976929187774658, "learning_rate": 3.633333333333334e-06, "loss": 1.1818, "step": 110 }, { "grad_norm": 2.151960611343384, "learning_rate": 3.966666666666667e-06, "loss": 1.2056, "step": 120 }, { "grad_norm": 1.8281904458999634, "learning_rate": 4.2999999999999995e-06, "loss": 1.1796, "step": 130 }, { "grad_norm": 2.6051571369171143, "learning_rate": 4.633333333333334e-06, "loss": 1.1886, "step": 140 }, { "grad_norm": 2.5162341594696045, "learning_rate": 4.966666666666667e-06, "loss": 1.205, "step": 150 }, { "grad_norm": 1.8132128715515137, "learning_rate": 5.3e-06, "loss": 1.1725, "step": 160 }, { "grad_norm": 2.2009341716766357, "learning_rate": 5.633333333333333e-06, "loss": 1.1474, "step": 170 }, { "grad_norm": 2.1833715438842773, "learning_rate": 5.9666666666666666e-06, "loss": 1.1293, "step": 180 }, { "grad_norm": 2.273012638092041, "learning_rate": 6.300000000000001e-06, "loss": 1.1463, "step": 190 }, { "grad_norm": 1.7340333461761475, "learning_rate": 6.633333333333333e-06, "loss": 1.1285, "step": 200 }, { "grad_norm": 1.890699028968811, "learning_rate": 6.966666666666667e-06, "loss": 1.1369, "step": 210 }, { "grad_norm": 2.147536277770996, "learning_rate": 7.2999999999999996e-06, "loss": 1.1076, "step": 220 }, { "grad_norm": 1.3702398538589478, "learning_rate": 7.633333333333334e-06, "loss": 1.1139, "step": 230 }, { "grad_norm": 1.8564649820327759, "learning_rate": 7.966666666666666e-06, "loss": 1.1135, "step": 240 }, { "grad_norm": 1.9417624473571777, "learning_rate": 8.3e-06, "loss": 1.1141, "step": 250 }, { "grad_norm": 2.0505709648132324, "learning_rate": 8.633333333333334e-06, "loss": 1.1081, "step": 260 }, { "grad_norm": 2.1652414798736572, "learning_rate": 8.966666666666668e-06, "loss": 1.1084, "step": 270 }, { "grad_norm": 1.9243570566177368, "learning_rate": 9.3e-06, "loss": 1.1021, "step": 280 }, { "grad_norm": 1.9513193368911743, "learning_rate": 9.633333333333335e-06, "loss": 1.0914, "step": 290 }, { "grad_norm": 1.951107144355774, "learning_rate": 9.966666666666667e-06, "loss": 1.0804, "step": 300 }, { "grad_norm": 1.8376394510269165, "learning_rate": 1.03e-05, "loss": 1.1004, "step": 310 }, { "grad_norm": 1.4982789754867554, "learning_rate": 1.0633333333333334e-05, "loss": 1.0837, "step": 320 }, { "grad_norm": 1.9461957216262817, "learning_rate": 1.0966666666666666e-05, "loss": 1.0645, "step": 330 }, { "grad_norm": 1.6889629364013672, "learning_rate": 1.13e-05, "loss": 1.0929, "step": 340 }, { "grad_norm": 2.166200637817383, "learning_rate": 1.1633333333333334e-05, "loss": 1.0887, "step": 350 }, { "grad_norm": 1.9847972393035889, "learning_rate": 1.1966666666666668e-05, "loss": 1.0716, "step": 360 }, { "grad_norm": 1.6193352937698364, "learning_rate": 1.23e-05, "loss": 1.0516, "step": 370 }, { "grad_norm": 1.6432912349700928, "learning_rate": 1.2633333333333333e-05, "loss": 1.0967, "step": 380 }, { "grad_norm": 1.9338219165802002, "learning_rate": 1.2966666666666669e-05, "loss": 1.0581, "step": 390 }, { "grad_norm": 2.0325162410736084, "learning_rate": 1.3300000000000001e-05, "loss": 1.0646, "step": 400 }, { "grad_norm": 1.7107360363006592, "learning_rate": 1.3633333333333334e-05, "loss": 1.0917, "step": 410 }, { "grad_norm": 1.9763637781143188, "learning_rate": 1.3966666666666666e-05, "loss": 1.0651, "step": 420 }, { "grad_norm": 2.281877279281616, "learning_rate": 1.43e-05, "loss": 1.0221, "step": 430 }, { "grad_norm": 2.198232889175415, "learning_rate": 1.4633333333333334e-05, "loss": 1.0421, "step": 440 }, { "grad_norm": 2.2839407920837402, "learning_rate": 1.4966666666666668e-05, "loss": 1.0418, "step": 450 }, { "grad_norm": 1.7887474298477173, "learning_rate": 1.53e-05, "loss": 1.0328, "step": 460 }, { "grad_norm": 2.4262583255767822, "learning_rate": 1.563333333333333e-05, "loss": 0.9861, "step": 470 }, { "grad_norm": 1.9350372552871704, "learning_rate": 1.5966666666666667e-05, "loss": 0.9758, "step": 480 }, { "grad_norm": 2.2590487003326416, "learning_rate": 1.63e-05, "loss": 0.9717, "step": 490 }, { "grad_norm": 2.4763693809509277, "learning_rate": 1.6633333333333336e-05, "loss": 0.8933, "step": 500 }, { "grad_norm": 2.3228254318237305, "learning_rate": 1.6966666666666668e-05, "loss": 0.9293, "step": 510 }, { "grad_norm": 13.062117576599121, "learning_rate": 1.73e-05, "loss": 0.8848, "step": 520 }, { "grad_norm": 2.6293792724609375, "learning_rate": 1.7633333333333336e-05, "loss": 0.8508, "step": 530 }, { "grad_norm": 3.2583248615264893, "learning_rate": 1.796666666666667e-05, "loss": 0.8565, "step": 540 }, { "grad_norm": 2.7124991416931152, "learning_rate": 1.83e-05, "loss": 0.7941, "step": 550 }, { "grad_norm": 2.979337692260742, "learning_rate": 1.8633333333333333e-05, "loss": 0.7834, "step": 560 }, { "grad_norm": 3.62507963180542, "learning_rate": 1.896666666666667e-05, "loss": 0.7358, "step": 570 }, { "grad_norm": 3.2734127044677734, "learning_rate": 1.93e-05, "loss": 0.7808, "step": 580 }, { "grad_norm": 3.669417142868042, "learning_rate": 1.9633333333333334e-05, "loss": 0.7651, "step": 590 }, { "grad_norm": 3.283047676086426, "learning_rate": 1.9966666666666666e-05, "loss": 0.7033, "step": 600 }, { "grad_norm": 2.7269959449768066, "learning_rate": 2.0300000000000002e-05, "loss": 0.7298, "step": 610 }, { "grad_norm": 3.0878970623016357, "learning_rate": 2.0633333333333335e-05, "loss": 0.6686, "step": 620 }, { "grad_norm": 2.8599672317504883, "learning_rate": 2.0966666666666667e-05, "loss": 0.6666, "step": 630 }, { "grad_norm": 2.478567600250244, "learning_rate": 2.13e-05, "loss": 0.6012, "step": 640 }, { "grad_norm": 2.6333370208740234, "learning_rate": 2.1633333333333332e-05, "loss": 0.5933, "step": 650 }, { "grad_norm": 4.01058292388916, "learning_rate": 2.1966666666666668e-05, "loss": 0.6168, "step": 660 }, { "grad_norm": 2.9297633171081543, "learning_rate": 2.23e-05, "loss": 0.5587, "step": 670 }, { "grad_norm": 3.2352454662323, "learning_rate": 2.2633333333333336e-05, "loss": 0.5739, "step": 680 }, { "grad_norm": 3.0889785289764404, "learning_rate": 2.2966666666666668e-05, "loss": 0.535, "step": 690 }, { "grad_norm": 2.64123272895813, "learning_rate": 2.3300000000000004e-05, "loss": 0.5538, "step": 700 }, { "grad_norm": 2.943326711654663, "learning_rate": 2.3633333333333336e-05, "loss": 0.5437, "step": 710 }, { "grad_norm": 4.187772750854492, "learning_rate": 2.396666666666667e-05, "loss": 0.4927, "step": 720 }, { "grad_norm": 2.947031021118164, "learning_rate": 2.43e-05, "loss": 0.5201, "step": 730 }, { "grad_norm": 2.1675961017608643, "learning_rate": 2.4633333333333334e-05, "loss": 0.5218, "step": 740 }, { "grad_norm": 2.0846035480499268, "learning_rate": 2.496666666666667e-05, "loss": 0.4934, "step": 750 }, { "grad_norm": 2.7021639347076416, "learning_rate": 2.5300000000000002e-05, "loss": 0.485, "step": 760 }, { "grad_norm": 2.3447108268737793, "learning_rate": 2.5633333333333338e-05, "loss": 0.4955, "step": 770 }, { "grad_norm": 2.612253189086914, "learning_rate": 2.5966666666666667e-05, "loss": 0.4834, "step": 780 }, { "grad_norm": 3.0442042350769043, "learning_rate": 2.6300000000000002e-05, "loss": 0.4245, "step": 790 }, { "grad_norm": 3.174814224243164, "learning_rate": 2.663333333333333e-05, "loss": 0.507, "step": 800 }, { "grad_norm": 3.078709363937378, "learning_rate": 2.6966666666666667e-05, "loss": 0.4368, "step": 810 }, { "grad_norm": 3.0821094512939453, "learning_rate": 2.7300000000000003e-05, "loss": 0.4019, "step": 820 }, { "grad_norm": 3.3967466354370117, "learning_rate": 2.7633333333333332e-05, "loss": 0.3825, "step": 830 }, { "grad_norm": 2.6441216468811035, "learning_rate": 2.7966666666666668e-05, "loss": 0.407, "step": 840 }, { "grad_norm": 3.021688938140869, "learning_rate": 2.83e-05, "loss": 0.4352, "step": 850 }, { "grad_norm": 2.9694321155548096, "learning_rate": 2.8633333333333336e-05, "loss": 0.3948, "step": 860 }, { "grad_norm": 2.0211377143859863, "learning_rate": 2.8966666666666668e-05, "loss": 0.3975, "step": 870 }, { "grad_norm": 2.649658441543579, "learning_rate": 2.93e-05, "loss": 0.3644, "step": 880 }, { "grad_norm": 2.3843441009521484, "learning_rate": 2.9633333333333336e-05, "loss": 0.3445, "step": 890 }, { "grad_norm": 3.299147129058838, "learning_rate": 2.9966666666666672e-05, "loss": 0.3324, "step": 900 }, { "grad_norm": 3.089663028717041, "learning_rate": 3.03e-05, "loss": 0.3697, "step": 910 }, { "grad_norm": 2.508145332336426, "learning_rate": 3.063333333333334e-05, "loss": 0.3771, "step": 920 }, { "grad_norm": 2.094851493835449, "learning_rate": 3.096666666666666e-05, "loss": 0.3471, "step": 930 }, { "grad_norm": 3.6942086219787598, "learning_rate": 3.13e-05, "loss": 0.3629, "step": 940 }, { "grad_norm": 3.1799986362457275, "learning_rate": 3.1633333333333334e-05, "loss": 0.3634, "step": 950 }, { "grad_norm": 2.316878318786621, "learning_rate": 3.196666666666667e-05, "loss": 0.3246, "step": 960 }, { "grad_norm": 2.3948347568511963, "learning_rate": 3.2300000000000006e-05, "loss": 0.3555, "step": 970 }, { "grad_norm": 2.223820686340332, "learning_rate": 3.263333333333333e-05, "loss": 0.3109, "step": 980 }, { "grad_norm": 2.753469467163086, "learning_rate": 3.296666666666667e-05, "loss": 0.2948, "step": 990 }, { "grad_norm": 4.029375076293945, "learning_rate": 3.33e-05, "loss": 0.302, "step": 1000 }, { "grad_norm": 3.0689263343811035, "learning_rate": 3.3633333333333335e-05, "loss": 0.3198, "step": 1010 }, { "grad_norm": 2.722245454788208, "learning_rate": 3.396666666666667e-05, "loss": 0.3141, "step": 1020 }, { "grad_norm": 2.896317720413208, "learning_rate": 3.430000000000001e-05, "loss": 0.2873, "step": 1030 }, { "grad_norm": 2.2943873405456543, "learning_rate": 3.463333333333333e-05, "loss": 0.2885, "step": 1040 }, { "grad_norm": 3.0464224815368652, "learning_rate": 3.496666666666667e-05, "loss": 0.3321, "step": 1050 }, { "grad_norm": 2.759852409362793, "learning_rate": 3.53e-05, "loss": 0.2772, "step": 1060 }, { "grad_norm": 3.09073543548584, "learning_rate": 3.563333333333334e-05, "loss": 0.2944, "step": 1070 }, { "grad_norm": 1.7184548377990723, "learning_rate": 3.596666666666667e-05, "loss": 0.2171, "step": 1080 }, { "grad_norm": 2.5566325187683105, "learning_rate": 3.63e-05, "loss": 0.3368, "step": 1090 }, { "grad_norm": 2.2110397815704346, "learning_rate": 3.6633333333333334e-05, "loss": 0.2738, "step": 1100 }, { "grad_norm": 1.6593278646469116, "learning_rate": 3.6966666666666666e-05, "loss": 0.2801, "step": 1110 }, { "grad_norm": 2.406130075454712, "learning_rate": 3.73e-05, "loss": 0.2728, "step": 1120 }, { "grad_norm": 2.4783928394317627, "learning_rate": 3.763333333333334e-05, "loss": 0.2376, "step": 1130 }, { "grad_norm": 2.022433042526245, "learning_rate": 3.796666666666667e-05, "loss": 0.2121, "step": 1140 }, { "grad_norm": 1.8714436292648315, "learning_rate": 3.83e-05, "loss": 0.2122, "step": 1150 }, { "grad_norm": 2.1609947681427, "learning_rate": 3.8633333333333335e-05, "loss": 0.2136, "step": 1160 }, { "grad_norm": 2.5822458267211914, "learning_rate": 3.896666666666667e-05, "loss": 0.2045, "step": 1170 }, { "grad_norm": 2.7156293392181396, "learning_rate": 3.9300000000000007e-05, "loss": 0.2487, "step": 1180 }, { "grad_norm": 1.8696635961532593, "learning_rate": 3.963333333333333e-05, "loss": 0.2194, "step": 1190 }, { "grad_norm": 2.0956344604492188, "learning_rate": 3.996666666666667e-05, "loss": 0.2456, "step": 1200 }, { "grad_norm": 1.7319245338439941, "learning_rate": 4.0300000000000004e-05, "loss": 0.2385, "step": 1210 }, { "grad_norm": 2.2370188236236572, "learning_rate": 4.0633333333333336e-05, "loss": 0.2235, "step": 1220 }, { "grad_norm": 2.651639938354492, "learning_rate": 4.096666666666667e-05, "loss": 0.2285, "step": 1230 }, { "grad_norm": 2.159344434738159, "learning_rate": 4.13e-05, "loss": 0.2154, "step": 1240 }, { "grad_norm": 2.1763393878936768, "learning_rate": 4.1633333333333333e-05, "loss": 0.2233, "step": 1250 }, { "grad_norm": 2.284054756164551, "learning_rate": 4.196666666666667e-05, "loss": 0.2284, "step": 1260 }, { "grad_norm": 2.2450003623962402, "learning_rate": 4.23e-05, "loss": 0.2205, "step": 1270 }, { "grad_norm": 2.914175510406494, "learning_rate": 4.263333333333334e-05, "loss": 0.2378, "step": 1280 }, { "grad_norm": 3.282472848892212, "learning_rate": 4.296666666666666e-05, "loss": 0.2164, "step": 1290 }, { "grad_norm": 2.248265266418457, "learning_rate": 4.33e-05, "loss": 0.2347, "step": 1300 }, { "grad_norm": 1.85403311252594, "learning_rate": 4.3633333333333335e-05, "loss": 0.2112, "step": 1310 }, { "grad_norm": 2.3942599296569824, "learning_rate": 4.396666666666667e-05, "loss": 0.2781, "step": 1320 }, { "grad_norm": 2.4758377075195312, "learning_rate": 4.43e-05, "loss": 0.1915, "step": 1330 }, { "grad_norm": 1.4877569675445557, "learning_rate": 4.463333333333334e-05, "loss": 0.1901, "step": 1340 }, { "grad_norm": 1.9537400007247925, "learning_rate": 4.496666666666667e-05, "loss": 0.1797, "step": 1350 }, { "grad_norm": 3.272366523742676, "learning_rate": 4.53e-05, "loss": 0.2611, "step": 1360 }, { "grad_norm": 1.948183298110962, "learning_rate": 4.5633333333333336e-05, "loss": 0.1939, "step": 1370 }, { "grad_norm": 1.9928641319274902, "learning_rate": 4.596666666666667e-05, "loss": 0.2118, "step": 1380 }, { "grad_norm": 3.0318148136138916, "learning_rate": 4.630000000000001e-05, "loss": 0.2153, "step": 1390 }, { "grad_norm": 2.5359442234039307, "learning_rate": 4.663333333333333e-05, "loss": 0.2537, "step": 1400 }, { "grad_norm": 2.128546953201294, "learning_rate": 4.696666666666667e-05, "loss": 0.1811, "step": 1410 }, { "grad_norm": 2.055798053741455, "learning_rate": 4.73e-05, "loss": 0.2322, "step": 1420 }, { "grad_norm": 2.1923317909240723, "learning_rate": 4.763333333333334e-05, "loss": 0.223, "step": 1430 }, { "grad_norm": 2.2780303955078125, "learning_rate": 4.796666666666667e-05, "loss": 0.2304, "step": 1440 }, { "grad_norm": 1.237210988998413, "learning_rate": 4.83e-05, "loss": 0.1936, "step": 1450 }, { "grad_norm": 2.0921897888183594, "learning_rate": 4.8633333333333334e-05, "loss": 0.2266, "step": 1460 }, { "grad_norm": 1.7888177633285522, "learning_rate": 4.8966666666666667e-05, "loss": 0.1964, "step": 1470 }, { "grad_norm": 1.5368609428405762, "learning_rate": 4.93e-05, "loss": 0.1655, "step": 1480 }, { "grad_norm": 2.129121780395508, "learning_rate": 4.963333333333334e-05, "loss": 0.2225, "step": 1490 }, { "grad_norm": 1.4553990364074707, "learning_rate": 4.996666666666667e-05, "loss": 0.1859, "step": 1500 }, { "grad_norm": 1.7757089138031006, "learning_rate": 5.03e-05, "loss": 0.2137, "step": 1510 }, { "grad_norm": 1.94987154006958, "learning_rate": 5.0633333333333335e-05, "loss": 0.1962, "step": 1520 }, { "grad_norm": 1.6136974096298218, "learning_rate": 5.0966666666666674e-05, "loss": 0.2027, "step": 1530 }, { "grad_norm": 1.6413114070892334, "learning_rate": 5.130000000000001e-05, "loss": 0.2271, "step": 1540 }, { "grad_norm": 2.035320281982422, "learning_rate": 5.163333333333333e-05, "loss": 0.2707, "step": 1550 }, { "grad_norm": 2.1973636150360107, "learning_rate": 5.196666666666667e-05, "loss": 0.2399, "step": 1560 }, { "grad_norm": 1.8697173595428467, "learning_rate": 5.2300000000000004e-05, "loss": 0.1794, "step": 1570 }, { "grad_norm": 1.9656593799591064, "learning_rate": 5.2633333333333336e-05, "loss": 0.2121, "step": 1580 }, { "grad_norm": 1.852837324142456, "learning_rate": 5.296666666666666e-05, "loss": 0.2066, "step": 1590 }, { "grad_norm": 1.5176286697387695, "learning_rate": 5.330000000000001e-05, "loss": 0.1942, "step": 1600 }, { "grad_norm": 1.8346984386444092, "learning_rate": 5.3633333333333334e-05, "loss": 0.1893, "step": 1610 }, { "grad_norm": 1.7617892026901245, "learning_rate": 5.3966666666666666e-05, "loss": 0.189, "step": 1620 }, { "grad_norm": 1.772983193397522, "learning_rate": 5.4300000000000005e-05, "loss": 0.1994, "step": 1630 }, { "grad_norm": 2.2190332412719727, "learning_rate": 5.463333333333334e-05, "loss": 0.2211, "step": 1640 }, { "grad_norm": 1.96506929397583, "learning_rate": 5.496666666666666e-05, "loss": 0.2578, "step": 1650 }, { "grad_norm": 1.9038299322128296, "learning_rate": 5.530000000000001e-05, "loss": 0.2049, "step": 1660 }, { "grad_norm": 1.4873415231704712, "learning_rate": 5.5633333333333335e-05, "loss": 0.2227, "step": 1670 }, { "grad_norm": 1.787426233291626, "learning_rate": 5.596666666666667e-05, "loss": 0.2255, "step": 1680 }, { "grad_norm": 1.9160351753234863, "learning_rate": 5.63e-05, "loss": 0.2075, "step": 1690 }, { "grad_norm": 1.7899158000946045, "learning_rate": 5.663333333333334e-05, "loss": 0.2378, "step": 1700 }, { "grad_norm": 1.966518521308899, "learning_rate": 5.696666666666667e-05, "loss": 0.1847, "step": 1710 }, { "grad_norm": 1.9451677799224854, "learning_rate": 5.73e-05, "loss": 0.2059, "step": 1720 }, { "grad_norm": 1.4753328561782837, "learning_rate": 5.7633333333333336e-05, "loss": 0.2179, "step": 1730 }, { "grad_norm": 2.231715440750122, "learning_rate": 5.796666666666667e-05, "loss": 0.2162, "step": 1740 }, { "grad_norm": 1.9007164239883423, "learning_rate": 5.83e-05, "loss": 0.1989, "step": 1750 }, { "grad_norm": 1.638191819190979, "learning_rate": 5.863333333333334e-05, "loss": 0.196, "step": 1760 }, { "grad_norm": 1.9802309274673462, "learning_rate": 5.896666666666667e-05, "loss": 0.2076, "step": 1770 }, { "grad_norm": 1.8987586498260498, "learning_rate": 5.93e-05, "loss": 0.2008, "step": 1780 }, { "grad_norm": 1.3714555501937866, "learning_rate": 5.9633333333333344e-05, "loss": 0.1951, "step": 1790 }, { "grad_norm": 1.7758967876434326, "learning_rate": 5.996666666666667e-05, "loss": 0.2047, "step": 1800 }, { "grad_norm": 1.1921807527542114, "learning_rate": 6.03e-05, "loss": 0.1844, "step": 1810 }, { "grad_norm": 1.3567193746566772, "learning_rate": 6.063333333333333e-05, "loss": 0.2113, "step": 1820 }, { "grad_norm": 2.0091586112976074, "learning_rate": 6.0966666666666674e-05, "loss": 0.1817, "step": 1830 }, { "grad_norm": 1.6412349939346313, "learning_rate": 6.13e-05, "loss": 0.189, "step": 1840 }, { "grad_norm": 1.6914161443710327, "learning_rate": 6.163333333333333e-05, "loss": 0.2127, "step": 1850 }, { "grad_norm": 1.3190102577209473, "learning_rate": 6.196666666666668e-05, "loss": 0.239, "step": 1860 }, { "grad_norm": 1.841160535812378, "learning_rate": 6.23e-05, "loss": 0.2117, "step": 1870 }, { "grad_norm": 1.9003639221191406, "learning_rate": 6.263333333333333e-05, "loss": 0.2066, "step": 1880 }, { "grad_norm": 1.8111066818237305, "learning_rate": 6.296666666666667e-05, "loss": 0.2057, "step": 1890 }, { "grad_norm": 1.4908910989761353, "learning_rate": 6.330000000000001e-05, "loss": 0.2046, "step": 1900 }, { "grad_norm": 1.1898845434188843, "learning_rate": 6.363333333333334e-05, "loss": 0.209, "step": 1910 }, { "grad_norm": 1.4947654008865356, "learning_rate": 6.396666666666667e-05, "loss": 0.1907, "step": 1920 }, { "grad_norm": 1.3463621139526367, "learning_rate": 6.43e-05, "loss": 0.2324, "step": 1930 }, { "grad_norm": 1.5817949771881104, "learning_rate": 6.463333333333334e-05, "loss": 0.2104, "step": 1940 }, { "grad_norm": 1.3222252130508423, "learning_rate": 6.496666666666667e-05, "loss": 0.1759, "step": 1950 }, { "grad_norm": 1.5581730604171753, "learning_rate": 6.53e-05, "loss": 0.181, "step": 1960 }, { "grad_norm": 1.9255026578903198, "learning_rate": 6.563333333333333e-05, "loss": 0.1888, "step": 1970 }, { "grad_norm": 2.0426762104034424, "learning_rate": 6.596666666666667e-05, "loss": 0.2073, "step": 1980 }, { "grad_norm": 1.4214354753494263, "learning_rate": 6.630000000000001e-05, "loss": 0.195, "step": 1990 }, { "grad_norm": 2.036141872406006, "learning_rate": 6.663333333333333e-05, "loss": 0.1782, "step": 2000 }, { "grad_norm": 1.1438493728637695, "learning_rate": 6.696666666666666e-05, "loss": 0.1818, "step": 2010 }, { "grad_norm": 1.6118632555007935, "learning_rate": 6.730000000000001e-05, "loss": 0.1916, "step": 2020 }, { "grad_norm": 1.6401573419570923, "learning_rate": 6.763333333333334e-05, "loss": 0.217, "step": 2030 }, { "grad_norm": 1.419750452041626, "learning_rate": 6.796666666666666e-05, "loss": 0.1946, "step": 2040 }, { "grad_norm": 1.18424391746521, "learning_rate": 6.83e-05, "loss": 0.1746, "step": 2050 }, { "grad_norm": 1.406874418258667, "learning_rate": 6.863333333333334e-05, "loss": 0.1956, "step": 2060 }, { "grad_norm": 1.3530786037445068, "learning_rate": 6.896666666666667e-05, "loss": 0.1891, "step": 2070 }, { "grad_norm": 1.7386308908462524, "learning_rate": 6.93e-05, "loss": 0.1791, "step": 2080 }, { "grad_norm": 1.1751360893249512, "learning_rate": 6.963333333333334e-05, "loss": 0.207, "step": 2090 }, { "grad_norm": 1.252974033355713, "learning_rate": 6.996666666666667e-05, "loss": 0.1842, "step": 2100 }, { "grad_norm": 1.6621707677841187, "learning_rate": 7.03e-05, "loss": 0.1996, "step": 2110 }, { "grad_norm": 1.402913212776184, "learning_rate": 7.063333333333333e-05, "loss": 0.2284, "step": 2120 }, { "grad_norm": 1.609904170036316, "learning_rate": 7.096666666666667e-05, "loss": 0.2113, "step": 2130 }, { "grad_norm": 1.341609239578247, "learning_rate": 7.13e-05, "loss": 0.213, "step": 2140 }, { "grad_norm": 1.5592175722122192, "learning_rate": 7.163333333333334e-05, "loss": 0.2287, "step": 2150 }, { "grad_norm": 1.4033401012420654, "learning_rate": 7.196666666666668e-05, "loss": 0.2041, "step": 2160 }, { "grad_norm": 1.468656063079834, "learning_rate": 7.23e-05, "loss": 0.207, "step": 2170 }, { "grad_norm": 1.321109414100647, "learning_rate": 7.263333333333334e-05, "loss": 0.1903, "step": 2180 }, { "grad_norm": 1.6698410511016846, "learning_rate": 7.296666666666667e-05, "loss": 0.1731, "step": 2190 }, { "grad_norm": 1.4736465215682983, "learning_rate": 7.33e-05, "loss": 0.1554, "step": 2200 }, { "grad_norm": 1.429604411125183, "learning_rate": 7.363333333333334e-05, "loss": 0.2154, "step": 2210 }, { "grad_norm": 1.6474733352661133, "learning_rate": 7.396666666666667e-05, "loss": 0.2381, "step": 2220 }, { "grad_norm": 1.4938603639602661, "learning_rate": 7.43e-05, "loss": 0.1888, "step": 2230 }, { "grad_norm": 1.3768140077590942, "learning_rate": 7.463333333333334e-05, "loss": 0.1502, "step": 2240 }, { "grad_norm": 1.5291926860809326, "learning_rate": 7.496666666666667e-05, "loss": 0.1784, "step": 2250 }, { "grad_norm": 1.4036003351211548, "learning_rate": 7.53e-05, "loss": 0.2201, "step": 2260 }, { "grad_norm": 1.3246618509292603, "learning_rate": 7.563333333333333e-05, "loss": 0.221, "step": 2270 }, { "grad_norm": 1.6254242658615112, "learning_rate": 7.596666666666668e-05, "loss": 0.2151, "step": 2280 }, { "grad_norm": 1.573975920677185, "learning_rate": 7.630000000000001e-05, "loss": 0.1904, "step": 2290 }, { "grad_norm": 1.2009119987487793, "learning_rate": 7.663333333333333e-05, "loss": 0.2469, "step": 2300 }, { "grad_norm": 1.5803437232971191, "learning_rate": 7.696666666666668e-05, "loss": 0.1853, "step": 2310 }, { "grad_norm": 1.0142064094543457, "learning_rate": 7.730000000000001e-05, "loss": 0.1946, "step": 2320 }, { "grad_norm": 1.4663245677947998, "learning_rate": 7.763333333333334e-05, "loss": 0.1692, "step": 2330 }, { "grad_norm": 1.3807011842727661, "learning_rate": 7.796666666666666e-05, "loss": 0.2078, "step": 2340 }, { "grad_norm": 1.6009043455123901, "learning_rate": 7.83e-05, "loss": 0.1779, "step": 2350 }, { "grad_norm": 1.6911698579788208, "learning_rate": 7.863333333333334e-05, "loss": 0.1772, "step": 2360 }, { "grad_norm": 1.611280083656311, "learning_rate": 7.896666666666667e-05, "loss": 0.2145, "step": 2370 }, { "grad_norm": 1.4255794286727905, "learning_rate": 7.93e-05, "loss": 0.1815, "step": 2380 }, { "grad_norm": 1.4662569761276245, "learning_rate": 7.963333333333334e-05, "loss": 0.1853, "step": 2390 }, { "grad_norm": 1.5171114206314087, "learning_rate": 7.996666666666667e-05, "loss": 0.216, "step": 2400 }, { "grad_norm": 1.4752658605575562, "learning_rate": 8.030000000000001e-05, "loss": 0.2428, "step": 2410 }, { "grad_norm": 2.164395809173584, "learning_rate": 8.063333333333333e-05, "loss": 0.2013, "step": 2420 }, { "grad_norm": 1.1783969402313232, "learning_rate": 8.096666666666667e-05, "loss": 0.177, "step": 2430 }, { "grad_norm": 1.8139854669570923, "learning_rate": 8.13e-05, "loss": 0.1894, "step": 2440 }, { "grad_norm": 1.3259992599487305, "learning_rate": 8.163333333333334e-05, "loss": 0.2036, "step": 2450 }, { "grad_norm": 1.1950873136520386, "learning_rate": 8.196666666666668e-05, "loss": 0.1564, "step": 2460 }, { "grad_norm": 1.483264446258545, "learning_rate": 8.23e-05, "loss": 0.2244, "step": 2470 }, { "grad_norm": 1.2849138975143433, "learning_rate": 8.263333333333334e-05, "loss": 0.1725, "step": 2480 }, { "grad_norm": 1.2604809999465942, "learning_rate": 8.296666666666667e-05, "loss": 0.1783, "step": 2490 }, { "grad_norm": 1.3654593229293823, "learning_rate": 8.33e-05, "loss": 0.1806, "step": 2500 }, { "grad_norm": 1.1738306283950806, "learning_rate": 8.363333333333334e-05, "loss": 0.2008, "step": 2510 }, { "grad_norm": 1.1897372007369995, "learning_rate": 8.396666666666667e-05, "loss": 0.1808, "step": 2520 }, { "grad_norm": 1.317838430404663, "learning_rate": 8.43e-05, "loss": 0.1624, "step": 2530 }, { "grad_norm": 1.585335373878479, "learning_rate": 8.463333333333335e-05, "loss": 0.1727, "step": 2540 }, { "grad_norm": 1.135698676109314, "learning_rate": 8.496666666666667e-05, "loss": 0.1907, "step": 2550 }, { "grad_norm": 1.4098966121673584, "learning_rate": 8.53e-05, "loss": 0.2018, "step": 2560 }, { "grad_norm": 1.22854483127594, "learning_rate": 8.563333333333333e-05, "loss": 0.2121, "step": 2570 }, { "grad_norm": 1.2817273139953613, "learning_rate": 8.596666666666668e-05, "loss": 0.172, "step": 2580 }, { "grad_norm": 1.3770641088485718, "learning_rate": 8.63e-05, "loss": 0.1999, "step": 2590 }, { "grad_norm": 1.0441067218780518, "learning_rate": 8.663333333333333e-05, "loss": 0.2086, "step": 2600 }, { "grad_norm": 1.3772320747375488, "learning_rate": 8.696666666666668e-05, "loss": 0.1677, "step": 2610 }, { "grad_norm": 1.096062421798706, "learning_rate": 8.730000000000001e-05, "loss": 0.1957, "step": 2620 }, { "grad_norm": 1.309760332107544, "learning_rate": 8.763333333333334e-05, "loss": 0.1547, "step": 2630 }, { "grad_norm": 1.0641523599624634, "learning_rate": 8.796666666666667e-05, "loss": 0.2189, "step": 2640 }, { "grad_norm": 1.186310052871704, "learning_rate": 8.83e-05, "loss": 0.1554, "step": 2650 }, { "grad_norm": 1.1318175792694092, "learning_rate": 8.863333333333334e-05, "loss": 0.1747, "step": 2660 }, { "grad_norm": 1.484761357307434, "learning_rate": 8.896666666666667e-05, "loss": 0.156, "step": 2670 }, { "grad_norm": 0.9705097675323486, "learning_rate": 8.93e-05, "loss": 0.184, "step": 2680 }, { "grad_norm": 1.0836840867996216, "learning_rate": 8.963333333333333e-05, "loss": 0.1876, "step": 2690 }, { "grad_norm": 0.8978180885314941, "learning_rate": 8.996666666666667e-05, "loss": 0.2061, "step": 2700 }, { "grad_norm": 1.490487813949585, "learning_rate": 9.030000000000001e-05, "loss": 0.1964, "step": 2710 }, { "grad_norm": 1.3114856481552124, "learning_rate": 9.063333333333333e-05, "loss": 0.1943, "step": 2720 }, { "grad_norm": 1.344236135482788, "learning_rate": 9.096666666666666e-05, "loss": 0.1697, "step": 2730 }, { "grad_norm": 1.443164587020874, "learning_rate": 9.130000000000001e-05, "loss": 0.1905, "step": 2740 }, { "grad_norm": 1.0813146829605103, "learning_rate": 9.163333333333334e-05, "loss": 0.1471, "step": 2750 }, { "grad_norm": 1.1751703023910522, "learning_rate": 9.196666666666666e-05, "loss": 0.1582, "step": 2760 }, { "grad_norm": 1.042534351348877, "learning_rate": 9.230000000000001e-05, "loss": 0.175, "step": 2770 }, { "grad_norm": 1.1272408962249756, "learning_rate": 9.263333333333334e-05, "loss": 0.2116, "step": 2780 }, { "grad_norm": 1.268645167350769, "learning_rate": 9.296666666666667e-05, "loss": 0.15, "step": 2790 }, { "grad_norm": 1.2685943841934204, "learning_rate": 9.33e-05, "loss": 0.1758, "step": 2800 }, { "grad_norm": 1.1503596305847168, "learning_rate": 9.363333333333334e-05, "loss": 0.1896, "step": 2810 }, { "grad_norm": 1.5217326879501343, "learning_rate": 9.396666666666667e-05, "loss": 0.1887, "step": 2820 }, { "grad_norm": 1.0893689393997192, "learning_rate": 9.43e-05, "loss": 0.1767, "step": 2830 }, { "grad_norm": 1.295325756072998, "learning_rate": 9.463333333333333e-05, "loss": 0.1997, "step": 2840 }, { "grad_norm": 1.4090317487716675, "learning_rate": 9.496666666666667e-05, "loss": 0.1527, "step": 2850 }, { "grad_norm": 1.3767322301864624, "learning_rate": 9.53e-05, "loss": 0.1725, "step": 2860 }, { "grad_norm": 1.839415192604065, "learning_rate": 9.563333333333334e-05, "loss": 0.181, "step": 2870 }, { "grad_norm": 1.1493103504180908, "learning_rate": 9.596666666666668e-05, "loss": 0.1697, "step": 2880 }, { "grad_norm": 0.9746655821800232, "learning_rate": 9.63e-05, "loss": 0.2375, "step": 2890 }, { "grad_norm": 2.023308038711548, "learning_rate": 9.663333333333334e-05, "loss": 0.1877, "step": 2900 }, { "grad_norm": 0.8602619171142578, "learning_rate": 9.696666666666667e-05, "loss": 0.2318, "step": 2910 }, { "grad_norm": 1.0124236345291138, "learning_rate": 9.730000000000001e-05, "loss": 0.1566, "step": 2920 }, { "grad_norm": 1.161813735961914, "learning_rate": 9.763333333333334e-05, "loss": 0.1908, "step": 2930 }, { "grad_norm": 1.1843147277832031, "learning_rate": 9.796666666666667e-05, "loss": 0.1855, "step": 2940 }, { "grad_norm": 1.320243239402771, "learning_rate": 9.83e-05, "loss": 0.1808, "step": 2950 }, { "grad_norm": 1.3531924486160278, "learning_rate": 9.863333333333334e-05, "loss": 0.1702, "step": 2960 }, { "grad_norm": 1.106434941291809, "learning_rate": 9.896666666666667e-05, "loss": 0.1672, "step": 2970 }, { "grad_norm": 1.4475525617599487, "learning_rate": 9.93e-05, "loss": 0.2087, "step": 2980 }, { "grad_norm": 1.205426812171936, "learning_rate": 9.963333333333333e-05, "loss": 0.1977, "step": 2990 }, { "grad_norm": 1.1478064060211182, "learning_rate": 9.996666666666668e-05, "loss": 0.1913, "step": 3000 }, { "grad_norm": 0.9739011526107788, "learning_rate": 9.999999384858465e-05, "loss": 0.1828, "step": 3010 }, { "grad_norm": 0.9858922362327576, "learning_rate": 9.999997258443473e-05, "loss": 0.1664, "step": 3020 }, { "grad_norm": 0.8175076246261597, "learning_rate": 9.999993613161331e-05, "loss": 0.185, "step": 3030 }, { "grad_norm": 1.2363545894622803, "learning_rate": 9.999988449013146e-05, "loss": 0.1852, "step": 3040 }, { "grad_norm": 1.2952274084091187, "learning_rate": 9.99998176600049e-05, "loss": 0.1655, "step": 3050 }, { "grad_norm": 0.9189832210540771, "learning_rate": 9.999973564125389e-05, "loss": 0.2307, "step": 3060 }, { "grad_norm": 1.1008625030517578, "learning_rate": 9.999963843390335e-05, "loss": 0.1992, "step": 3070 }, { "grad_norm": 1.2709870338439941, "learning_rate": 9.999952603798282e-05, "loss": 0.1812, "step": 3080 }, { "grad_norm": 1.2905864715576172, "learning_rate": 9.999939845352646e-05, "loss": 0.1588, "step": 3090 }, { "grad_norm": 1.0956246852874756, "learning_rate": 9.999925568057298e-05, "loss": 0.1589, "step": 3100 }, { "grad_norm": 1.6194196939468384, "learning_rate": 9.999909771916578e-05, "loss": 0.2073, "step": 3110 }, { "grad_norm": 0.9712592959403992, "learning_rate": 9.999892456935285e-05, "loss": 0.1796, "step": 3120 }, { "grad_norm": 1.2482966184616089, "learning_rate": 9.999873623118679e-05, "loss": 0.1541, "step": 3130 }, { "grad_norm": 1.022146224975586, "learning_rate": 9.999853270472479e-05, "loss": 0.1806, "step": 3140 }, { "grad_norm": 1.0014681816101074, "learning_rate": 9.999831399002871e-05, "loss": 0.1566, "step": 3150 }, { "grad_norm": 1.1831233501434326, "learning_rate": 9.999808008716494e-05, "loss": 0.1664, "step": 3160 }, { "grad_norm": 1.4484535455703735, "learning_rate": 9.999783099620459e-05, "loss": 0.1732, "step": 3170 }, { "grad_norm": 1.230710744857788, "learning_rate": 9.999756671722328e-05, "loss": 0.1861, "step": 3180 }, { "grad_norm": 0.9835123419761658, "learning_rate": 9.99972872503013e-05, "loss": 0.1612, "step": 3190 }, { "grad_norm": 1.1855614185333252, "learning_rate": 9.999699259552359e-05, "loss": 0.1594, "step": 3200 }, { "grad_norm": 1.1162168979644775, "learning_rate": 9.99966827529796e-05, "loss": 0.1404, "step": 3210 }, { "grad_norm": 1.067151665687561, "learning_rate": 9.999635772276348e-05, "loss": 0.1595, "step": 3220 }, { "grad_norm": 1.2876273393630981, "learning_rate": 9.999601750497396e-05, "loss": 0.1804, "step": 3230 }, { "grad_norm": 0.9359829425811768, "learning_rate": 9.99956620997144e-05, "loss": 0.1876, "step": 3240 }, { "grad_norm": 1.1717841625213623, "learning_rate": 9.999529150709275e-05, "loss": 0.1921, "step": 3250 }, { "grad_norm": 1.1096442937850952, "learning_rate": 9.999490572722158e-05, "loss": 0.2104, "step": 3260 }, { "grad_norm": 1.2584998607635498, "learning_rate": 9.99945047602181e-05, "loss": 0.1788, "step": 3270 }, { "grad_norm": 0.9697068929672241, "learning_rate": 9.99940886062041e-05, "loss": 0.2016, "step": 3280 }, { "grad_norm": 1.136383295059204, "learning_rate": 9.999365726530599e-05, "loss": 0.1742, "step": 3290 }, { "grad_norm": 1.1347447633743286, "learning_rate": 9.999321073765481e-05, "loss": 0.1757, "step": 3300 }, { "grad_norm": 0.8531554937362671, "learning_rate": 9.99927490233862e-05, "loss": 0.1578, "step": 3310 }, { "grad_norm": 1.1021595001220703, "learning_rate": 9.999227212264043e-05, "loss": 0.1811, "step": 3320 }, { "grad_norm": 1.119384527206421, "learning_rate": 9.999178003556236e-05, "loss": 0.1538, "step": 3330 }, { "grad_norm": 1.3471107482910156, "learning_rate": 9.999127276230146e-05, "loss": 0.1623, "step": 3340 }, { "grad_norm": 0.9315221905708313, "learning_rate": 9.999075030301184e-05, "loss": 0.1713, "step": 3350 }, { "grad_norm": 0.9798449873924255, "learning_rate": 9.999021265785221e-05, "loss": 0.1658, "step": 3360 }, { "grad_norm": 0.8041617274284363, "learning_rate": 9.998965982698589e-05, "loss": 0.1607, "step": 3370 }, { "grad_norm": 1.0764963626861572, "learning_rate": 9.998909181058082e-05, "loss": 0.1477, "step": 3380 }, { "grad_norm": 1.0197232961654663, "learning_rate": 9.998850860880953e-05, "loss": 0.1618, "step": 3390 }, { "grad_norm": 1.058061957359314, "learning_rate": 9.998791022184922e-05, "loss": 0.1748, "step": 3400 }, { "grad_norm": 1.255689024925232, "learning_rate": 9.99872966498816e-05, "loss": 0.169, "step": 3410 }, { "grad_norm": 1.802088975906372, "learning_rate": 9.998666789309313e-05, "loss": 0.2162, "step": 3420 }, { "grad_norm": 0.9632624983787537, "learning_rate": 9.998602395167475e-05, "loss": 0.1745, "step": 3430 }, { "grad_norm": 1.1646955013275146, "learning_rate": 9.998536482582213e-05, "loss": 0.1794, "step": 3440 }, { "grad_norm": 0.9702215194702148, "learning_rate": 9.998469051573544e-05, "loss": 0.1682, "step": 3450 }, { "grad_norm": 1.1819888353347778, "learning_rate": 9.998400102161954e-05, "loss": 0.1563, "step": 3460 }, { "grad_norm": 1.2018296718597412, "learning_rate": 9.998329634368388e-05, "loss": 0.1612, "step": 3470 }, { "grad_norm": 1.0526212453842163, "learning_rate": 9.998257648214253e-05, "loss": 0.1285, "step": 3480 }, { "grad_norm": 1.440714955329895, "learning_rate": 9.998184143721417e-05, "loss": 0.1564, "step": 3490 }, { "grad_norm": 1.173736572265625, "learning_rate": 9.998109120912206e-05, "loss": 0.1793, "step": 3500 }, { "grad_norm": 1.1304090023040771, "learning_rate": 9.998032579809411e-05, "loss": 0.1622, "step": 3510 }, { "grad_norm": 1.4098598957061768, "learning_rate": 9.997954520436286e-05, "loss": 0.1807, "step": 3520 }, { "grad_norm": 0.9826326966285706, "learning_rate": 9.997874942816538e-05, "loss": 0.1621, "step": 3530 }, { "grad_norm": 1.3891226053237915, "learning_rate": 9.997793846974345e-05, "loss": 0.1719, "step": 3540 }, { "grad_norm": 1.3484325408935547, "learning_rate": 9.997711232934341e-05, "loss": 0.1753, "step": 3550 }, { "grad_norm": 1.0355767011642456, "learning_rate": 9.99762710072162e-05, "loss": 0.1864, "step": 3560 }, { "grad_norm": 1.3911041021347046, "learning_rate": 9.997541450361743e-05, "loss": 0.1445, "step": 3570 }, { "grad_norm": 1.227480173110962, "learning_rate": 9.997454281880723e-05, "loss": 0.1737, "step": 3580 }, { "grad_norm": 1.3046683073043823, "learning_rate": 9.997365595305044e-05, "loss": 0.165, "step": 3590 }, { "grad_norm": 0.9704902768135071, "learning_rate": 9.997275390661644e-05, "loss": 0.1631, "step": 3600 }, { "grad_norm": 1.2779918909072876, "learning_rate": 9.997183667977926e-05, "loss": 0.1636, "step": 3610 }, { "grad_norm": 0.9269623160362244, "learning_rate": 9.997090427281752e-05, "loss": 0.1455, "step": 3620 }, { "grad_norm": 1.1644238233566284, "learning_rate": 9.996995668601448e-05, "loss": 0.1321, "step": 3630 }, { "grad_norm": 1.4765958786010742, "learning_rate": 9.996899391965798e-05, "loss": 0.178, "step": 3640 }, { "grad_norm": 0.8685243725776672, "learning_rate": 9.996801597404048e-05, "loss": 0.138, "step": 3650 }, { "grad_norm": 1.3047888278961182, "learning_rate": 9.996702284945905e-05, "loss": 0.1973, "step": 3660 }, { "grad_norm": 0.9078426361083984, "learning_rate": 9.996601454621539e-05, "loss": 0.1854, "step": 3670 }, { "grad_norm": 0.9688925743103027, "learning_rate": 9.996499106461577e-05, "loss": 0.1491, "step": 3680 }, { "grad_norm": 1.12204110622406, "learning_rate": 9.996395240497112e-05, "loss": 0.2021, "step": 3690 }, { "grad_norm": 1.1787536144256592, "learning_rate": 9.996289856759696e-05, "loss": 0.1581, "step": 3700 }, { "grad_norm": 1.3809843063354492, "learning_rate": 9.996182955281342e-05, "loss": 0.1787, "step": 3710 }, { "grad_norm": 1.20012366771698, "learning_rate": 9.996074536094519e-05, "loss": 0.22, "step": 3720 }, { "grad_norm": 1.0635907649993896, "learning_rate": 9.995964599232168e-05, "loss": 0.1455, "step": 3730 }, { "grad_norm": 0.8101170659065247, "learning_rate": 9.995853144727683e-05, "loss": 0.1584, "step": 3740 }, { "grad_norm": 0.9238809943199158, "learning_rate": 9.99574017261492e-05, "loss": 0.142, "step": 3750 }, { "grad_norm": 1.1434462070465088, "learning_rate": 9.995625682928198e-05, "loss": 0.147, "step": 3760 }, { "grad_norm": 1.1573954820632935, "learning_rate": 9.995509675702295e-05, "loss": 0.1629, "step": 3770 }, { "grad_norm": 1.2654308080673218, "learning_rate": 9.995392150972451e-05, "loss": 0.1368, "step": 3780 }, { "grad_norm": 1.0783894062042236, "learning_rate": 9.995273108774366e-05, "loss": 0.1481, "step": 3790 }, { "grad_norm": 0.726858377456665, "learning_rate": 9.995152549144205e-05, "loss": 0.139, "step": 3800 }, { "grad_norm": 0.8703896403312683, "learning_rate": 9.995030472118587e-05, "loss": 0.1438, "step": 3810 }, { "grad_norm": 1.2182265520095825, "learning_rate": 9.9949068777346e-05, "loss": 0.1972, "step": 3820 }, { "grad_norm": 1.1941533088684082, "learning_rate": 9.994781766029786e-05, "loss": 0.1608, "step": 3830 }, { "grad_norm": 0.9086639285087585, "learning_rate": 9.994655137042151e-05, "loss": 0.162, "step": 3840 }, { "grad_norm": 1.0803718566894531, "learning_rate": 9.99452699081016e-05, "loss": 0.1439, "step": 3850 }, { "grad_norm": 1.2756205797195435, "learning_rate": 9.994397327372743e-05, "loss": 0.1316, "step": 3860 }, { "grad_norm": 0.9949019551277161, "learning_rate": 9.994266146769286e-05, "loss": 0.1497, "step": 3870 }, { "grad_norm": 1.2938073873519897, "learning_rate": 9.994133449039642e-05, "loss": 0.1691, "step": 3880 }, { "grad_norm": 0.8042252063751221, "learning_rate": 9.993999234224118e-05, "loss": 0.1409, "step": 3890 }, { "grad_norm": 1.0879441499710083, "learning_rate": 9.993863502363485e-05, "loss": 0.1255, "step": 3900 }, { "grad_norm": 0.9718165993690491, "learning_rate": 9.993726253498976e-05, "loss": 0.1668, "step": 3910 }, { "grad_norm": 1.033963680267334, "learning_rate": 9.993587487672282e-05, "loss": 0.1848, "step": 3920 }, { "grad_norm": 0.8439921140670776, "learning_rate": 9.993447204925558e-05, "loss": 0.1202, "step": 3930 }, { "grad_norm": 1.2208585739135742, "learning_rate": 9.993305405301416e-05, "loss": 0.1398, "step": 3940 }, { "grad_norm": 0.9819973707199097, "learning_rate": 9.993162088842935e-05, "loss": 0.1483, "step": 3950 }, { "grad_norm": 0.8631084561347961, "learning_rate": 9.993017255593646e-05, "loss": 0.1885, "step": 3960 }, { "grad_norm": 1.095736026763916, "learning_rate": 9.992870905597548e-05, "loss": 0.1421, "step": 3970 }, { "grad_norm": 1.1526933908462524, "learning_rate": 9.9927230388991e-05, "loss": 0.1359, "step": 3980 }, { "grad_norm": 1.257192850112915, "learning_rate": 9.992573655543215e-05, "loss": 0.1995, "step": 3990 }, { "grad_norm": 1.0526734590530396, "learning_rate": 9.992422755575277e-05, "loss": 0.1324, "step": 4000 }, { "grad_norm": 1.3490657806396484, "learning_rate": 9.992270339041123e-05, "loss": 0.1861, "step": 4010 }, { "grad_norm": 0.9668695330619812, "learning_rate": 9.992116405987053e-05, "loss": 0.1876, "step": 4020 }, { "grad_norm": 0.9477154016494751, "learning_rate": 9.991960956459828e-05, "loss": 0.1399, "step": 4030 }, { "grad_norm": 0.8765524625778198, "learning_rate": 9.991803990506669e-05, "loss": 0.1575, "step": 4040 }, { "grad_norm": 0.8193208575248718, "learning_rate": 9.991645508175258e-05, "loss": 0.1164, "step": 4050 }, { "grad_norm": 1.0950297117233276, "learning_rate": 9.99148550951374e-05, "loss": 0.1553, "step": 4060 }, { "grad_norm": 1.0754482746124268, "learning_rate": 9.991323994570716e-05, "loss": 0.1105, "step": 4070 }, { "grad_norm": 1.1915477514266968, "learning_rate": 9.99116096339525e-05, "loss": 0.1878, "step": 4080 }, { "grad_norm": 1.1513535976409912, "learning_rate": 9.990996416036869e-05, "loss": 0.1415, "step": 4090 }, { "grad_norm": 1.030468463897705, "learning_rate": 9.990830352545555e-05, "loss": 0.1596, "step": 4100 }, { "grad_norm": 0.9468449354171753, "learning_rate": 9.990662772971756e-05, "loss": 0.1306, "step": 4110 }, { "grad_norm": 0.7746255397796631, "learning_rate": 9.990493677366376e-05, "loss": 0.1561, "step": 4120 }, { "grad_norm": 1.1519911289215088, "learning_rate": 9.990323065780786e-05, "loss": 0.153, "step": 4130 }, { "grad_norm": 1.1214290857315063, "learning_rate": 9.990150938266808e-05, "loss": 0.1588, "step": 4140 }, { "grad_norm": 1.0658485889434814, "learning_rate": 9.989977294876733e-05, "loss": 0.1434, "step": 4150 }, { "grad_norm": 1.0043483972549438, "learning_rate": 9.989802135663308e-05, "loss": 0.1618, "step": 4160 }, { "grad_norm": 0.7936168313026428, "learning_rate": 9.989625460679743e-05, "loss": 0.1332, "step": 4170 }, { "grad_norm": 1.0533448457717896, "learning_rate": 9.989447269979706e-05, "loss": 0.1699, "step": 4180 }, { "grad_norm": 1.0714876651763916, "learning_rate": 9.989267563617328e-05, "loss": 0.1338, "step": 4190 }, { "grad_norm": 1.2933329343795776, "learning_rate": 9.989086341647198e-05, "loss": 0.1568, "step": 4200 }, { "grad_norm": 0.8907761573791504, "learning_rate": 9.988903604124366e-05, "loss": 0.1658, "step": 4210 }, { "grad_norm": 0.9112685918807983, "learning_rate": 9.988719351104343e-05, "loss": 0.187, "step": 4220 }, { "grad_norm": 1.5526094436645508, "learning_rate": 9.9885335826431e-05, "loss": 0.1461, "step": 4230 }, { "grad_norm": 0.9482647180557251, "learning_rate": 9.988346298797071e-05, "loss": 0.1452, "step": 4240 }, { "grad_norm": 0.8067628145217896, "learning_rate": 9.988157499623146e-05, "loss": 0.1495, "step": 4250 }, { "grad_norm": 0.7394295930862427, "learning_rate": 9.987967185178677e-05, "loss": 0.1387, "step": 4260 }, { "grad_norm": 0.7353919148445129, "learning_rate": 9.987775355521476e-05, "loss": 0.1153, "step": 4270 }, { "grad_norm": 0.9972267746925354, "learning_rate": 9.987582010709817e-05, "loss": 0.131, "step": 4280 }, { "grad_norm": 0.9666521549224854, "learning_rate": 9.987387150802431e-05, "loss": 0.1784, "step": 4290 }, { "grad_norm": 1.1468828916549683, "learning_rate": 9.987190775858517e-05, "loss": 0.1569, "step": 4300 }, { "grad_norm": 0.9518359899520874, "learning_rate": 9.98699288593772e-05, "loss": 0.1594, "step": 4310 }, { "grad_norm": 0.9169225692749023, "learning_rate": 9.986793481100161e-05, "loss": 0.1516, "step": 4320 }, { "grad_norm": 1.1210041046142578, "learning_rate": 9.986592561406412e-05, "loss": 0.1442, "step": 4330 }, { "grad_norm": 0.9424847960472107, "learning_rate": 9.986390126917503e-05, "loss": 0.1452, "step": 4340 }, { "grad_norm": 1.1076087951660156, "learning_rate": 9.986186177694933e-05, "loss": 0.1727, "step": 4350 }, { "grad_norm": 1.0766842365264893, "learning_rate": 9.985980713800656e-05, "loss": 0.1873, "step": 4360 }, { "grad_norm": 1.0978039503097534, "learning_rate": 9.985773735297084e-05, "loss": 0.1608, "step": 4370 }, { "grad_norm": 1.0702990293502808, "learning_rate": 9.985565242247092e-05, "loss": 0.1498, "step": 4380 }, { "grad_norm": 0.8602754473686218, "learning_rate": 9.985355234714016e-05, "loss": 0.1404, "step": 4390 }, { "grad_norm": 0.9580897092819214, "learning_rate": 9.985143712761652e-05, "loss": 0.1309, "step": 4400 }, { "grad_norm": 0.9266173243522644, "learning_rate": 9.984930676454252e-05, "loss": 0.1556, "step": 4410 }, { "grad_norm": 0.7936074137687683, "learning_rate": 9.984716125856532e-05, "loss": 0.1311, "step": 4420 }, { "grad_norm": 0.817727267742157, "learning_rate": 9.984500061033667e-05, "loss": 0.1409, "step": 4430 }, { "grad_norm": 0.9453954696655273, "learning_rate": 9.984282482051293e-05, "loss": 0.1415, "step": 4440 }, { "grad_norm": 0.8071401119232178, "learning_rate": 9.9840633889755e-05, "loss": 0.12, "step": 4450 }, { "grad_norm": 1.1751935482025146, "learning_rate": 9.983842781872848e-05, "loss": 0.1644, "step": 4460 }, { "grad_norm": 1.1144529581069946, "learning_rate": 9.98362066081035e-05, "loss": 0.1467, "step": 4470 }, { "grad_norm": 1.1253125667572021, "learning_rate": 9.983397025855479e-05, "loss": 0.1261, "step": 4480 }, { "grad_norm": 1.433658480644226, "learning_rate": 9.983171877076171e-05, "loss": 0.1428, "step": 4490 }, { "grad_norm": 1.0738686323165894, "learning_rate": 9.98294521454082e-05, "loss": 0.1567, "step": 4500 }, { "grad_norm": 0.9866392016410828, "learning_rate": 9.98271703831828e-05, "loss": 0.1538, "step": 4510 }, { "grad_norm": 1.119602084159851, "learning_rate": 9.982487348477865e-05, "loss": 0.1374, "step": 4520 }, { "grad_norm": 1.072619915008545, "learning_rate": 9.982256145089347e-05, "loss": 0.1476, "step": 4530 }, { "grad_norm": 0.7546165585517883, "learning_rate": 9.982023428222962e-05, "loss": 0.1475, "step": 4540 }, { "grad_norm": 0.8524878621101379, "learning_rate": 9.981789197949403e-05, "loss": 0.1341, "step": 4550 }, { "grad_norm": 1.1534829139709473, "learning_rate": 9.98155345433982e-05, "loss": 0.1456, "step": 4560 }, { "grad_norm": 0.8286758065223694, "learning_rate": 9.981316197465831e-05, "loss": 0.1662, "step": 4570 }, { "grad_norm": 1.5444588661193848, "learning_rate": 9.981077427399504e-05, "loss": 0.1759, "step": 4580 }, { "grad_norm": 0.8499327301979065, "learning_rate": 9.980837144213371e-05, "loss": 0.1361, "step": 4590 }, { "grad_norm": 0.9888041019439697, "learning_rate": 9.980595347980426e-05, "loss": 0.1445, "step": 4600 }, { "grad_norm": 1.0755330324172974, "learning_rate": 9.980352038774119e-05, "loss": 0.1519, "step": 4610 }, { "grad_norm": 0.9364874958992004, "learning_rate": 9.98010721666836e-05, "loss": 0.1404, "step": 4620 }, { "grad_norm": 1.0495823621749878, "learning_rate": 9.979860881737523e-05, "loss": 0.1225, "step": 4630 }, { "grad_norm": 0.6128596663475037, "learning_rate": 9.979613034056434e-05, "loss": 0.1143, "step": 4640 }, { "grad_norm": 1.370629906654358, "learning_rate": 9.979363673700386e-05, "loss": 0.1519, "step": 4650 }, { "grad_norm": 1.3092387914657593, "learning_rate": 9.979112800745124e-05, "loss": 0.1526, "step": 4660 }, { "grad_norm": 1.027348279953003, "learning_rate": 9.978860415266861e-05, "loss": 0.1386, "step": 4670 }, { "grad_norm": 0.8014964461326599, "learning_rate": 9.978606517342262e-05, "loss": 0.1239, "step": 4680 }, { "grad_norm": 0.5752570629119873, "learning_rate": 9.978351107048456e-05, "loss": 0.1818, "step": 4690 }, { "grad_norm": 0.8231750130653381, "learning_rate": 9.978094184463029e-05, "loss": 0.1384, "step": 4700 }, { "grad_norm": 1.0453721284866333, "learning_rate": 9.977835749664029e-05, "loss": 0.1407, "step": 4710 }, { "grad_norm": 0.9754924178123474, "learning_rate": 9.97757580272996e-05, "loss": 0.1689, "step": 4720 }, { "grad_norm": 0.6779398322105408, "learning_rate": 9.977314343739786e-05, "loss": 0.1486, "step": 4730 }, { "grad_norm": 0.6877363920211792, "learning_rate": 9.977051372772934e-05, "loss": 0.1329, "step": 4740 }, { "grad_norm": 1.0206977128982544, "learning_rate": 9.976786889909286e-05, "loss": 0.1566, "step": 4750 }, { "grad_norm": 1.131162405014038, "learning_rate": 9.976520895229185e-05, "loss": 0.1547, "step": 4760 }, { "grad_norm": 0.964643657207489, "learning_rate": 9.976253388813433e-05, "loss": 0.1564, "step": 4770 }, { "grad_norm": 0.9891523718833923, "learning_rate": 9.975984370743293e-05, "loss": 0.1478, "step": 4780 }, { "grad_norm": 1.0060348510742188, "learning_rate": 9.975713841100485e-05, "loss": 0.1506, "step": 4790 }, { "grad_norm": 1.0522239208221436, "learning_rate": 9.975441799967187e-05, "loss": 0.1283, "step": 4800 }, { "grad_norm": 0.7301154136657715, "learning_rate": 9.975168247426039e-05, "loss": 0.1291, "step": 4810 }, { "grad_norm": 1.0475102663040161, "learning_rate": 9.974893183560139e-05, "loss": 0.1329, "step": 4820 }, { "grad_norm": 0.8429783582687378, "learning_rate": 9.974616608453045e-05, "loss": 0.2034, "step": 4830 }, { "grad_norm": 0.8131240606307983, "learning_rate": 9.974338522188772e-05, "loss": 0.1537, "step": 4840 }, { "grad_norm": 1.7306773662567139, "learning_rate": 9.974058924851797e-05, "loss": 0.1512, "step": 4850 }, { "grad_norm": 0.911236584186554, "learning_rate": 9.973777816527051e-05, "loss": 0.1303, "step": 4860 }, { "grad_norm": 0.9552854299545288, "learning_rate": 9.973495197299931e-05, "loss": 0.1575, "step": 4870 }, { "grad_norm": 0.8220639228820801, "learning_rate": 9.973211067256287e-05, "loss": 0.1421, "step": 4880 }, { "grad_norm": 1.0478966236114502, "learning_rate": 9.97292542648243e-05, "loss": 0.1353, "step": 4890 }, { "grad_norm": 0.8397774696350098, "learning_rate": 9.972638275065131e-05, "loss": 0.1292, "step": 4900 }, { "grad_norm": 0.7532806396484375, "learning_rate": 9.972349613091621e-05, "loss": 0.1255, "step": 4910 }, { "grad_norm": 0.9562700390815735, "learning_rate": 9.972059440649584e-05, "loss": 0.1219, "step": 4920 }, { "grad_norm": 1.2835731506347656, "learning_rate": 9.971767757827168e-05, "loss": 0.1371, "step": 4930 }, { "grad_norm": 1.1234405040740967, "learning_rate": 9.971474564712982e-05, "loss": 0.1211, "step": 4940 }, { "grad_norm": 0.9548580050468445, "learning_rate": 9.971179861396084e-05, "loss": 0.1486, "step": 4950 }, { "grad_norm": 1.0849323272705078, "learning_rate": 9.970883647966003e-05, "loss": 0.1634, "step": 4960 }, { "grad_norm": 1.1055625677108765, "learning_rate": 9.970585924512717e-05, "loss": 0.1287, "step": 4970 }, { "grad_norm": 1.0817433595657349, "learning_rate": 9.970286691126669e-05, "loss": 0.1367, "step": 4980 }, { "grad_norm": 0.7910575270652771, "learning_rate": 9.969985947898756e-05, "loss": 0.1345, "step": 4990 }, { "grad_norm": 0.9663563370704651, "learning_rate": 9.969683694920337e-05, "loss": 0.1183, "step": 5000 }, { "grad_norm": 0.838074803352356, "learning_rate": 9.969379932283228e-05, "loss": 0.1168, "step": 5010 }, { "grad_norm": 1.0864468812942505, "learning_rate": 9.969074660079704e-05, "loss": 0.1256, "step": 5020 }, { "grad_norm": 0.7012449502944946, "learning_rate": 9.968767878402501e-05, "loss": 0.1389, "step": 5030 }, { "grad_norm": 1.338454246520996, "learning_rate": 9.968459587344808e-05, "loss": 0.1328, "step": 5040 }, { "grad_norm": 0.9210155010223389, "learning_rate": 9.968149787000278e-05, "loss": 0.1349, "step": 5050 }, { "grad_norm": 0.865938127040863, "learning_rate": 9.967838477463018e-05, "loss": 0.1233, "step": 5060 }, { "grad_norm": 1.2779878377914429, "learning_rate": 9.967525658827597e-05, "loss": 0.1456, "step": 5070 }, { "grad_norm": 1.0192033052444458, "learning_rate": 9.967211331189042e-05, "loss": 0.1356, "step": 5080 }, { "grad_norm": 1.075439453125, "learning_rate": 9.966895494642834e-05, "loss": 0.1772, "step": 5090 }, { "grad_norm": 0.8100889325141907, "learning_rate": 9.96657814928492e-05, "loss": 0.1197, "step": 5100 }, { "grad_norm": 0.9356569647789001, "learning_rate": 9.966259295211697e-05, "loss": 0.1586, "step": 5110 }, { "grad_norm": 1.0405375957489014, "learning_rate": 9.965938932520028e-05, "loss": 0.1337, "step": 5120 }, { "grad_norm": 0.9246542453765869, "learning_rate": 9.965617061307229e-05, "loss": 0.1191, "step": 5130 }, { "grad_norm": 0.8143234848976135, "learning_rate": 9.965293681671077e-05, "loss": 0.1466, "step": 5140 }, { "grad_norm": 0.8443891406059265, "learning_rate": 9.964968793709804e-05, "loss": 0.15, "step": 5150 }, { "grad_norm": 1.0543866157531738, "learning_rate": 9.964642397522106e-05, "loss": 0.1361, "step": 5160 }, { "grad_norm": 1.029094934463501, "learning_rate": 9.96431449320713e-05, "loss": 0.1419, "step": 5170 }, { "grad_norm": 0.9264340996742249, "learning_rate": 9.963985080864486e-05, "loss": 0.1255, "step": 5180 }, { "grad_norm": 1.07936429977417, "learning_rate": 9.96365416059424e-05, "loss": 0.1595, "step": 5190 }, { "grad_norm": 0.9322939515113831, "learning_rate": 9.963321732496919e-05, "loss": 0.1207, "step": 5200 }, { "grad_norm": 0.8766630291938782, "learning_rate": 9.962987796673506e-05, "loss": 0.1185, "step": 5210 }, { "grad_norm": 0.8196833729743958, "learning_rate": 9.962652353225438e-05, "loss": 0.1456, "step": 5220 }, { "grad_norm": 0.9844681024551392, "learning_rate": 9.962315402254619e-05, "loss": 0.1324, "step": 5230 }, { "grad_norm": 0.8784993290901184, "learning_rate": 9.9619769438634e-05, "loss": 0.1227, "step": 5240 }, { "grad_norm": 0.8097463846206665, "learning_rate": 9.9616369781546e-05, "loss": 0.1086, "step": 5250 }, { "grad_norm": 0.6633681058883667, "learning_rate": 9.961295505231491e-05, "loss": 0.1347, "step": 5260 }, { "grad_norm": 1.006287693977356, "learning_rate": 9.960952525197804e-05, "loss": 0.1488, "step": 5270 }, { "grad_norm": 0.9176791906356812, "learning_rate": 9.960608038157724e-05, "loss": 0.1467, "step": 5280 }, { "grad_norm": 1.0561805963516235, "learning_rate": 9.960262044215901e-05, "loss": 0.1503, "step": 5290 }, { "grad_norm": 1.0727053880691528, "learning_rate": 9.959914543477435e-05, "loss": 0.1358, "step": 5300 }, { "grad_norm": 0.8659074902534485, "learning_rate": 9.959565536047892e-05, "loss": 0.1445, "step": 5310 }, { "grad_norm": 1.177690863609314, "learning_rate": 9.959215022033288e-05, "loss": 0.1605, "step": 5320 }, { "grad_norm": 1.0577738285064697, "learning_rate": 9.9588630015401e-05, "loss": 0.1592, "step": 5330 }, { "grad_norm": 1.0318773984909058, "learning_rate": 9.958509474675264e-05, "loss": 0.1636, "step": 5340 }, { "grad_norm": 0.7076526880264282, "learning_rate": 9.958154441546171e-05, "loss": 0.1527, "step": 5350 }, { "grad_norm": 0.9383074641227722, "learning_rate": 9.957797902260673e-05, "loss": 0.1573, "step": 5360 }, { "grad_norm": 0.788246214389801, "learning_rate": 9.957439856927073e-05, "loss": 0.1427, "step": 5370 }, { "grad_norm": 0.8753125667572021, "learning_rate": 9.957080305654139e-05, "loss": 0.1383, "step": 5380 }, { "grad_norm": 1.1280510425567627, "learning_rate": 9.956719248551092e-05, "loss": 0.1374, "step": 5390 }, { "grad_norm": 1.1322604417800903, "learning_rate": 9.956356685727612e-05, "loss": 0.1405, "step": 5400 }, { "grad_norm": 1.030678629875183, "learning_rate": 9.955992617293836e-05, "loss": 0.1317, "step": 5410 }, { "grad_norm": 0.8094638586044312, "learning_rate": 9.955627043360358e-05, "loss": 0.136, "step": 5420 }, { "grad_norm": 0.9443933963775635, "learning_rate": 9.955259964038231e-05, "loss": 0.1309, "step": 5430 }, { "grad_norm": 0.7085103392601013, "learning_rate": 9.954891379438962e-05, "loss": 0.164, "step": 5440 }, { "grad_norm": 0.7687110900878906, "learning_rate": 9.954521289674519e-05, "loss": 0.1434, "step": 5450 }, { "grad_norm": 0.8744443655014038, "learning_rate": 9.954149694857325e-05, "loss": 0.1478, "step": 5460 }, { "grad_norm": 1.1620943546295166, "learning_rate": 9.953776595100258e-05, "loss": 0.1164, "step": 5470 }, { "grad_norm": 0.6770328879356384, "learning_rate": 9.95340199051666e-05, "loss": 0.1185, "step": 5480 }, { "grad_norm": 0.9250375628471375, "learning_rate": 9.953025881220325e-05, "loss": 0.1372, "step": 5490 }, { "grad_norm": 1.0552353858947754, "learning_rate": 9.952648267325504e-05, "loss": 0.1344, "step": 5500 }, { "grad_norm": 1.0231132507324219, "learning_rate": 9.952269148946905e-05, "loss": 0.1224, "step": 5510 }, { "grad_norm": 1.168219804763794, "learning_rate": 9.951888526199697e-05, "loss": 0.1238, "step": 5520 }, { "grad_norm": 0.988857626914978, "learning_rate": 9.951506399199501e-05, "loss": 0.1243, "step": 5530 }, { "grad_norm": 1.5287972688674927, "learning_rate": 9.951122768062399e-05, "loss": 0.1532, "step": 5540 }, { "grad_norm": 1.0617619752883911, "learning_rate": 9.950737632904927e-05, "loss": 0.1722, "step": 5550 }, { "grad_norm": 1.1047580242156982, "learning_rate": 9.950350993844077e-05, "loss": 0.1518, "step": 5560 }, { "grad_norm": 0.8931300044059753, "learning_rate": 9.949962850997303e-05, "loss": 0.136, "step": 5570 }, { "grad_norm": 0.9175195097923279, "learning_rate": 9.949573204482512e-05, "loss": 0.1444, "step": 5580 }, { "grad_norm": 0.8270457983016968, "learning_rate": 9.949182054418064e-05, "loss": 0.1186, "step": 5590 }, { "grad_norm": 0.9638673067092896, "learning_rate": 9.948789400922787e-05, "loss": 0.1378, "step": 5600 }, { "grad_norm": 1.0094046592712402, "learning_rate": 9.948395244115953e-05, "loss": 0.1362, "step": 5610 }, { "grad_norm": 0.868220865726471, "learning_rate": 9.9479995841173e-05, "loss": 0.1806, "step": 5620 }, { "grad_norm": 0.8548880815505981, "learning_rate": 9.947602421047017e-05, "loss": 0.1413, "step": 5630 }, { "grad_norm": 1.236607551574707, "learning_rate": 9.947203755025753e-05, "loss": 0.1508, "step": 5640 }, { "grad_norm": 1.0321433544158936, "learning_rate": 9.946803586174611e-05, "loss": 0.1343, "step": 5650 }, { "grad_norm": 1.1432808637619019, "learning_rate": 9.946401914615151e-05, "loss": 0.1674, "step": 5660 }, { "grad_norm": 0.9682312607765198, "learning_rate": 9.945998740469394e-05, "loss": 0.1541, "step": 5670 }, { "grad_norm": 0.7769452333450317, "learning_rate": 9.945594063859809e-05, "loss": 0.157, "step": 5680 }, { "grad_norm": 0.984912097454071, "learning_rate": 9.94518788490933e-05, "loss": 0.1543, "step": 5690 }, { "grad_norm": 0.9963728785514832, "learning_rate": 9.944780203741341e-05, "loss": 0.1286, "step": 5700 }, { "grad_norm": 0.9789602756500244, "learning_rate": 9.944371020479686e-05, "loss": 0.1243, "step": 5710 }, { "grad_norm": 1.0684499740600586, "learning_rate": 9.943960335248662e-05, "loss": 0.1382, "step": 5720 }, { "grad_norm": 0.8862773180007935, "learning_rate": 9.943548148173027e-05, "loss": 0.1412, "step": 5730 }, { "grad_norm": 0.9948656558990479, "learning_rate": 9.943134459377992e-05, "loss": 0.1177, "step": 5740 }, { "grad_norm": 0.9666841626167297, "learning_rate": 9.942719268989222e-05, "loss": 0.1241, "step": 5750 }, { "grad_norm": 1.0312432050704956, "learning_rate": 9.942302577132844e-05, "loss": 0.1159, "step": 5760 }, { "grad_norm": 0.8448374271392822, "learning_rate": 9.941884383935438e-05, "loss": 0.1451, "step": 5770 }, { "grad_norm": 0.7511975169181824, "learning_rate": 9.941464689524039e-05, "loss": 0.1341, "step": 5780 }, { "grad_norm": 0.8034070134162903, "learning_rate": 9.941043494026139e-05, "loss": 0.1121, "step": 5790 }, { "grad_norm": 0.8245567083358765, "learning_rate": 9.940620797569685e-05, "loss": 0.1405, "step": 5800 }, { "grad_norm": 1.088132619857788, "learning_rate": 9.940196600283082e-05, "loss": 0.1242, "step": 5810 }, { "grad_norm": 1.0688196420669556, "learning_rate": 9.939770902295192e-05, "loss": 0.1357, "step": 5820 }, { "grad_norm": 0.7021129727363586, "learning_rate": 9.939343703735329e-05, "loss": 0.1439, "step": 5830 }, { "grad_norm": 1.0861544609069824, "learning_rate": 9.938915004733264e-05, "loss": 0.1189, "step": 5840 }, { "grad_norm": 0.7978534698486328, "learning_rate": 9.938484805419224e-05, "loss": 0.1138, "step": 5850 }, { "grad_norm": 0.9971507787704468, "learning_rate": 9.938053105923894e-05, "loss": 0.1167, "step": 5860 }, { "grad_norm": 0.7416560053825378, "learning_rate": 9.937619906378413e-05, "loss": 0.1207, "step": 5870 }, { "grad_norm": 1.076420545578003, "learning_rate": 9.937185206914374e-05, "loss": 0.1089, "step": 5880 }, { "grad_norm": 1.0931743383407593, "learning_rate": 9.936749007663829e-05, "loss": 0.1198, "step": 5890 }, { "grad_norm": 0.7670378684997559, "learning_rate": 9.93631130875928e-05, "loss": 0.1226, "step": 5900 }, { "grad_norm": 0.8509029150009155, "learning_rate": 9.935872110333692e-05, "loss": 0.1403, "step": 5910 }, { "grad_norm": 0.8293117880821228, "learning_rate": 9.935431412520484e-05, "loss": 0.1149, "step": 5920 }, { "grad_norm": 0.6144686341285706, "learning_rate": 9.934989215453523e-05, "loss": 0.0937, "step": 5930 }, { "grad_norm": 0.8169775009155273, "learning_rate": 9.934545519267139e-05, "loss": 0.1206, "step": 5940 }, { "grad_norm": 0.7301484942436218, "learning_rate": 9.934100324096117e-05, "loss": 0.1124, "step": 5950 }, { "grad_norm": 0.9132269620895386, "learning_rate": 9.933653630075692e-05, "loss": 0.1249, "step": 5960 }, { "grad_norm": 1.2228574752807617, "learning_rate": 9.93320543734156e-05, "loss": 0.1078, "step": 5970 }, { "grad_norm": 1.0036585330963135, "learning_rate": 9.932755746029871e-05, "loss": 0.1605, "step": 5980 }, { "grad_norm": 1.005914568901062, "learning_rate": 9.932304556277228e-05, "loss": 0.1617, "step": 5990 }, { "grad_norm": 1.015267014503479, "learning_rate": 9.93185186822069e-05, "loss": 0.1228, "step": 6000 }, { "grad_norm": 0.7487809658050537, "learning_rate": 9.931397681997773e-05, "loss": 0.1137, "step": 6010 }, { "grad_norm": 0.9365728497505188, "learning_rate": 9.930941997746446e-05, "loss": 0.1131, "step": 6020 }, { "grad_norm": 0.7732081413269043, "learning_rate": 9.930484815605134e-05, "loss": 0.1347, "step": 6030 }, { "grad_norm": 0.7713011503219604, "learning_rate": 9.930026135712717e-05, "loss": 0.1082, "step": 6040 }, { "grad_norm": 0.7377663850784302, "learning_rate": 9.92956595820853e-05, "loss": 0.1374, "step": 6050 }, { "grad_norm": 0.6614827513694763, "learning_rate": 9.929104283232362e-05, "loss": 0.1368, "step": 6060 }, { "grad_norm": 0.770519495010376, "learning_rate": 9.92864111092446e-05, "loss": 0.1209, "step": 6070 }, { "grad_norm": 1.615349531173706, "learning_rate": 9.92817644142552e-05, "loss": 0.1471, "step": 6080 }, { "grad_norm": 0.9667331576347351, "learning_rate": 9.927710274876698e-05, "loss": 0.1238, "step": 6090 }, { "grad_norm": 0.6318171620368958, "learning_rate": 9.927242611419603e-05, "loss": 0.1312, "step": 6100 }, { "grad_norm": 0.9530181884765625, "learning_rate": 9.926773451196301e-05, "loss": 0.1243, "step": 6110 }, { "grad_norm": 0.784501850605011, "learning_rate": 9.926302794349306e-05, "loss": 0.1199, "step": 6120 }, { "grad_norm": 0.7425724267959595, "learning_rate": 9.925830641021594e-05, "loss": 0.1065, "step": 6130 }, { "grad_norm": 1.1164346933364868, "learning_rate": 9.925356991356593e-05, "loss": 0.1229, "step": 6140 }, { "grad_norm": 0.727760374546051, "learning_rate": 9.924881845498184e-05, "loss": 0.1115, "step": 6150 }, { "grad_norm": 0.7613076567649841, "learning_rate": 9.924405203590705e-05, "loss": 0.1346, "step": 6160 }, { "grad_norm": 0.9525797963142395, "learning_rate": 9.923927065778946e-05, "loss": 0.1394, "step": 6170 }, { "grad_norm": 0.8115614056587219, "learning_rate": 9.923447432208154e-05, "loss": 0.1499, "step": 6180 }, { "grad_norm": 0.8674775958061218, "learning_rate": 9.922966303024027e-05, "loss": 0.1235, "step": 6190 }, { "grad_norm": 0.8428709506988525, "learning_rate": 9.922483678372721e-05, "loss": 0.1294, "step": 6200 }, { "grad_norm": 1.0486034154891968, "learning_rate": 9.921999558400845e-05, "loss": 0.1093, "step": 6210 }, { "grad_norm": 0.7817873358726501, "learning_rate": 9.92151394325546e-05, "loss": 0.1066, "step": 6220 }, { "grad_norm": 0.8640316724777222, "learning_rate": 9.921026833084084e-05, "loss": 0.1154, "step": 6230 }, { "grad_norm": 0.7998485565185547, "learning_rate": 9.920538228034689e-05, "loss": 0.0959, "step": 6240 }, { "grad_norm": 0.8617517948150635, "learning_rate": 9.920048128255699e-05, "loss": 0.1216, "step": 6250 }, { "grad_norm": 1.5218642950057983, "learning_rate": 9.919556533895995e-05, "loss": 0.142, "step": 6260 }, { "grad_norm": 1.1396366357803345, "learning_rate": 9.919063445104907e-05, "loss": 0.1206, "step": 6270 }, { "grad_norm": 1.0647180080413818, "learning_rate": 9.918568862032227e-05, "loss": 0.136, "step": 6280 }, { "grad_norm": 1.5272080898284912, "learning_rate": 9.918072784828194e-05, "loss": 0.1352, "step": 6290 }, { "grad_norm": 0.9865410327911377, "learning_rate": 9.917575213643501e-05, "loss": 0.1316, "step": 6300 }, { "grad_norm": 0.9096260070800781, "learning_rate": 9.917076148629302e-05, "loss": 0.1159, "step": 6310 }, { "grad_norm": 1.1018093824386597, "learning_rate": 9.916575589937196e-05, "loss": 0.1375, "step": 6320 }, { "grad_norm": 0.6824197173118591, "learning_rate": 9.916073537719239e-05, "loss": 0.127, "step": 6330 }, { "grad_norm": 0.7503131628036499, "learning_rate": 9.915569992127944e-05, "loss": 0.1205, "step": 6340 }, { "grad_norm": 0.8993796110153198, "learning_rate": 9.915064953316273e-05, "loss": 0.1385, "step": 6350 }, { "grad_norm": 0.6589654684066772, "learning_rate": 9.914558421437645e-05, "loss": 0.1489, "step": 6360 }, { "grad_norm": 0.7789879441261292, "learning_rate": 9.914050396645929e-05, "loss": 0.1256, "step": 6370 }, { "grad_norm": 0.9773585796356201, "learning_rate": 9.913540879095452e-05, "loss": 0.1269, "step": 6380 }, { "grad_norm": 1.051119327545166, "learning_rate": 9.913029868940987e-05, "loss": 0.1236, "step": 6390 }, { "grad_norm": 0.679543137550354, "learning_rate": 9.912517366337772e-05, "loss": 0.1363, "step": 6400 }, { "grad_norm": 0.8014342188835144, "learning_rate": 9.912003371441487e-05, "loss": 0.1231, "step": 6410 }, { "grad_norm": 0.8237544298171997, "learning_rate": 9.911487884408271e-05, "loss": 0.1214, "step": 6420 }, { "grad_norm": 1.3490140438079834, "learning_rate": 9.910970905394719e-05, "loss": 0.1453, "step": 6430 }, { "grad_norm": 0.6084827184677124, "learning_rate": 9.91045243455787e-05, "loss": 0.1514, "step": 6440 }, { "grad_norm": 0.7846445441246033, "learning_rate": 9.909932472055225e-05, "loss": 0.131, "step": 6450 }, { "grad_norm": 0.7659568190574646, "learning_rate": 9.909411018044734e-05, "loss": 0.1196, "step": 6460 }, { "grad_norm": 1.1461881399154663, "learning_rate": 9.908888072684802e-05, "loss": 0.1339, "step": 6470 }, { "grad_norm": 0.7566052675247192, "learning_rate": 9.908363636134285e-05, "loss": 0.1068, "step": 6480 }, { "grad_norm": 0.8777204155921936, "learning_rate": 9.907837708552493e-05, "loss": 0.0987, "step": 6490 }, { "grad_norm": 0.7091094851493835, "learning_rate": 9.90731029009919e-05, "loss": 0.1536, "step": 6500 }, { "grad_norm": 0.6528358459472656, "learning_rate": 9.906781380934589e-05, "loss": 0.1179, "step": 6510 }, { "grad_norm": 0.7966002821922302, "learning_rate": 9.906250981219362e-05, "loss": 0.1198, "step": 6520 }, { "grad_norm": 0.9503476619720459, "learning_rate": 9.905719091114628e-05, "loss": 0.1277, "step": 6530 }, { "grad_norm": 1.0134141445159912, "learning_rate": 9.905185710781964e-05, "loss": 0.1314, "step": 6540 }, { "grad_norm": 1.0670223236083984, "learning_rate": 9.904650840383392e-05, "loss": 0.122, "step": 6550 }, { "grad_norm": 0.8720384836196899, "learning_rate": 9.904114480081397e-05, "loss": 0.1122, "step": 6560 }, { "grad_norm": 1.0348410606384277, "learning_rate": 9.903576630038906e-05, "loss": 0.1205, "step": 6570 }, { "grad_norm": 0.8876850605010986, "learning_rate": 9.903037290419309e-05, "loss": 0.124, "step": 6580 }, { "grad_norm": 0.7204495668411255, "learning_rate": 9.902496461386439e-05, "loss": 0.1212, "step": 6590 }, { "grad_norm": 0.5837070941925049, "learning_rate": 9.901954143104588e-05, "loss": 0.1037, "step": 6600 }, { "grad_norm": 0.7536510229110718, "learning_rate": 9.901410335738496e-05, "loss": 0.1374, "step": 6610 }, { "grad_norm": 0.6961902976036072, "learning_rate": 9.900865039453358e-05, "loss": 0.1195, "step": 6620 }, { "grad_norm": 1.008029580116272, "learning_rate": 9.900318254414821e-05, "loss": 0.1431, "step": 6630 }, { "grad_norm": 0.8721669912338257, "learning_rate": 9.899769980788985e-05, "loss": 0.1402, "step": 6640 }, { "grad_norm": 0.9353023767471313, "learning_rate": 9.899220218742398e-05, "loss": 0.12, "step": 6650 }, { "grad_norm": 0.8972859978675842, "learning_rate": 9.898668968442066e-05, "loss": 0.1255, "step": 6660 }, { "grad_norm": 0.9485423564910889, "learning_rate": 9.898116230055443e-05, "loss": 0.1224, "step": 6670 }, { "grad_norm": 0.8911267518997192, "learning_rate": 9.897562003750437e-05, "loss": 0.1247, "step": 6680 }, { "grad_norm": 0.8482899069786072, "learning_rate": 9.897006289695407e-05, "loss": 0.135, "step": 6690 }, { "grad_norm": 0.8534388542175293, "learning_rate": 9.896449088059164e-05, "loss": 0.1118, "step": 6700 }, { "grad_norm": 0.8999696373939514, "learning_rate": 9.89589039901097e-05, "loss": 0.1156, "step": 6710 }, { "grad_norm": 0.7200745344161987, "learning_rate": 9.895330222720542e-05, "loss": 0.1191, "step": 6720 }, { "grad_norm": 0.9088776707649231, "learning_rate": 9.894768559358047e-05, "loss": 0.1123, "step": 6730 }, { "grad_norm": 0.8024901747703552, "learning_rate": 9.894205409094101e-05, "loss": 0.132, "step": 6740 }, { "grad_norm": 0.9468099474906921, "learning_rate": 9.893640772099777e-05, "loss": 0.1603, "step": 6750 }, { "grad_norm": 1.0181752443313599, "learning_rate": 9.893074648546595e-05, "loss": 0.1756, "step": 6760 }, { "grad_norm": 1.0648884773254395, "learning_rate": 9.892507038606528e-05, "loss": 0.1281, "step": 6770 }, { "grad_norm": 0.7279801964759827, "learning_rate": 9.891937942452003e-05, "loss": 0.0997, "step": 6780 }, { "grad_norm": 0.801932692527771, "learning_rate": 9.891367360255895e-05, "loss": 0.1411, "step": 6790 }, { "grad_norm": 0.8149898052215576, "learning_rate": 9.890795292191532e-05, "loss": 0.1163, "step": 6800 }, { "grad_norm": 0.9333888292312622, "learning_rate": 9.890221738432694e-05, "loss": 0.1289, "step": 6810 }, { "grad_norm": 0.8221701979637146, "learning_rate": 9.88964669915361e-05, "loss": 0.103, "step": 6820 }, { "grad_norm": 0.6619411706924438, "learning_rate": 9.889070174528963e-05, "loss": 0.1051, "step": 6830 }, { "grad_norm": 0.8145730495452881, "learning_rate": 9.888492164733883e-05, "loss": 0.1249, "step": 6840 }, { "grad_norm": 0.7604629993438721, "learning_rate": 9.88791266994396e-05, "loss": 0.1145, "step": 6850 }, { "grad_norm": 0.7816104888916016, "learning_rate": 9.887331690335223e-05, "loss": 0.143, "step": 6860 }, { "grad_norm": 0.8819064497947693, "learning_rate": 9.886749226084163e-05, "loss": 0.1063, "step": 6870 }, { "grad_norm": 0.6918976306915283, "learning_rate": 9.886165277367714e-05, "loss": 0.1623, "step": 6880 }, { "grad_norm": 0.8834580183029175, "learning_rate": 9.885579844363265e-05, "loss": 0.1139, "step": 6890 }, { "grad_norm": 0.761034369468689, "learning_rate": 9.884992927248656e-05, "loss": 0.1086, "step": 6900 }, { "grad_norm": 0.8502677083015442, "learning_rate": 9.884404526202178e-05, "loss": 0.1063, "step": 6910 }, { "grad_norm": 0.7461807727813721, "learning_rate": 9.883814641402568e-05, "loss": 0.1284, "step": 6920 }, { "grad_norm": 1.2581504583358765, "learning_rate": 9.88322327302902e-05, "loss": 0.1499, "step": 6930 }, { "grad_norm": 0.6954284310340881, "learning_rate": 9.882630421261176e-05, "loss": 0.1201, "step": 6940 }, { "grad_norm": 1.1339693069458008, "learning_rate": 9.88203608627913e-05, "loss": 0.1369, "step": 6950 }, { "grad_norm": 0.8447139263153076, "learning_rate": 9.881440268263422e-05, "loss": 0.1215, "step": 6960 }, { "grad_norm": 1.0088653564453125, "learning_rate": 9.880842967395048e-05, "loss": 0.137, "step": 6970 }, { "grad_norm": 0.8280324935913086, "learning_rate": 9.880244183855452e-05, "loss": 0.1291, "step": 6980 }, { "grad_norm": 0.7147833704948425, "learning_rate": 9.879643917826527e-05, "loss": 0.1132, "step": 6990 }, { "grad_norm": 0.6523711085319519, "learning_rate": 9.87904216949062e-05, "loss": 0.096, "step": 7000 }, { "grad_norm": 0.8191735744476318, "learning_rate": 9.878438939030526e-05, "loss": 0.1123, "step": 7010 }, { "grad_norm": 0.9663703441619873, "learning_rate": 9.877834226629489e-05, "loss": 0.117, "step": 7020 }, { "grad_norm": 0.870854377746582, "learning_rate": 9.877228032471206e-05, "loss": 0.1085, "step": 7030 }, { "grad_norm": 0.8056763410568237, "learning_rate": 9.876620356739823e-05, "loss": 0.1364, "step": 7040 }, { "grad_norm": 0.8767446279525757, "learning_rate": 9.876011199619935e-05, "loss": 0.1241, "step": 7050 }, { "grad_norm": 0.6687363386154175, "learning_rate": 9.875400561296589e-05, "loss": 0.1155, "step": 7060 }, { "grad_norm": 1.0570520162582397, "learning_rate": 9.874788441955278e-05, "loss": 0.1126, "step": 7070 }, { "grad_norm": 0.6787680983543396, "learning_rate": 9.874174841781951e-05, "loss": 0.1219, "step": 7080 }, { "grad_norm": 0.7253695130348206, "learning_rate": 9.873559760963003e-05, "loss": 0.1226, "step": 7090 }, { "grad_norm": 0.9771643280982971, "learning_rate": 9.872943199685278e-05, "loss": 0.1371, "step": 7100 }, { "grad_norm": 0.7709919810295105, "learning_rate": 9.872325158136071e-05, "loss": 0.1, "step": 7110 }, { "grad_norm": 0.7705621719360352, "learning_rate": 9.871705636503128e-05, "loss": 0.1049, "step": 7120 }, { "grad_norm": 0.7588700652122498, "learning_rate": 9.871084634974641e-05, "loss": 0.1082, "step": 7130 }, { "grad_norm": 0.6941968202590942, "learning_rate": 9.870462153739257e-05, "loss": 0.1199, "step": 7140 }, { "grad_norm": 0.8162714242935181, "learning_rate": 9.869838192986067e-05, "loss": 0.147, "step": 7150 }, { "grad_norm": 0.8725212216377258, "learning_rate": 9.869212752904616e-05, "loss": 0.1122, "step": 7160 }, { "grad_norm": 0.7852557897567749, "learning_rate": 9.868585833684894e-05, "loss": 0.1392, "step": 7170 }, { "grad_norm": 1.079488754272461, "learning_rate": 9.867957435517342e-05, "loss": 0.121, "step": 7180 }, { "grad_norm": 0.6724323630332947, "learning_rate": 9.867327558592854e-05, "loss": 0.1127, "step": 7190 }, { "grad_norm": 0.8964502215385437, "learning_rate": 9.866696203102766e-05, "loss": 0.1278, "step": 7200 }, { "grad_norm": 0.8256273865699768, "learning_rate": 9.86606336923887e-05, "loss": 0.1092, "step": 7210 }, { "grad_norm": 0.6752233505249023, "learning_rate": 9.865429057193403e-05, "loss": 0.1187, "step": 7220 }, { "grad_norm": 0.9941809773445129, "learning_rate": 9.864793267159053e-05, "loss": 0.1056, "step": 7230 }, { "grad_norm": 1.0136876106262207, "learning_rate": 9.864155999328957e-05, "loss": 0.1217, "step": 7240 }, { "grad_norm": 0.5840379595756531, "learning_rate": 9.8635172538967e-05, "loss": 0.0984, "step": 7250 }, { "grad_norm": 0.8296282887458801, "learning_rate": 9.862877031056312e-05, "loss": 0.0984, "step": 7260 }, { "grad_norm": 0.8434572219848633, "learning_rate": 9.862235331002279e-05, "loss": 0.1095, "step": 7270 }, { "grad_norm": 0.7498409748077393, "learning_rate": 9.861592153929533e-05, "loss": 0.0996, "step": 7280 }, { "grad_norm": 0.8910300731658936, "learning_rate": 9.860947500033455e-05, "loss": 0.1281, "step": 7290 }, { "grad_norm": 0.6706676483154297, "learning_rate": 9.86030136950987e-05, "loss": 0.1092, "step": 7300 }, { "grad_norm": 0.7680054903030396, "learning_rate": 9.85965376255506e-05, "loss": 0.1351, "step": 7310 }, { "grad_norm": 0.7596255540847778, "learning_rate": 9.859004679365747e-05, "loss": 0.0984, "step": 7320 }, { "grad_norm": 0.9294014573097229, "learning_rate": 9.858354120139108e-05, "loss": 0.1173, "step": 7330 }, { "grad_norm": 0.8982163667678833, "learning_rate": 9.857702085072764e-05, "loss": 0.1115, "step": 7340 }, { "grad_norm": 0.8241246342658997, "learning_rate": 9.857048574364787e-05, "loss": 0.1266, "step": 7350 }, { "grad_norm": 0.9902753829956055, "learning_rate": 9.856393588213698e-05, "loss": 0.1123, "step": 7360 }, { "grad_norm": 0.7895777821540833, "learning_rate": 9.855737126818458e-05, "loss": 0.114, "step": 7370 }, { "grad_norm": 0.6778472661972046, "learning_rate": 9.855079190378491e-05, "loss": 0.1072, "step": 7380 }, { "grad_norm": 0.7812398672103882, "learning_rate": 9.854419779093655e-05, "loss": 0.1173, "step": 7390 }, { "grad_norm": 0.7398772239685059, "learning_rate": 9.853758893164264e-05, "loss": 0.1014, "step": 7400 }, { "grad_norm": 0.8888201713562012, "learning_rate": 9.853096532791078e-05, "loss": 0.1083, "step": 7410 }, { "grad_norm": 1.123327612876892, "learning_rate": 9.852432698175304e-05, "loss": 0.1065, "step": 7420 }, { "grad_norm": 0.7495812773704529, "learning_rate": 9.851767389518597e-05, "loss": 0.1187, "step": 7430 }, { "grad_norm": 0.8102077841758728, "learning_rate": 9.85110060702306e-05, "loss": 0.1047, "step": 7440 }, { "grad_norm": 0.9787280559539795, "learning_rate": 9.850432350891245e-05, "loss": 0.1176, "step": 7450 }, { "grad_norm": 0.7037767767906189, "learning_rate": 9.84976262132615e-05, "loss": 0.0977, "step": 7460 }, { "grad_norm": 0.7110582590103149, "learning_rate": 9.849091418531222e-05, "loss": 0.107, "step": 7470 }, { "grad_norm": 0.6903412938117981, "learning_rate": 9.848418742710353e-05, "loss": 0.0951, "step": 7480 }, { "grad_norm": 0.6781321167945862, "learning_rate": 9.847744594067885e-05, "loss": 0.1153, "step": 7490 }, { "grad_norm": 0.8369008898735046, "learning_rate": 9.847068972808607e-05, "loss": 0.1408, "step": 7500 }, { "grad_norm": 0.7371344566345215, "learning_rate": 9.846391879137756e-05, "loss": 0.1045, "step": 7510 }, { "grad_norm": 0.5606982111930847, "learning_rate": 9.845713313261012e-05, "loss": 0.0887, "step": 7520 }, { "grad_norm": 0.8661860823631287, "learning_rate": 9.845033275384505e-05, "loss": 0.1141, "step": 7530 }, { "grad_norm": 1.0788849592208862, "learning_rate": 9.844351765714818e-05, "loss": 0.1308, "step": 7540 }, { "grad_norm": 0.8692513108253479, "learning_rate": 9.843668784458971e-05, "loss": 0.1211, "step": 7550 }, { "grad_norm": 0.8081322312355042, "learning_rate": 9.842984331824437e-05, "loss": 0.1274, "step": 7560 }, { "grad_norm": 0.8628252744674683, "learning_rate": 9.842298408019133e-05, "loss": 0.1157, "step": 7570 }, { "grad_norm": 0.720597505569458, "learning_rate": 9.841611013251429e-05, "loss": 0.1009, "step": 7580 }, { "grad_norm": 0.991450846195221, "learning_rate": 9.840922147730133e-05, "loss": 0.1363, "step": 7590 }, { "grad_norm": 0.6773099303245544, "learning_rate": 9.840231811664506e-05, "loss": 0.1237, "step": 7600 }, { "grad_norm": 0.8070788383483887, "learning_rate": 9.839540005264252e-05, "loss": 0.1115, "step": 7610 }, { "grad_norm": 0.7008978724479675, "learning_rate": 9.838846728739527e-05, "loss": 0.0958, "step": 7620 }, { "grad_norm": 0.69225013256073, "learning_rate": 9.838151982300927e-05, "loss": 0.1007, "step": 7630 }, { "grad_norm": 0.6967025995254517, "learning_rate": 9.8374557661595e-05, "loss": 0.116, "step": 7640 }, { "grad_norm": 0.9032745957374573, "learning_rate": 9.836758080526735e-05, "loss": 0.1025, "step": 7650 }, { "grad_norm": 0.6689473986625671, "learning_rate": 9.836058925614575e-05, "loss": 0.0992, "step": 7660 }, { "grad_norm": 0.9991471767425537, "learning_rate": 9.8353583016354e-05, "loss": 0.0926, "step": 7670 }, { "grad_norm": 0.6147887706756592, "learning_rate": 9.834656208802044e-05, "loss": 0.1029, "step": 7680 }, { "grad_norm": 0.5881600975990295, "learning_rate": 9.833952647327784e-05, "loss": 0.1161, "step": 7690 }, { "grad_norm": 1.1676924228668213, "learning_rate": 9.833247617426342e-05, "loss": 0.1349, "step": 7700 }, { "grad_norm": 0.8404257893562317, "learning_rate": 9.832541119311889e-05, "loss": 0.0925, "step": 7710 }, { "grad_norm": 0.8665609955787659, "learning_rate": 9.83183315319904e-05, "loss": 0.1222, "step": 7720 }, { "grad_norm": 0.5863387584686279, "learning_rate": 9.831123719302855e-05, "loss": 0.1124, "step": 7730 }, { "grad_norm": 0.7798240184783936, "learning_rate": 9.830412817838842e-05, "loss": 0.0976, "step": 7740 }, { "grad_norm": 0.7843145728111267, "learning_rate": 9.829700449022956e-05, "loss": 0.1017, "step": 7750 }, { "grad_norm": 0.8692129254341125, "learning_rate": 9.828986613071593e-05, "loss": 0.0923, "step": 7760 }, { "grad_norm": 0.9727404117584229, "learning_rate": 9.828271310201601e-05, "loss": 0.1357, "step": 7770 }, { "grad_norm": 0.772467315196991, "learning_rate": 9.827554540630268e-05, "loss": 0.1046, "step": 7780 }, { "grad_norm": 0.9051814079284668, "learning_rate": 9.826836304575329e-05, "loss": 0.1259, "step": 7790 }, { "grad_norm": 0.9157164692878723, "learning_rate": 9.826116602254966e-05, "loss": 0.122, "step": 7800 }, { "grad_norm": 1.0106055736541748, "learning_rate": 9.825395433887805e-05, "loss": 0.1103, "step": 7810 }, { "grad_norm": 0.7366385459899902, "learning_rate": 9.824672799692917e-05, "loss": 0.1, "step": 7820 }, { "grad_norm": 0.7500820159912109, "learning_rate": 9.823948699889823e-05, "loss": 0.118, "step": 7830 }, { "grad_norm": 0.9047691822052002, "learning_rate": 9.823223134698483e-05, "loss": 0.1225, "step": 7840 }, { "grad_norm": 0.9162114858627319, "learning_rate": 9.822496104339303e-05, "loss": 0.1143, "step": 7850 }, { "grad_norm": 0.761970579624176, "learning_rate": 9.821767609033138e-05, "loss": 0.1019, "step": 7860 }, { "grad_norm": 1.2024885416030884, "learning_rate": 9.821037649001284e-05, "loss": 0.1425, "step": 7870 }, { "grad_norm": 0.8468419909477234, "learning_rate": 9.820306224465486e-05, "loss": 0.1357, "step": 7880 }, { "grad_norm": 0.7765203714370728, "learning_rate": 9.819573335647928e-05, "loss": 0.127, "step": 7890 }, { "grad_norm": 1.0490984916687012, "learning_rate": 9.818838982771246e-05, "loss": 0.1047, "step": 7900 }, { "grad_norm": 0.5486226081848145, "learning_rate": 9.818103166058514e-05, "loss": 0.1328, "step": 7910 }, { "grad_norm": 0.9463914036750793, "learning_rate": 9.817365885733254e-05, "loss": 0.1025, "step": 7920 }, { "grad_norm": 0.6209417581558228, "learning_rate": 9.816627142019434e-05, "loss": 0.1011, "step": 7930 }, { "grad_norm": 0.8162684440612793, "learning_rate": 9.815886935141463e-05, "loss": 0.1179, "step": 7940 }, { "grad_norm": 9.126652717590332, "learning_rate": 9.8151452653242e-05, "loss": 0.1295, "step": 7950 }, { "grad_norm": 0.697386622428894, "learning_rate": 9.814402132792939e-05, "loss": 0.0958, "step": 7960 }, { "grad_norm": 0.6615158319473267, "learning_rate": 9.813657537773428e-05, "loss": 0.124, "step": 7970 }, { "grad_norm": 1.0788935422897339, "learning_rate": 9.812911480491854e-05, "loss": 0.1233, "step": 7980 }, { "grad_norm": 0.8621317148208618, "learning_rate": 9.81216396117485e-05, "loss": 0.1271, "step": 7990 }, { "grad_norm": 1.0739328861236572, "learning_rate": 9.811414980049491e-05, "loss": 0.1204, "step": 8000 }, { "grad_norm": 0.7204447388648987, "learning_rate": 9.810664537343301e-05, "loss": 0.1188, "step": 8010 }, { "grad_norm": 0.7373979091644287, "learning_rate": 9.809912633284243e-05, "loss": 0.141, "step": 8020 }, { "grad_norm": 1.6704127788543701, "learning_rate": 9.809159268100725e-05, "loss": 0.1335, "step": 8030 }, { "grad_norm": 0.9144304990768433, "learning_rate": 9.808404442021599e-05, "loss": 0.1142, "step": 8040 }, { "grad_norm": 0.826422929763794, "learning_rate": 9.807648155276163e-05, "loss": 0.0968, "step": 8050 }, { "grad_norm": 0.82502681016922, "learning_rate": 9.806890408094156e-05, "loss": 0.1315, "step": 8060 }, { "grad_norm": 0.7801705598831177, "learning_rate": 9.806131200705761e-05, "loss": 0.0882, "step": 8070 }, { "grad_norm": 0.6019753813743591, "learning_rate": 9.805370533341605e-05, "loss": 0.1065, "step": 8080 }, { "grad_norm": 0.830460786819458, "learning_rate": 9.804608406232762e-05, "loss": 0.124, "step": 8090 }, { "grad_norm": 0.6502386331558228, "learning_rate": 9.803844819610741e-05, "loss": 0.1066, "step": 8100 }, { "grad_norm": 0.7292042970657349, "learning_rate": 9.803079773707504e-05, "loss": 0.1036, "step": 8110 }, { "grad_norm": 0.7874075174331665, "learning_rate": 9.802313268755447e-05, "loss": 0.1147, "step": 8120 }, { "grad_norm": 0.7408947348594666, "learning_rate": 9.801545304987419e-05, "loss": 0.1087, "step": 8130 }, { "grad_norm": 0.6854400038719177, "learning_rate": 9.800775882636704e-05, "loss": 0.1041, "step": 8140 }, { "grad_norm": 0.91585373878479, "learning_rate": 9.800005001937034e-05, "loss": 0.1205, "step": 8150 }, { "grad_norm": 0.8077234625816345, "learning_rate": 9.79923266312258e-05, "loss": 0.1121, "step": 8160 }, { "grad_norm": 0.87063068151474, "learning_rate": 9.79845886642796e-05, "loss": 0.1148, "step": 8170 }, { "grad_norm": 0.8147907853126526, "learning_rate": 9.797683612088233e-05, "loss": 0.1056, "step": 8180 }, { "grad_norm": 0.8672631978988647, "learning_rate": 9.796906900338898e-05, "loss": 0.1071, "step": 8190 }, { "grad_norm": 0.7767663598060608, "learning_rate": 9.796128731415903e-05, "loss": 0.1098, "step": 8200 }, { "grad_norm": 0.8404273986816406, "learning_rate": 9.795349105555634e-05, "loss": 0.0867, "step": 8210 }, { "grad_norm": 0.8839234113693237, "learning_rate": 9.794568022994922e-05, "loss": 0.1278, "step": 8220 }, { "grad_norm": 1.112586498260498, "learning_rate": 9.793785483971034e-05, "loss": 0.1152, "step": 8230 }, { "grad_norm": 0.9334195852279663, "learning_rate": 9.793001488721691e-05, "loss": 0.1117, "step": 8240 }, { "grad_norm": 0.7020570039749146, "learning_rate": 9.792216037485047e-05, "loss": 0.1076, "step": 8250 }, { "grad_norm": 0.8044388890266418, "learning_rate": 9.791429130499704e-05, "loss": 0.1145, "step": 8260 }, { "grad_norm": 1.0647181272506714, "learning_rate": 9.790640768004698e-05, "loss": 0.1304, "step": 8270 }, { "grad_norm": 0.8318389654159546, "learning_rate": 9.789850950239518e-05, "loss": 0.1034, "step": 8280 }, { "grad_norm": 0.7403506636619568, "learning_rate": 9.789059677444089e-05, "loss": 0.1261, "step": 8290 }, { "grad_norm": 1.0913132429122925, "learning_rate": 9.788266949858776e-05, "loss": 0.1218, "step": 8300 }, { "grad_norm": 1.309130072593689, "learning_rate": 9.787472767724392e-05, "loss": 0.1175, "step": 8310 }, { "grad_norm": 0.8422965407371521, "learning_rate": 9.786677131282185e-05, "loss": 0.1192, "step": 8320 }, { "grad_norm": 0.7529466152191162, "learning_rate": 9.785880040773853e-05, "loss": 0.0949, "step": 8330 }, { "grad_norm": 0.7305835485458374, "learning_rate": 9.785081496441527e-05, "loss": 0.1034, "step": 8340 }, { "grad_norm": 0.8053068518638611, "learning_rate": 9.784281498527785e-05, "loss": 0.1196, "step": 8350 }, { "grad_norm": 0.6806154251098633, "learning_rate": 9.783480047275646e-05, "loss": 0.1073, "step": 8360 }, { "grad_norm": 0.8008338212966919, "learning_rate": 9.78267714292857e-05, "loss": 0.1216, "step": 8370 }, { "grad_norm": 0.9894343614578247, "learning_rate": 9.781872785730454e-05, "loss": 0.0946, "step": 8380 }, { "grad_norm": 0.9375789761543274, "learning_rate": 9.781066975925646e-05, "loss": 0.1132, "step": 8390 }, { "grad_norm": 0.5873976945877075, "learning_rate": 9.780259713758928e-05, "loss": 0.1093, "step": 8400 }, { "grad_norm": 0.6629438400268555, "learning_rate": 9.779450999475524e-05, "loss": 0.1181, "step": 8410 }, { "grad_norm": 0.6053526401519775, "learning_rate": 9.7786408333211e-05, "loss": 0.1121, "step": 8420 }, { "grad_norm": 0.5780242681503296, "learning_rate": 9.777829215541764e-05, "loss": 0.1281, "step": 8430 }, { "grad_norm": 0.959517240524292, "learning_rate": 9.777016146384064e-05, "loss": 0.1373, "step": 8440 }, { "grad_norm": 0.9093196392059326, "learning_rate": 9.776201626094988e-05, "loss": 0.1269, "step": 8450 }, { "grad_norm": 0.7094097137451172, "learning_rate": 9.775385654921965e-05, "loss": 0.1346, "step": 8460 }, { "grad_norm": 0.8692582249641418, "learning_rate": 9.774568233112868e-05, "loss": 0.1019, "step": 8470 }, { "grad_norm": 0.9477279782295227, "learning_rate": 9.773749360916007e-05, "loss": 0.1363, "step": 8480 }, { "grad_norm": 0.9363481402397156, "learning_rate": 9.772929038580134e-05, "loss": 0.1211, "step": 8490 }, { "grad_norm": 0.925882875919342, "learning_rate": 9.772107266354439e-05, "loss": 0.1135, "step": 8500 }, { "grad_norm": 0.8312253952026367, "learning_rate": 9.77128404448856e-05, "loss": 0.1238, "step": 8510 }, { "grad_norm": 0.8586586117744446, "learning_rate": 9.770459373232565e-05, "loss": 0.1237, "step": 8520 }, { "grad_norm": 0.8822264075279236, "learning_rate": 9.769633252836969e-05, "loss": 0.1097, "step": 8530 }, { "grad_norm": 0.6909860968589783, "learning_rate": 9.768805683552724e-05, "loss": 0.1119, "step": 8540 }, { "grad_norm": 0.6131134629249573, "learning_rate": 9.767976665631228e-05, "loss": 0.0983, "step": 8550 }, { "grad_norm": 0.8567954301834106, "learning_rate": 9.767146199324311e-05, "loss": 0.1263, "step": 8560 }, { "grad_norm": 0.8126462697982788, "learning_rate": 9.766314284884249e-05, "loss": 0.1235, "step": 8570 }, { "grad_norm": 0.8734514713287354, "learning_rate": 9.765480922563752e-05, "loss": 0.1228, "step": 8580 }, { "grad_norm": 0.8481717705726624, "learning_rate": 9.764646112615978e-05, "loss": 0.118, "step": 8590 }, { "grad_norm": 1.1893682479858398, "learning_rate": 9.763809855294517e-05, "loss": 0.1023, "step": 8600 }, { "grad_norm": 0.8877421617507935, "learning_rate": 9.762972150853404e-05, "loss": 0.1242, "step": 8610 }, { "grad_norm": 0.8281400203704834, "learning_rate": 9.762132999547111e-05, "loss": 0.1259, "step": 8620 }, { "grad_norm": 1.0149271488189697, "learning_rate": 9.761292401630549e-05, "loss": 0.1701, "step": 8630 }, { "grad_norm": 0.8636648654937744, "learning_rate": 9.76045035735907e-05, "loss": 0.1054, "step": 8640 }, { "grad_norm": 0.6947084069252014, "learning_rate": 9.759606866988464e-05, "loss": 0.095, "step": 8650 }, { "grad_norm": 0.7271919846534729, "learning_rate": 9.758761930774963e-05, "loss": 0.1017, "step": 8660 }, { "grad_norm": 1.0697157382965088, "learning_rate": 9.757915548975235e-05, "loss": 0.0951, "step": 8670 }, { "grad_norm": 0.8411468267440796, "learning_rate": 9.757067721846389e-05, "loss": 0.0945, "step": 8680 }, { "grad_norm": 0.5844053030014038, "learning_rate": 9.756218449645971e-05, "loss": 0.0989, "step": 8690 }, { "grad_norm": 0.6488379836082458, "learning_rate": 9.75536773263197e-05, "loss": 0.0849, "step": 8700 }, { "grad_norm": 0.8686915636062622, "learning_rate": 9.75451557106281e-05, "loss": 0.0973, "step": 8710 }, { "grad_norm": 0.7385649085044861, "learning_rate": 9.753661965197354e-05, "loss": 0.0856, "step": 8720 }, { "grad_norm": 0.7861431837081909, "learning_rate": 9.752806915294908e-05, "loss": 0.1048, "step": 8730 }, { "grad_norm": 0.8074831366539001, "learning_rate": 9.75195042161521e-05, "loss": 0.1185, "step": 8740 }, { "grad_norm": 0.8328621983528137, "learning_rate": 9.751092484418442e-05, "loss": 0.0987, "step": 8750 }, { "grad_norm": 0.8449028730392456, "learning_rate": 9.750233103965224e-05, "loss": 0.1105, "step": 8760 }, { "grad_norm": 0.5689380764961243, "learning_rate": 9.749372280516611e-05, "loss": 0.0947, "step": 8770 }, { "grad_norm": 0.8034319281578064, "learning_rate": 9.748510014334097e-05, "loss": 0.1036, "step": 8780 }, { "grad_norm": 0.722968339920044, "learning_rate": 9.747646305679621e-05, "loss": 0.1266, "step": 8790 }, { "grad_norm": 0.7165214419364929, "learning_rate": 9.74678115481555e-05, "loss": 0.0973, "step": 8800 }, { "grad_norm": 0.8074265718460083, "learning_rate": 9.745914562004696e-05, "loss": 0.109, "step": 8810 }, { "grad_norm": 0.7828789353370667, "learning_rate": 9.745046527510307e-05, "loss": 0.1171, "step": 8820 }, { "grad_norm": 0.8219668865203857, "learning_rate": 9.744177051596068e-05, "loss": 0.1289, "step": 8830 }, { "grad_norm": 0.698023796081543, "learning_rate": 9.743306134526105e-05, "loss": 0.125, "step": 8840 }, { "grad_norm": 0.5724928379058838, "learning_rate": 9.742433776564977e-05, "loss": 0.1109, "step": 8850 }, { "grad_norm": 0.8210878968238831, "learning_rate": 9.741559977977683e-05, "loss": 0.0893, "step": 8860 }, { "grad_norm": 0.6348305940628052, "learning_rate": 9.740684739029661e-05, "loss": 0.1164, "step": 8870 }, { "grad_norm": 0.7690865993499756, "learning_rate": 9.739808059986789e-05, "loss": 0.099, "step": 8880 }, { "grad_norm": 0.8051338195800781, "learning_rate": 9.738929941115373e-05, "loss": 0.1336, "step": 8890 }, { "grad_norm": 0.7910470962524414, "learning_rate": 9.738050382682167e-05, "loss": 0.1228, "step": 8900 }, { "grad_norm": 0.7348703145980835, "learning_rate": 9.737169384954355e-05, "loss": 0.1358, "step": 8910 }, { "grad_norm": 0.7288306951522827, "learning_rate": 9.736286948199562e-05, "loss": 0.0923, "step": 8920 }, { "grad_norm": 0.7566800117492676, "learning_rate": 9.735403072685848e-05, "loss": 0.1219, "step": 8930 }, { "grad_norm": 0.8122377395629883, "learning_rate": 9.734517758681712e-05, "loss": 0.0965, "step": 8940 }, { "grad_norm": 1.0334062576293945, "learning_rate": 9.733631006456088e-05, "loss": 0.0904, "step": 8950 }, { "grad_norm": 0.922827959060669, "learning_rate": 9.732742816278348e-05, "loss": 0.1286, "step": 8960 }, { "grad_norm": 1.030263900756836, "learning_rate": 9.731853188418302e-05, "loss": 0.1192, "step": 8970 }, { "grad_norm": 0.844610333442688, "learning_rate": 9.730962123146194e-05, "loss": 0.1096, "step": 8980 }, { "grad_norm": 1.1251323223114014, "learning_rate": 9.730069620732709e-05, "loss": 0.1221, "step": 8990 }, { "grad_norm": 0.9523710012435913, "learning_rate": 9.72917568144896e-05, "loss": 0.1064, "step": 9000 }, { "grad_norm": 1.3260389566421509, "learning_rate": 9.728280305566509e-05, "loss": 0.1467, "step": 9010 }, { "grad_norm": 0.7717998027801514, "learning_rate": 9.727383493357343e-05, "loss": 0.1251, "step": 9020 }, { "grad_norm": 0.7679126262664795, "learning_rate": 9.726485245093891e-05, "loss": 0.1028, "step": 9030 }, { "grad_norm": 0.7331946492195129, "learning_rate": 9.725585561049018e-05, "loss": 0.1245, "step": 9040 }, { "grad_norm": 0.9736688733100891, "learning_rate": 9.724684441496022e-05, "loss": 0.1153, "step": 9050 }, { "grad_norm": 0.8956596851348877, "learning_rate": 9.72378188670864e-05, "loss": 0.1147, "step": 9060 }, { "grad_norm": 0.9593555331230164, "learning_rate": 9.722877896961047e-05, "loss": 0.118, "step": 9070 }, { "grad_norm": 1.0225554704666138, "learning_rate": 9.721972472527848e-05, "loss": 0.1208, "step": 9080 }, { "grad_norm": 0.6499040126800537, "learning_rate": 9.721065613684089e-05, "loss": 0.1389, "step": 9090 }, { "grad_norm": 0.8907834887504578, "learning_rate": 9.72015732070525e-05, "loss": 0.1078, "step": 9100 }, { "grad_norm": 0.8183223605155945, "learning_rate": 9.719247593867244e-05, "loss": 0.0998, "step": 9110 }, { "grad_norm": 0.6865338683128357, "learning_rate": 9.718336433446423e-05, "loss": 0.1247, "step": 9120 }, { "grad_norm": 0.9747229218482971, "learning_rate": 9.717423839719574e-05, "loss": 0.1174, "step": 9130 }, { "grad_norm": 0.771809458732605, "learning_rate": 9.71650981296392e-05, "loss": 0.1327, "step": 9140 }, { "grad_norm": 0.7101731300354004, "learning_rate": 9.715594353457118e-05, "loss": 0.1113, "step": 9150 }, { "grad_norm": 0.9703447818756104, "learning_rate": 9.714677461477257e-05, "loss": 0.1233, "step": 9160 }, { "grad_norm": 1.1303101778030396, "learning_rate": 9.713759137302869e-05, "loss": 0.1156, "step": 9170 }, { "grad_norm": 0.6790053248405457, "learning_rate": 9.712839381212914e-05, "loss": 0.1025, "step": 9180 }, { "grad_norm": 0.9554124474525452, "learning_rate": 9.71191819348679e-05, "loss": 0.1288, "step": 9190 }, { "grad_norm": 0.7322344183921814, "learning_rate": 9.710995574404331e-05, "loss": 0.0928, "step": 9200 }, { "grad_norm": 0.7221456170082092, "learning_rate": 9.710071524245802e-05, "loss": 0.1013, "step": 9210 }, { "grad_norm": 0.7621352672576904, "learning_rate": 9.709146043291906e-05, "loss": 0.1101, "step": 9220 }, { "grad_norm": 0.6445536613464355, "learning_rate": 9.70821913182378e-05, "loss": 0.0835, "step": 9230 }, { "grad_norm": 0.9266310334205627, "learning_rate": 9.707290790122995e-05, "loss": 0.1253, "step": 9240 }, { "grad_norm": 0.824367344379425, "learning_rate": 9.706361018471557e-05, "loss": 0.1034, "step": 9250 }, { "grad_norm": 0.971007227897644, "learning_rate": 9.705429817151906e-05, "loss": 0.1324, "step": 9260 }, { "grad_norm": 0.6129474639892578, "learning_rate": 9.704497186446917e-05, "loss": 0.102, "step": 9270 }, { "grad_norm": 0.6781779527664185, "learning_rate": 9.703563126639896e-05, "loss": 0.1131, "step": 9280 }, { "grad_norm": 0.6848224997520447, "learning_rate": 9.70262763801459e-05, "loss": 0.0926, "step": 9290 }, { "grad_norm": 0.7931645512580872, "learning_rate": 9.701690720855171e-05, "loss": 0.0994, "step": 9300 }, { "grad_norm": 0.6838724613189697, "learning_rate": 9.700752375446253e-05, "loss": 0.1413, "step": 9310 }, { "grad_norm": 0.8234416842460632, "learning_rate": 9.69981260207288e-05, "loss": 0.1038, "step": 9320 }, { "grad_norm": 1.0015647411346436, "learning_rate": 9.698871401020529e-05, "loss": 0.1106, "step": 9330 }, { "grad_norm": 1.2240266799926758, "learning_rate": 9.697928772575112e-05, "loss": 0.108, "step": 9340 }, { "grad_norm": 0.6993211507797241, "learning_rate": 9.696984717022976e-05, "loss": 0.1013, "step": 9350 }, { "grad_norm": 0.712963879108429, "learning_rate": 9.6960392346509e-05, "loss": 0.0969, "step": 9360 }, { "grad_norm": 1.0144227743148804, "learning_rate": 9.695092325746097e-05, "loss": 0.1141, "step": 9370 }, { "grad_norm": 0.734596312046051, "learning_rate": 9.694143990596211e-05, "loss": 0.1263, "step": 9380 }, { "grad_norm": 0.6476377844810486, "learning_rate": 9.693194229489325e-05, "loss": 0.0976, "step": 9390 }, { "grad_norm": 0.9219314455986023, "learning_rate": 9.692243042713944e-05, "loss": 0.1014, "step": 9400 }, { "grad_norm": 0.723304271697998, "learning_rate": 9.691290430559022e-05, "loss": 0.0994, "step": 9410 }, { "grad_norm": 0.7624348998069763, "learning_rate": 9.690336393313932e-05, "loss": 0.1183, "step": 9420 }, { "grad_norm": 0.6821195483207703, "learning_rate": 9.689380931268487e-05, "loss": 0.0951, "step": 9430 }, { "grad_norm": 0.8260982632637024, "learning_rate": 9.688424044712932e-05, "loss": 0.1085, "step": 9440 }, { "grad_norm": 0.7290518283843994, "learning_rate": 9.687465733937942e-05, "loss": 0.0991, "step": 9450 }, { "grad_norm": 0.7103854417800903, "learning_rate": 9.686505999234627e-05, "loss": 0.1366, "step": 9460 }, { "grad_norm": 0.7903345823287964, "learning_rate": 9.685544840894529e-05, "loss": 0.1124, "step": 9470 }, { "grad_norm": 0.7286682724952698, "learning_rate": 9.684582259209624e-05, "loss": 0.096, "step": 9480 }, { "grad_norm": 0.7760915756225586, "learning_rate": 9.683618254472317e-05, "loss": 0.1, "step": 9490 }, { "grad_norm": 0.6150763034820557, "learning_rate": 9.682652826975449e-05, "loss": 0.0931, "step": 9500 }, { "grad_norm": 0.7404559254646301, "learning_rate": 9.681685977012291e-05, "loss": 0.1124, "step": 9510 }, { "grad_norm": 0.923324704170227, "learning_rate": 9.680717704876546e-05, "loss": 0.1034, "step": 9520 }, { "grad_norm": 0.7714402675628662, "learning_rate": 9.679748010862349e-05, "loss": 0.1111, "step": 9530 }, { "grad_norm": 1.2141863107681274, "learning_rate": 9.678776895264267e-05, "loss": 0.1111, "step": 9540 }, { "grad_norm": 0.7625934481620789, "learning_rate": 9.6778043583773e-05, "loss": 0.1259, "step": 9550 }, { "grad_norm": 1.251234769821167, "learning_rate": 9.67683040049688e-05, "loss": 0.1167, "step": 9560 }, { "grad_norm": 0.7555075883865356, "learning_rate": 9.675855021918869e-05, "loss": 0.0915, "step": 9570 }, { "grad_norm": 0.5661584138870239, "learning_rate": 9.674878222939561e-05, "loss": 0.1034, "step": 9580 }, { "grad_norm": 0.7736393213272095, "learning_rate": 9.673900003855681e-05, "loss": 0.0996, "step": 9590 }, { "grad_norm": 0.6958341598510742, "learning_rate": 9.672920364964389e-05, "loss": 0.1159, "step": 9600 }, { "grad_norm": 0.6698607802391052, "learning_rate": 9.671939306563269e-05, "loss": 0.0873, "step": 9610 }, { "grad_norm": 0.618735134601593, "learning_rate": 9.670956828950345e-05, "loss": 0.0935, "step": 9620 }, { "grad_norm": 0.8735418319702148, "learning_rate": 9.669972932424065e-05, "loss": 0.1024, "step": 9630 }, { "grad_norm": 0.7312784194946289, "learning_rate": 9.668987617283312e-05, "loss": 0.1333, "step": 9640 }, { "grad_norm": 0.9032554626464844, "learning_rate": 9.668000883827397e-05, "loss": 0.1023, "step": 9650 }, { "grad_norm": 0.7201107740402222, "learning_rate": 9.667012732356067e-05, "loss": 0.0934, "step": 9660 }, { "grad_norm": 0.7086207270622253, "learning_rate": 9.666023163169493e-05, "loss": 0.1089, "step": 9670 }, { "grad_norm": 0.7582955360412598, "learning_rate": 9.665032176568281e-05, "loss": 0.1082, "step": 9680 }, { "grad_norm": 0.9025633931159973, "learning_rate": 9.664039772853469e-05, "loss": 0.1069, "step": 9690 }, { "grad_norm": 0.8722148537635803, "learning_rate": 9.663045952326518e-05, "loss": 0.1063, "step": 9700 }, { "grad_norm": 0.5853986144065857, "learning_rate": 9.662050715289328e-05, "loss": 0.0881, "step": 9710 }, { "grad_norm": 0.6449203491210938, "learning_rate": 9.661054062044226e-05, "loss": 0.0854, "step": 9720 }, { "grad_norm": 0.8504125475883484, "learning_rate": 9.660055992893968e-05, "loss": 0.1035, "step": 9730 }, { "grad_norm": 0.6503901481628418, "learning_rate": 9.659056508141739e-05, "loss": 0.0979, "step": 9740 }, { "grad_norm": 1.1788185834884644, "learning_rate": 9.658055608091161e-05, "loss": 0.1067, "step": 9750 }, { "grad_norm": 0.8643325567245483, "learning_rate": 9.657053293046276e-05, "loss": 0.1059, "step": 9760 }, { "grad_norm": 0.6117591857910156, "learning_rate": 9.656049563311564e-05, "loss": 0.1214, "step": 9770 }, { "grad_norm": 0.7058812379837036, "learning_rate": 9.655044419191929e-05, "loss": 0.1192, "step": 9780 }, { "grad_norm": 0.9594596028327942, "learning_rate": 9.654037860992711e-05, "loss": 0.1095, "step": 9790 }, { "grad_norm": 0.961826741695404, "learning_rate": 9.653029889019672e-05, "loss": 0.091, "step": 9800 }, { "grad_norm": 0.9655978679656982, "learning_rate": 9.65202050357901e-05, "loss": 0.0948, "step": 9810 }, { "grad_norm": 0.6303826570510864, "learning_rate": 9.651009704977347e-05, "loss": 0.105, "step": 9820 }, { "grad_norm": 0.9957102537155151, "learning_rate": 9.649997493521738e-05, "loss": 0.1245, "step": 9830 }, { "grad_norm": 0.6634133458137512, "learning_rate": 9.64898386951967e-05, "loss": 0.0908, "step": 9840 }, { "grad_norm": 0.678835928440094, "learning_rate": 9.647968833279049e-05, "loss": 0.1038, "step": 9850 }, { "grad_norm": 0.6902616620063782, "learning_rate": 9.646952385108218e-05, "loss": 0.0917, "step": 9860 }, { "grad_norm": 0.993213951587677, "learning_rate": 9.645934525315951e-05, "loss": 0.1049, "step": 9870 }, { "grad_norm": 0.749019205570221, "learning_rate": 9.644915254211442e-05, "loss": 0.0948, "step": 9880 }, { "grad_norm": 0.7002012729644775, "learning_rate": 9.643894572104321e-05, "loss": 0.1054, "step": 9890 }, { "grad_norm": 0.5761085152626038, "learning_rate": 9.642872479304644e-05, "loss": 0.0872, "step": 9900 }, { "grad_norm": 0.7063408493995667, "learning_rate": 9.641848976122895e-05, "loss": 0.1009, "step": 9910 }, { "grad_norm": 0.7192111015319824, "learning_rate": 9.64082406286999e-05, "loss": 0.0895, "step": 9920 }, { "grad_norm": 1.0777944326400757, "learning_rate": 9.639797739857269e-05, "loss": 0.1107, "step": 9930 }, { "grad_norm": 0.8311973810195923, "learning_rate": 9.638770007396498e-05, "loss": 0.1108, "step": 9940 }, { "grad_norm": 0.8895646333694458, "learning_rate": 9.63774086579988e-05, "loss": 0.1172, "step": 9950 }, { "grad_norm": 0.9203358888626099, "learning_rate": 9.63671031538004e-05, "loss": 0.1099, "step": 9960 }, { "grad_norm": 0.8642525672912598, "learning_rate": 9.635678356450031e-05, "loss": 0.0968, "step": 9970 }, { "grad_norm": 0.6886183023452759, "learning_rate": 9.634644989323336e-05, "loss": 0.1023, "step": 9980 }, { "grad_norm": 0.6968928575515747, "learning_rate": 9.633610214313861e-05, "loss": 0.1035, "step": 9990 }, { "grad_norm": 0.916830837726593, "learning_rate": 9.632574031735951e-05, "loss": 0.1322, "step": 10000 }, { "grad_norm": 0.6608489155769348, "learning_rate": 9.631536441904364e-05, "loss": 0.098, "step": 10010 }, { "grad_norm": 0.7738320827484131, "learning_rate": 9.630497445134293e-05, "loss": 0.0973, "step": 10020 }, { "grad_norm": 0.581135094165802, "learning_rate": 9.62945704174136e-05, "loss": 0.0945, "step": 10030 }, { "grad_norm": 1.1426177024841309, "learning_rate": 9.628415232041612e-05, "loss": 0.111, "step": 10040 }, { "grad_norm": 1.1348474025726318, "learning_rate": 9.627372016351524e-05, "loss": 0.1182, "step": 10050 }, { "grad_norm": 0.7364993691444397, "learning_rate": 9.626327394987995e-05, "loss": 0.0851, "step": 10060 }, { "grad_norm": 0.9821736216545105, "learning_rate": 9.625281368268355e-05, "loss": 0.1065, "step": 10070 }, { "grad_norm": 1.2623438835144043, "learning_rate": 9.624233936510357e-05, "loss": 0.1239, "step": 10080 }, { "grad_norm": 1.1646506786346436, "learning_rate": 9.623185100032187e-05, "loss": 0.1325, "step": 10090 }, { "grad_norm": 0.8964385390281677, "learning_rate": 9.62213485915245e-05, "loss": 0.1106, "step": 10100 }, { "grad_norm": 0.7774184346199036, "learning_rate": 9.621083214190186e-05, "loss": 0.103, "step": 10110 }, { "grad_norm": 0.4765988290309906, "learning_rate": 9.62003016546485e-05, "loss": 0.0887, "step": 10120 }, { "grad_norm": 0.6231943368911743, "learning_rate": 9.618975713296339e-05, "loss": 0.1276, "step": 10130 }, { "grad_norm": 0.5990045070648193, "learning_rate": 9.61791985800496e-05, "loss": 0.1193, "step": 10140 }, { "grad_norm": 0.798305869102478, "learning_rate": 9.616862599911458e-05, "loss": 0.1427, "step": 10150 }, { "grad_norm": 0.8156343102455139, "learning_rate": 9.615803939337e-05, "loss": 0.1033, "step": 10160 }, { "grad_norm": 0.7141534090042114, "learning_rate": 9.614743876603178e-05, "loss": 0.0946, "step": 10170 }, { "grad_norm": 1.2834110260009766, "learning_rate": 9.613682412032013e-05, "loss": 0.1099, "step": 10180 }, { "grad_norm": 0.6567717790603638, "learning_rate": 9.612619545945947e-05, "loss": 0.0899, "step": 10190 }, { "grad_norm": 1.0036795139312744, "learning_rate": 9.611555278667852e-05, "loss": 0.1102, "step": 10200 }, { "grad_norm": 0.7972432374954224, "learning_rate": 9.610489610521024e-05, "loss": 0.1024, "step": 10210 }, { "grad_norm": 0.7764559984207153, "learning_rate": 9.609422541829187e-05, "loss": 0.132, "step": 10220 }, { "grad_norm": 0.6292955875396729, "learning_rate": 9.608354072916486e-05, "loss": 0.12, "step": 10230 }, { "grad_norm": 0.8699120879173279, "learning_rate": 9.607284204107493e-05, "loss": 0.1214, "step": 10240 }, { "grad_norm": 0.9175477027893066, "learning_rate": 9.606212935727208e-05, "loss": 0.1134, "step": 10250 }, { "grad_norm": 0.6216536164283752, "learning_rate": 9.605140268101052e-05, "loss": 0.1157, "step": 10260 }, { "grad_norm": 0.9184012413024902, "learning_rate": 9.604066201554875e-05, "loss": 0.1153, "step": 10270 }, { "grad_norm": 0.66749507188797, "learning_rate": 9.60299073641495e-05, "loss": 0.1039, "step": 10280 }, { "grad_norm": 0.7786386013031006, "learning_rate": 9.601913873007974e-05, "loss": 0.11, "step": 10290 }, { "grad_norm": 1.179758906364441, "learning_rate": 9.60083561166107e-05, "loss": 0.1015, "step": 10300 }, { "grad_norm": 0.5376681089401245, "learning_rate": 9.599755952701783e-05, "loss": 0.1213, "step": 10310 }, { "grad_norm": 0.6571853756904602, "learning_rate": 9.598674896458089e-05, "loss": 0.1015, "step": 10320 }, { "grad_norm": 0.86484694480896, "learning_rate": 9.597592443258383e-05, "loss": 0.1076, "step": 10330 }, { "grad_norm": 0.9011909365653992, "learning_rate": 9.596508593431483e-05, "loss": 0.0972, "step": 10340 }, { "grad_norm": 0.6803378462791443, "learning_rate": 9.59542334730664e-05, "loss": 0.0949, "step": 10350 }, { "grad_norm": 0.9325493574142456, "learning_rate": 9.594336705213516e-05, "loss": 0.0986, "step": 10360 }, { "grad_norm": 1.1699159145355225, "learning_rate": 9.593248667482208e-05, "loss": 0.1333, "step": 10370 }, { "grad_norm": 0.6711845397949219, "learning_rate": 9.592159234443233e-05, "loss": 0.1406, "step": 10380 }, { "grad_norm": 0.6163390278816223, "learning_rate": 9.59106840642753e-05, "loss": 0.1001, "step": 10390 }, { "grad_norm": 1.2792834043502808, "learning_rate": 9.589976183766467e-05, "loss": 0.1172, "step": 10400 }, { "grad_norm": 0.6997532248497009, "learning_rate": 9.58888256679183e-05, "loss": 0.109, "step": 10410 }, { "grad_norm": 0.7119566798210144, "learning_rate": 9.587787555835832e-05, "loss": 0.1122, "step": 10420 }, { "grad_norm": 0.7047100067138672, "learning_rate": 9.586691151231107e-05, "loss": 0.0902, "step": 10430 }, { "grad_norm": 0.7804931998252869, "learning_rate": 9.585593353310715e-05, "loss": 0.1024, "step": 10440 }, { "grad_norm": 0.765004575252533, "learning_rate": 9.58449416240814e-05, "loss": 0.0897, "step": 10450 }, { "grad_norm": 0.741997241973877, "learning_rate": 9.583393578857283e-05, "loss": 0.1077, "step": 10460 }, { "grad_norm": 0.8351094722747803, "learning_rate": 9.582291602992474e-05, "loss": 0.0977, "step": 10470 }, { "grad_norm": 0.9141425490379333, "learning_rate": 9.581188235148466e-05, "loss": 0.1007, "step": 10480 }, { "grad_norm": 0.5333566069602966, "learning_rate": 9.58008347566043e-05, "loss": 0.1447, "step": 10490 }, { "grad_norm": 0.8538241386413574, "learning_rate": 9.578977324863965e-05, "loss": 0.1026, "step": 10500 }, { "grad_norm": 0.8747405409812927, "learning_rate": 9.577869783095089e-05, "loss": 0.0846, "step": 10510 }, { "grad_norm": 1.1663649082183838, "learning_rate": 9.576760850690245e-05, "loss": 0.1135, "step": 10520 }, { "grad_norm": 0.7089765071868896, "learning_rate": 9.575650527986298e-05, "loss": 0.1096, "step": 10530 }, { "grad_norm": 0.7524533867835999, "learning_rate": 9.574538815320531e-05, "loss": 0.1023, "step": 10540 }, { "grad_norm": 0.6567497253417969, "learning_rate": 9.573425713030656e-05, "loss": 0.1022, "step": 10550 }, { "grad_norm": 0.92160564661026, "learning_rate": 9.572311221454806e-05, "loss": 0.1042, "step": 10560 }, { "grad_norm": 0.7704942226409912, "learning_rate": 9.57119534093153e-05, "loss": 0.1428, "step": 10570 }, { "grad_norm": 0.8260449767112732, "learning_rate": 9.570078071799806e-05, "loss": 0.1066, "step": 10580 }, { "grad_norm": 0.7565892338752747, "learning_rate": 9.568959414399028e-05, "loss": 0.1016, "step": 10590 }, { "grad_norm": 0.8288909792900085, "learning_rate": 9.567839369069018e-05, "loss": 0.0896, "step": 10600 }, { "grad_norm": 0.6274757981300354, "learning_rate": 9.566717936150013e-05, "loss": 0.0893, "step": 10610 }, { "grad_norm": 0.5729643702507019, "learning_rate": 9.565595115982678e-05, "loss": 0.0856, "step": 10620 }, { "grad_norm": 0.7603257894515991, "learning_rate": 9.564470908908094e-05, "loss": 0.0927, "step": 10630 }, { "grad_norm": 0.6455250978469849, "learning_rate": 9.563345315267764e-05, "loss": 0.089, "step": 10640 }, { "grad_norm": 0.7810919284820557, "learning_rate": 9.562218335403616e-05, "loss": 0.1282, "step": 10650 }, { "grad_norm": 0.8530184030532837, "learning_rate": 9.561089969657999e-05, "loss": 0.104, "step": 10660 }, { "grad_norm": 0.6723816394805908, "learning_rate": 9.559960218373673e-05, "loss": 0.1, "step": 10670 }, { "grad_norm": 0.9027137756347656, "learning_rate": 9.558829081893836e-05, "loss": 0.1022, "step": 10680 }, { "grad_norm": 0.9945937395095825, "learning_rate": 9.55769656056209e-05, "loss": 0.0957, "step": 10690 }, { "grad_norm": 0.6150497198104858, "learning_rate": 9.556562654722469e-05, "loss": 0.0908, "step": 10700 }, { "grad_norm": 0.8704168200492859, "learning_rate": 9.555427364719422e-05, "loss": 0.0962, "step": 10710 }, { "grad_norm": 0.8691695928573608, "learning_rate": 9.55429069089782e-05, "loss": 0.1087, "step": 10720 }, { "grad_norm": 0.6640902161598206, "learning_rate": 9.553152633602956e-05, "loss": 0.092, "step": 10730 }, { "grad_norm": 0.7775123715400696, "learning_rate": 9.552013193180543e-05, "loss": 0.1091, "step": 10740 }, { "grad_norm": 1.0121511220932007, "learning_rate": 9.550872369976707e-05, "loss": 0.0994, "step": 10750 }, { "grad_norm": 0.6790451407432556, "learning_rate": 9.549730164338007e-05, "loss": 0.0953, "step": 10760 }, { "grad_norm": 0.6264231204986572, "learning_rate": 9.548586576611408e-05, "loss": 0.0903, "step": 10770 }, { "grad_norm": 1.0103304386138916, "learning_rate": 9.54744160714431e-05, "loss": 0.0965, "step": 10780 }, { "grad_norm": 0.8147310018539429, "learning_rate": 9.546295256284516e-05, "loss": 0.1057, "step": 10790 }, { "grad_norm": 0.6591886878013611, "learning_rate": 9.545147524380265e-05, "loss": 0.0942, "step": 10800 }, { "grad_norm": 0.9782715439796448, "learning_rate": 9.543998411780201e-05, "loss": 0.0878, "step": 10810 }, { "grad_norm": 0.6510726809501648, "learning_rate": 9.542847918833397e-05, "loss": 0.0836, "step": 10820 }, { "grad_norm": 0.6079398393630981, "learning_rate": 9.541696045889343e-05, "loss": 0.1093, "step": 10830 }, { "grad_norm": 0.5506206750869751, "learning_rate": 9.540542793297947e-05, "loss": 0.0955, "step": 10840 }, { "grad_norm": 0.7793743014335632, "learning_rate": 9.539388161409537e-05, "loss": 0.098, "step": 10850 }, { "grad_norm": 0.7172420024871826, "learning_rate": 9.538232150574857e-05, "loss": 0.1012, "step": 10860 }, { "grad_norm": 0.8308109045028687, "learning_rate": 9.537074761145076e-05, "loss": 0.1365, "step": 10870 }, { "grad_norm": 0.7163622379302979, "learning_rate": 9.535915993471778e-05, "loss": 0.1082, "step": 10880 }, { "grad_norm": 0.5566398501396179, "learning_rate": 9.534755847906964e-05, "loss": 0.0736, "step": 10890 }, { "grad_norm": 0.7216687202453613, "learning_rate": 9.533594324803057e-05, "loss": 0.0881, "step": 10900 }, { "grad_norm": 0.5537539124488831, "learning_rate": 9.532431424512895e-05, "loss": 0.0864, "step": 10910 }, { "grad_norm": 0.6061203479766846, "learning_rate": 9.531267147389741e-05, "loss": 0.0794, "step": 10920 }, { "grad_norm": 0.47310277819633484, "learning_rate": 9.530101493787266e-05, "loss": 0.1005, "step": 10930 }, { "grad_norm": 0.7560266256332397, "learning_rate": 9.528934464059571e-05, "loss": 0.0909, "step": 10940 }, { "grad_norm": 0.8985103368759155, "learning_rate": 9.527766058561163e-05, "loss": 0.0934, "step": 10950 }, { "grad_norm": 0.905853271484375, "learning_rate": 9.526596277646976e-05, "loss": 0.1123, "step": 10960 }, { "grad_norm": 0.7601210474967957, "learning_rate": 9.525425121672358e-05, "loss": 0.0963, "step": 10970 }, { "grad_norm": 1.0515323877334595, "learning_rate": 9.524252590993074e-05, "loss": 0.0943, "step": 10980 }, { "grad_norm": 0.9127128720283508, "learning_rate": 9.523078685965309e-05, "loss": 0.1032, "step": 10990 }, { "grad_norm": 0.8448425531387329, "learning_rate": 9.521903406945664e-05, "loss": 0.0938, "step": 11000 }, { "grad_norm": 0.6513290405273438, "learning_rate": 9.520726754291158e-05, "loss": 0.1124, "step": 11010 }, { "grad_norm": 0.8350521326065063, "learning_rate": 9.519548728359227e-05, "loss": 0.108, "step": 11020 }, { "grad_norm": 1.0848088264465332, "learning_rate": 9.518369329507726e-05, "loss": 0.0986, "step": 11030 }, { "grad_norm": 0.564542293548584, "learning_rate": 9.51718855809492e-05, "loss": 0.0822, "step": 11040 }, { "grad_norm": 0.7504306435585022, "learning_rate": 9.516006414479502e-05, "loss": 0.0993, "step": 11050 }, { "grad_norm": 0.9644858241081238, "learning_rate": 9.514822899020572e-05, "loss": 0.0852, "step": 11060 }, { "grad_norm": 0.790071964263916, "learning_rate": 9.513638012077654e-05, "loss": 0.0796, "step": 11070 }, { "grad_norm": 0.761962354183197, "learning_rate": 9.512451754010683e-05, "loss": 0.0851, "step": 11080 }, { "grad_norm": 1.0144405364990234, "learning_rate": 9.511264125180013e-05, "loss": 0.0982, "step": 11090 }, { "grad_norm": 0.6447202563285828, "learning_rate": 9.510075125946414e-05, "loss": 0.09, "step": 11100 }, { "grad_norm": 0.8852084279060364, "learning_rate": 9.508884756671075e-05, "loss": 0.0944, "step": 11110 }, { "grad_norm": 0.8056500554084778, "learning_rate": 9.507693017715596e-05, "loss": 0.0871, "step": 11120 }, { "grad_norm": 0.7039442658424377, "learning_rate": 9.506499909441997e-05, "loss": 0.1162, "step": 11130 }, { "grad_norm": 0.5214377045631409, "learning_rate": 9.505305432212713e-05, "loss": 0.0937, "step": 11140 }, { "grad_norm": 0.5591301321983337, "learning_rate": 9.504109586390595e-05, "loss": 0.1129, "step": 11150 }, { "grad_norm": 1.2282638549804688, "learning_rate": 9.502912372338908e-05, "loss": 0.1081, "step": 11160 }, { "grad_norm": 0.6916815638542175, "learning_rate": 9.501713790421335e-05, "loss": 0.0906, "step": 11170 }, { "grad_norm": 0.5370979309082031, "learning_rate": 9.500513841001974e-05, "loss": 0.0877, "step": 11180 }, { "grad_norm": 0.5769765377044678, "learning_rate": 9.499312524445336e-05, "loss": 0.0945, "step": 11190 }, { "grad_norm": 0.6200330853462219, "learning_rate": 9.498109841116351e-05, "loss": 0.1114, "step": 11200 }, { "grad_norm": 0.7552255392074585, "learning_rate": 9.496905791380363e-05, "loss": 0.0942, "step": 11210 }, { "grad_norm": 0.7738712430000305, "learning_rate": 9.495700375603129e-05, "loss": 0.0867, "step": 11220 }, { "grad_norm": 0.8059350252151489, "learning_rate": 9.494493594150822e-05, "loss": 0.0909, "step": 11230 }, { "grad_norm": 0.8562090992927551, "learning_rate": 9.493285447390032e-05, "loss": 0.1039, "step": 11240 }, { "grad_norm": 0.8997350931167603, "learning_rate": 9.492075935687761e-05, "loss": 0.0972, "step": 11250 }, { "grad_norm": 0.5295472741127014, "learning_rate": 9.490865059411427e-05, "loss": 0.0768, "step": 11260 }, { "grad_norm": 0.7607792019844055, "learning_rate": 9.489652818928863e-05, "loss": 0.0916, "step": 11270 }, { "grad_norm": 1.0160609483718872, "learning_rate": 9.488439214608315e-05, "loss": 0.0853, "step": 11280 }, { "grad_norm": 0.6344143152236938, "learning_rate": 9.487224246818444e-05, "loss": 0.0938, "step": 11290 }, { "grad_norm": 0.5558145046234131, "learning_rate": 9.486007915928325e-05, "loss": 0.12, "step": 11300 }, { "grad_norm": 0.6708557605743408, "learning_rate": 9.484790222307448e-05, "loss": 0.1079, "step": 11310 }, { "grad_norm": 0.7927095890045166, "learning_rate": 9.483571166325716e-05, "loss": 0.0999, "step": 11320 }, { "grad_norm": 0.6329466104507446, "learning_rate": 9.482350748353444e-05, "loss": 0.0938, "step": 11330 }, { "grad_norm": 1.182403802871704, "learning_rate": 9.481128968761363e-05, "loss": 0.1102, "step": 11340 }, { "grad_norm": 0.554328441619873, "learning_rate": 9.479905827920621e-05, "loss": 0.0825, "step": 11350 }, { "grad_norm": 0.6306438446044922, "learning_rate": 9.478681326202773e-05, "loss": 0.1151, "step": 11360 }, { "grad_norm": 0.8814279437065125, "learning_rate": 9.477455463979791e-05, "loss": 0.0803, "step": 11370 }, { "grad_norm": 0.7431371808052063, "learning_rate": 9.476228241624059e-05, "loss": 0.1192, "step": 11380 }, { "grad_norm": 0.6297891736030579, "learning_rate": 9.474999659508374e-05, "loss": 0.082, "step": 11390 }, { "grad_norm": 1.0731163024902344, "learning_rate": 9.47376971800595e-05, "loss": 0.1044, "step": 11400 }, { "grad_norm": 0.838602602481842, "learning_rate": 9.472538417490409e-05, "loss": 0.1002, "step": 11410 }, { "grad_norm": 1.393725872039795, "learning_rate": 9.471305758335784e-05, "loss": 0.1159, "step": 11420 }, { "grad_norm": 0.7255215048789978, "learning_rate": 9.47007174091653e-05, "loss": 0.1034, "step": 11430 }, { "grad_norm": 0.759509265422821, "learning_rate": 9.468836365607507e-05, "loss": 0.1025, "step": 11440 }, { "grad_norm": 0.938933253288269, "learning_rate": 9.467599632783988e-05, "loss": 0.1037, "step": 11450 }, { "grad_norm": 0.9160049557685852, "learning_rate": 9.466361542821662e-05, "loss": 0.0959, "step": 11460 }, { "grad_norm": 0.7950766086578369, "learning_rate": 9.465122096096625e-05, "loss": 0.1191, "step": 11470 }, { "grad_norm": 0.6111190319061279, "learning_rate": 9.463881292985391e-05, "loss": 0.1147, "step": 11480 }, { "grad_norm": 0.6490433812141418, "learning_rate": 9.462639133864881e-05, "loss": 0.1112, "step": 11490 }, { "grad_norm": 0.538110613822937, "learning_rate": 9.461395619112432e-05, "loss": 0.0839, "step": 11500 }, { "grad_norm": 0.710127592086792, "learning_rate": 9.460150749105791e-05, "loss": 0.0909, "step": 11510 }, { "grad_norm": 0.9259569644927979, "learning_rate": 9.458904524223116e-05, "loss": 0.1072, "step": 11520 }, { "grad_norm": 0.7261127829551697, "learning_rate": 9.457656944842976e-05, "loss": 0.0804, "step": 11530 }, { "grad_norm": 0.7718995809555054, "learning_rate": 9.456408011344353e-05, "loss": 0.1159, "step": 11540 }, { "grad_norm": 0.7370561957359314, "learning_rate": 9.455157724106643e-05, "loss": 0.1194, "step": 11550 }, { "grad_norm": 0.7511985898017883, "learning_rate": 9.453906083509647e-05, "loss": 0.1083, "step": 11560 }, { "grad_norm": 0.7349106669425964, "learning_rate": 9.45265308993358e-05, "loss": 0.0862, "step": 11570 }, { "grad_norm": 0.9144174456596375, "learning_rate": 9.451398743759071e-05, "loss": 0.0933, "step": 11580 }, { "grad_norm": 0.7816289663314819, "learning_rate": 9.450143045367156e-05, "loss": 0.1138, "step": 11590 }, { "grad_norm": 0.8345934748649597, "learning_rate": 9.448885995139283e-05, "loss": 0.0914, "step": 11600 }, { "grad_norm": 0.6178365349769592, "learning_rate": 9.44762759345731e-05, "loss": 0.1045, "step": 11610 }, { "grad_norm": 0.851370096206665, "learning_rate": 9.446367840703509e-05, "loss": 0.085, "step": 11620 }, { "grad_norm": 0.8791214823722839, "learning_rate": 9.445106737260556e-05, "loss": 0.0958, "step": 11630 }, { "grad_norm": 0.7529547214508057, "learning_rate": 9.443844283511543e-05, "loss": 0.0956, "step": 11640 }, { "grad_norm": 0.6718201637268066, "learning_rate": 9.442580479839968e-05, "loss": 0.1075, "step": 11650 }, { "grad_norm": 0.732272207736969, "learning_rate": 9.441315326629745e-05, "loss": 0.1016, "step": 11660 }, { "grad_norm": 0.730674147605896, "learning_rate": 9.44004882426519e-05, "loss": 0.0952, "step": 11670 }, { "grad_norm": 0.7749148607254028, "learning_rate": 9.438780973131037e-05, "loss": 0.0899, "step": 11680 }, { "grad_norm": 0.909260630607605, "learning_rate": 9.437511773612423e-05, "loss": 0.1031, "step": 11690 }, { "grad_norm": 0.6382336616516113, "learning_rate": 9.436241226094896e-05, "loss": 0.0939, "step": 11700 }, { "grad_norm": 0.8495974540710449, "learning_rate": 9.434969330964418e-05, "loss": 0.1253, "step": 11710 }, { "grad_norm": 0.7533050179481506, "learning_rate": 9.433696088607356e-05, "loss": 0.0969, "step": 11720 }, { "grad_norm": 1.1244479417800903, "learning_rate": 9.432421499410486e-05, "loss": 0.1037, "step": 11730 }, { "grad_norm": 0.693206250667572, "learning_rate": 9.431145563760998e-05, "loss": 0.0943, "step": 11740 }, { "grad_norm": 1.1021201610565186, "learning_rate": 9.429868282046484e-05, "loss": 0.1268, "step": 11750 }, { "grad_norm": 0.8850135803222656, "learning_rate": 9.428589654654951e-05, "loss": 0.1076, "step": 11760 }, { "grad_norm": 0.8337645530700684, "learning_rate": 9.42730968197481e-05, "loss": 0.1143, "step": 11770 }, { "grad_norm": 0.601527750492096, "learning_rate": 9.426028364394883e-05, "loss": 0.0858, "step": 11780 }, { "grad_norm": 0.5720808506011963, "learning_rate": 9.424745702304402e-05, "loss": 0.0877, "step": 11790 }, { "grad_norm": 0.5230599641799927, "learning_rate": 9.423461696093006e-05, "loss": 0.1291, "step": 11800 }, { "grad_norm": 0.993819534778595, "learning_rate": 9.422176346150741e-05, "loss": 0.1092, "step": 11810 }, { "grad_norm": 0.9256030917167664, "learning_rate": 9.420889652868063e-05, "loss": 0.111, "step": 11820 }, { "grad_norm": 0.6954436898231506, "learning_rate": 9.419601616635836e-05, "loss": 0.0955, "step": 11830 }, { "grad_norm": 0.7501406669616699, "learning_rate": 9.418312237845331e-05, "loss": 0.0857, "step": 11840 }, { "grad_norm": 0.6971440315246582, "learning_rate": 9.417021516888225e-05, "loss": 0.0858, "step": 11850 }, { "grad_norm": 0.753267765045166, "learning_rate": 9.415729454156608e-05, "loss": 0.0911, "step": 11860 }, { "grad_norm": 0.596595048904419, "learning_rate": 9.414436050042973e-05, "loss": 0.0916, "step": 11870 }, { "grad_norm": 0.621789813041687, "learning_rate": 9.413141304940223e-05, "loss": 0.1076, "step": 11880 }, { "grad_norm": 0.7751087546348572, "learning_rate": 9.411845219241666e-05, "loss": 0.1069, "step": 11890 }, { "grad_norm": 0.7473658323287964, "learning_rate": 9.410547793341021e-05, "loss": 0.1027, "step": 11900 }, { "grad_norm": 0.8429346084594727, "learning_rate": 9.409249027632408e-05, "loss": 0.0818, "step": 11910 }, { "grad_norm": 0.5119603276252747, "learning_rate": 9.407948922510362e-05, "loss": 0.0817, "step": 11920 }, { "grad_norm": 0.7941814064979553, "learning_rate": 9.406647478369817e-05, "loss": 0.0826, "step": 11930 }, { "grad_norm": 1.0556139945983887, "learning_rate": 9.405344695606118e-05, "loss": 0.1097, "step": 11940 }, { "grad_norm": 0.6849089860916138, "learning_rate": 9.404040574615018e-05, "loss": 0.0982, "step": 11950 }, { "grad_norm": 0.6690095067024231, "learning_rate": 9.402735115792674e-05, "loss": 0.1097, "step": 11960 }, { "grad_norm": 0.9636114835739136, "learning_rate": 9.401428319535649e-05, "loss": 0.1033, "step": 11970 }, { "grad_norm": 0.8232889771461487, "learning_rate": 9.400120186240912e-05, "loss": 0.1382, "step": 11980 }, { "grad_norm": 0.7245526909828186, "learning_rate": 9.398810716305844e-05, "loss": 0.1009, "step": 11990 }, { "grad_norm": 0.6199687123298645, "learning_rate": 9.397499910128222e-05, "loss": 0.1477, "step": 12000 }, { "grad_norm": 0.9879696369171143, "learning_rate": 9.396187768106237e-05, "loss": 0.1031, "step": 12010 }, { "grad_norm": 0.775917649269104, "learning_rate": 9.394874290638482e-05, "loss": 0.0952, "step": 12020 }, { "grad_norm": 0.6514952778816223, "learning_rate": 9.393559478123959e-05, "loss": 0.0915, "step": 12030 }, { "grad_norm": 0.8196951150894165, "learning_rate": 9.39224333096207e-05, "loss": 0.1009, "step": 12040 }, { "grad_norm": 0.6379609704017639, "learning_rate": 9.390925849552629e-05, "loss": 0.1012, "step": 12050 }, { "grad_norm": 0.5990712642669678, "learning_rate": 9.389607034295849e-05, "loss": 0.0935, "step": 12060 }, { "grad_norm": 0.7385344505310059, "learning_rate": 9.388286885592355e-05, "loss": 0.107, "step": 12070 }, { "grad_norm": 0.5796132683753967, "learning_rate": 9.386965403843168e-05, "loss": 0.1105, "step": 12080 }, { "grad_norm": 1.0226298570632935, "learning_rate": 9.385642589449726e-05, "loss": 0.1177, "step": 12090 }, { "grad_norm": 0.6585622429847717, "learning_rate": 9.38431844281386e-05, "loss": 0.0942, "step": 12100 }, { "grad_norm": 0.9146554470062256, "learning_rate": 9.38299296433781e-05, "loss": 0.0966, "step": 12110 }, { "grad_norm": 0.7661169767379761, "learning_rate": 9.381666154424226e-05, "loss": 0.0738, "step": 12120 }, { "grad_norm": 0.7752992510795593, "learning_rate": 9.380338013476157e-05, "loss": 0.1033, "step": 12130 }, { "grad_norm": 0.859798789024353, "learning_rate": 9.379008541897054e-05, "loss": 0.107, "step": 12140 }, { "grad_norm": 0.8270056843757629, "learning_rate": 9.377677740090777e-05, "loss": 0.1104, "step": 12150 }, { "grad_norm": 0.8169299364089966, "learning_rate": 9.376345608461588e-05, "loss": 0.1029, "step": 12160 }, { "grad_norm": 0.648333728313446, "learning_rate": 9.375012147414155e-05, "loss": 0.1039, "step": 12170 }, { "grad_norm": 0.8400606513023376, "learning_rate": 9.373677357353545e-05, "loss": 0.1078, "step": 12180 }, { "grad_norm": 0.7368009686470032, "learning_rate": 9.372341238685237e-05, "loss": 0.1096, "step": 12190 }, { "grad_norm": 0.6344829797744751, "learning_rate": 9.371003791815102e-05, "loss": 0.1053, "step": 12200 }, { "grad_norm": 0.7001718878746033, "learning_rate": 9.369665017149429e-05, "loss": 0.0738, "step": 12210 }, { "grad_norm": 0.8492374420166016, "learning_rate": 9.368324915094895e-05, "loss": 0.0917, "step": 12220 }, { "grad_norm": 0.7770529389381409, "learning_rate": 9.366983486058591e-05, "loss": 0.0781, "step": 12230 }, { "grad_norm": 0.6355231404304504, "learning_rate": 9.365640730448009e-05, "loss": 0.1014, "step": 12240 }, { "grad_norm": 0.8858745694160461, "learning_rate": 9.36429664867104e-05, "loss": 0.0914, "step": 12250 }, { "grad_norm": 0.4828411042690277, "learning_rate": 9.362951241135982e-05, "loss": 0.0801, "step": 12260 }, { "grad_norm": 0.8627265095710754, "learning_rate": 9.361604508251534e-05, "loss": 0.0908, "step": 12270 }, { "grad_norm": 0.6835442185401917, "learning_rate": 9.360256450426799e-05, "loss": 0.098, "step": 12280 }, { "grad_norm": 0.772235095500946, "learning_rate": 9.358907068071279e-05, "loss": 0.1052, "step": 12290 }, { "grad_norm": 1.2165919542312622, "learning_rate": 9.357556361594882e-05, "loss": 0.1225, "step": 12300 }, { "grad_norm": 0.7912402153015137, "learning_rate": 9.356204331407917e-05, "loss": 0.1136, "step": 12310 }, { "grad_norm": 0.9940614104270935, "learning_rate": 9.354850977921094e-05, "loss": 0.1322, "step": 12320 }, { "grad_norm": 0.8173584938049316, "learning_rate": 9.353496301545529e-05, "loss": 0.1387, "step": 12330 }, { "grad_norm": 0.9950727224349976, "learning_rate": 9.352140302692733e-05, "loss": 0.1137, "step": 12340 }, { "grad_norm": 0.879923403263092, "learning_rate": 9.350782981774627e-05, "loss": 0.1009, "step": 12350 }, { "grad_norm": 0.6332003474235535, "learning_rate": 9.349424339203526e-05, "loss": 0.0877, "step": 12360 }, { "grad_norm": 0.9931042790412903, "learning_rate": 9.34806437539215e-05, "loss": 0.1194, "step": 12370 }, { "grad_norm": 0.9474413990974426, "learning_rate": 9.346703090753622e-05, "loss": 0.1054, "step": 12380 }, { "grad_norm": 0.7336586117744446, "learning_rate": 9.345340485701461e-05, "loss": 0.1071, "step": 12390 }, { "grad_norm": 0.728599488735199, "learning_rate": 9.343976560649595e-05, "loss": 0.0775, "step": 12400 }, { "grad_norm": 0.5907045006752014, "learning_rate": 9.342611316012344e-05, "loss": 0.1022, "step": 12410 }, { "grad_norm": 0.921181321144104, "learning_rate": 9.341244752204437e-05, "loss": 0.1116, "step": 12420 }, { "grad_norm": 0.6798943877220154, "learning_rate": 9.339876869640995e-05, "loss": 0.0865, "step": 12430 }, { "grad_norm": 0.8208454847335815, "learning_rate": 9.33850766873755e-05, "loss": 0.0827, "step": 12440 }, { "grad_norm": 0.7580721378326416, "learning_rate": 9.337137149910028e-05, "loss": 0.1097, "step": 12450 }, { "grad_norm": 0.9095668792724609, "learning_rate": 9.335765313574753e-05, "loss": 0.0856, "step": 12460 }, { "grad_norm": 0.7264552116394043, "learning_rate": 9.334392160148457e-05, "loss": 0.1126, "step": 12470 }, { "grad_norm": 0.7766967415809631, "learning_rate": 9.333017690048264e-05, "loss": 0.0926, "step": 12480 }, { "grad_norm": 0.7139021158218384, "learning_rate": 9.331641903691706e-05, "loss": 0.0958, "step": 12490 }, { "grad_norm": 0.6237538456916809, "learning_rate": 9.330264801496707e-05, "loss": 0.1108, "step": 12500 }, { "grad_norm": 0.4743254482746124, "learning_rate": 9.328886383881594e-05, "loss": 0.0706, "step": 12510 }, { "grad_norm": 0.7403242588043213, "learning_rate": 9.327506651265095e-05, "loss": 0.0837, "step": 12520 }, { "grad_norm": 0.9796172380447388, "learning_rate": 9.326125604066338e-05, "loss": 0.0849, "step": 12530 }, { "grad_norm": 0.589729905128479, "learning_rate": 9.324743242704847e-05, "loss": 0.1033, "step": 12540 }, { "grad_norm": 0.6063310503959656, "learning_rate": 9.323359567600546e-05, "loss": 0.0923, "step": 12550 }, { "grad_norm": 0.7266538143157959, "learning_rate": 9.321974579173761e-05, "loss": 0.09, "step": 12560 }, { "grad_norm": 0.8285888433456421, "learning_rate": 9.320588277845213e-05, "loss": 0.101, "step": 12570 }, { "grad_norm": 0.6244341135025024, "learning_rate": 9.319200664036026e-05, "loss": 0.0974, "step": 12580 }, { "grad_norm": 0.8302909731864929, "learning_rate": 9.31781173816772e-05, "loss": 0.1111, "step": 12590 }, { "grad_norm": 0.7320884466171265, "learning_rate": 9.316421500662212e-05, "loss": 0.0846, "step": 12600 }, { "grad_norm": 0.6567819714546204, "learning_rate": 9.31502995194182e-05, "loss": 0.0862, "step": 12610 }, { "grad_norm": 0.6038808226585388, "learning_rate": 9.31363709242926e-05, "loss": 0.0806, "step": 12620 }, { "grad_norm": 0.7791721820831299, "learning_rate": 9.312242922547647e-05, "loss": 0.0969, "step": 12630 }, { "grad_norm": 1.1741491556167603, "learning_rate": 9.310847442720492e-05, "loss": 0.0983, "step": 12640 }, { "grad_norm": 1.0010426044464111, "learning_rate": 9.309450653371706e-05, "loss": 0.1043, "step": 12650 }, { "grad_norm": 0.8243852257728577, "learning_rate": 9.308052554925595e-05, "loss": 0.0751, "step": 12660 }, { "grad_norm": 1.2376649379730225, "learning_rate": 9.306653147806867e-05, "loss": 0.1204, "step": 12670 }, { "grad_norm": 1.2467361688613892, "learning_rate": 9.305252432440622e-05, "loss": 0.0963, "step": 12680 }, { "grad_norm": 0.7603247165679932, "learning_rate": 9.303850409252361e-05, "loss": 0.1001, "step": 12690 }, { "grad_norm": 0.9036640524864197, "learning_rate": 9.302447078667985e-05, "loss": 0.0868, "step": 12700 }, { "grad_norm": 0.8865518569946289, "learning_rate": 9.301042441113783e-05, "loss": 0.1071, "step": 12710 }, { "grad_norm": 0.6987447738647461, "learning_rate": 9.299636497016451e-05, "loss": 0.0833, "step": 12720 }, { "grad_norm": 0.712550699710846, "learning_rate": 9.298229246803076e-05, "loss": 0.076, "step": 12730 }, { "grad_norm": 0.9395150542259216, "learning_rate": 9.296820690901144e-05, "loss": 0.0926, "step": 12740 }, { "grad_norm": 0.572746992111206, "learning_rate": 9.295410829738539e-05, "loss": 0.0804, "step": 12750 }, { "grad_norm": 0.5711652636528015, "learning_rate": 9.293999663743535e-05, "loss": 0.0884, "step": 12760 }, { "grad_norm": 0.7259470224380493, "learning_rate": 9.292587193344813e-05, "loss": 0.1151, "step": 12770 }, { "grad_norm": 1.282049536705017, "learning_rate": 9.291173418971437e-05, "loss": 0.1117, "step": 12780 }, { "grad_norm": 0.6449275016784668, "learning_rate": 9.28975834105288e-05, "loss": 0.08, "step": 12790 }, { "grad_norm": 0.7346086502075195, "learning_rate": 9.288341960019004e-05, "loss": 0.1166, "step": 12800 }, { "grad_norm": 0.7392499446868896, "learning_rate": 9.286924276300067e-05, "loss": 0.0819, "step": 12810 }, { "grad_norm": 0.8217024803161621, "learning_rate": 9.285505290326726e-05, "loss": 0.0885, "step": 12820 }, { "grad_norm": 0.6341871023178101, "learning_rate": 9.284085002530027e-05, "loss": 0.1083, "step": 12830 }, { "grad_norm": 0.5730892419815063, "learning_rate": 9.282663413341422e-05, "loss": 0.0721, "step": 12840 }, { "grad_norm": 0.7511049509048462, "learning_rate": 9.281240523192747e-05, "loss": 0.0744, "step": 12850 }, { "grad_norm": 0.6141087412834167, "learning_rate": 9.279816332516242e-05, "loss": 0.0834, "step": 12860 }, { "grad_norm": 0.7876451015472412, "learning_rate": 9.278390841744536e-05, "loss": 0.0932, "step": 12870 }, { "grad_norm": 0.5259876251220703, "learning_rate": 9.276964051310658e-05, "loss": 0.0878, "step": 12880 }, { "grad_norm": 0.5560954809188843, "learning_rate": 9.275535961648027e-05, "loss": 0.0859, "step": 12890 }, { "grad_norm": 0.8579637408256531, "learning_rate": 9.274106573190459e-05, "loss": 0.0985, "step": 12900 }, { "grad_norm": 0.8020234107971191, "learning_rate": 9.272675886372168e-05, "loss": 0.0892, "step": 12910 }, { "grad_norm": 0.6462441682815552, "learning_rate": 9.271243901627754e-05, "loss": 0.0896, "step": 12920 }, { "grad_norm": 0.8273811340332031, "learning_rate": 9.269810619392219e-05, "loss": 0.095, "step": 12930 }, { "grad_norm": 0.9241808652877808, "learning_rate": 9.268376040100955e-05, "loss": 0.0917, "step": 12940 }, { "grad_norm": 0.6110383868217468, "learning_rate": 9.266940164189752e-05, "loss": 0.085, "step": 12950 }, { "grad_norm": 0.8096863031387329, "learning_rate": 9.265502992094787e-05, "loss": 0.0963, "step": 12960 }, { "grad_norm": 0.7668428421020508, "learning_rate": 9.264064524252638e-05, "loss": 0.0998, "step": 12970 }, { "grad_norm": 0.8346124887466431, "learning_rate": 9.262624761100271e-05, "loss": 0.1189, "step": 12980 }, { "grad_norm": 0.6298662424087524, "learning_rate": 9.261183703075051e-05, "loss": 0.0826, "step": 12990 }, { "grad_norm": 0.5915015935897827, "learning_rate": 9.259741350614733e-05, "loss": 0.0735, "step": 13000 }, { "grad_norm": 0.7308691740036011, "learning_rate": 9.258297704157464e-05, "loss": 0.0848, "step": 13010 }, { "grad_norm": 0.7880702614784241, "learning_rate": 9.256852764141786e-05, "loss": 0.0968, "step": 13020 }, { "grad_norm": 0.6595432162284851, "learning_rate": 9.255406531006634e-05, "loss": 0.0919, "step": 13030 }, { "grad_norm": 0.7524335384368896, "learning_rate": 9.253959005191335e-05, "loss": 0.0931, "step": 13040 }, { "grad_norm": 0.8755455613136292, "learning_rate": 9.25251018713561e-05, "loss": 0.0943, "step": 13050 }, { "grad_norm": 0.5788072943687439, "learning_rate": 9.251060077279571e-05, "loss": 0.0787, "step": 13060 }, { "grad_norm": 0.6479431986808777, "learning_rate": 9.249608676063724e-05, "loss": 0.101, "step": 13070 }, { "grad_norm": 0.8880367875099182, "learning_rate": 9.248155983928964e-05, "loss": 0.093, "step": 13080 }, { "grad_norm": 0.8066619634628296, "learning_rate": 9.246702001316583e-05, "loss": 0.0927, "step": 13090 }, { "grad_norm": 0.7057879567146301, "learning_rate": 9.245246728668262e-05, "loss": 0.0784, "step": 13100 }, { "grad_norm": 0.6875454783439636, "learning_rate": 9.243790166426073e-05, "loss": 0.0956, "step": 13110 }, { "grad_norm": 0.6821117997169495, "learning_rate": 9.242332315032484e-05, "loss": 0.0872, "step": 13120 }, { "grad_norm": 0.7150406241416931, "learning_rate": 9.240873174930349e-05, "loss": 0.0887, "step": 13130 }, { "grad_norm": 0.5455617308616638, "learning_rate": 9.239412746562917e-05, "loss": 0.1095, "step": 13140 }, { "grad_norm": 0.6455450654029846, "learning_rate": 9.237951030373828e-05, "loss": 0.0774, "step": 13150 }, { "grad_norm": 0.9790071845054626, "learning_rate": 9.236488026807113e-05, "loss": 0.0868, "step": 13160 }, { "grad_norm": 1.1104111671447754, "learning_rate": 9.235023736307193e-05, "loss": 0.1084, "step": 13170 }, { "grad_norm": 0.5061917304992676, "learning_rate": 9.233558159318881e-05, "loss": 0.0984, "step": 13180 }, { "grad_norm": 1.0151344537734985, "learning_rate": 9.232091296287382e-05, "loss": 0.103, "step": 13190 }, { "grad_norm": 0.6106647253036499, "learning_rate": 9.230623147658288e-05, "loss": 0.0916, "step": 13200 }, { "grad_norm": 0.5589365363121033, "learning_rate": 9.229153713877586e-05, "loss": 0.0888, "step": 13210 }, { "grad_norm": 0.6468563079833984, "learning_rate": 9.227682995391649e-05, "loss": 0.0951, "step": 13220 }, { "grad_norm": 0.8726822733879089, "learning_rate": 9.226210992647243e-05, "loss": 0.0892, "step": 13230 }, { "grad_norm": 0.768828809261322, "learning_rate": 9.224737706091525e-05, "loss": 0.0796, "step": 13240 }, { "grad_norm": 0.6195400953292847, "learning_rate": 9.223263136172039e-05, "loss": 0.0904, "step": 13250 }, { "grad_norm": 0.833642303943634, "learning_rate": 9.22178728333672e-05, "loss": 0.0828, "step": 13260 }, { "grad_norm": 0.781406581401825, "learning_rate": 9.220310148033897e-05, "loss": 0.0707, "step": 13270 }, { "grad_norm": 0.4590030610561371, "learning_rate": 9.21883173071228e-05, "loss": 0.0836, "step": 13280 }, { "grad_norm": 0.6451886296272278, "learning_rate": 9.217352031820976e-05, "loss": 0.0692, "step": 13290 }, { "grad_norm": 0.6267248392105103, "learning_rate": 9.215871051809477e-05, "loss": 0.1114, "step": 13300 }, { "grad_norm": 0.7034592628479004, "learning_rate": 9.214388791127666e-05, "loss": 0.0789, "step": 13310 }, { "grad_norm": 0.5654531121253967, "learning_rate": 9.212905250225814e-05, "loss": 0.0798, "step": 13320 }, { "grad_norm": 0.6371467709541321, "learning_rate": 9.211420429554583e-05, "loss": 0.0919, "step": 13330 }, { "grad_norm": 0.5652119517326355, "learning_rate": 9.209934329565022e-05, "loss": 0.0733, "step": 13340 }, { "grad_norm": 0.6413688063621521, "learning_rate": 9.208446950708568e-05, "loss": 0.0928, "step": 13350 }, { "grad_norm": 0.5711184144020081, "learning_rate": 9.20695829343705e-05, "loss": 0.0919, "step": 13360 }, { "grad_norm": 0.9997236728668213, "learning_rate": 9.205468358202678e-05, "loss": 0.1058, "step": 13370 }, { "grad_norm": 0.7386050820350647, "learning_rate": 9.203977145458059e-05, "loss": 0.0895, "step": 13380 }, { "grad_norm": 0.7350773215293884, "learning_rate": 9.202484655656182e-05, "loss": 0.0871, "step": 13390 }, { "grad_norm": 0.7736497521400452, "learning_rate": 9.200990889250427e-05, "loss": 0.1028, "step": 13400 }, { "grad_norm": 0.5030760765075684, "learning_rate": 9.19949584669456e-05, "loss": 0.0923, "step": 13410 }, { "grad_norm": 0.8589800596237183, "learning_rate": 9.197999528442738e-05, "loss": 0.0999, "step": 13420 }, { "grad_norm": 0.5906085968017578, "learning_rate": 9.196501934949499e-05, "loss": 0.1067, "step": 13430 }, { "grad_norm": 0.7144001722335815, "learning_rate": 9.195003066669776e-05, "loss": 0.0859, "step": 13440 }, { "grad_norm": 0.7752034068107605, "learning_rate": 9.193502924058884e-05, "loss": 0.0879, "step": 13450 }, { "grad_norm": 0.7114759087562561, "learning_rate": 9.192001507572526e-05, "loss": 0.1006, "step": 13460 }, { "grad_norm": 1.0897845029830933, "learning_rate": 9.190498817666793e-05, "loss": 0.1303, "step": 13470 }, { "grad_norm": 0.5131651163101196, "learning_rate": 9.188994854798163e-05, "loss": 0.0848, "step": 13480 }, { "grad_norm": 0.8113803267478943, "learning_rate": 9.187489619423499e-05, "loss": 0.0808, "step": 13490 }, { "grad_norm": 0.7021291255950928, "learning_rate": 9.185983112000056e-05, "loss": 0.0799, "step": 13500 }, { "grad_norm": 0.5695791244506836, "learning_rate": 9.184475332985464e-05, "loss": 0.0848, "step": 13510 }, { "grad_norm": 0.6823028922080994, "learning_rate": 9.182966282837754e-05, "loss": 0.0881, "step": 13520 }, { "grad_norm": 0.8276146054267883, "learning_rate": 9.18145596201533e-05, "loss": 0.0927, "step": 13530 }, { "grad_norm": 0.7317036390304565, "learning_rate": 9.179944370976991e-05, "loss": 0.0632, "step": 13540 }, { "grad_norm": 1.1433619260787964, "learning_rate": 9.178431510181918e-05, "loss": 0.0945, "step": 13550 }, { "grad_norm": 0.7389175891876221, "learning_rate": 9.176917380089675e-05, "loss": 0.0973, "step": 13560 }, { "grad_norm": 0.7098069787025452, "learning_rate": 9.175401981160219e-05, "loss": 0.0799, "step": 13570 }, { "grad_norm": 0.6829767823219299, "learning_rate": 9.173885313853885e-05, "loss": 0.0737, "step": 13580 }, { "grad_norm": 0.6697173118591309, "learning_rate": 9.172367378631398e-05, "loss": 0.0872, "step": 13590 }, { "grad_norm": 0.6079223155975342, "learning_rate": 9.170848175953866e-05, "loss": 0.1144, "step": 13600 }, { "grad_norm": 0.6502634286880493, "learning_rate": 9.169327706282784e-05, "loss": 0.0773, "step": 13610 }, { "grad_norm": 0.8580983281135559, "learning_rate": 9.167805970080029e-05, "loss": 0.0759, "step": 13620 }, { "grad_norm": 0.8535483479499817, "learning_rate": 9.166282967807864e-05, "loss": 0.0772, "step": 13630 }, { "grad_norm": 0.7256141304969788, "learning_rate": 9.16475869992894e-05, "loss": 0.1041, "step": 13640 }, { "grad_norm": 0.6952700614929199, "learning_rate": 9.163233166906284e-05, "loss": 0.0812, "step": 13650 }, { "grad_norm": 0.8930968642234802, "learning_rate": 9.161706369203317e-05, "loss": 0.0848, "step": 13660 }, { "grad_norm": 0.707869291305542, "learning_rate": 9.16017830728384e-05, "loss": 0.1101, "step": 13670 }, { "grad_norm": 0.9177502989768982, "learning_rate": 9.158648981612035e-05, "loss": 0.0892, "step": 13680 }, { "grad_norm": 0.8978441953659058, "learning_rate": 9.157118392652472e-05, "loss": 0.1045, "step": 13690 }, { "grad_norm": 0.8379814028739929, "learning_rate": 9.155586540870104e-05, "loss": 0.0917, "step": 13700 }, { "grad_norm": 0.9865835905075073, "learning_rate": 9.154053426730267e-05, "loss": 0.1016, "step": 13710 }, { "grad_norm": 0.7001112103462219, "learning_rate": 9.15251905069868e-05, "loss": 0.0705, "step": 13720 }, { "grad_norm": 0.6882014870643616, "learning_rate": 9.150983413241446e-05, "loss": 0.1165, "step": 13730 }, { "grad_norm": 0.7157098054885864, "learning_rate": 9.149446514825051e-05, "loss": 0.0932, "step": 13740 }, { "grad_norm": 0.7328441739082336, "learning_rate": 9.147908355916365e-05, "loss": 0.082, "step": 13750 }, { "grad_norm": 0.7568864822387695, "learning_rate": 9.146368936982642e-05, "loss": 0.1104, "step": 13760 }, { "grad_norm": 0.9075926542282104, "learning_rate": 9.144828258491511e-05, "loss": 0.0972, "step": 13770 }, { "grad_norm": 0.6714442372322083, "learning_rate": 9.143286320910996e-05, "loss": 0.0818, "step": 13780 }, { "grad_norm": 0.711791455745697, "learning_rate": 9.141743124709491e-05, "loss": 0.0834, "step": 13790 }, { "grad_norm": 0.6848009824752808, "learning_rate": 9.140198670355784e-05, "loss": 0.1187, "step": 13800 }, { "grad_norm": 0.6797389984130859, "learning_rate": 9.138652958319034e-05, "loss": 0.0945, "step": 13810 }, { "grad_norm": 0.6716344356536865, "learning_rate": 9.137105989068791e-05, "loss": 0.0934, "step": 13820 }, { "grad_norm": 0.7537602186203003, "learning_rate": 9.135557763074983e-05, "loss": 0.0978, "step": 13830 }, { "grad_norm": 0.609402060508728, "learning_rate": 9.13400828080792e-05, "loss": 0.1067, "step": 13840 }, { "grad_norm": 0.9204389452934265, "learning_rate": 9.132457542738292e-05, "loss": 0.1022, "step": 13850 }, { "grad_norm": 0.708209753036499, "learning_rate": 9.130905549337174e-05, "loss": 0.0935, "step": 13860 }, { "grad_norm": 0.8013960719108582, "learning_rate": 9.129352301076021e-05, "loss": 0.0841, "step": 13870 }, { "grad_norm": 0.7758584022521973, "learning_rate": 9.127797798426668e-05, "loss": 0.0766, "step": 13880 }, { "grad_norm": 0.6129173040390015, "learning_rate": 9.126242041861333e-05, "loss": 0.07, "step": 13890 }, { "grad_norm": 0.6873789429664612, "learning_rate": 9.124685031852611e-05, "loss": 0.0805, "step": 13900 }, { "grad_norm": 0.808345377445221, "learning_rate": 9.123126768873482e-05, "loss": 0.104, "step": 13910 }, { "grad_norm": 0.8625933527946472, "learning_rate": 9.121567253397308e-05, "loss": 0.0898, "step": 13920 }, { "grad_norm": 0.5391258001327515, "learning_rate": 9.120006485897824e-05, "loss": 0.0925, "step": 13930 }, { "grad_norm": 0.5603607296943665, "learning_rate": 9.118444466849152e-05, "loss": 0.0837, "step": 13940 }, { "grad_norm": 0.8036087155342102, "learning_rate": 9.116881196725793e-05, "loss": 0.076, "step": 13950 }, { "grad_norm": 0.9855889081954956, "learning_rate": 9.115316676002627e-05, "loss": 0.1162, "step": 13960 }, { "grad_norm": 1.2401658296585083, "learning_rate": 9.113750905154911e-05, "loss": 0.0767, "step": 13970 }, { "grad_norm": 0.7729992866516113, "learning_rate": 9.112183884658289e-05, "loss": 0.0849, "step": 13980 }, { "grad_norm": 0.9797981381416321, "learning_rate": 9.11061561498878e-05, "loss": 0.101, "step": 13990 }, { "grad_norm": 0.806107223033905, "learning_rate": 9.109046096622779e-05, "loss": 0.0809, "step": 14000 }, { "grad_norm": 0.6574611663818359, "learning_rate": 9.107475330037069e-05, "loss": 0.0938, "step": 14010 }, { "grad_norm": 0.8083263635635376, "learning_rate": 9.105903315708806e-05, "loss": 0.1152, "step": 14020 }, { "grad_norm": 0.6154211759567261, "learning_rate": 9.104330054115524e-05, "loss": 0.0861, "step": 14030 }, { "grad_norm": 0.8314677476882935, "learning_rate": 9.102755545735141e-05, "loss": 0.0808, "step": 14040 }, { "grad_norm": 0.7212948203086853, "learning_rate": 9.10117979104595e-05, "loss": 0.13, "step": 14050 }, { "grad_norm": 0.7897972464561462, "learning_rate": 9.099602790526624e-05, "loss": 0.0954, "step": 14060 }, { "grad_norm": 0.849248468875885, "learning_rate": 9.098024544656212e-05, "loss": 0.072, "step": 14070 }, { "grad_norm": 0.9068009257316589, "learning_rate": 9.096445053914148e-05, "loss": 0.091, "step": 14080 }, { "grad_norm": 0.6360738277435303, "learning_rate": 9.094864318780236e-05, "loss": 0.0761, "step": 14090 }, { "grad_norm": 0.6737626791000366, "learning_rate": 9.093282339734663e-05, "loss": 0.0779, "step": 14100 }, { "grad_norm": 0.6551205515861511, "learning_rate": 9.091699117257992e-05, "loss": 0.0754, "step": 14110 }, { "grad_norm": 0.6933133006095886, "learning_rate": 9.090114651831163e-05, "loss": 0.0917, "step": 14120 }, { "grad_norm": 0.6535260081291199, "learning_rate": 9.088528943935497e-05, "loss": 0.0787, "step": 14130 }, { "grad_norm": 0.5819027423858643, "learning_rate": 9.086941994052689e-05, "loss": 0.0784, "step": 14140 }, { "grad_norm": 0.7737126350402832, "learning_rate": 9.085353802664813e-05, "loss": 0.0942, "step": 14150 }, { "grad_norm": 0.5747708082199097, "learning_rate": 9.08376437025432e-05, "loss": 0.0951, "step": 14160 }, { "grad_norm": 0.8276579976081848, "learning_rate": 9.082173697304035e-05, "loss": 0.098, "step": 14170 }, { "grad_norm": 1.1537160873413086, "learning_rate": 9.080581784297166e-05, "loss": 0.1063, "step": 14180 }, { "grad_norm": 0.7805328369140625, "learning_rate": 9.078988631717291e-05, "loss": 0.0821, "step": 14190 }, { "grad_norm": 1.077366590499878, "learning_rate": 9.077394240048369e-05, "loss": 0.093, "step": 14200 }, { "grad_norm": 0.7952777743339539, "learning_rate": 9.075798609774736e-05, "loss": 0.1122, "step": 14210 }, { "grad_norm": 0.6132530570030212, "learning_rate": 9.0742017413811e-05, "loss": 0.0875, "step": 14220 }, { "grad_norm": 0.7609317898750305, "learning_rate": 9.072603635352548e-05, "loss": 0.0776, "step": 14230 }, { "grad_norm": 0.7482858896255493, "learning_rate": 9.071004292174541e-05, "loss": 0.0795, "step": 14240 }, { "grad_norm": 0.5777071714401245, "learning_rate": 9.06940371233292e-05, "loss": 0.0786, "step": 14250 }, { "grad_norm": 0.6076545715332031, "learning_rate": 9.067801896313898e-05, "loss": 0.0763, "step": 14260 }, { "grad_norm": 0.6668615937232971, "learning_rate": 9.066198844604064e-05, "loss": 0.0876, "step": 14270 }, { "grad_norm": 0.6284846663475037, "learning_rate": 9.06459455769038e-05, "loss": 0.0916, "step": 14280 }, { "grad_norm": 0.6990781426429749, "learning_rate": 9.062989036060193e-05, "loss": 0.099, "step": 14290 }, { "grad_norm": 0.8151722550392151, "learning_rate": 9.061382280201212e-05, "loss": 0.1038, "step": 14300 }, { "grad_norm": 0.6961491107940674, "learning_rate": 9.059774290601528e-05, "loss": 0.0837, "step": 14310 }, { "grad_norm": 1.1836434602737427, "learning_rate": 9.058165067749606e-05, "loss": 0.1127, "step": 14320 }, { "grad_norm": 1.1622449159622192, "learning_rate": 9.056554612134288e-05, "loss": 0.0968, "step": 14330 }, { "grad_norm": 0.7732072472572327, "learning_rate": 9.054942924244785e-05, "loss": 0.0902, "step": 14340 }, { "grad_norm": 0.5789279937744141, "learning_rate": 9.053330004570686e-05, "loss": 0.0708, "step": 14350 }, { "grad_norm": 0.832394003868103, "learning_rate": 9.051715853601955e-05, "loss": 0.0922, "step": 14360 }, { "grad_norm": 0.8385879397392273, "learning_rate": 9.050100471828926e-05, "loss": 0.104, "step": 14370 }, { "grad_norm": 0.8028297424316406, "learning_rate": 9.048483859742311e-05, "loss": 0.0887, "step": 14380 }, { "grad_norm": 0.8504419922828674, "learning_rate": 9.046866017833193e-05, "loss": 0.0776, "step": 14390 }, { "grad_norm": 0.8825477957725525, "learning_rate": 9.045246946593029e-05, "loss": 0.0706, "step": 14400 }, { "grad_norm": 0.7058652639389038, "learning_rate": 9.043626646513652e-05, "loss": 0.0746, "step": 14410 }, { "grad_norm": 0.7185536026954651, "learning_rate": 9.042005118087267e-05, "loss": 0.0984, "step": 14420 }, { "grad_norm": 0.8395110964775085, "learning_rate": 9.040382361806448e-05, "loss": 0.0932, "step": 14430 }, { "grad_norm": 0.7518756985664368, "learning_rate": 9.038758378164148e-05, "loss": 0.1027, "step": 14440 }, { "grad_norm": 0.651936411857605, "learning_rate": 9.037133167653691e-05, "loss": 0.087, "step": 14450 }, { "grad_norm": 0.6090784072875977, "learning_rate": 9.035506730768771e-05, "loss": 0.0812, "step": 14460 }, { "grad_norm": 0.9885725378990173, "learning_rate": 9.033879068003458e-05, "loss": 0.0977, "step": 14470 }, { "grad_norm": 0.6902058124542236, "learning_rate": 9.032250179852193e-05, "loss": 0.0959, "step": 14480 }, { "grad_norm": 0.7562060356140137, "learning_rate": 9.030620066809787e-05, "loss": 0.1004, "step": 14490 }, { "grad_norm": 0.6319054365158081, "learning_rate": 9.028988729371428e-05, "loss": 0.0839, "step": 14500 }, { "grad_norm": 0.6504964828491211, "learning_rate": 9.027356168032673e-05, "loss": 0.0861, "step": 14510 }, { "grad_norm": 0.5568788647651672, "learning_rate": 9.02572238328945e-05, "loss": 0.0687, "step": 14520 }, { "grad_norm": 0.7978841066360474, "learning_rate": 9.02408737563806e-05, "loss": 0.0897, "step": 14530 }, { "grad_norm": 0.527666449546814, "learning_rate": 9.022451145575174e-05, "loss": 0.0679, "step": 14540 }, { "grad_norm": 0.659881591796875, "learning_rate": 9.02081369359784e-05, "loss": 0.0908, "step": 14550 }, { "grad_norm": 0.7337923645973206, "learning_rate": 9.019175020203465e-05, "loss": 0.0788, "step": 14560 }, { "grad_norm": 0.4801911413669586, "learning_rate": 9.017535125889842e-05, "loss": 0.0736, "step": 14570 }, { "grad_norm": 0.720556914806366, "learning_rate": 9.015894011155124e-05, "loss": 0.0935, "step": 14580 }, { "grad_norm": 0.5343217253684998, "learning_rate": 9.014251676497838e-05, "loss": 0.0711, "step": 14590 }, { "grad_norm": 0.7077155113220215, "learning_rate": 9.012608122416884e-05, "loss": 0.0876, "step": 14600 }, { "grad_norm": 0.766619086265564, "learning_rate": 9.010963349411529e-05, "loss": 0.0959, "step": 14610 }, { "grad_norm": 0.6413426995277405, "learning_rate": 9.00931735798141e-05, "loss": 0.0855, "step": 14620 }, { "grad_norm": 0.8532230257987976, "learning_rate": 9.00767014862654e-05, "loss": 0.0827, "step": 14630 }, { "grad_norm": 0.6720548272132874, "learning_rate": 9.006021721847295e-05, "loss": 0.087, "step": 14640 }, { "grad_norm": 0.7225130796432495, "learning_rate": 9.004372078144423e-05, "loss": 0.0793, "step": 14650 }, { "grad_norm": 0.7635659575462341, "learning_rate": 9.002721218019043e-05, "loss": 0.0764, "step": 14660 }, { "grad_norm": 0.7220643162727356, "learning_rate": 9.001069141972642e-05, "loss": 0.0823, "step": 14670 }, { "grad_norm": 0.8236568570137024, "learning_rate": 8.99941585050708e-05, "loss": 0.083, "step": 14680 }, { "grad_norm": 0.6461641788482666, "learning_rate": 8.997761344124578e-05, "loss": 0.0761, "step": 14690 }, { "grad_norm": 0.6860876083374023, "learning_rate": 8.996105623327737e-05, "loss": 0.0807, "step": 14700 }, { "grad_norm": 0.7142871022224426, "learning_rate": 8.994448688619517e-05, "loss": 0.0849, "step": 14710 }, { "grad_norm": 0.7794959545135498, "learning_rate": 8.992790540503253e-05, "loss": 0.0864, "step": 14720 }, { "grad_norm": 0.625065267086029, "learning_rate": 8.991131179482648e-05, "loss": 0.0785, "step": 14730 }, { "grad_norm": 0.6562982201576233, "learning_rate": 8.989470606061768e-05, "loss": 0.0948, "step": 14740 }, { "grad_norm": 0.5251091718673706, "learning_rate": 8.987808820745056e-05, "loss": 0.0734, "step": 14750 }, { "grad_norm": 1.1422183513641357, "learning_rate": 8.986145824037315e-05, "loss": 0.0879, "step": 14760 }, { "grad_norm": 0.7371772527694702, "learning_rate": 8.984481616443721e-05, "loss": 0.0901, "step": 14770 }, { "grad_norm": 0.77792888879776, "learning_rate": 8.982816198469815e-05, "loss": 0.0847, "step": 14780 }, { "grad_norm": 0.6415791511535645, "learning_rate": 8.98114957062151e-05, "loss": 0.0843, "step": 14790 }, { "grad_norm": 0.8700986504554749, "learning_rate": 8.97948173340508e-05, "loss": 0.0871, "step": 14800 }, { "grad_norm": 0.6617854833602905, "learning_rate": 8.977812687327172e-05, "loss": 0.0897, "step": 14810 }, { "grad_norm": 0.46777087450027466, "learning_rate": 8.976142432894798e-05, "loss": 0.0927, "step": 14820 }, { "grad_norm": 0.8570370078086853, "learning_rate": 8.974470970615336e-05, "loss": 0.0936, "step": 14830 }, { "grad_norm": 1.0881195068359375, "learning_rate": 8.972798300996534e-05, "loss": 0.1082, "step": 14840 }, { "grad_norm": 0.596121072769165, "learning_rate": 8.971124424546504e-05, "loss": 0.1031, "step": 14850 }, { "grad_norm": 0.5887495875358582, "learning_rate": 8.969449341773724e-05, "loss": 0.0911, "step": 14860 }, { "grad_norm": 0.5507463216781616, "learning_rate": 8.967773053187042e-05, "loss": 0.0934, "step": 14870 }, { "grad_norm": 0.5744767189025879, "learning_rate": 8.966095559295668e-05, "loss": 0.0751, "step": 14880 }, { "grad_norm": 0.7315704822540283, "learning_rate": 8.964416860609184e-05, "loss": 0.0769, "step": 14890 }, { "grad_norm": 0.7252019047737122, "learning_rate": 8.962736957637532e-05, "loss": 0.0787, "step": 14900 }, { "grad_norm": 0.9250659942626953, "learning_rate": 8.96105585089102e-05, "loss": 0.093, "step": 14910 }, { "grad_norm": 0.6260889768600464, "learning_rate": 8.959373540880329e-05, "loss": 0.0863, "step": 14920 }, { "grad_norm": 0.7378466129302979, "learning_rate": 8.957690028116495e-05, "loss": 0.0944, "step": 14930 }, { "grad_norm": 0.4876152276992798, "learning_rate": 8.956005313110928e-05, "loss": 0.0923, "step": 14940 }, { "grad_norm": 0.8763590455055237, "learning_rate": 8.9543193963754e-05, "loss": 0.1024, "step": 14950 }, { "grad_norm": 0.5730733275413513, "learning_rate": 8.952632278422048e-05, "loss": 0.09, "step": 14960 }, { "grad_norm": 0.9238808751106262, "learning_rate": 8.95094395976337e-05, "loss": 0.0918, "step": 14970 }, { "grad_norm": 1.2421318292617798, "learning_rate": 8.949254440912239e-05, "loss": 0.0946, "step": 14980 }, { "grad_norm": 0.6714566946029663, "learning_rate": 8.94756372238188e-05, "loss": 0.0747, "step": 14990 }, { "grad_norm": 0.7747244238853455, "learning_rate": 8.945871804685892e-05, "loss": 0.0872, "step": 15000 }, { "grad_norm": 0.8294615149497986, "learning_rate": 8.944178688338236e-05, "loss": 0.0678, "step": 15010 }, { "grad_norm": 0.5479936003684998, "learning_rate": 8.942484373853233e-05, "loss": 0.0759, "step": 15020 }, { "grad_norm": 0.6880900859832764, "learning_rate": 8.940788861745572e-05, "loss": 0.0754, "step": 15030 }, { "grad_norm": 0.4816497564315796, "learning_rate": 8.939092152530308e-05, "loss": 0.0752, "step": 15040 }, { "grad_norm": 0.4674496054649353, "learning_rate": 8.937394246722853e-05, "loss": 0.0682, "step": 15050 }, { "grad_norm": 0.5885239243507385, "learning_rate": 8.935695144838984e-05, "loss": 0.0794, "step": 15060 }, { "grad_norm": 0.7386784553527832, "learning_rate": 8.933994847394849e-05, "loss": 0.0734, "step": 15070 }, { "grad_norm": 0.8484346866607666, "learning_rate": 8.932293354906949e-05, "loss": 0.0896, "step": 15080 }, { "grad_norm": 0.6817585229873657, "learning_rate": 8.930590667892153e-05, "loss": 0.0844, "step": 15090 }, { "grad_norm": 0.8194221258163452, "learning_rate": 8.928886786867696e-05, "loss": 0.0972, "step": 15100 }, { "grad_norm": 0.791200578212738, "learning_rate": 8.927181712351168e-05, "loss": 0.0758, "step": 15110 }, { "grad_norm": 0.5818436741828918, "learning_rate": 8.925475444860527e-05, "loss": 0.0938, "step": 15120 }, { "grad_norm": 1.089716911315918, "learning_rate": 8.923767984914092e-05, "loss": 0.1066, "step": 15130 }, { "grad_norm": 0.8416954874992371, "learning_rate": 8.922059333030545e-05, "loss": 0.0824, "step": 15140 }, { "grad_norm": 0.7270142436027527, "learning_rate": 8.920349489728928e-05, "loss": 0.0732, "step": 15150 }, { "grad_norm": 0.8615787625312805, "learning_rate": 8.918638455528646e-05, "loss": 0.1038, "step": 15160 }, { "grad_norm": 0.6612411737442017, "learning_rate": 8.916926230949468e-05, "loss": 0.0993, "step": 15170 }, { "grad_norm": 0.8749944567680359, "learning_rate": 8.915212816511522e-05, "loss": 0.0847, "step": 15180 }, { "grad_norm": 0.893699049949646, "learning_rate": 8.913498212735296e-05, "loss": 0.1202, "step": 15190 }, { "grad_norm": 0.49247559905052185, "learning_rate": 8.911782420141643e-05, "loss": 0.0853, "step": 15200 }, { "grad_norm": 0.950262725353241, "learning_rate": 8.910065439251775e-05, "loss": 0.1096, "step": 15210 }, { "grad_norm": 0.6484801769256592, "learning_rate": 8.908347270587268e-05, "loss": 0.0712, "step": 15220 }, { "grad_norm": 0.7240500450134277, "learning_rate": 8.906627914670054e-05, "loss": 0.1086, "step": 15230 }, { "grad_norm": 0.7179484963417053, "learning_rate": 8.904907372022427e-05, "loss": 0.072, "step": 15240 }, { "grad_norm": 0.6703804731369019, "learning_rate": 8.903185643167042e-05, "loss": 0.0864, "step": 15250 }, { "grad_norm": 0.6359685659408569, "learning_rate": 8.901462728626919e-05, "loss": 0.0849, "step": 15260 }, { "grad_norm": 0.5738243460655212, "learning_rate": 8.899738628925429e-05, "loss": 0.0693, "step": 15270 }, { "grad_norm": 0.5759138464927673, "learning_rate": 8.898013344586312e-05, "loss": 0.0928, "step": 15280 }, { "grad_norm": 0.8607297539710999, "learning_rate": 8.896286876133661e-05, "loss": 0.0935, "step": 15290 }, { "grad_norm": 0.7186563611030579, "learning_rate": 8.894559224091933e-05, "loss": 0.0774, "step": 15300 }, { "grad_norm": 0.7945495247840881, "learning_rate": 8.892830388985942e-05, "loss": 0.0772, "step": 15310 }, { "grad_norm": 1.0649679899215698, "learning_rate": 8.891100371340864e-05, "loss": 0.0807, "step": 15320 }, { "grad_norm": 0.6216043829917908, "learning_rate": 8.889369171682231e-05, "loss": 0.0938, "step": 15330 }, { "grad_norm": 0.7855075597763062, "learning_rate": 8.887636790535936e-05, "loss": 0.0902, "step": 15340 }, { "grad_norm": 0.567712128162384, "learning_rate": 8.885903228428231e-05, "loss": 0.0791, "step": 15350 }, { "grad_norm": 0.7706348896026611, "learning_rate": 8.884168485885727e-05, "loss": 0.0913, "step": 15360 }, { "grad_norm": 0.7713733315467834, "learning_rate": 8.882432563435393e-05, "loss": 0.0875, "step": 15370 }, { "grad_norm": 0.8461950421333313, "learning_rate": 8.880695461604556e-05, "loss": 0.0866, "step": 15380 }, { "grad_norm": 0.7796589136123657, "learning_rate": 8.878957180920901e-05, "loss": 0.0685, "step": 15390 }, { "grad_norm": 0.6641393899917603, "learning_rate": 8.877217721912473e-05, "loss": 0.0746, "step": 15400 }, { "grad_norm": 1.0047498941421509, "learning_rate": 8.875477085107673e-05, "loss": 0.0829, "step": 15410 }, { "grad_norm": 0.7947528958320618, "learning_rate": 8.87373527103526e-05, "loss": 0.1026, "step": 15420 }, { "grad_norm": 0.5342550873756409, "learning_rate": 8.871992280224353e-05, "loss": 0.0982, "step": 15430 }, { "grad_norm": 0.935849666595459, "learning_rate": 8.870248113204422e-05, "loss": 0.0819, "step": 15440 }, { "grad_norm": 0.5477079153060913, "learning_rate": 8.868502770505306e-05, "loss": 0.0876, "step": 15450 }, { "grad_norm": 0.8767722845077515, "learning_rate": 8.86675625265719e-05, "loss": 0.0888, "step": 15460 }, { "grad_norm": 0.6680763959884644, "learning_rate": 8.865008560190618e-05, "loss": 0.082, "step": 15470 }, { "grad_norm": 0.9954163432121277, "learning_rate": 8.863259693636496e-05, "loss": 0.0875, "step": 15480 }, { "grad_norm": 0.9421848654747009, "learning_rate": 8.861509653526083e-05, "loss": 0.0653, "step": 15490 }, { "grad_norm": 0.6158773899078369, "learning_rate": 8.859758440390993e-05, "loss": 0.0637, "step": 15500 }, { "grad_norm": 0.7402524352073669, "learning_rate": 8.858006054763202e-05, "loss": 0.0742, "step": 15510 }, { "grad_norm": 0.7100924849510193, "learning_rate": 8.856252497175035e-05, "loss": 0.0869, "step": 15520 }, { "grad_norm": 0.7681096792221069, "learning_rate": 8.854497768159178e-05, "loss": 0.0823, "step": 15530 }, { "grad_norm": 0.7164523601531982, "learning_rate": 8.852741868248671e-05, "loss": 0.0839, "step": 15540 }, { "grad_norm": 0.671438455581665, "learning_rate": 8.85098479797691e-05, "loss": 0.0859, "step": 15550 }, { "grad_norm": 0.8413930535316467, "learning_rate": 8.849226557877646e-05, "loss": 0.0873, "step": 15560 }, { "grad_norm": 0.8361978530883789, "learning_rate": 8.84746714848499e-05, "loss": 0.0747, "step": 15570 }, { "grad_norm": 0.8147082924842834, "learning_rate": 8.845706570333397e-05, "loss": 0.0848, "step": 15580 }, { "grad_norm": 0.9384058117866516, "learning_rate": 8.84394482395769e-05, "loss": 0.074, "step": 15590 }, { "grad_norm": 0.771336555480957, "learning_rate": 8.842181909893038e-05, "loss": 0.085, "step": 15600 }, { "grad_norm": 0.9505167603492737, "learning_rate": 8.840417828674969e-05, "loss": 0.0876, "step": 15610 }, { "grad_norm": 0.6577292680740356, "learning_rate": 8.838652580839364e-05, "loss": 0.0738, "step": 15620 }, { "grad_norm": 0.6692731976509094, "learning_rate": 8.836886166922458e-05, "loss": 0.074, "step": 15630 }, { "grad_norm": 0.6634111404418945, "learning_rate": 8.835118587460844e-05, "loss": 0.0773, "step": 15640 }, { "grad_norm": 0.6962167024612427, "learning_rate": 8.83334984299146e-05, "loss": 0.0743, "step": 15650 }, { "grad_norm": 0.6426434516906738, "learning_rate": 8.83157993405161e-05, "loss": 0.0858, "step": 15660 }, { "grad_norm": 0.5320380330085754, "learning_rate": 8.829808861178943e-05, "loss": 0.0796, "step": 15670 }, { "grad_norm": 0.7469257116317749, "learning_rate": 8.828036624911464e-05, "loss": 0.0745, "step": 15680 }, { "grad_norm": 0.6475929021835327, "learning_rate": 8.826263225787532e-05, "loss": 0.0882, "step": 15690 }, { "grad_norm": 0.982716977596283, "learning_rate": 8.824488664345858e-05, "loss": 0.0739, "step": 15700 }, { "grad_norm": 0.7815120220184326, "learning_rate": 8.822712941125508e-05, "loss": 0.093, "step": 15710 }, { "grad_norm": 0.8639106750488281, "learning_rate": 8.820936056665898e-05, "loss": 0.0981, "step": 15720 }, { "grad_norm": 0.8129995465278625, "learning_rate": 8.819158011506801e-05, "loss": 0.0838, "step": 15730 }, { "grad_norm": 0.5739174485206604, "learning_rate": 8.81737880618834e-05, "loss": 0.1068, "step": 15740 }, { "grad_norm": 0.6277855038642883, "learning_rate": 8.815598441250987e-05, "loss": 0.0929, "step": 15750 }, { "grad_norm": 0.7727562189102173, "learning_rate": 8.813816917235576e-05, "loss": 0.0835, "step": 15760 }, { "grad_norm": 0.5069063305854797, "learning_rate": 8.812034234683282e-05, "loss": 0.064, "step": 15770 }, { "grad_norm": 0.5655542016029358, "learning_rate": 8.810250394135637e-05, "loss": 0.0787, "step": 15780 }, { "grad_norm": 0.5797998309135437, "learning_rate": 8.808465396134529e-05, "loss": 0.0904, "step": 15790 }, { "grad_norm": 0.8392467498779297, "learning_rate": 8.806679241222189e-05, "loss": 0.0906, "step": 15800 }, { "grad_norm": 0.7035959362983704, "learning_rate": 8.804891929941203e-05, "loss": 0.0989, "step": 15810 }, { "grad_norm": 0.5133164525032043, "learning_rate": 8.803103462834514e-05, "loss": 0.0884, "step": 15820 }, { "grad_norm": 0.8088290095329285, "learning_rate": 8.801313840445408e-05, "loss": 0.1051, "step": 15830 }, { "grad_norm": 0.8234735727310181, "learning_rate": 8.799523063317524e-05, "loss": 0.1109, "step": 15840 }, { "grad_norm": 0.5688288807868958, "learning_rate": 8.797731131994854e-05, "loss": 0.0836, "step": 15850 }, { "grad_norm": 0.7051399946212769, "learning_rate": 8.795938047021739e-05, "loss": 0.0772, "step": 15860 }, { "grad_norm": 0.6419736742973328, "learning_rate": 8.794143808942872e-05, "loss": 0.0653, "step": 15870 }, { "grad_norm": 0.5638424754142761, "learning_rate": 8.792348418303296e-05, "loss": 0.0867, "step": 15880 }, { "grad_norm": 0.4943941831588745, "learning_rate": 8.790551875648398e-05, "loss": 0.0727, "step": 15890 }, { "grad_norm": 0.9752152562141418, "learning_rate": 8.788754181523926e-05, "loss": 0.0794, "step": 15900 }, { "grad_norm": 0.7800365686416626, "learning_rate": 8.78695533647597e-05, "loss": 0.0716, "step": 15910 }, { "grad_norm": 0.7163779735565186, "learning_rate": 8.785155341050972e-05, "loss": 0.0823, "step": 15920 }, { "grad_norm": 0.5775902271270752, "learning_rate": 8.783354195795721e-05, "loss": 0.0743, "step": 15930 }, { "grad_norm": 0.6866976618766785, "learning_rate": 8.78155190125736e-05, "loss": 0.0771, "step": 15940 }, { "grad_norm": 0.7832014560699463, "learning_rate": 8.779748457983378e-05, "loss": 0.0819, "step": 15950 }, { "grad_norm": 0.8526631593704224, "learning_rate": 8.777943866521612e-05, "loss": 0.0911, "step": 15960 }, { "grad_norm": 0.6923834681510925, "learning_rate": 8.77613812742025e-05, "loss": 0.0669, "step": 15970 }, { "grad_norm": 0.7832556366920471, "learning_rate": 8.774331241227829e-05, "loss": 0.0824, "step": 15980 }, { "grad_norm": 0.5644892454147339, "learning_rate": 8.772523208493232e-05, "loss": 0.0834, "step": 15990 }, { "grad_norm": 0.6610400080680847, "learning_rate": 8.770714029765692e-05, "loss": 0.072, "step": 16000 }, { "grad_norm": 0.6642505526542664, "learning_rate": 8.768903705594789e-05, "loss": 0.0626, "step": 16010 }, { "grad_norm": 0.531834602355957, "learning_rate": 8.767092236530453e-05, "loss": 0.0647, "step": 16020 }, { "grad_norm": 1.123783826828003, "learning_rate": 8.76527962312296e-05, "loss": 0.0864, "step": 16030 }, { "grad_norm": 0.9823497533798218, "learning_rate": 8.763465865922934e-05, "loss": 0.0725, "step": 16040 }, { "grad_norm": 0.6602157950401306, "learning_rate": 8.761650965481347e-05, "loss": 0.0734, "step": 16050 }, { "grad_norm": 0.8621131777763367, "learning_rate": 8.759834922349516e-05, "loss": 0.081, "step": 16060 }, { "grad_norm": 0.9087852239608765, "learning_rate": 8.758017737079108e-05, "loss": 0.0699, "step": 16070 }, { "grad_norm": 0.6122088432312012, "learning_rate": 8.756199410222137e-05, "loss": 0.0697, "step": 16080 }, { "grad_norm": 0.5136193633079529, "learning_rate": 8.754379942330963e-05, "loss": 0.0753, "step": 16090 }, { "grad_norm": 0.5751240849494934, "learning_rate": 8.75255933395829e-05, "loss": 0.0763, "step": 16100 }, { "grad_norm": 0.8228462338447571, "learning_rate": 8.750737585657171e-05, "loss": 0.0716, "step": 16110 }, { "grad_norm": 0.8120549321174622, "learning_rate": 8.748914697981008e-05, "loss": 0.0667, "step": 16120 }, { "grad_norm": 0.8640583157539368, "learning_rate": 8.747090671483542e-05, "loss": 0.0853, "step": 16130 }, { "grad_norm": 1.0116454362869263, "learning_rate": 8.745265506718869e-05, "loss": 0.0753, "step": 16140 }, { "grad_norm": 0.9195190072059631, "learning_rate": 8.74343920424142e-05, "loss": 0.102, "step": 16150 }, { "grad_norm": 0.7043737769126892, "learning_rate": 8.741611764605982e-05, "loss": 0.0889, "step": 16160 }, { "grad_norm": 0.7547170519828796, "learning_rate": 8.739783188367682e-05, "loss": 0.0935, "step": 16170 }, { "grad_norm": 0.6852645874023438, "learning_rate": 8.737953476081991e-05, "loss": 0.0781, "step": 16180 }, { "grad_norm": 0.7127507925033569, "learning_rate": 8.73612262830473e-05, "loss": 0.1075, "step": 16190 }, { "grad_norm": 0.517964243888855, "learning_rate": 8.734290645592061e-05, "loss": 0.0691, "step": 16200 }, { "grad_norm": 0.45784810185432434, "learning_rate": 8.732457528500493e-05, "loss": 0.0959, "step": 16210 }, { "grad_norm": 0.8660680651664734, "learning_rate": 8.730623277586875e-05, "loss": 0.0684, "step": 16220 }, { "grad_norm": 0.6944331526756287, "learning_rate": 8.72878789340841e-05, "loss": 0.0772, "step": 16230 }, { "grad_norm": 0.5173137784004211, "learning_rate": 8.726951376522635e-05, "loss": 0.0696, "step": 16240 }, { "grad_norm": 0.6863482594490051, "learning_rate": 8.725113727487435e-05, "loss": 0.0711, "step": 16250 }, { "grad_norm": 0.8157569169998169, "learning_rate": 8.723274946861042e-05, "loss": 0.1055, "step": 16260 }, { "grad_norm": 0.7558162212371826, "learning_rate": 8.721435035202026e-05, "loss": 0.1022, "step": 16270 }, { "grad_norm": 0.6100829839706421, "learning_rate": 8.719593993069306e-05, "loss": 0.0734, "step": 16280 }, { "grad_norm": 0.7190330028533936, "learning_rate": 8.717751821022139e-05, "loss": 0.0644, "step": 16290 }, { "grad_norm": 0.8423972129821777, "learning_rate": 8.715908519620134e-05, "loss": 0.0888, "step": 16300 }, { "grad_norm": 0.765821635723114, "learning_rate": 8.71406408942323e-05, "loss": 0.0936, "step": 16310 }, { "grad_norm": 0.7157899737358093, "learning_rate": 8.712218530991723e-05, "loss": 0.0763, "step": 16320 }, { "grad_norm": 0.5196389555931091, "learning_rate": 8.710371844886241e-05, "loss": 0.0858, "step": 16330 }, { "grad_norm": 1.036952018737793, "learning_rate": 8.708524031667758e-05, "loss": 0.083, "step": 16340 }, { "grad_norm": 0.5238617062568665, "learning_rate": 8.706675091897592e-05, "loss": 0.0772, "step": 16350 }, { "grad_norm": 0.5001530647277832, "learning_rate": 8.704825026137404e-05, "loss": 0.0681, "step": 16360 }, { "grad_norm": 0.847054660320282, "learning_rate": 8.702973834949192e-05, "loss": 0.0636, "step": 16370 }, { "grad_norm": 0.6358325481414795, "learning_rate": 8.701121518895301e-05, "loss": 0.0891, "step": 16380 }, { "grad_norm": 0.9520848989486694, "learning_rate": 8.699268078538414e-05, "loss": 0.0789, "step": 16390 }, { "grad_norm": 0.5865267515182495, "learning_rate": 8.69741351444156e-05, "loss": 0.0816, "step": 16400 }, { "grad_norm": 0.6082649827003479, "learning_rate": 8.695557827168101e-05, "loss": 0.0728, "step": 16410 }, { "grad_norm": 0.6793395280838013, "learning_rate": 8.693701017281753e-05, "loss": 0.0615, "step": 16420 }, { "grad_norm": 0.8518043756484985, "learning_rate": 8.691843085346563e-05, "loss": 0.0879, "step": 16430 }, { "grad_norm": 0.7567982077598572, "learning_rate": 8.689984031926919e-05, "loss": 0.0751, "step": 16440 }, { "grad_norm": 0.5985616445541382, "learning_rate": 8.688123857587555e-05, "loss": 0.0829, "step": 16450 }, { "grad_norm": 0.7106831669807434, "learning_rate": 8.686262562893544e-05, "loss": 0.0803, "step": 16460 }, { "grad_norm": 0.8833368420600891, "learning_rate": 8.684400148410294e-05, "loss": 0.0916, "step": 16470 }, { "grad_norm": 0.6364809274673462, "learning_rate": 8.682536614703562e-05, "loss": 0.0797, "step": 16480 }, { "grad_norm": 0.9657999277114868, "learning_rate": 8.680671962339437e-05, "loss": 0.0773, "step": 16490 }, { "grad_norm": 0.8166150450706482, "learning_rate": 8.678806191884352e-05, "loss": 0.0813, "step": 16500 }, { "grad_norm": 0.8726226091384888, "learning_rate": 8.67693930390508e-05, "loss": 0.0638, "step": 16510 }, { "grad_norm": 0.47871068120002747, "learning_rate": 8.67507129896873e-05, "loss": 0.0754, "step": 16520 }, { "grad_norm": 0.5977247357368469, "learning_rate": 8.673202177642757e-05, "loss": 0.0715, "step": 16530 }, { "grad_norm": 0.9510893225669861, "learning_rate": 8.671331940494945e-05, "loss": 0.0734, "step": 16540 }, { "grad_norm": 0.5705345273017883, "learning_rate": 8.669460588093427e-05, "loss": 0.0848, "step": 16550 }, { "grad_norm": 0.8139265179634094, "learning_rate": 8.667588121006667e-05, "loss": 0.1046, "step": 16560 }, { "grad_norm": 0.9831705689430237, "learning_rate": 8.665714539803475e-05, "loss": 0.0679, "step": 16570 }, { "grad_norm": 0.5587022304534912, "learning_rate": 8.663839845052993e-05, "loss": 0.0778, "step": 16580 }, { "grad_norm": 0.9718136191368103, "learning_rate": 8.661964037324703e-05, "loss": 0.0837, "step": 16590 }, { "grad_norm": 0.8317979574203491, "learning_rate": 8.660087117188427e-05, "loss": 0.0957, "step": 16600 }, { "grad_norm": 0.84015291929245, "learning_rate": 8.658209085214325e-05, "loss": 0.0729, "step": 16610 }, { "grad_norm": 0.8070387840270996, "learning_rate": 8.656329941972891e-05, "loss": 0.0794, "step": 16620 }, { "grad_norm": 0.6806045770645142, "learning_rate": 8.654449688034963e-05, "loss": 0.0693, "step": 16630 }, { "grad_norm": 0.6998480558395386, "learning_rate": 8.652568323971706e-05, "loss": 0.0543, "step": 16640 }, { "grad_norm": 0.5973631739616394, "learning_rate": 8.650685850354636e-05, "loss": 0.0699, "step": 16650 }, { "grad_norm": 0.6941776275634766, "learning_rate": 8.648802267755593e-05, "loss": 0.0692, "step": 16660 }, { "grad_norm": 0.7596254348754883, "learning_rate": 8.646917576746764e-05, "loss": 0.0669, "step": 16670 }, { "grad_norm": 0.8699783086776733, "learning_rate": 8.645031777900666e-05, "loss": 0.0706, "step": 16680 }, { "grad_norm": 0.6108158230781555, "learning_rate": 8.643144871790154e-05, "loss": 0.0823, "step": 16690 }, { "grad_norm": 0.649633526802063, "learning_rate": 8.641256858988424e-05, "loss": 0.0675, "step": 16700 }, { "grad_norm": 0.5115476250648499, "learning_rate": 8.639367740069e-05, "loss": 0.0723, "step": 16710 }, { "grad_norm": 0.7403807044029236, "learning_rate": 8.63747751560575e-05, "loss": 0.0808, "step": 16720 }, { "grad_norm": 0.7216556668281555, "learning_rate": 8.635586186172871e-05, "loss": 0.0551, "step": 16730 }, { "grad_norm": 0.5666940808296204, "learning_rate": 8.633693752344902e-05, "loss": 0.0732, "step": 16740 }, { "grad_norm": 0.5483543872833252, "learning_rate": 8.631800214696713e-05, "loss": 0.0712, "step": 16750 }, { "grad_norm": 0.6838043332099915, "learning_rate": 8.629905573803511e-05, "loss": 0.0834, "step": 16760 }, { "grad_norm": 0.6555301547050476, "learning_rate": 8.628009830240839e-05, "loss": 0.0716, "step": 16770 }, { "grad_norm": 0.5582286715507507, "learning_rate": 8.626112984584571e-05, "loss": 0.0743, "step": 16780 }, { "grad_norm": 0.5496573448181152, "learning_rate": 8.62421503741092e-05, "loss": 0.0639, "step": 16790 }, { "grad_norm": 0.5194265842437744, "learning_rate": 8.622315989296432e-05, "loss": 0.0794, "step": 16800 }, { "grad_norm": 0.8341193795204163, "learning_rate": 8.62041584081799e-05, "loss": 0.0871, "step": 16810 }, { "grad_norm": 0.721199631690979, "learning_rate": 8.618514592552807e-05, "loss": 0.0726, "step": 16820 }, { "grad_norm": 0.5691670179367065, "learning_rate": 8.616612245078431e-05, "loss": 0.0646, "step": 16830 }, { "grad_norm": 0.5854786038398743, "learning_rate": 8.614708798972746e-05, "loss": 0.0657, "step": 16840 }, { "grad_norm": 0.6420395374298096, "learning_rate": 8.61280425481397e-05, "loss": 0.0517, "step": 16850 }, { "grad_norm": 0.7021713256835938, "learning_rate": 8.61089861318065e-05, "loss": 0.0836, "step": 16860 }, { "grad_norm": 0.5344057083129883, "learning_rate": 8.608991874651673e-05, "loss": 0.0745, "step": 16870 }, { "grad_norm": 0.6424691677093506, "learning_rate": 8.607084039806255e-05, "loss": 0.0685, "step": 16880 }, { "grad_norm": 0.8813813924789429, "learning_rate": 8.605175109223944e-05, "loss": 0.0785, "step": 16890 }, { "grad_norm": 0.46196499466896057, "learning_rate": 8.603265083484624e-05, "loss": 0.0618, "step": 16900 }, { "grad_norm": 0.6111968159675598, "learning_rate": 8.60135396316851e-05, "loss": 0.0679, "step": 16910 }, { "grad_norm": 0.4323829114437103, "learning_rate": 8.599441748856152e-05, "loss": 0.0773, "step": 16920 }, { "grad_norm": 0.7031299471855164, "learning_rate": 8.597528441128427e-05, "loss": 0.072, "step": 16930 }, { "grad_norm": 0.6895526051521301, "learning_rate": 8.595614040566549e-05, "loss": 0.0897, "step": 16940 }, { "grad_norm": 0.6644770503044128, "learning_rate": 8.593698547752063e-05, "loss": 0.0739, "step": 16950 }, { "grad_norm": 0.6755679249763489, "learning_rate": 8.591781963266843e-05, "loss": 0.0789, "step": 16960 }, { "grad_norm": 0.8465325832366943, "learning_rate": 8.5898642876931e-05, "loss": 0.0972, "step": 16970 }, { "grad_norm": 0.6566012501716614, "learning_rate": 8.587945521613369e-05, "loss": 0.0799, "step": 16980 }, { "grad_norm": 0.6591241955757141, "learning_rate": 8.586025665610524e-05, "loss": 0.0892, "step": 16990 }, { "grad_norm": 0.7525947093963623, "learning_rate": 8.584104720267765e-05, "loss": 0.0877, "step": 17000 }, { "grad_norm": 0.5044746994972229, "learning_rate": 8.582182686168625e-05, "loss": 0.0763, "step": 17010 }, { "grad_norm": 1.0132057666778564, "learning_rate": 8.580259563896967e-05, "loss": 0.076, "step": 17020 }, { "grad_norm": 0.6495361328125, "learning_rate": 8.578335354036983e-05, "loss": 0.0654, "step": 17030 }, { "grad_norm": 0.6298885941505432, "learning_rate": 8.576410057173201e-05, "loss": 0.0689, "step": 17040 }, { "grad_norm": 0.9964661598205566, "learning_rate": 8.574483673890474e-05, "loss": 0.0657, "step": 17050 }, { "grad_norm": 0.6233921647071838, "learning_rate": 8.572556204773983e-05, "loss": 0.0804, "step": 17060 }, { "grad_norm": 0.5122942328453064, "learning_rate": 8.570627650409246e-05, "loss": 0.0713, "step": 17070 }, { "grad_norm": 0.5374937057495117, "learning_rate": 8.568698011382107e-05, "loss": 0.0745, "step": 17080 }, { "grad_norm": 0.644218921661377, "learning_rate": 8.566767288278738e-05, "loss": 0.0723, "step": 17090 }, { "grad_norm": 0.8014484643936157, "learning_rate": 8.56483548168564e-05, "loss": 0.0741, "step": 17100 }, { "grad_norm": 0.7557458877563477, "learning_rate": 8.562902592189648e-05, "loss": 0.0683, "step": 17110 }, { "grad_norm": 0.7441030740737915, "learning_rate": 8.560968620377921e-05, "loss": 0.0764, "step": 17120 }, { "grad_norm": 0.7892253994941711, "learning_rate": 8.559033566837951e-05, "loss": 0.107, "step": 17130 }, { "grad_norm": 0.6004026532173157, "learning_rate": 8.557097432157551e-05, "loss": 0.0732, "step": 17140 }, { "grad_norm": 0.46048828959465027, "learning_rate": 8.555160216924872e-05, "loss": 0.0602, "step": 17150 }, { "grad_norm": 0.6072617769241333, "learning_rate": 8.55322192172839e-05, "loss": 0.0832, "step": 17160 }, { "grad_norm": 0.9720436334609985, "learning_rate": 8.551282547156902e-05, "loss": 0.089, "step": 17170 }, { "grad_norm": 0.7633448243141174, "learning_rate": 8.549342093799544e-05, "loss": 0.072, "step": 17180 }, { "grad_norm": 0.5015411376953125, "learning_rate": 8.547400562245773e-05, "loss": 0.056, "step": 17190 }, { "grad_norm": 0.6732174158096313, "learning_rate": 8.545457953085374e-05, "loss": 0.0749, "step": 17200 }, { "grad_norm": 0.5280580520629883, "learning_rate": 8.543514266908463e-05, "loss": 0.0683, "step": 17210 }, { "grad_norm": 0.9202122688293457, "learning_rate": 8.541569504305478e-05, "loss": 0.075, "step": 17220 }, { "grad_norm": 1.6254152059555054, "learning_rate": 8.539623665867187e-05, "loss": 0.0802, "step": 17230 }, { "grad_norm": 0.5883209109306335, "learning_rate": 8.537676752184685e-05, "loss": 0.0878, "step": 17240 }, { "grad_norm": 0.8054414391517639, "learning_rate": 8.53572876384939e-05, "loss": 0.079, "step": 17250 }, { "grad_norm": 0.6838610172271729, "learning_rate": 8.533779701453056e-05, "loss": 0.0872, "step": 17260 }, { "grad_norm": 0.7963960766792297, "learning_rate": 8.53182956558775e-05, "loss": 0.0773, "step": 17270 }, { "grad_norm": 0.6611281037330627, "learning_rate": 8.529878356845877e-05, "loss": 0.0918, "step": 17280 }, { "grad_norm": 0.598971426486969, "learning_rate": 8.527926075820158e-05, "loss": 0.0664, "step": 17290 }, { "grad_norm": 0.9163659811019897, "learning_rate": 8.525972723103648e-05, "loss": 0.0854, "step": 17300 }, { "grad_norm": 0.637027382850647, "learning_rate": 8.524018299289722e-05, "loss": 0.0652, "step": 17310 }, { "grad_norm": 0.5070129632949829, "learning_rate": 8.522062804972083e-05, "loss": 0.0628, "step": 17320 }, { "grad_norm": 0.46604111790657043, "learning_rate": 8.520106240744759e-05, "loss": 0.0589, "step": 17330 }, { "grad_norm": 0.7520110011100769, "learning_rate": 8.518148607202102e-05, "loss": 0.0672, "step": 17340 }, { "grad_norm": 0.7830360531806946, "learning_rate": 8.51618990493879e-05, "loss": 0.0659, "step": 17350 }, { "grad_norm": 0.8430642485618591, "learning_rate": 8.514230134549823e-05, "loss": 0.0783, "step": 17360 }, { "grad_norm": 0.7021442651748657, "learning_rate": 8.51226929663053e-05, "loss": 0.0555, "step": 17370 }, { "grad_norm": 0.6530439257621765, "learning_rate": 8.51030739177656e-05, "loss": 0.0624, "step": 17380 }, { "grad_norm": 0.7905057668685913, "learning_rate": 8.508344420583889e-05, "loss": 0.07, "step": 17390 }, { "grad_norm": 0.7060890197753906, "learning_rate": 8.506380383648816e-05, "loss": 0.0669, "step": 17400 }, { "grad_norm": 0.5311899781227112, "learning_rate": 8.504415281567963e-05, "loss": 0.0767, "step": 17410 }, { "grad_norm": 0.6847178339958191, "learning_rate": 8.502449114938275e-05, "loss": 0.0948, "step": 17420 }, { "grad_norm": 0.4662735164165497, "learning_rate": 8.500481884357025e-05, "loss": 0.0617, "step": 17430 }, { "grad_norm": 1.1125587224960327, "learning_rate": 8.498513590421801e-05, "loss": 0.1102, "step": 17440 }, { "grad_norm": 0.9330596923828125, "learning_rate": 8.496544233730522e-05, "loss": 0.0715, "step": 17450 }, { "grad_norm": 0.4781123101711273, "learning_rate": 8.494573814881426e-05, "loss": 0.0655, "step": 17460 }, { "grad_norm": 0.6192917823791504, "learning_rate": 8.492602334473074e-05, "loss": 0.0789, "step": 17470 }, { "grad_norm": 0.8039998412132263, "learning_rate": 8.49062979310435e-05, "loss": 0.0646, "step": 17480 }, { "grad_norm": 0.6127128601074219, "learning_rate": 8.488656191374458e-05, "loss": 0.07, "step": 17490 }, { "grad_norm": 0.49162882566452026, "learning_rate": 8.48668152988293e-05, "loss": 0.0821, "step": 17500 }, { "grad_norm": 0.5245164036750793, "learning_rate": 8.484705809229612e-05, "loss": 0.0533, "step": 17510 }, { "grad_norm": 0.896036684513092, "learning_rate": 8.482729030014677e-05, "loss": 0.0805, "step": 17520 }, { "grad_norm": 0.7045820951461792, "learning_rate": 8.48075119283862e-05, "loss": 0.0786, "step": 17530 }, { "grad_norm": 0.719273567199707, "learning_rate": 8.478772298302254e-05, "loss": 0.0884, "step": 17540 }, { "grad_norm": 0.8307862877845764, "learning_rate": 8.476792347006716e-05, "loss": 0.0616, "step": 17550 }, { "grad_norm": 0.5477255582809448, "learning_rate": 8.474811339553462e-05, "loss": 0.0985, "step": 17560 }, { "grad_norm": 0.7828502058982849, "learning_rate": 8.47282927654427e-05, "loss": 0.0759, "step": 17570 }, { "grad_norm": 0.696103036403656, "learning_rate": 8.470846158581238e-05, "loss": 0.0577, "step": 17580 }, { "grad_norm": 0.8828123807907104, "learning_rate": 8.468861986266787e-05, "loss": 0.0658, "step": 17590 }, { "grad_norm": 0.5588876008987427, "learning_rate": 8.466876760203654e-05, "loss": 0.0675, "step": 17600 }, { "grad_norm": 0.7462535500526428, "learning_rate": 8.464890480994898e-05, "loss": 0.0668, "step": 17610 }, { "grad_norm": 0.6963565349578857, "learning_rate": 8.462903149243899e-05, "loss": 0.0674, "step": 17620 }, { "grad_norm": 0.6277569532394409, "learning_rate": 8.460914765554357e-05, "loss": 0.0607, "step": 17630 }, { "grad_norm": 0.8646305799484253, "learning_rate": 8.458925330530288e-05, "loss": 0.0779, "step": 17640 }, { "grad_norm": 0.5729181170463562, "learning_rate": 8.456934844776032e-05, "loss": 0.0782, "step": 17650 }, { "grad_norm": 0.6947152018547058, "learning_rate": 8.454943308896246e-05, "loss": 0.0736, "step": 17660 }, { "grad_norm": 0.7301270961761475, "learning_rate": 8.452950723495905e-05, "loss": 0.074, "step": 17670 }, { "grad_norm": 0.8765786290168762, "learning_rate": 8.450957089180303e-05, "loss": 0.0753, "step": 17680 }, { "grad_norm": 0.620013415813446, "learning_rate": 8.448962406555055e-05, "loss": 0.0634, "step": 17690 }, { "grad_norm": 0.5551254153251648, "learning_rate": 8.446966676226093e-05, "loss": 0.0613, "step": 17700 }, { "grad_norm": 0.5267893671989441, "learning_rate": 8.444969898799667e-05, "loss": 0.0742, "step": 17710 }, { "grad_norm": 0.45019176602363586, "learning_rate": 8.442972074882343e-05, "loss": 0.0662, "step": 17720 }, { "grad_norm": 0.6380565166473389, "learning_rate": 8.44097320508101e-05, "loss": 0.0675, "step": 17730 }, { "grad_norm": 0.7419494390487671, "learning_rate": 8.43897329000287e-05, "loss": 0.0718, "step": 17740 }, { "grad_norm": 0.6664040088653564, "learning_rate": 8.436972330255448e-05, "loss": 0.0918, "step": 17750 }, { "grad_norm": 0.4614785611629486, "learning_rate": 8.434970326446579e-05, "loss": 0.064, "step": 17760 }, { "grad_norm": 0.6190007328987122, "learning_rate": 8.432967279184418e-05, "loss": 0.0809, "step": 17770 }, { "grad_norm": 0.4707062542438507, "learning_rate": 8.430963189077441e-05, "loss": 0.0537, "step": 17780 }, { "grad_norm": 0.6942821741104126, "learning_rate": 8.428958056734437e-05, "loss": 0.0715, "step": 17790 }, { "grad_norm": 0.6141427159309387, "learning_rate": 8.426951882764513e-05, "loss": 0.0689, "step": 17800 }, { "grad_norm": 0.6711402535438538, "learning_rate": 8.424944667777089e-05, "loss": 0.0685, "step": 17810 }, { "grad_norm": 0.5399754643440247, "learning_rate": 8.422936412381905e-05, "loss": 0.0608, "step": 17820 }, { "grad_norm": 0.8833020329475403, "learning_rate": 8.420927117189017e-05, "loss": 0.0915, "step": 17830 }, { "grad_norm": 0.620413601398468, "learning_rate": 8.418916782808795e-05, "loss": 0.0855, "step": 17840 }, { "grad_norm": 0.6537781953811646, "learning_rate": 8.416905409851926e-05, "loss": 0.0618, "step": 17850 }, { "grad_norm": 0.6815183162689209, "learning_rate": 8.41489299892941e-05, "loss": 0.0712, "step": 17860 }, { "grad_norm": 0.6252518892288208, "learning_rate": 8.412879550652566e-05, "loss": 0.0573, "step": 17870 }, { "grad_norm": 0.8491523861885071, "learning_rate": 8.410865065633029e-05, "loss": 0.0854, "step": 17880 }, { "grad_norm": 0.5943813920021057, "learning_rate": 8.408849544482742e-05, "loss": 0.0779, "step": 17890 }, { "grad_norm": 0.6469469666481018, "learning_rate": 8.406832987813968e-05, "loss": 0.0745, "step": 17900 }, { "grad_norm": 0.4374494254589081, "learning_rate": 8.404815396239286e-05, "loss": 0.0551, "step": 17910 }, { "grad_norm": 0.4658002257347107, "learning_rate": 8.402796770371587e-05, "loss": 0.0629, "step": 17920 }, { "grad_norm": 0.317950576543808, "learning_rate": 8.400777110824071e-05, "loss": 0.0584, "step": 17930 }, { "grad_norm": 0.4415615200996399, "learning_rate": 8.398756418210263e-05, "loss": 0.0525, "step": 17940 }, { "grad_norm": 0.7139857411384583, "learning_rate": 8.396734693143993e-05, "loss": 0.0644, "step": 17950 }, { "grad_norm": 0.9052291512489319, "learning_rate": 8.39471193623941e-05, "loss": 0.0708, "step": 17960 }, { "grad_norm": 0.7059046626091003, "learning_rate": 8.392688148110974e-05, "loss": 0.0905, "step": 17970 }, { "grad_norm": 0.48555445671081543, "learning_rate": 8.390663329373456e-05, "loss": 0.0996, "step": 17980 }, { "grad_norm": 0.8015718460083008, "learning_rate": 8.388637480641944e-05, "loss": 0.0869, "step": 17990 }, { "grad_norm": 0.6670337319374084, "learning_rate": 8.386610602531837e-05, "loss": 0.07, "step": 18000 }, { "grad_norm": 0.7140462398529053, "learning_rate": 8.384582695658847e-05, "loss": 0.0625, "step": 18010 }, { "grad_norm": 0.6925010085105896, "learning_rate": 8.382553760638999e-05, "loss": 0.0652, "step": 18020 }, { "grad_norm": 0.41882166266441345, "learning_rate": 8.380523798088631e-05, "loss": 0.0613, "step": 18030 }, { "grad_norm": 0.728325366973877, "learning_rate": 8.378492808624389e-05, "loss": 0.0867, "step": 18040 }, { "grad_norm": 0.8296850919723511, "learning_rate": 8.376460792863237e-05, "loss": 0.0913, "step": 18050 }, { "grad_norm": 0.4838344156742096, "learning_rate": 8.374427751422444e-05, "loss": 0.0775, "step": 18060 }, { "grad_norm": 0.6804161667823792, "learning_rate": 8.3723936849196e-05, "loss": 0.0695, "step": 18070 }, { "grad_norm": 0.3769875168800354, "learning_rate": 8.370358593972595e-05, "loss": 0.0701, "step": 18080 }, { "grad_norm": 0.6862947940826416, "learning_rate": 8.36832247919964e-05, "loss": 0.0753, "step": 18090 }, { "grad_norm": 0.6166612505912781, "learning_rate": 8.36628534121925e-05, "loss": 0.0726, "step": 18100 }, { "grad_norm": 0.6253165006637573, "learning_rate": 8.364247180650254e-05, "loss": 0.0768, "step": 18110 }, { "grad_norm": 0.5244354009628296, "learning_rate": 8.362207998111794e-05, "loss": 0.0901, "step": 18120 }, { "grad_norm": 0.510158121585846, "learning_rate": 8.360167794223318e-05, "loss": 0.0529, "step": 18130 }, { "grad_norm": 0.5913280248641968, "learning_rate": 8.358126569604586e-05, "loss": 0.0665, "step": 18140 }, { "grad_norm": 0.5083522796630859, "learning_rate": 8.356084324875668e-05, "loss": 0.0747, "step": 18150 }, { "grad_norm": 0.5507321357727051, "learning_rate": 8.354041060656945e-05, "loss": 0.0683, "step": 18160 }, { "grad_norm": 0.7145259976387024, "learning_rate": 8.351996777569106e-05, "loss": 0.0813, "step": 18170 }, { "grad_norm": 0.6007261276245117, "learning_rate": 8.349951476233148e-05, "loss": 0.073, "step": 18180 }, { "grad_norm": 0.8927158713340759, "learning_rate": 8.347905157270386e-05, "loss": 0.0899, "step": 18190 }, { "grad_norm": 0.7059577107429504, "learning_rate": 8.345857821302432e-05, "loss": 0.0753, "step": 18200 }, { "grad_norm": 0.7000967860221863, "learning_rate": 8.343809468951213e-05, "loss": 0.0852, "step": 18210 }, { "grad_norm": 0.9205093383789062, "learning_rate": 8.341760100838965e-05, "loss": 0.0707, "step": 18220 }, { "grad_norm": 0.44144248962402344, "learning_rate": 8.339709717588233e-05, "loss": 0.0703, "step": 18230 }, { "grad_norm": 0.7732656002044678, "learning_rate": 8.33765831982187e-05, "loss": 0.0847, "step": 18240 }, { "grad_norm": 0.6567316055297852, "learning_rate": 8.335605908163035e-05, "loss": 0.0821, "step": 18250 }, { "grad_norm": 0.6816288232803345, "learning_rate": 8.333552483235196e-05, "loss": 0.0695, "step": 18260 }, { "grad_norm": 0.6377179622650146, "learning_rate": 8.33149804566213e-05, "loss": 0.0671, "step": 18270 }, { "grad_norm": 0.633642852306366, "learning_rate": 8.329442596067921e-05, "loss": 0.0591, "step": 18280 }, { "grad_norm": 0.8194890022277832, "learning_rate": 8.32738613507696e-05, "loss": 0.091, "step": 18290 }, { "grad_norm": 0.6855754852294922, "learning_rate": 8.325328663313946e-05, "loss": 0.0783, "step": 18300 }, { "grad_norm": 0.7769573330879211, "learning_rate": 8.323270181403884e-05, "loss": 0.0635, "step": 18310 }, { "grad_norm": 0.7017245888710022, "learning_rate": 8.321210689972086e-05, "loss": 0.0732, "step": 18320 }, { "grad_norm": 0.73322594165802, "learning_rate": 8.319150189644174e-05, "loss": 0.066, "step": 18330 }, { "grad_norm": 0.850037693977356, "learning_rate": 8.31708868104607e-05, "loss": 0.0711, "step": 18340 }, { "grad_norm": 0.6666229367256165, "learning_rate": 8.315026164804007e-05, "loss": 0.0681, "step": 18350 }, { "grad_norm": 0.538086473941803, "learning_rate": 8.312962641544524e-05, "loss": 0.0588, "step": 18360 }, { "grad_norm": 0.6492592692375183, "learning_rate": 8.310898111894465e-05, "loss": 0.054, "step": 18370 }, { "grad_norm": 0.602301836013794, "learning_rate": 8.308832576480977e-05, "loss": 0.0509, "step": 18380 }, { "grad_norm": 1.2094069719314575, "learning_rate": 8.306766035931519e-05, "loss": 0.0791, "step": 18390 }, { "grad_norm": 0.6771414875984192, "learning_rate": 8.304698490873847e-05, "loss": 0.0697, "step": 18400 }, { "grad_norm": 0.8157739043235779, "learning_rate": 8.30262994193603e-05, "loss": 0.0788, "step": 18410 }, { "grad_norm": 0.8124664425849915, "learning_rate": 8.300560389746438e-05, "loss": 0.0868, "step": 18420 }, { "grad_norm": 0.5529941320419312, "learning_rate": 8.298489834933745e-05, "loss": 0.0935, "step": 18430 }, { "grad_norm": 0.5383126735687256, "learning_rate": 8.296418278126934e-05, "loss": 0.0741, "step": 18440 }, { "grad_norm": 0.5744771957397461, "learning_rate": 8.294345719955284e-05, "loss": 0.0684, "step": 18450 }, { "grad_norm": 0.8459459543228149, "learning_rate": 8.29227216104839e-05, "loss": 0.0788, "step": 18460 }, { "grad_norm": 0.5898786187171936, "learning_rate": 8.290197602036137e-05, "loss": 0.0549, "step": 18470 }, { "grad_norm": 0.595887303352356, "learning_rate": 8.288122043548725e-05, "loss": 0.0873, "step": 18480 }, { "grad_norm": 0.5601022243499756, "learning_rate": 8.286045486216657e-05, "loss": 0.0645, "step": 18490 }, { "grad_norm": 0.5243886113166809, "learning_rate": 8.283967930670733e-05, "loss": 0.0718, "step": 18500 }, { "grad_norm": 0.477109432220459, "learning_rate": 8.281889377542058e-05, "loss": 0.0742, "step": 18510 }, { "grad_norm": 0.4588690996170044, "learning_rate": 8.279809827462045e-05, "loss": 0.066, "step": 18520 }, { "grad_norm": 0.7151319980621338, "learning_rate": 8.277729281062402e-05, "loss": 0.071, "step": 18530 }, { "grad_norm": 0.5172391533851624, "learning_rate": 8.27564773897515e-05, "loss": 0.076, "step": 18540 }, { "grad_norm": 0.6456280946731567, "learning_rate": 8.273565201832602e-05, "loss": 0.0604, "step": 18550 }, { "grad_norm": 0.6082733273506165, "learning_rate": 8.27148167026738e-05, "loss": 0.0764, "step": 18560 }, { "grad_norm": 0.6612033843994141, "learning_rate": 8.269397144912405e-05, "loss": 0.08, "step": 18570 }, { "grad_norm": 1.0896947383880615, "learning_rate": 8.267311626400899e-05, "loss": 0.0819, "step": 18580 }, { "grad_norm": 0.7372398972511292, "learning_rate": 8.26522511536639e-05, "loss": 0.0629, "step": 18590 }, { "grad_norm": 0.5780072808265686, "learning_rate": 8.263137612442706e-05, "loss": 0.0896, "step": 18600 }, { "grad_norm": 0.8477794528007507, "learning_rate": 8.261049118263971e-05, "loss": 0.0714, "step": 18610 }, { "grad_norm": 0.7064101099967957, "learning_rate": 8.258959633464619e-05, "loss": 0.0669, "step": 18620 }, { "grad_norm": 0.6858994364738464, "learning_rate": 8.256869158679377e-05, "loss": 0.083, "step": 18630 }, { "grad_norm": 0.9047589302062988, "learning_rate": 8.254777694543278e-05, "loss": 0.0961, "step": 18640 }, { "grad_norm": 0.9138543009757996, "learning_rate": 8.252685241691651e-05, "loss": 0.079, "step": 18650 }, { "grad_norm": 1.0378787517547607, "learning_rate": 8.250591800760133e-05, "loss": 0.074, "step": 18660 }, { "grad_norm": 0.670203447341919, "learning_rate": 8.248497372384649e-05, "loss": 0.0909, "step": 18670 }, { "grad_norm": 0.5859519839286804, "learning_rate": 8.246401957201437e-05, "loss": 0.0647, "step": 18680 }, { "grad_norm": 0.7937480807304382, "learning_rate": 8.244305555847027e-05, "loss": 0.0696, "step": 18690 }, { "grad_norm": 0.7387683987617493, "learning_rate": 8.24220816895825e-05, "loss": 0.0603, "step": 18700 }, { "grad_norm": 0.6579334139823914, "learning_rate": 8.240109797172237e-05, "loss": 0.075, "step": 18710 }, { "grad_norm": 0.5827521085739136, "learning_rate": 8.238010441126416e-05, "loss": 0.073, "step": 18720 }, { "grad_norm": 0.8603880405426025, "learning_rate": 8.23591010145852e-05, "loss": 0.0766, "step": 18730 }, { "grad_norm": 0.563439667224884, "learning_rate": 8.233808778806571e-05, "loss": 0.0895, "step": 18740 }, { "grad_norm": 0.5964698195457458, "learning_rate": 8.231706473808903e-05, "loss": 0.098, "step": 18750 }, { "grad_norm": 0.6693893671035767, "learning_rate": 8.229603187104133e-05, "loss": 0.0657, "step": 18760 }, { "grad_norm": 0.7164173126220703, "learning_rate": 8.22749891933119e-05, "loss": 0.0759, "step": 18770 }, { "grad_norm": 0.8697961568832397, "learning_rate": 8.225393671129291e-05, "loss": 0.0648, "step": 18780 }, { "grad_norm": 0.8154591917991638, "learning_rate": 8.223287443137957e-05, "loss": 0.0781, "step": 18790 }, { "grad_norm": 0.8388005495071411, "learning_rate": 8.221180235997004e-05, "loss": 0.0888, "step": 18800 }, { "grad_norm": 0.7544431090354919, "learning_rate": 8.219072050346544e-05, "loss": 0.0808, "step": 18810 }, { "grad_norm": 0.7073110938072205, "learning_rate": 8.216962886826992e-05, "loss": 0.0623, "step": 18820 }, { "grad_norm": 0.9166061878204346, "learning_rate": 8.214852746079054e-05, "loss": 0.0672, "step": 18830 }, { "grad_norm": 0.8703492879867554, "learning_rate": 8.212741628743732e-05, "loss": 0.0739, "step": 18840 }, { "grad_norm": 0.954918384552002, "learning_rate": 8.210629535462333e-05, "loss": 0.0681, "step": 18850 }, { "grad_norm": 0.737109363079071, "learning_rate": 8.208516466876453e-05, "loss": 0.0658, "step": 18860 }, { "grad_norm": 0.5497350096702576, "learning_rate": 8.206402423627986e-05, "loss": 0.0773, "step": 18870 }, { "grad_norm": 0.6637060642242432, "learning_rate": 8.204287406359124e-05, "loss": 0.0647, "step": 18880 }, { "grad_norm": 0.6894040107727051, "learning_rate": 8.20217141571235e-05, "loss": 0.0629, "step": 18890 }, { "grad_norm": 0.8620993494987488, "learning_rate": 8.200054452330449e-05, "loss": 0.07, "step": 18900 }, { "grad_norm": 0.6829098463058472, "learning_rate": 8.197936516856499e-05, "loss": 0.0643, "step": 18910 }, { "grad_norm": 0.6968793869018555, "learning_rate": 8.195817609933871e-05, "loss": 0.068, "step": 18920 }, { "grad_norm": 0.5615761876106262, "learning_rate": 8.193697732206233e-05, "loss": 0.0846, "step": 18930 }, { "grad_norm": 0.5496521592140198, "learning_rate": 8.19157688431755e-05, "loss": 0.1042, "step": 18940 }, { "grad_norm": 1.0804857015609741, "learning_rate": 8.189455066912077e-05, "loss": 0.0787, "step": 18950 }, { "grad_norm": 0.7736998796463013, "learning_rate": 8.187332280634369e-05, "loss": 0.0662, "step": 18960 }, { "grad_norm": 0.614160418510437, "learning_rate": 8.18520852612927e-05, "loss": 0.0565, "step": 18970 }, { "grad_norm": 0.3712298572063446, "learning_rate": 8.183083804041921e-05, "loss": 0.068, "step": 18980 }, { "grad_norm": 0.5334725379943848, "learning_rate": 8.180958115017757e-05, "loss": 0.0669, "step": 18990 }, { "grad_norm": 0.5205767750740051, "learning_rate": 8.178831459702505e-05, "loss": 0.0589, "step": 19000 }, { "grad_norm": 0.6239930987358093, "learning_rate": 8.17670383874219e-05, "loss": 0.0576, "step": 19010 }, { "grad_norm": 0.7175000309944153, "learning_rate": 8.174575252783124e-05, "loss": 0.0817, "step": 19020 }, { "grad_norm": 0.520465612411499, "learning_rate": 8.172445702471914e-05, "loss": 0.0599, "step": 19030 }, { "grad_norm": 0.6231836676597595, "learning_rate": 8.170315188455466e-05, "loss": 0.0772, "step": 19040 }, { "grad_norm": 0.6375508308410645, "learning_rate": 8.168183711380969e-05, "loss": 0.0585, "step": 19050 }, { "grad_norm": 0.6042803525924683, "learning_rate": 8.166051271895913e-05, "loss": 0.0617, "step": 19060 }, { "grad_norm": 0.6941260099411011, "learning_rate": 8.163917870648075e-05, "loss": 0.0671, "step": 19070 }, { "grad_norm": 0.9616419672966003, "learning_rate": 8.161783508285526e-05, "loss": 0.0881, "step": 19080 }, { "grad_norm": 0.6913079023361206, "learning_rate": 8.159648185456628e-05, "loss": 0.0754, "step": 19090 }, { "grad_norm": 0.6445378065109253, "learning_rate": 8.157511902810038e-05, "loss": 0.0608, "step": 19100 }, { "grad_norm": 0.7533419728279114, "learning_rate": 8.155374660994701e-05, "loss": 0.0817, "step": 19110 }, { "grad_norm": 0.5170310139656067, "learning_rate": 8.153236460659857e-05, "loss": 0.0653, "step": 19120 }, { "grad_norm": 0.7603201270103455, "learning_rate": 8.151097302455031e-05, "loss": 0.0697, "step": 19130 }, { "grad_norm": 0.7436984181404114, "learning_rate": 8.148957187030044e-05, "loss": 0.0853, "step": 19140 }, { "grad_norm": 0.6109397411346436, "learning_rate": 8.146816115035006e-05, "loss": 0.0628, "step": 19150 }, { "grad_norm": 0.6687734723091125, "learning_rate": 8.14467408712032e-05, "loss": 0.0668, "step": 19160 }, { "grad_norm": 0.6977255344390869, "learning_rate": 8.142531103936678e-05, "loss": 0.0597, "step": 19170 }, { "grad_norm": 0.496600866317749, "learning_rate": 8.14038716613506e-05, "loss": 0.0731, "step": 19180 }, { "grad_norm": 0.4306409955024719, "learning_rate": 8.138242274366736e-05, "loss": 0.0575, "step": 19190 }, { "grad_norm": 0.8170785307884216, "learning_rate": 8.136096429283271e-05, "loss": 0.0737, "step": 19200 }, { "grad_norm": 0.7045538425445557, "learning_rate": 8.133949631536515e-05, "loss": 0.0595, "step": 19210 }, { "grad_norm": 0.7353094220161438, "learning_rate": 8.131801881778607e-05, "loss": 0.0677, "step": 19220 }, { "grad_norm": 0.4703497588634491, "learning_rate": 8.129653180661978e-05, "loss": 0.0659, "step": 19230 }, { "grad_norm": 0.516627311706543, "learning_rate": 8.127503528839346e-05, "loss": 0.0781, "step": 19240 }, { "grad_norm": 0.8496378064155579, "learning_rate": 8.125352926963721e-05, "loss": 0.0721, "step": 19250 }, { "grad_norm": 0.46289992332458496, "learning_rate": 8.123201375688395e-05, "loss": 0.0562, "step": 19260 }, { "grad_norm": 0.5314409732818604, "learning_rate": 8.121048875666954e-05, "loss": 0.0468, "step": 19270 }, { "grad_norm": 0.6993576288223267, "learning_rate": 8.118895427553274e-05, "loss": 0.074, "step": 19280 }, { "grad_norm": 0.678082287311554, "learning_rate": 8.116741032001511e-05, "loss": 0.0881, "step": 19290 }, { "grad_norm": 0.5608174204826355, "learning_rate": 8.114585689666114e-05, "loss": 0.0572, "step": 19300 }, { "grad_norm": 0.6911395192146301, "learning_rate": 8.112429401201821e-05, "loss": 0.076, "step": 19310 }, { "grad_norm": 0.6034897565841675, "learning_rate": 8.110272167263656e-05, "loss": 0.0589, "step": 19320 }, { "grad_norm": 0.5365114808082581, "learning_rate": 8.108113988506929e-05, "loss": 0.0802, "step": 19330 }, { "grad_norm": 0.6580524444580078, "learning_rate": 8.105954865587235e-05, "loss": 0.0751, "step": 19340 }, { "grad_norm": 0.5647140741348267, "learning_rate": 8.103794799160463e-05, "loss": 0.0616, "step": 19350 }, { "grad_norm": 0.7414378523826599, "learning_rate": 8.101633789882781e-05, "loss": 0.0628, "step": 19360 }, { "grad_norm": 0.7357096076011658, "learning_rate": 8.099471838410648e-05, "loss": 0.0563, "step": 19370 }, { "grad_norm": 0.5996799468994141, "learning_rate": 8.097308945400806e-05, "loss": 0.0677, "step": 19380 }, { "grad_norm": 0.6035972833633423, "learning_rate": 8.095145111510288e-05, "loss": 0.0585, "step": 19390 }, { "grad_norm": 0.7127842307090759, "learning_rate": 8.092980337396406e-05, "loss": 0.0673, "step": 19400 }, { "grad_norm": 0.6205645799636841, "learning_rate": 8.090814623716763e-05, "loss": 0.0671, "step": 19410 }, { "grad_norm": 0.6946377754211426, "learning_rate": 8.088647971129246e-05, "loss": 0.0482, "step": 19420 }, { "grad_norm": 0.35201796889305115, "learning_rate": 8.086480380292026e-05, "loss": 0.0597, "step": 19430 }, { "grad_norm": 0.6519982814788818, "learning_rate": 8.084311851863562e-05, "loss": 0.0725, "step": 19440 }, { "grad_norm": 0.5337324738502502, "learning_rate": 8.082142386502591e-05, "loss": 0.0801, "step": 19450 }, { "grad_norm": 0.5961397290229797, "learning_rate": 8.079971984868145e-05, "loss": 0.0728, "step": 19460 }, { "grad_norm": 0.9999836087226868, "learning_rate": 8.077800647619532e-05, "loss": 0.0738, "step": 19470 }, { "grad_norm": 0.7167994379997253, "learning_rate": 8.075628375416345e-05, "loss": 0.0792, "step": 19480 }, { "grad_norm": 0.6151668429374695, "learning_rate": 8.073455168918464e-05, "loss": 0.0737, "step": 19490 }, { "grad_norm": 0.7242158055305481, "learning_rate": 8.071281028786055e-05, "loss": 0.0619, "step": 19500 }, { "grad_norm": 0.6152495741844177, "learning_rate": 8.069105955679562e-05, "loss": 0.0737, "step": 19510 }, { "grad_norm": 0.5106921195983887, "learning_rate": 8.066929950259713e-05, "loss": 0.067, "step": 19520 }, { "grad_norm": 0.7909271717071533, "learning_rate": 8.064753013187522e-05, "loss": 0.0625, "step": 19530 }, { "grad_norm": 0.8159493207931519, "learning_rate": 8.062575145124289e-05, "loss": 0.0632, "step": 19540 }, { "grad_norm": 0.8115736842155457, "learning_rate": 8.060396346731587e-05, "loss": 0.0785, "step": 19550 }, { "grad_norm": 0.6611064076423645, "learning_rate": 8.058216618671281e-05, "loss": 0.0587, "step": 19560 }, { "grad_norm": 0.49922874569892883, "learning_rate": 8.056035961605514e-05, "loss": 0.0727, "step": 19570 }, { "grad_norm": 0.6108146905899048, "learning_rate": 8.05385437619671e-05, "loss": 0.0574, "step": 19580 }, { "grad_norm": 0.6649414896965027, "learning_rate": 8.05167186310758e-05, "loss": 0.0569, "step": 19590 }, { "grad_norm": 0.7906359434127808, "learning_rate": 8.049488423001113e-05, "loss": 0.0619, "step": 19600 }, { "grad_norm": 0.6095308065414429, "learning_rate": 8.047304056540581e-05, "loss": 0.0605, "step": 19610 }, { "grad_norm": 0.8990007042884827, "learning_rate": 8.045118764389534e-05, "loss": 0.0692, "step": 19620 }, { "grad_norm": 0.7191029191017151, "learning_rate": 8.042932547211809e-05, "loss": 0.1012, "step": 19630 }, { "grad_norm": 0.5796219110488892, "learning_rate": 8.04074540567152e-05, "loss": 0.067, "step": 19640 }, { "grad_norm": 0.7363657355308533, "learning_rate": 8.038557340433063e-05, "loss": 0.0756, "step": 19650 }, { "grad_norm": 0.6582059860229492, "learning_rate": 8.036368352161115e-05, "loss": 0.0559, "step": 19660 }, { "grad_norm": 0.8240092396736145, "learning_rate": 8.034178441520633e-05, "loss": 0.088, "step": 19670 }, { "grad_norm": 0.758538544178009, "learning_rate": 8.031987609176852e-05, "loss": 0.0749, "step": 19680 }, { "grad_norm": 0.6149095892906189, "learning_rate": 8.02979585579529e-05, "loss": 0.0874, "step": 19690 }, { "grad_norm": 0.6683826446533203, "learning_rate": 8.027603182041745e-05, "loss": 0.0658, "step": 19700 }, { "grad_norm": 0.5267385244369507, "learning_rate": 8.025409588582292e-05, "loss": 0.0574, "step": 19710 }, { "grad_norm": 0.507832407951355, "learning_rate": 8.023215076083288e-05, "loss": 0.0576, "step": 19720 }, { "grad_norm": 0.7452993392944336, "learning_rate": 8.021019645211367e-05, "loss": 0.0729, "step": 19730 }, { "grad_norm": 0.6452960968017578, "learning_rate": 8.018823296633441e-05, "loss": 0.0591, "step": 19740 }, { "grad_norm": 0.8263060450553894, "learning_rate": 8.016626031016708e-05, "loss": 0.0763, "step": 19750 }, { "grad_norm": 0.8191331028938293, "learning_rate": 8.014427849028636e-05, "loss": 0.0906, "step": 19760 }, { "grad_norm": 0.8972970247268677, "learning_rate": 8.012228751336974e-05, "loss": 0.0625, "step": 19770 }, { "grad_norm": 0.3946194350719452, "learning_rate": 8.01002873860975e-05, "loss": 0.0614, "step": 19780 }, { "grad_norm": 0.5952431559562683, "learning_rate": 8.00782781151527e-05, "loss": 0.0748, "step": 19790 }, { "grad_norm": 0.5978491902351379, "learning_rate": 8.005625970722119e-05, "loss": 0.0648, "step": 19800 }, { "grad_norm": 0.6063401103019714, "learning_rate": 8.003423216899158e-05, "loss": 0.0688, "step": 19810 }, { "grad_norm": 0.626435399055481, "learning_rate": 8.001219550715522e-05, "loss": 0.0668, "step": 19820 }, { "grad_norm": 0.5650373101234436, "learning_rate": 7.999014972840632e-05, "loss": 0.0764, "step": 19830 }, { "grad_norm": 0.5548434257507324, "learning_rate": 7.996809483944174e-05, "loss": 0.0788, "step": 19840 }, { "grad_norm": 0.635301411151886, "learning_rate": 7.994603084696124e-05, "loss": 0.0628, "step": 19850 }, { "grad_norm": 0.5530869960784912, "learning_rate": 7.992395775766724e-05, "loss": 0.0526, "step": 19860 }, { "grad_norm": 0.6883628964424133, "learning_rate": 7.990187557826497e-05, "loss": 0.066, "step": 19870 }, { "grad_norm": 0.44549328088760376, "learning_rate": 7.987978431546242e-05, "loss": 0.064, "step": 19880 }, { "grad_norm": 0.5876850485801697, "learning_rate": 7.985768397597031e-05, "loss": 0.0607, "step": 19890 }, { "grad_norm": 0.5099275708198547, "learning_rate": 7.983557456650216e-05, "loss": 0.0703, "step": 19900 }, { "grad_norm": 0.5948911309242249, "learning_rate": 7.981345609377422e-05, "loss": 0.0654, "step": 19910 }, { "grad_norm": 0.43198201060295105, "learning_rate": 7.97913285645055e-05, "loss": 0.0481, "step": 19920 }, { "grad_norm": 0.6449263095855713, "learning_rate": 7.976919198541776e-05, "loss": 0.0673, "step": 19930 }, { "grad_norm": 0.6759973168373108, "learning_rate": 7.974704636323548e-05, "loss": 0.0738, "step": 19940 }, { "grad_norm": 0.9147946834564209, "learning_rate": 7.972489170468597e-05, "loss": 0.0654, "step": 19950 }, { "grad_norm": 0.7706419825553894, "learning_rate": 7.970272801649918e-05, "loss": 0.0553, "step": 19960 }, { "grad_norm": 0.5284444689750671, "learning_rate": 7.96805553054079e-05, "loss": 0.0706, "step": 19970 }, { "grad_norm": 0.42311525344848633, "learning_rate": 7.965837357814756e-05, "loss": 0.0534, "step": 19980 }, { "grad_norm": 0.4826006293296814, "learning_rate": 7.963618284145643e-05, "loss": 0.0605, "step": 19990 }, { "grad_norm": 0.8698768019676208, "learning_rate": 7.961398310207544e-05, "loss": 0.0754, "step": 20000 }, { "grad_norm": 0.5369033813476562, "learning_rate": 7.95917743667483e-05, "loss": 0.0783, "step": 20010 }, { "grad_norm": 0.7382476329803467, "learning_rate": 7.956955664222144e-05, "loss": 0.0572, "step": 20020 }, { "grad_norm": 0.4948105812072754, "learning_rate": 7.954732993524399e-05, "loss": 0.0601, "step": 20030 }, { "grad_norm": 0.531665027141571, "learning_rate": 7.952509425256786e-05, "loss": 0.0698, "step": 20040 }, { "grad_norm": 0.7118204832077026, "learning_rate": 7.950284960094767e-05, "loss": 0.0679, "step": 20050 }, { "grad_norm": 0.5803144574165344, "learning_rate": 7.948059598714076e-05, "loss": 0.0627, "step": 20060 }, { "grad_norm": 0.6845124959945679, "learning_rate": 7.945833341790717e-05, "loss": 0.078, "step": 20070 }, { "grad_norm": 0.5456127524375916, "learning_rate": 7.94360619000097e-05, "loss": 0.0682, "step": 20080 }, { "grad_norm": 0.7218425273895264, "learning_rate": 7.941378144021381e-05, "loss": 0.0738, "step": 20090 }, { "grad_norm": 0.579727292060852, "learning_rate": 7.939149204528777e-05, "loss": 0.058, "step": 20100 }, { "grad_norm": 0.48929235339164734, "learning_rate": 7.936919372200246e-05, "loss": 0.07, "step": 20110 }, { "grad_norm": 0.47924667596817017, "learning_rate": 7.934688647713158e-05, "loss": 0.0558, "step": 20120 }, { "grad_norm": 0.4169023931026459, "learning_rate": 7.932457031745143e-05, "loss": 0.0471, "step": 20130 }, { "grad_norm": 0.5192099213600159, "learning_rate": 7.930224524974108e-05, "loss": 0.0485, "step": 20140 }, { "grad_norm": 0.6962177157402039, "learning_rate": 7.927991128078232e-05, "loss": 0.0657, "step": 20150 }, { "grad_norm": 0.4698057472705841, "learning_rate": 7.925756841735958e-05, "loss": 0.0646, "step": 20160 }, { "grad_norm": 0.2934754490852356, "learning_rate": 7.923521666626008e-05, "loss": 0.067, "step": 20170 }, { "grad_norm": 0.7604890465736389, "learning_rate": 7.921285603427366e-05, "loss": 0.0705, "step": 20180 }, { "grad_norm": 0.67522794008255, "learning_rate": 7.91904865281929e-05, "loss": 0.061, "step": 20190 }, { "grad_norm": 0.6138363480567932, "learning_rate": 7.916810815481307e-05, "loss": 0.0766, "step": 20200 }, { "grad_norm": 0.7149645686149597, "learning_rate": 7.914572092093211e-05, "loss": 0.0519, "step": 20210 }, { "grad_norm": 0.47578445076942444, "learning_rate": 7.912332483335068e-05, "loss": 0.0569, "step": 20220 }, { "grad_norm": 0.8004193305969238, "learning_rate": 7.910091989887213e-05, "loss": 0.0523, "step": 20230 }, { "grad_norm": 0.5816022157669067, "learning_rate": 7.907850612430248e-05, "loss": 0.0717, "step": 20240 }, { "grad_norm": 0.474821537733078, "learning_rate": 7.905608351645044e-05, "loss": 0.0492, "step": 20250 }, { "grad_norm": 0.8359928131103516, "learning_rate": 7.90336520821274e-05, "loss": 0.0837, "step": 20260 }, { "grad_norm": 0.45682692527770996, "learning_rate": 7.901121182814746e-05, "loss": 0.0532, "step": 20270 }, { "grad_norm": 0.7137346863746643, "learning_rate": 7.898876276132736e-05, "loss": 0.0575, "step": 20280 }, { "grad_norm": 0.41443151235580444, "learning_rate": 7.896630488848654e-05, "loss": 0.0731, "step": 20290 }, { "grad_norm": 0.5451486110687256, "learning_rate": 7.89438382164471e-05, "loss": 0.0809, "step": 20300 }, { "grad_norm": 0.5327691435813904, "learning_rate": 7.892136275203383e-05, "loss": 0.0687, "step": 20310 }, { "grad_norm": 0.5119234323501587, "learning_rate": 7.889887850207418e-05, "loss": 0.0587, "step": 20320 }, { "grad_norm": 0.5591288805007935, "learning_rate": 7.887638547339827e-05, "loss": 0.058, "step": 20330 }, { "grad_norm": 0.5918354988098145, "learning_rate": 7.885388367283891e-05, "loss": 0.0705, "step": 20340 }, { "grad_norm": 0.9254708886146545, "learning_rate": 7.88313731072315e-05, "loss": 0.0714, "step": 20350 }, { "grad_norm": 0.6657266616821289, "learning_rate": 7.88088537834142e-05, "loss": 0.0829, "step": 20360 }, { "grad_norm": 0.8026009202003479, "learning_rate": 7.878632570822778e-05, "loss": 0.0789, "step": 20370 }, { "grad_norm": 0.7402452230453491, "learning_rate": 7.876378888851567e-05, "loss": 0.099, "step": 20380 }, { "grad_norm": 0.6725495457649231, "learning_rate": 7.874124333112396e-05, "loss": 0.0695, "step": 20390 }, { "grad_norm": 0.6775646209716797, "learning_rate": 7.871868904290138e-05, "loss": 0.0649, "step": 20400 }, { "grad_norm": 0.7361243367195129, "learning_rate": 7.869612603069935e-05, "loss": 0.0659, "step": 20410 }, { "grad_norm": 0.7022765874862671, "learning_rate": 7.867355430137192e-05, "loss": 0.0667, "step": 20420 }, { "grad_norm": 0.5838765501976013, "learning_rate": 7.865097386177577e-05, "loss": 0.0547, "step": 20430 }, { "grad_norm": 0.4191904366016388, "learning_rate": 7.862838471877023e-05, "loss": 0.0623, "step": 20440 }, { "grad_norm": 1.1888298988342285, "learning_rate": 7.860578687921731e-05, "loss": 0.0603, "step": 20450 }, { "grad_norm": 1.0971417427062988, "learning_rate": 7.858318034998164e-05, "loss": 0.0772, "step": 20460 }, { "grad_norm": 0.8485752940177917, "learning_rate": 7.856056513793046e-05, "loss": 0.0717, "step": 20470 }, { "grad_norm": 0.907538890838623, "learning_rate": 7.85379412499337e-05, "loss": 0.0606, "step": 20480 }, { "grad_norm": 0.5107941031455994, "learning_rate": 7.851530869286389e-05, "loss": 0.0729, "step": 20490 }, { "grad_norm": 0.6972208023071289, "learning_rate": 7.849266747359619e-05, "loss": 0.0629, "step": 20500 }, { "grad_norm": 0.6265513300895691, "learning_rate": 7.847001759900843e-05, "loss": 0.0545, "step": 20510 }, { "grad_norm": 0.7231782674789429, "learning_rate": 7.844735907598102e-05, "loss": 0.0579, "step": 20520 }, { "grad_norm": 0.4701542258262634, "learning_rate": 7.842469191139703e-05, "loss": 0.0579, "step": 20530 }, { "grad_norm": 0.5025255084037781, "learning_rate": 7.840201611214215e-05, "loss": 0.0657, "step": 20540 }, { "grad_norm": 0.5681331753730774, "learning_rate": 7.837933168510469e-05, "loss": 0.0716, "step": 20550 }, { "grad_norm": 0.8299402594566345, "learning_rate": 7.835663863717559e-05, "loss": 0.0722, "step": 20560 }, { "grad_norm": 0.8408659100532532, "learning_rate": 7.833393697524838e-05, "loss": 0.0945, "step": 20570 }, { "grad_norm": 0.4982859492301941, "learning_rate": 7.831122670621922e-05, "loss": 0.0528, "step": 20580 }, { "grad_norm": 0.9316030740737915, "learning_rate": 7.82885078369869e-05, "loss": 0.0714, "step": 20590 }, { "grad_norm": 0.7074949145317078, "learning_rate": 7.826578037445283e-05, "loss": 0.072, "step": 20600 }, { "grad_norm": 0.5849635601043701, "learning_rate": 7.824304432552097e-05, "loss": 0.0511, "step": 20610 }, { "grad_norm": 0.6416986584663391, "learning_rate": 7.822029969709798e-05, "loss": 0.0583, "step": 20620 }, { "grad_norm": 0.6454067230224609, "learning_rate": 7.819754649609306e-05, "loss": 0.0733, "step": 20630 }, { "grad_norm": 1.126392126083374, "learning_rate": 7.817478472941802e-05, "loss": 0.0627, "step": 20640 }, { "grad_norm": 0.7151906490325928, "learning_rate": 7.815201440398727e-05, "loss": 0.0598, "step": 20650 }, { "grad_norm": 0.623033881187439, "learning_rate": 7.812923552671789e-05, "loss": 0.0595, "step": 20660 }, { "grad_norm": 0.5730457305908203, "learning_rate": 7.810644810452945e-05, "loss": 0.0417, "step": 20670 }, { "grad_norm": 0.574713945388794, "learning_rate": 7.808365214434417e-05, "loss": 0.0572, "step": 20680 }, { "grad_norm": 0.5940136313438416, "learning_rate": 7.80608476530869e-05, "loss": 0.0626, "step": 20690 }, { "grad_norm": 0.5124941468238831, "learning_rate": 7.8038034637685e-05, "loss": 0.0689, "step": 20700 }, { "grad_norm": 0.7583538889884949, "learning_rate": 7.801521310506848e-05, "loss": 0.0601, "step": 20710 }, { "grad_norm": 0.6424245834350586, "learning_rate": 7.799238306216994e-05, "loss": 0.0616, "step": 20720 }, { "grad_norm": 0.6294963955879211, "learning_rate": 7.796954451592448e-05, "loss": 0.0583, "step": 20730 }, { "grad_norm": 0.43303540349006653, "learning_rate": 7.794669747326992e-05, "loss": 0.051, "step": 20740 }, { "grad_norm": 0.7627994418144226, "learning_rate": 7.792384194114654e-05, "loss": 0.0568, "step": 20750 }, { "grad_norm": 0.678819477558136, "learning_rate": 7.790097792649729e-05, "loss": 0.0701, "step": 20760 }, { "grad_norm": 0.64703369140625, "learning_rate": 7.787810543626762e-05, "loss": 0.05, "step": 20770 }, { "grad_norm": 1.2693992853164673, "learning_rate": 7.785522447740558e-05, "loss": 0.067, "step": 20780 }, { "grad_norm": 0.7586674690246582, "learning_rate": 7.783233505686182e-05, "loss": 0.0699, "step": 20790 }, { "grad_norm": 0.5031360387802124, "learning_rate": 7.780943718158955e-05, "loss": 0.0524, "step": 20800 }, { "grad_norm": 0.6937921047210693, "learning_rate": 7.778653085854453e-05, "loss": 0.0669, "step": 20810 }, { "grad_norm": 0.7896915078163147, "learning_rate": 7.77636160946851e-05, "loss": 0.0638, "step": 20820 }, { "grad_norm": 0.49752625823020935, "learning_rate": 7.774069289697215e-05, "loss": 0.0743, "step": 20830 }, { "grad_norm": 0.49750104546546936, "learning_rate": 7.771776127236913e-05, "loss": 0.0782, "step": 20840 }, { "grad_norm": 0.7593702673912048, "learning_rate": 7.769482122784212e-05, "loss": 0.0634, "step": 20850 }, { "grad_norm": 0.5500180721282959, "learning_rate": 7.767187277035963e-05, "loss": 0.067, "step": 20860 }, { "grad_norm": 0.7015213370323181, "learning_rate": 7.764891590689285e-05, "loss": 0.0513, "step": 20870 }, { "grad_norm": 0.4333689212799072, "learning_rate": 7.762595064441542e-05, "loss": 0.0606, "step": 20880 }, { "grad_norm": 0.8058313727378845, "learning_rate": 7.760297698990362e-05, "loss": 0.0626, "step": 20890 }, { "grad_norm": 0.6407642960548401, "learning_rate": 7.757999495033623e-05, "loss": 0.0704, "step": 20900 }, { "grad_norm": 0.6233401298522949, "learning_rate": 7.755700453269456e-05, "loss": 0.0698, "step": 20910 }, { "grad_norm": 0.7978671193122864, "learning_rate": 7.753400574396254e-05, "loss": 0.0591, "step": 20920 }, { "grad_norm": 0.5833495259284973, "learning_rate": 7.751099859112655e-05, "loss": 0.0687, "step": 20930 }, { "grad_norm": 0.797045886516571, "learning_rate": 7.748798308117557e-05, "loss": 0.0568, "step": 20940 }, { "grad_norm": 0.49547719955444336, "learning_rate": 7.746495922110112e-05, "loss": 0.06, "step": 20950 }, { "grad_norm": 1.2402112483978271, "learning_rate": 7.744192701789723e-05, "loss": 0.0639, "step": 20960 }, { "grad_norm": 0.5892848968505859, "learning_rate": 7.741888647856046e-05, "loss": 0.0544, "step": 20970 }, { "grad_norm": 0.639600396156311, "learning_rate": 7.739583761008994e-05, "loss": 0.065, "step": 20980 }, { "grad_norm": 0.7296360731124878, "learning_rate": 7.73727804194873e-05, "loss": 0.0638, "step": 20990 }, { "grad_norm": 0.6579421162605286, "learning_rate": 7.734971491375671e-05, "loss": 0.0576, "step": 21000 }, { "grad_norm": 0.6308386921882629, "learning_rate": 7.732664109990485e-05, "loss": 0.0671, "step": 21010 }, { "grad_norm": 0.692832887172699, "learning_rate": 7.730355898494095e-05, "loss": 0.0683, "step": 21020 }, { "grad_norm": 0.5660350322723389, "learning_rate": 7.728046857587673e-05, "loss": 0.0595, "step": 21030 }, { "grad_norm": 0.679065465927124, "learning_rate": 7.725736987972647e-05, "loss": 0.0599, "step": 21040 }, { "grad_norm": 0.8575834035873413, "learning_rate": 7.723426290350691e-05, "loss": 0.0729, "step": 21050 }, { "grad_norm": 0.5477249622344971, "learning_rate": 7.721114765423736e-05, "loss": 0.0578, "step": 21060 }, { "grad_norm": 0.5584307312965393, "learning_rate": 7.718802413893963e-05, "loss": 0.0503, "step": 21070 }, { "grad_norm": 0.7056741714477539, "learning_rate": 7.716489236463802e-05, "loss": 0.0649, "step": 21080 }, { "grad_norm": 0.5661779642105103, "learning_rate": 7.714175233835936e-05, "loss": 0.0612, "step": 21090 }, { "grad_norm": 1.310306429862976, "learning_rate": 7.711860406713299e-05, "loss": 0.0662, "step": 21100 }, { "grad_norm": 0.5027204155921936, "learning_rate": 7.70954475579907e-05, "loss": 0.0542, "step": 21110 }, { "grad_norm": 0.5070560574531555, "learning_rate": 7.707228281796688e-05, "loss": 0.0544, "step": 21120 }, { "grad_norm": 0.6766663193702698, "learning_rate": 7.704910985409833e-05, "loss": 0.057, "step": 21130 }, { "grad_norm": 0.673031747341156, "learning_rate": 7.702592867342439e-05, "loss": 0.0522, "step": 21140 }, { "grad_norm": 0.8213121891021729, "learning_rate": 7.700273928298691e-05, "loss": 0.0727, "step": 21150 }, { "grad_norm": 0.33464667201042175, "learning_rate": 7.697954168983021e-05, "loss": 0.057, "step": 21160 }, { "grad_norm": 0.5257903337478638, "learning_rate": 7.695633590100109e-05, "loss": 0.0641, "step": 21170 }, { "grad_norm": 0.5815262198448181, "learning_rate": 7.693312192354886e-05, "loss": 0.0668, "step": 21180 }, { "grad_norm": 0.7471060752868652, "learning_rate": 7.690989976452532e-05, "loss": 0.0652, "step": 21190 }, { "grad_norm": 0.4685947895050049, "learning_rate": 7.688666943098475e-05, "loss": 0.07, "step": 21200 }, { "grad_norm": 0.6186369061470032, "learning_rate": 7.686343092998389e-05, "loss": 0.1017, "step": 21210 }, { "grad_norm": 0.5597102046012878, "learning_rate": 7.684018426858202e-05, "loss": 0.0667, "step": 21220 }, { "grad_norm": 0.8337017297744751, "learning_rate": 7.681692945384084e-05, "loss": 0.0759, "step": 21230 }, { "grad_norm": 0.758658230304718, "learning_rate": 7.679366649282456e-05, "loss": 0.0753, "step": 21240 }, { "grad_norm": 0.5761226415634155, "learning_rate": 7.677039539259983e-05, "loss": 0.0532, "step": 21250 }, { "grad_norm": 0.6398540139198303, "learning_rate": 7.674711616023581e-05, "loss": 0.0499, "step": 21260 }, { "grad_norm": 0.5114226937294006, "learning_rate": 7.672382880280413e-05, "loss": 0.0577, "step": 21270 }, { "grad_norm": 0.4426962435245514, "learning_rate": 7.670053332737885e-05, "loss": 0.0661, "step": 21280 }, { "grad_norm": 0.4801621735095978, "learning_rate": 7.667722974103654e-05, "loss": 0.0742, "step": 21290 }, { "grad_norm": 0.41410109400749207, "learning_rate": 7.66539180508562e-05, "loss": 0.0537, "step": 21300 }, { "grad_norm": 0.8756566047668457, "learning_rate": 7.663059826391932e-05, "loss": 0.0802, "step": 21310 }, { "grad_norm": 0.588733434677124, "learning_rate": 7.660727038730981e-05, "loss": 0.0593, "step": 21320 }, { "grad_norm": 0.8663751482963562, "learning_rate": 7.65839344281141e-05, "loss": 0.0676, "step": 21330 }, { "grad_norm": 0.5890693664550781, "learning_rate": 7.656059039342101e-05, "loss": 0.0604, "step": 21340 }, { "grad_norm": 0.3044498562812805, "learning_rate": 7.653723829032187e-05, "loss": 0.0529, "step": 21350 }, { "grad_norm": 1.1151108741760254, "learning_rate": 7.65138781259104e-05, "loss": 0.0668, "step": 21360 }, { "grad_norm": 0.5095704197883606, "learning_rate": 7.649050990728279e-05, "loss": 0.0605, "step": 21370 }, { "grad_norm": 0.4823833107948303, "learning_rate": 7.646713364153774e-05, "loss": 0.0429, "step": 21380 }, { "grad_norm": 0.6536568999290466, "learning_rate": 7.64437493357763e-05, "loss": 0.0603, "step": 21390 }, { "grad_norm": 0.5206348896026611, "learning_rate": 7.642035699710202e-05, "loss": 0.1163, "step": 21400 }, { "grad_norm": 0.4610430896282196, "learning_rate": 7.639695663262089e-05, "loss": 0.0799, "step": 21410 }, { "grad_norm": 1.3323787450790405, "learning_rate": 7.637354824944128e-05, "loss": 0.0732, "step": 21420 }, { "grad_norm": 0.8006385564804077, "learning_rate": 7.635013185467408e-05, "loss": 0.066, "step": 21430 }, { "grad_norm": 0.7383692264556885, "learning_rate": 7.632670745543256e-05, "loss": 0.0708, "step": 21440 }, { "grad_norm": 0.5302392244338989, "learning_rate": 7.630327505883242e-05, "loss": 0.0607, "step": 21450 }, { "grad_norm": 0.5554313659667969, "learning_rate": 7.627983467199182e-05, "loss": 0.0406, "step": 21460 }, { "grad_norm": 0.6855206489562988, "learning_rate": 7.625638630203132e-05, "loss": 0.0634, "step": 21470 }, { "grad_norm": 0.609388530254364, "learning_rate": 7.623292995607394e-05, "loss": 0.0602, "step": 21480 }, { "grad_norm": 0.6729419827461243, "learning_rate": 7.620946564124507e-05, "loss": 0.0649, "step": 21490 }, { "grad_norm": 0.6863114237785339, "learning_rate": 7.618599336467256e-05, "loss": 0.0631, "step": 21500 }, { "grad_norm": 0.5648500323295593, "learning_rate": 7.616251313348666e-05, "loss": 0.0749, "step": 21510 }, { "grad_norm": 0.6610344648361206, "learning_rate": 7.613902495482005e-05, "loss": 0.0615, "step": 21520 }, { "grad_norm": 0.4306272566318512, "learning_rate": 7.611552883580784e-05, "loss": 0.051, "step": 21530 }, { "grad_norm": 0.7037889361381531, "learning_rate": 7.609202478358748e-05, "loss": 0.0733, "step": 21540 }, { "grad_norm": 0.6669933199882507, "learning_rate": 7.606851280529895e-05, "loss": 0.0553, "step": 21550 }, { "grad_norm": 0.5325101613998413, "learning_rate": 7.604499290808449e-05, "loss": 0.0661, "step": 21560 }, { "grad_norm": 0.6732141971588135, "learning_rate": 7.602146509908888e-05, "loss": 0.0521, "step": 21570 }, { "grad_norm": 0.5870396494865417, "learning_rate": 7.599792938545921e-05, "loss": 0.0632, "step": 21580 }, { "grad_norm": 0.5554564595222473, "learning_rate": 7.597438577434506e-05, "loss": 0.067, "step": 21590 }, { "grad_norm": 0.5756967663764954, "learning_rate": 7.595083427289831e-05, "loss": 0.0717, "step": 21600 }, { "grad_norm": 0.4447110891342163, "learning_rate": 7.59272748882733e-05, "loss": 0.0611, "step": 21610 }, { "grad_norm": 0.7461786866188049, "learning_rate": 7.590370762762675e-05, "loss": 0.0645, "step": 21620 }, { "grad_norm": 0.5831683278083801, "learning_rate": 7.588013249811777e-05, "loss": 0.075, "step": 21630 }, { "grad_norm": 0.8924147486686707, "learning_rate": 7.585654950690786e-05, "loss": 0.0535, "step": 21640 }, { "grad_norm": 0.5976592898368835, "learning_rate": 7.583295866116091e-05, "loss": 0.0627, "step": 21650 }, { "grad_norm": 0.5371516346931458, "learning_rate": 7.580935996804321e-05, "loss": 0.0614, "step": 21660 }, { "grad_norm": 0.8827683925628662, "learning_rate": 7.57857534347234e-05, "loss": 0.096, "step": 21670 }, { "grad_norm": 0.4436342418193817, "learning_rate": 7.576213906837254e-05, "loss": 0.0515, "step": 21680 }, { "grad_norm": 0.5951741337776184, "learning_rate": 7.573851687616403e-05, "loss": 0.0548, "step": 21690 }, { "grad_norm": 0.679869532585144, "learning_rate": 7.571488686527368e-05, "loss": 0.064, "step": 21700 }, { "grad_norm": 0.8054651021957397, "learning_rate": 7.569124904287968e-05, "loss": 0.055, "step": 21710 }, { "grad_norm": 0.6117942929267883, "learning_rate": 7.566760341616254e-05, "loss": 0.0654, "step": 21720 }, { "grad_norm": 0.8348686099052429, "learning_rate": 7.564394999230519e-05, "loss": 0.0601, "step": 21730 }, { "grad_norm": 0.6368081569671631, "learning_rate": 7.562028877849294e-05, "loss": 0.0695, "step": 21740 }, { "grad_norm": 0.46211913228034973, "learning_rate": 7.559661978191341e-05, "loss": 0.0817, "step": 21750 }, { "grad_norm": 0.5242652297019958, "learning_rate": 7.557294300975664e-05, "loss": 0.0545, "step": 21760 }, { "grad_norm": 0.5962632298469543, "learning_rate": 7.554925846921499e-05, "loss": 0.0633, "step": 21770 }, { "grad_norm": 1.054306983947754, "learning_rate": 7.552556616748321e-05, "loss": 0.0652, "step": 21780 }, { "grad_norm": 0.5161319971084595, "learning_rate": 7.550186611175838e-05, "loss": 0.0804, "step": 21790 }, { "grad_norm": 0.5403336882591248, "learning_rate": 7.547815830923998e-05, "loss": 0.0625, "step": 21800 }, { "grad_norm": 0.47338610887527466, "learning_rate": 7.54544427671298e-05, "loss": 0.0568, "step": 21810 }, { "grad_norm": 0.4371338486671448, "learning_rate": 7.543071949263198e-05, "loss": 0.0635, "step": 21820 }, { "grad_norm": 0.6717756986618042, "learning_rate": 7.540698849295305e-05, "loss": 0.0568, "step": 21830 }, { "grad_norm": 0.6221994757652283, "learning_rate": 7.538324977530183e-05, "loss": 0.0606, "step": 21840 }, { "grad_norm": 0.4579251706600189, "learning_rate": 7.535950334688955e-05, "loss": 0.0679, "step": 21850 }, { "grad_norm": 1.0115599632263184, "learning_rate": 7.533574921492972e-05, "loss": 0.0758, "step": 21860 }, { "grad_norm": 0.6702033281326294, "learning_rate": 7.531198738663824e-05, "loss": 0.0623, "step": 21870 }, { "grad_norm": 0.7279088497161865, "learning_rate": 7.528821786923333e-05, "loss": 0.0781, "step": 21880 }, { "grad_norm": 0.5420592427253723, "learning_rate": 7.52644406699355e-05, "loss": 0.0558, "step": 21890 }, { "grad_norm": 0.8586335778236389, "learning_rate": 7.524065579596766e-05, "loss": 0.0701, "step": 21900 }, { "grad_norm": 0.9226605296134949, "learning_rate": 7.521686325455506e-05, "loss": 0.051, "step": 21910 }, { "grad_norm": 0.45868754386901855, "learning_rate": 7.51930630529252e-05, "loss": 0.0676, "step": 21920 }, { "grad_norm": 0.5761677622795105, "learning_rate": 7.516925519830797e-05, "loss": 0.0586, "step": 21930 }, { "grad_norm": 0.4312590956687927, "learning_rate": 7.514543969793557e-05, "loss": 0.0668, "step": 21940 }, { "grad_norm": 0.6977737545967102, "learning_rate": 7.512161655904251e-05, "loss": 0.0749, "step": 21950 }, { "grad_norm": 0.4736560583114624, "learning_rate": 7.509778578886563e-05, "loss": 0.043, "step": 21960 }, { "grad_norm": 0.514630138874054, "learning_rate": 7.507394739464412e-05, "loss": 0.0612, "step": 21970 }, { "grad_norm": 0.5187046527862549, "learning_rate": 7.50501013836194e-05, "loss": 0.065, "step": 21980 }, { "grad_norm": 0.6025691032409668, "learning_rate": 7.50262477630353e-05, "loss": 0.0628, "step": 21990 }, { "grad_norm": 0.6461264491081238, "learning_rate": 7.500238654013794e-05, "loss": 0.0675, "step": 22000 }, { "grad_norm": 0.6680965423583984, "learning_rate": 7.497851772217566e-05, "loss": 0.0703, "step": 22010 }, { "grad_norm": 0.6630335450172424, "learning_rate": 7.495464131639924e-05, "loss": 0.0545, "step": 22020 }, { "grad_norm": 0.7760348320007324, "learning_rate": 7.493075733006166e-05, "loss": 0.0635, "step": 22030 }, { "grad_norm": 0.8156224489212036, "learning_rate": 7.490686577041828e-05, "loss": 0.069, "step": 22040 }, { "grad_norm": 0.6005292534828186, "learning_rate": 7.488296664472668e-05, "loss": 0.0611, "step": 22050 }, { "grad_norm": 0.7792485356330872, "learning_rate": 7.485905996024682e-05, "loss": 0.0629, "step": 22060 }, { "grad_norm": 0.5492357015609741, "learning_rate": 7.483514572424093e-05, "loss": 0.0533, "step": 22070 }, { "grad_norm": 0.46555545926094055, "learning_rate": 7.481122394397349e-05, "loss": 0.0645, "step": 22080 }, { "grad_norm": 0.7687682509422302, "learning_rate": 7.478729462671131e-05, "loss": 0.0638, "step": 22090 }, { "grad_norm": 0.8554268479347229, "learning_rate": 7.47633577797235e-05, "loss": 0.0537, "step": 22100 }, { "grad_norm": 0.7551859021186829, "learning_rate": 7.473941341028144e-05, "loss": 0.0579, "step": 22110 }, { "grad_norm": 0.527718722820282, "learning_rate": 7.471546152565879e-05, "loss": 0.0822, "step": 22120 }, { "grad_norm": 0.49963584542274475, "learning_rate": 7.46915021331315e-05, "loss": 0.0599, "step": 22130 }, { "grad_norm": 0.8473708629608154, "learning_rate": 7.466753523997778e-05, "loss": 0.0701, "step": 22140 }, { "grad_norm": 0.5947211980819702, "learning_rate": 7.464356085347819e-05, "loss": 0.0542, "step": 22150 }, { "grad_norm": 0.6175183653831482, "learning_rate": 7.461957898091548e-05, "loss": 0.0642, "step": 22160 }, { "grad_norm": 0.46479538083076477, "learning_rate": 7.459558962957473e-05, "loss": 0.0425, "step": 22170 }, { "grad_norm": 0.6338525414466858, "learning_rate": 7.457159280674326e-05, "loss": 0.0638, "step": 22180 }, { "grad_norm": 0.5738877654075623, "learning_rate": 7.454758851971066e-05, "loss": 0.0729, "step": 22190 }, { "grad_norm": 0.4121954143047333, "learning_rate": 7.45235767757688e-05, "loss": 0.0784, "step": 22200 }, { "grad_norm": 0.41950544714927673, "learning_rate": 7.449955758221183e-05, "loss": 0.0566, "step": 22210 }, { "grad_norm": 0.5850544571876526, "learning_rate": 7.447553094633615e-05, "loss": 0.0738, "step": 22220 }, { "grad_norm": 0.643558919429779, "learning_rate": 7.445149687544039e-05, "loss": 0.0564, "step": 22230 }, { "grad_norm": 0.6952292323112488, "learning_rate": 7.44274553768255e-05, "loss": 0.0669, "step": 22240 }, { "grad_norm": 0.5024678707122803, "learning_rate": 7.440340645779464e-05, "loss": 0.0657, "step": 22250 }, { "grad_norm": 0.6250094771385193, "learning_rate": 7.437935012565322e-05, "loss": 0.0624, "step": 22260 }, { "grad_norm": 0.6154235601425171, "learning_rate": 7.435528638770893e-05, "loss": 0.0561, "step": 22270 }, { "grad_norm": 0.5271241664886475, "learning_rate": 7.433121525127171e-05, "loss": 0.0558, "step": 22280 }, { "grad_norm": 0.6364156007766724, "learning_rate": 7.430713672365371e-05, "loss": 0.0465, "step": 22290 }, { "grad_norm": 0.4311450719833374, "learning_rate": 7.428305081216938e-05, "loss": 0.0463, "step": 22300 }, { "grad_norm": 0.5906243324279785, "learning_rate": 7.425895752413536e-05, "loss": 0.0638, "step": 22310 }, { "grad_norm": 0.8508905172348022, "learning_rate": 7.423485686687057e-05, "loss": 0.0591, "step": 22320 }, { "grad_norm": 0.7160120606422424, "learning_rate": 7.421074884769616e-05, "loss": 0.0475, "step": 22330 }, { "grad_norm": 0.8574193120002747, "learning_rate": 7.418663347393548e-05, "loss": 0.0612, "step": 22340 }, { "grad_norm": 0.8265910744667053, "learning_rate": 7.416251075291418e-05, "loss": 0.0643, "step": 22350 }, { "grad_norm": 0.6044592261314392, "learning_rate": 7.413838069196007e-05, "loss": 0.0551, "step": 22360 }, { "grad_norm": 0.8225632309913635, "learning_rate": 7.411424329840324e-05, "loss": 0.0765, "step": 22370 }, { "grad_norm": 0.6064283847808838, "learning_rate": 7.409009857957601e-05, "loss": 0.0574, "step": 22380 }, { "grad_norm": 0.6664906144142151, "learning_rate": 7.40659465428129e-05, "loss": 0.0735, "step": 22390 }, { "grad_norm": 0.7653631567955017, "learning_rate": 7.404178719545063e-05, "loss": 0.063, "step": 22400 }, { "grad_norm": 0.38112467527389526, "learning_rate": 7.401762054482822e-05, "loss": 0.0538, "step": 22410 }, { "grad_norm": 0.6257686018943787, "learning_rate": 7.39934465982868e-05, "loss": 0.0696, "step": 22420 }, { "grad_norm": 0.7084353566169739, "learning_rate": 7.396926536316984e-05, "loss": 0.0534, "step": 22430 }, { "grad_norm": 0.5213550925254822, "learning_rate": 7.394507684682293e-05, "loss": 0.0629, "step": 22440 }, { "grad_norm": 0.5827367305755615, "learning_rate": 7.392088105659393e-05, "loss": 0.0762, "step": 22450 }, { "grad_norm": 0.7359413504600525, "learning_rate": 7.389667799983284e-05, "loss": 0.0584, "step": 22460 }, { "grad_norm": 0.6111510396003723, "learning_rate": 7.387246768389193e-05, "loss": 0.0605, "step": 22470 }, { "grad_norm": 0.49318617582321167, "learning_rate": 7.384825011612563e-05, "loss": 0.0537, "step": 22480 }, { "grad_norm": 0.5730351805686951, "learning_rate": 7.382402530389066e-05, "loss": 0.0551, "step": 22490 }, { "grad_norm": 0.6102268695831299, "learning_rate": 7.379979325454582e-05, "loss": 0.0482, "step": 22500 }, { "grad_norm": 0.4515620470046997, "learning_rate": 7.37755539754522e-05, "loss": 0.0422, "step": 22510 }, { "grad_norm": 0.48048868775367737, "learning_rate": 7.375130747397302e-05, "loss": 0.0672, "step": 22520 }, { "grad_norm": 0.6644967794418335, "learning_rate": 7.372705375747377e-05, "loss": 0.0598, "step": 22530 }, { "grad_norm": 0.671216607093811, "learning_rate": 7.370279283332205e-05, "loss": 0.0556, "step": 22540 }, { "grad_norm": 0.6401969194412231, "learning_rate": 7.36785247088877e-05, "loss": 0.0591, "step": 22550 }, { "grad_norm": 0.896141529083252, "learning_rate": 7.365424939154275e-05, "loss": 0.0947, "step": 22560 }, { "grad_norm": 0.8252336382865906, "learning_rate": 7.362996688866138e-05, "loss": 0.0763, "step": 22570 }, { "grad_norm": 0.5316601395606995, "learning_rate": 7.360567720761999e-05, "loss": 0.0669, "step": 22580 }, { "grad_norm": 0.6285518407821655, "learning_rate": 7.358138035579711e-05, "loss": 0.0579, "step": 22590 }, { "grad_norm": 0.46587085723876953, "learning_rate": 7.355707634057354e-05, "loss": 0.0614, "step": 22600 }, { "grad_norm": 0.6270586252212524, "learning_rate": 7.353276516933215e-05, "loss": 0.0643, "step": 22610 }, { "grad_norm": 0.7595579624176025, "learning_rate": 7.350844684945806e-05, "loss": 0.0674, "step": 22620 }, { "grad_norm": 0.6169339418411255, "learning_rate": 7.348412138833851e-05, "loss": 0.0584, "step": 22630 }, { "grad_norm": 0.7216416001319885, "learning_rate": 7.345978879336295e-05, "loss": 0.0602, "step": 22640 }, { "grad_norm": 0.7474215626716614, "learning_rate": 7.343544907192296e-05, "loss": 0.0545, "step": 22650 }, { "grad_norm": 0.5933284163475037, "learning_rate": 7.341110223141235e-05, "loss": 0.0548, "step": 22660 }, { "grad_norm": 0.6655194163322449, "learning_rate": 7.3386748279227e-05, "loss": 0.0487, "step": 22670 }, { "grad_norm": 0.5497560501098633, "learning_rate": 7.336238722276501e-05, "loss": 0.0485, "step": 22680 }, { "grad_norm": 0.5816964507102966, "learning_rate": 7.333801906942663e-05, "loss": 0.0803, "step": 22690 }, { "grad_norm": 0.7210990786552429, "learning_rate": 7.331364382661428e-05, "loss": 0.0656, "step": 22700 }, { "grad_norm": 0.6432394981384277, "learning_rate": 7.328926150173248e-05, "loss": 0.0604, "step": 22710 }, { "grad_norm": 0.682384192943573, "learning_rate": 7.326487210218795e-05, "loss": 0.0602, "step": 22720 }, { "grad_norm": 0.5068738460540771, "learning_rate": 7.324047563538955e-05, "loss": 0.0432, "step": 22730 }, { "grad_norm": 0.4075212776660919, "learning_rate": 7.321607210874828e-05, "loss": 0.0464, "step": 22740 }, { "grad_norm": 0.6132088899612427, "learning_rate": 7.31916615296773e-05, "loss": 0.058, "step": 22750 }, { "grad_norm": 0.4163596034049988, "learning_rate": 7.316724390559188e-05, "loss": 0.0503, "step": 22760 }, { "grad_norm": 0.581365168094635, "learning_rate": 7.314281924390946e-05, "loss": 0.0765, "step": 22770 }, { "grad_norm": 0.6166781783103943, "learning_rate": 7.311838755204959e-05, "loss": 0.0547, "step": 22780 }, { "grad_norm": 0.7833077311515808, "learning_rate": 7.3093948837434e-05, "loss": 0.0538, "step": 22790 }, { "grad_norm": 0.9271983504295349, "learning_rate": 7.306950310748651e-05, "loss": 0.0664, "step": 22800 }, { "grad_norm": 0.7816934585571289, "learning_rate": 7.304505036963311e-05, "loss": 0.0569, "step": 22810 }, { "grad_norm": 0.7262480854988098, "learning_rate": 7.302059063130186e-05, "loss": 0.0645, "step": 22820 }, { "grad_norm": 0.7353392243385315, "learning_rate": 7.2996123899923e-05, "loss": 0.0876, "step": 22830 }, { "grad_norm": 0.4566977322101593, "learning_rate": 7.297165018292886e-05, "loss": 0.0526, "step": 22840 }, { "grad_norm": 0.6117882132530212, "learning_rate": 7.294716948775396e-05, "loss": 0.0578, "step": 22850 }, { "grad_norm": 0.5866395831108093, "learning_rate": 7.292268182183484e-05, "loss": 0.0533, "step": 22860 }, { "grad_norm": 0.5306499004364014, "learning_rate": 7.28981871926102e-05, "loss": 0.0582, "step": 22870 }, { "grad_norm": 0.7079401016235352, "learning_rate": 7.28736856075209e-05, "loss": 0.0538, "step": 22880 }, { "grad_norm": 0.7818320393562317, "learning_rate": 7.284917707400985e-05, "loss": 0.0561, "step": 22890 }, { "grad_norm": 0.8392692804336548, "learning_rate": 7.282466159952212e-05, "loss": 0.0691, "step": 22900 }, { "grad_norm": 0.6850519180297852, "learning_rate": 7.280013919150483e-05, "loss": 0.0813, "step": 22910 }, { "grad_norm": 0.5098279118537903, "learning_rate": 7.277560985740728e-05, "loss": 0.0584, "step": 22920 }, { "grad_norm": 0.7357016205787659, "learning_rate": 7.275107360468079e-05, "loss": 0.0483, "step": 22930 }, { "grad_norm": 0.6798924803733826, "learning_rate": 7.272653044077885e-05, "loss": 0.0563, "step": 22940 }, { "grad_norm": 1.0033750534057617, "learning_rate": 7.270198037315703e-05, "loss": 0.06, "step": 22950 }, { "grad_norm": 0.5840441584587097, "learning_rate": 7.267742340927297e-05, "loss": 0.0595, "step": 22960 }, { "grad_norm": 0.6806386709213257, "learning_rate": 7.265285955658645e-05, "loss": 0.0635, "step": 22970 }, { "grad_norm": 0.7036235332489014, "learning_rate": 7.26282888225593e-05, "loss": 0.0617, "step": 22980 }, { "grad_norm": 0.6128553748130798, "learning_rate": 7.260371121465548e-05, "loss": 0.0413, "step": 22990 }, { "grad_norm": 0.6523773670196533, "learning_rate": 7.2579126740341e-05, "loss": 0.0608, "step": 23000 }, { "grad_norm": 0.7903918027877808, "learning_rate": 7.2554535407084e-05, "loss": 0.0465, "step": 23010 }, { "grad_norm": 0.928288996219635, "learning_rate": 7.252993722235464e-05, "loss": 0.0627, "step": 23020 }, { "grad_norm": 0.5496142506599426, "learning_rate": 7.250533219362523e-05, "loss": 0.049, "step": 23030 }, { "grad_norm": 0.4551919996738434, "learning_rate": 7.248072032837012e-05, "loss": 0.0589, "step": 23040 }, { "grad_norm": 0.4875103235244751, "learning_rate": 7.245610163406575e-05, "loss": 0.0491, "step": 23050 }, { "grad_norm": 0.3776809573173523, "learning_rate": 7.243147611819061e-05, "loss": 0.0635, "step": 23060 }, { "grad_norm": 0.5706813335418701, "learning_rate": 7.240684378822531e-05, "loss": 0.0609, "step": 23070 }, { "grad_norm": 0.4160725772380829, "learning_rate": 7.238220465165248e-05, "loss": 0.0554, "step": 23080 }, { "grad_norm": 0.5060400366783142, "learning_rate": 7.235755871595684e-05, "loss": 0.0632, "step": 23090 }, { "grad_norm": 0.79752117395401, "learning_rate": 7.233290598862517e-05, "loss": 0.0754, "step": 23100 }, { "grad_norm": 0.8573857545852661, "learning_rate": 7.230824647714635e-05, "loss": 0.0657, "step": 23110 }, { "grad_norm": 0.6249211430549622, "learning_rate": 7.228358018901124e-05, "loss": 0.0598, "step": 23120 }, { "grad_norm": 0.9326733946800232, "learning_rate": 7.225890713171286e-05, "loss": 0.055, "step": 23130 }, { "grad_norm": 0.6601552963256836, "learning_rate": 7.223422731274618e-05, "loss": 0.0431, "step": 23140 }, { "grad_norm": 0.6139488220214844, "learning_rate": 7.220954073960832e-05, "loss": 0.0531, "step": 23150 }, { "grad_norm": 0.7813953161239624, "learning_rate": 7.218484741979838e-05, "loss": 0.0725, "step": 23160 }, { "grad_norm": 0.600694477558136, "learning_rate": 7.216014736081756e-05, "loss": 0.0516, "step": 23170 }, { "grad_norm": 0.5798155665397644, "learning_rate": 7.213544057016906e-05, "loss": 0.0551, "step": 23180 }, { "grad_norm": 0.5388294458389282, "learning_rate": 7.211072705535819e-05, "loss": 0.0568, "step": 23190 }, { "grad_norm": 0.7960654497146606, "learning_rate": 7.208600682389224e-05, "loss": 0.0508, "step": 23200 }, { "grad_norm": 0.4978260397911072, "learning_rate": 7.206127988328055e-05, "loss": 0.0839, "step": 23210 }, { "grad_norm": 1.0118571519851685, "learning_rate": 7.203654624103453e-05, "loss": 0.0636, "step": 23220 }, { "grad_norm": 0.6566668152809143, "learning_rate": 7.201180590466761e-05, "loss": 0.0733, "step": 23230 }, { "grad_norm": 0.7450770139694214, "learning_rate": 7.198705888169523e-05, "loss": 0.0695, "step": 23240 }, { "grad_norm": 0.6877014636993408, "learning_rate": 7.196230517963491e-05, "loss": 0.0609, "step": 23250 }, { "grad_norm": 0.5616300106048584, "learning_rate": 7.193754480600615e-05, "loss": 0.0867, "step": 23260 }, { "grad_norm": 0.8734957575798035, "learning_rate": 7.19127777683305e-05, "loss": 0.0569, "step": 23270 }, { "grad_norm": 0.6403398513793945, "learning_rate": 7.188800407413156e-05, "loss": 0.0683, "step": 23280 }, { "grad_norm": 0.4915678799152374, "learning_rate": 7.186322373093489e-05, "loss": 0.0619, "step": 23290 }, { "grad_norm": 0.6062947511672974, "learning_rate": 7.18384367462681e-05, "loss": 0.06, "step": 23300 }, { "grad_norm": 0.6277279257774353, "learning_rate": 7.181364312766085e-05, "loss": 0.0566, "step": 23310 }, { "grad_norm": 0.52444988489151, "learning_rate": 7.178884288264477e-05, "loss": 0.0643, "step": 23320 }, { "grad_norm": 0.8670199513435364, "learning_rate": 7.176403601875353e-05, "loss": 0.0641, "step": 23330 }, { "grad_norm": 0.43632954359054565, "learning_rate": 7.173922254352279e-05, "loss": 0.0405, "step": 23340 }, { "grad_norm": 0.8013917207717896, "learning_rate": 7.171440246449024e-05, "loss": 0.0766, "step": 23350 }, { "grad_norm": 0.32203349471092224, "learning_rate": 7.168957578919555e-05, "loss": 0.0537, "step": 23360 }, { "grad_norm": 0.9714429974555969, "learning_rate": 7.16647425251804e-05, "loss": 0.0559, "step": 23370 }, { "grad_norm": 0.6008636355400085, "learning_rate": 7.163990267998852e-05, "loss": 0.0559, "step": 23380 }, { "grad_norm": 0.46100303530693054, "learning_rate": 7.161505626116556e-05, "loss": 0.0645, "step": 23390 }, { "grad_norm": 0.49934840202331543, "learning_rate": 7.159020327625923e-05, "loss": 0.0572, "step": 23400 }, { "grad_norm": 0.8620927929878235, "learning_rate": 7.15653437328192e-05, "loss": 0.067, "step": 23410 }, { "grad_norm": 0.6615041494369507, "learning_rate": 7.154047763839713e-05, "loss": 0.0526, "step": 23420 }, { "grad_norm": 0.5963443517684937, "learning_rate": 7.15156050005467e-05, "loss": 0.0642, "step": 23430 }, { "grad_norm": 0.8504436612129211, "learning_rate": 7.149072582682357e-05, "loss": 0.0588, "step": 23440 }, { "grad_norm": 0.9030956029891968, "learning_rate": 7.146584012478535e-05, "loss": 0.0583, "step": 23450 }, { "grad_norm": 0.79917311668396, "learning_rate": 7.144094790199169e-05, "loss": 0.0612, "step": 23460 }, { "grad_norm": 0.5858286619186401, "learning_rate": 7.141604916600415e-05, "loss": 0.0591, "step": 23470 }, { "grad_norm": 0.6871239542961121, "learning_rate": 7.139114392438635e-05, "loss": 0.0608, "step": 23480 }, { "grad_norm": 0.5617266297340393, "learning_rate": 7.136623218470382e-05, "loss": 0.0458, "step": 23490 }, { "grad_norm": 0.5017833113670349, "learning_rate": 7.13413139545241e-05, "loss": 0.0798, "step": 23500 }, { "grad_norm": 0.41838136315345764, "learning_rate": 7.131638924141668e-05, "loss": 0.0577, "step": 23510 }, { "grad_norm": 1.0235583782196045, "learning_rate": 7.129145805295304e-05, "loss": 0.0594, "step": 23520 }, { "grad_norm": 0.9780168533325195, "learning_rate": 7.126652039670661e-05, "loss": 0.0663, "step": 23530 }, { "grad_norm": 0.8003699779510498, "learning_rate": 7.124157628025278e-05, "loss": 0.0652, "step": 23540 }, { "grad_norm": 0.5743614435195923, "learning_rate": 7.121662571116894e-05, "loss": 0.06, "step": 23550 }, { "grad_norm": 0.9889276027679443, "learning_rate": 7.119166869703441e-05, "loss": 0.065, "step": 23560 }, { "grad_norm": 0.4329019784927368, "learning_rate": 7.116670524543044e-05, "loss": 0.0516, "step": 23570 }, { "grad_norm": 0.616698682308197, "learning_rate": 7.114173536394032e-05, "loss": 0.0602, "step": 23580 }, { "grad_norm": 0.6506422758102417, "learning_rate": 7.111675906014917e-05, "loss": 0.072, "step": 23590 }, { "grad_norm": 0.5236632227897644, "learning_rate": 7.109177634164421e-05, "loss": 0.062, "step": 23600 }, { "grad_norm": 0.7336872816085815, "learning_rate": 7.106678721601449e-05, "loss": 0.074, "step": 23610 }, { "grad_norm": 0.4745640754699707, "learning_rate": 7.104179169085103e-05, "loss": 0.0514, "step": 23620 }, { "grad_norm": 0.5329028367996216, "learning_rate": 7.101678977374683e-05, "loss": 0.0594, "step": 23630 }, { "grad_norm": 0.6551546454429626, "learning_rate": 7.099178147229685e-05, "loss": 0.056, "step": 23640 }, { "grad_norm": 0.6384695172309875, "learning_rate": 7.096676679409789e-05, "loss": 0.0505, "step": 23650 }, { "grad_norm": 0.7491097450256348, "learning_rate": 7.094174574674877e-05, "loss": 0.0964, "step": 23660 }, { "grad_norm": 0.4994812309741974, "learning_rate": 7.091671833785025e-05, "loss": 0.0454, "step": 23670 }, { "grad_norm": 0.6800205707550049, "learning_rate": 7.089168457500493e-05, "loss": 0.0566, "step": 23680 }, { "grad_norm": 0.7037068605422974, "learning_rate": 7.086664446581747e-05, "loss": 0.0526, "step": 23690 }, { "grad_norm": 0.5564907193183899, "learning_rate": 7.084159801789438e-05, "loss": 0.0564, "step": 23700 }, { "grad_norm": 0.7723528146743774, "learning_rate": 7.081654523884411e-05, "loss": 0.0734, "step": 23710 }, { "grad_norm": 0.6184577345848083, "learning_rate": 7.0791486136277e-05, "loss": 0.0516, "step": 23720 }, { "grad_norm": 0.8468604683876038, "learning_rate": 7.07664207178054e-05, "loss": 0.0553, "step": 23730 }, { "grad_norm": 0.6393225193023682, "learning_rate": 7.074134899104345e-05, "loss": 0.0479, "step": 23740 }, { "grad_norm": 0.377490371465683, "learning_rate": 7.071627096360735e-05, "loss": 0.0837, "step": 23750 }, { "grad_norm": 0.6468838453292847, "learning_rate": 7.069118664311511e-05, "loss": 0.0563, "step": 23760 }, { "grad_norm": 0.47418177127838135, "learning_rate": 7.06660960371867e-05, "loss": 0.0378, "step": 23770 }, { "grad_norm": 0.7357994318008423, "learning_rate": 7.064099915344396e-05, "loss": 0.0543, "step": 23780 }, { "grad_norm": 0.8660827875137329, "learning_rate": 7.061589599951066e-05, "loss": 0.0528, "step": 23790 }, { "grad_norm": 0.5211430788040161, "learning_rate": 7.05907865830125e-05, "loss": 0.0506, "step": 23800 }, { "grad_norm": 0.7539572715759277, "learning_rate": 7.056567091157703e-05, "loss": 0.0551, "step": 23810 }, { "grad_norm": 0.6959972381591797, "learning_rate": 7.054054899283375e-05, "loss": 0.0483, "step": 23820 }, { "grad_norm": 0.5721155405044556, "learning_rate": 7.051542083441403e-05, "loss": 0.0496, "step": 23830 }, { "grad_norm": 0.5185775756835938, "learning_rate": 7.049028644395113e-05, "loss": 0.0543, "step": 23840 }, { "grad_norm": 0.710403323173523, "learning_rate": 7.046514582908024e-05, "loss": 0.0545, "step": 23850 }, { "grad_norm": 0.5954424142837524, "learning_rate": 7.043999899743838e-05, "loss": 0.0541, "step": 23860 }, { "grad_norm": 0.6560906171798706, "learning_rate": 7.041484595666451e-05, "loss": 0.0618, "step": 23870 }, { "grad_norm": 0.6001397967338562, "learning_rate": 7.038968671439948e-05, "loss": 0.0549, "step": 23880 }, { "grad_norm": 0.6724897027015686, "learning_rate": 7.036452127828596e-05, "loss": 0.0492, "step": 23890 }, { "grad_norm": 0.6052905917167664, "learning_rate": 7.033934965596859e-05, "loss": 0.0451, "step": 23900 }, { "grad_norm": 0.5795575380325317, "learning_rate": 7.031417185509381e-05, "loss": 0.0465, "step": 23910 }, { "grad_norm": 0.6924315690994263, "learning_rate": 7.028898788331e-05, "loss": 0.0597, "step": 23920 }, { "grad_norm": 0.604254424571991, "learning_rate": 7.026379774826736e-05, "loss": 0.0587, "step": 23930 }, { "grad_norm": 0.4950799345970154, "learning_rate": 7.0238601457618e-05, "loss": 0.0542, "step": 23940 }, { "grad_norm": 0.8124343752861023, "learning_rate": 7.02133990190159e-05, "loss": 0.0696, "step": 23950 }, { "grad_norm": 0.8637648224830627, "learning_rate": 7.018819044011687e-05, "loss": 0.0618, "step": 23960 }, { "grad_norm": 0.6274315714836121, "learning_rate": 7.016297572857863e-05, "loss": 0.0558, "step": 23970 }, { "grad_norm": 0.8694181442260742, "learning_rate": 7.013775489206072e-05, "loss": 0.0666, "step": 23980 }, { "grad_norm": 0.501493513584137, "learning_rate": 7.01125279382246e-05, "loss": 0.0765, "step": 23990 }, { "grad_norm": 0.9049879312515259, "learning_rate": 7.008729487473351e-05, "loss": 0.0619, "step": 24000 }, { "grad_norm": 0.9387375712394714, "learning_rate": 7.006205570925263e-05, "loss": 0.0614, "step": 24010 }, { "grad_norm": 0.7630560994148254, "learning_rate": 7.003681044944892e-05, "loss": 0.0531, "step": 24020 }, { "grad_norm": 0.5737645030021667, "learning_rate": 7.001155910299126e-05, "loss": 0.0481, "step": 24030 }, { "grad_norm": 0.7408298850059509, "learning_rate": 6.99863016775503e-05, "loss": 0.0626, "step": 24040 }, { "grad_norm": 0.6114224195480347, "learning_rate": 6.996103818079859e-05, "loss": 0.0427, "step": 24050 }, { "grad_norm": 0.6437655091285706, "learning_rate": 6.993576862041054e-05, "loss": 0.0442, "step": 24060 }, { "grad_norm": 0.729150116443634, "learning_rate": 6.991049300406235e-05, "loss": 0.0614, "step": 24070 }, { "grad_norm": 0.7160778641700745, "learning_rate": 6.988521133943209e-05, "loss": 0.064, "step": 24080 }, { "grad_norm": 0.7164499163627625, "learning_rate": 6.985992363419966e-05, "loss": 0.0543, "step": 24090 }, { "grad_norm": 0.5155709981918335, "learning_rate": 6.983462989604682e-05, "loss": 0.0762, "step": 24100 }, { "grad_norm": 0.5107656717300415, "learning_rate": 6.980933013265709e-05, "loss": 0.0476, "step": 24110 }, { "grad_norm": 0.8063297867774963, "learning_rate": 6.978402435171592e-05, "loss": 0.0615, "step": 24120 }, { "grad_norm": 0.4268251061439514, "learning_rate": 6.975871256091052e-05, "loss": 0.0673, "step": 24130 }, { "grad_norm": 0.5057892203330994, "learning_rate": 6.973339476792995e-05, "loss": 0.0744, "step": 24140 }, { "grad_norm": 0.7367579340934753, "learning_rate": 6.970807098046505e-05, "loss": 0.0575, "step": 24150 }, { "grad_norm": 0.467750608921051, "learning_rate": 6.968274120620858e-05, "loss": 0.0655, "step": 24160 }, { "grad_norm": 0.3549362123012543, "learning_rate": 6.965740545285499e-05, "loss": 0.0449, "step": 24170 }, { "grad_norm": 0.6758797764778137, "learning_rate": 6.963206372810068e-05, "loss": 0.0577, "step": 24180 }, { "grad_norm": 0.7578173875808716, "learning_rate": 6.960671603964375e-05, "loss": 0.0543, "step": 24190 }, { "grad_norm": 0.4973852336406708, "learning_rate": 6.958136239518418e-05, "loss": 0.0643, "step": 24200 }, { "grad_norm": 0.5441197752952576, "learning_rate": 6.955600280242371e-05, "loss": 0.0549, "step": 24210 }, { "grad_norm": 0.9610100984573364, "learning_rate": 6.953063726906596e-05, "loss": 0.061, "step": 24220 }, { "grad_norm": 1.3355762958526611, "learning_rate": 6.950526580281626e-05, "loss": 0.0491, "step": 24230 }, { "grad_norm": 0.5656462907791138, "learning_rate": 6.947988841138184e-05, "loss": 0.0537, "step": 24240 }, { "grad_norm": 0.8859825134277344, "learning_rate": 6.945450510247165e-05, "loss": 0.0559, "step": 24250 }, { "grad_norm": 0.4195842444896698, "learning_rate": 6.942911588379647e-05, "loss": 0.0624, "step": 24260 }, { "grad_norm": 0.5830894112586975, "learning_rate": 6.940372076306888e-05, "loss": 0.0525, "step": 24270 }, { "grad_norm": 0.5326490998268127, "learning_rate": 6.937831974800326e-05, "loss": 0.0478, "step": 24280 }, { "grad_norm": 0.6736019849777222, "learning_rate": 6.935291284631574e-05, "loss": 0.0541, "step": 24290 }, { "grad_norm": 0.49964824318885803, "learning_rate": 6.932750006572428e-05, "loss": 0.0498, "step": 24300 }, { "grad_norm": 0.5319270491600037, "learning_rate": 6.930208141394863e-05, "loss": 0.046, "step": 24310 }, { "grad_norm": 0.8407612442970276, "learning_rate": 6.927665689871026e-05, "loss": 0.0485, "step": 24320 }, { "grad_norm": 0.8040627837181091, "learning_rate": 6.925122652773253e-05, "loss": 0.055, "step": 24330 }, { "grad_norm": 0.552007257938385, "learning_rate": 6.922579030874046e-05, "loss": 0.0473, "step": 24340 }, { "grad_norm": 0.8753889799118042, "learning_rate": 6.920034824946093e-05, "loss": 0.0642, "step": 24350 }, { "grad_norm": 0.6230931878089905, "learning_rate": 6.917490035762255e-05, "loss": 0.0805, "step": 24360 }, { "grad_norm": 0.6692032217979431, "learning_rate": 6.914944664095573e-05, "loss": 0.0548, "step": 24370 }, { "grad_norm": 0.6793791651725769, "learning_rate": 6.912398710719264e-05, "loss": 0.0524, "step": 24380 }, { "grad_norm": 0.7718807458877563, "learning_rate": 6.90985217640672e-05, "loss": 0.0697, "step": 24390 }, { "grad_norm": 0.7136959433555603, "learning_rate": 6.90730506193151e-05, "loss": 0.049, "step": 24400 }, { "grad_norm": 0.5187110304832458, "learning_rate": 6.904757368067384e-05, "loss": 0.0984, "step": 24410 }, { "grad_norm": 0.9893154501914978, "learning_rate": 6.90220909558826e-05, "loss": 0.0544, "step": 24420 }, { "grad_norm": 0.8141559958457947, "learning_rate": 6.899660245268237e-05, "loss": 0.0571, "step": 24430 }, { "grad_norm": 0.498773455619812, "learning_rate": 6.897110817881592e-05, "loss": 0.0583, "step": 24440 }, { "grad_norm": 0.5945766568183899, "learning_rate": 6.894560814202769e-05, "loss": 0.0516, "step": 24450 }, { "grad_norm": 0.5740294456481934, "learning_rate": 6.892010235006394e-05, "loss": 0.0483, "step": 24460 }, { "grad_norm": 0.5815576314926147, "learning_rate": 6.889459081067264e-05, "loss": 0.0463, "step": 24470 }, { "grad_norm": 0.5425485372543335, "learning_rate": 6.886907353160356e-05, "loss": 0.0458, "step": 24480 }, { "grad_norm": 0.6611071825027466, "learning_rate": 6.884355052060814e-05, "loss": 0.0847, "step": 24490 }, { "grad_norm": 0.6405183672904968, "learning_rate": 6.88180217854396e-05, "loss": 0.0537, "step": 24500 }, { "grad_norm": 0.6598913073539734, "learning_rate": 6.87924873338529e-05, "loss": 0.0773, "step": 24510 }, { "grad_norm": 0.449797660112381, "learning_rate": 6.876694717360475e-05, "loss": 0.059, "step": 24520 }, { "grad_norm": 0.6302838921546936, "learning_rate": 6.874140131245355e-05, "loss": 0.0686, "step": 24530 }, { "grad_norm": 0.6651034951210022, "learning_rate": 6.871584975815948e-05, "loss": 0.0597, "step": 24540 }, { "grad_norm": 0.9853631854057312, "learning_rate": 6.86902925184844e-05, "loss": 0.0714, "step": 24550 }, { "grad_norm": 0.556399405002594, "learning_rate": 6.866472960119195e-05, "loss": 0.0545, "step": 24560 }, { "grad_norm": 0.4837063252925873, "learning_rate": 6.863916101404748e-05, "loss": 0.07, "step": 24570 }, { "grad_norm": 0.7494713068008423, "learning_rate": 6.8613586764818e-05, "loss": 0.0584, "step": 24580 }, { "grad_norm": 0.5818883776664734, "learning_rate": 6.858800686127233e-05, "loss": 0.0516, "step": 24590 }, { "grad_norm": 0.545595109462738, "learning_rate": 6.856242131118097e-05, "loss": 0.055, "step": 24600 }, { "grad_norm": 0.4352235198020935, "learning_rate": 6.853683012231614e-05, "loss": 0.0585, "step": 24610 }, { "grad_norm": 0.8178169131278992, "learning_rate": 6.851123330245173e-05, "loss": 0.0508, "step": 24620 }, { "grad_norm": 0.514870285987854, "learning_rate": 6.848563085936343e-05, "loss": 0.0635, "step": 24630 }, { "grad_norm": 0.6247340440750122, "learning_rate": 6.846002280082853e-05, "loss": 0.0614, "step": 24640 }, { "grad_norm": 0.9632478952407837, "learning_rate": 6.843440913462614e-05, "loss": 0.0565, "step": 24650 }, { "grad_norm": 0.5392687916755676, "learning_rate": 6.840878986853698e-05, "loss": 0.0568, "step": 24660 }, { "grad_norm": 0.5714540481567383, "learning_rate": 6.838316501034352e-05, "loss": 0.0529, "step": 24670 }, { "grad_norm": 0.9044665098190308, "learning_rate": 6.83575345678299e-05, "loss": 0.0682, "step": 24680 }, { "grad_norm": 0.6406399607658386, "learning_rate": 6.833189854878196e-05, "loss": 0.0573, "step": 24690 }, { "grad_norm": 0.6109222173690796, "learning_rate": 6.83062569609873e-05, "loss": 0.051, "step": 24700 }, { "grad_norm": 0.5410897135734558, "learning_rate": 6.828060981223512e-05, "loss": 0.0806, "step": 24710 }, { "grad_norm": 0.613184928894043, "learning_rate": 6.825495711031634e-05, "loss": 0.0413, "step": 24720 }, { "grad_norm": 0.6805538535118103, "learning_rate": 6.822929886302359e-05, "loss": 0.055, "step": 24730 }, { "grad_norm": 0.44007518887519836, "learning_rate": 6.820363507815116e-05, "loss": 0.0459, "step": 24740 }, { "grad_norm": 0.8172946572303772, "learning_rate": 6.817796576349501e-05, "loss": 0.0526, "step": 24750 }, { "grad_norm": 0.7200448513031006, "learning_rate": 6.815229092685285e-05, "loss": 0.0578, "step": 24760 }, { "grad_norm": 0.8102459907531738, "learning_rate": 6.812661057602399e-05, "loss": 0.0616, "step": 24770 }, { "grad_norm": 0.446368008852005, "learning_rate": 6.810092471880943e-05, "loss": 0.0526, "step": 24780 }, { "grad_norm": 0.5898268818855286, "learning_rate": 6.807523336301187e-05, "loss": 0.058, "step": 24790 }, { "grad_norm": 0.5935202836990356, "learning_rate": 6.804953651643566e-05, "loss": 0.0537, "step": 24800 }, { "grad_norm": 0.5048432350158691, "learning_rate": 6.802383418688685e-05, "loss": 0.0557, "step": 24810 }, { "grad_norm": 0.5212246179580688, "learning_rate": 6.799812638217309e-05, "loss": 0.0573, "step": 24820 }, { "grad_norm": 0.41915827989578247, "learning_rate": 6.797241311010373e-05, "loss": 0.0456, "step": 24830 }, { "grad_norm": 0.6479530930519104, "learning_rate": 6.794669437848982e-05, "loss": 0.0446, "step": 24840 }, { "grad_norm": 0.6561416983604431, "learning_rate": 6.792097019514402e-05, "loss": 0.0542, "step": 24850 }, { "grad_norm": 0.7728612422943115, "learning_rate": 6.789524056788064e-05, "loss": 0.0767, "step": 24860 }, { "grad_norm": 0.8217223882675171, "learning_rate": 6.786950550451567e-05, "loss": 0.0745, "step": 24870 }, { "grad_norm": 0.6085757613182068, "learning_rate": 6.784376501286676e-05, "loss": 0.0559, "step": 24880 }, { "grad_norm": 0.5769059062004089, "learning_rate": 6.781801910075316e-05, "loss": 0.054, "step": 24890 }, { "grad_norm": 0.6749511957168579, "learning_rate": 6.779226777599581e-05, "loss": 0.0625, "step": 24900 }, { "grad_norm": 0.4416142404079437, "learning_rate": 6.776651104641729e-05, "loss": 0.0516, "step": 24910 }, { "grad_norm": 0.5071907043457031, "learning_rate": 6.774074891984183e-05, "loss": 0.0825, "step": 24920 }, { "grad_norm": 0.37446314096450806, "learning_rate": 6.771498140409526e-05, "loss": 0.0552, "step": 24930 }, { "grad_norm": 0.6852632164955139, "learning_rate": 6.768920850700506e-05, "loss": 0.0538, "step": 24940 }, { "grad_norm": 0.5290052890777588, "learning_rate": 6.766343023640039e-05, "loss": 0.042, "step": 24950 }, { "grad_norm": 0.41908013820648193, "learning_rate": 6.763764660011198e-05, "loss": 0.0497, "step": 24960 }, { "grad_norm": 0.5955412983894348, "learning_rate": 6.761185760597223e-05, "loss": 0.0436, "step": 24970 }, { "grad_norm": 0.48287203907966614, "learning_rate": 6.758606326181515e-05, "loss": 0.0568, "step": 24980 }, { "grad_norm": 0.48907747864723206, "learning_rate": 6.75602635754764e-05, "loss": 0.0662, "step": 24990 }, { "grad_norm": 0.569651186466217, "learning_rate": 6.75344585547932e-05, "loss": 0.0455, "step": 25000 }, { "grad_norm": 0.8618650436401367, "learning_rate": 6.750864820760449e-05, "loss": 0.0659, "step": 25010 }, { "grad_norm": 0.6721720099449158, "learning_rate": 6.748283254175072e-05, "loss": 0.0834, "step": 25020 }, { "grad_norm": 0.38320210576057434, "learning_rate": 6.745701156507404e-05, "loss": 0.0601, "step": 25030 }, { "grad_norm": 0.5743666291236877, "learning_rate": 6.743118528541818e-05, "loss": 0.0651, "step": 25040 }, { "grad_norm": 0.550365149974823, "learning_rate": 6.740535371062846e-05, "loss": 0.0704, "step": 25050 }, { "grad_norm": 0.7227051258087158, "learning_rate": 6.737951684855185e-05, "loss": 0.0601, "step": 25060 }, { "grad_norm": 0.8139155507087708, "learning_rate": 6.735367470703691e-05, "loss": 0.0539, "step": 25070 }, { "grad_norm": 0.6102705001831055, "learning_rate": 6.732782729393379e-05, "loss": 0.0579, "step": 25080 }, { "grad_norm": 1.0555191040039062, "learning_rate": 6.730197461709425e-05, "loss": 0.0669, "step": 25090 }, { "grad_norm": 0.7630133032798767, "learning_rate": 6.727611668437164e-05, "loss": 0.0542, "step": 25100 }, { "grad_norm": 0.5342068672180176, "learning_rate": 6.725025350362094e-05, "loss": 0.0626, "step": 25110 }, { "grad_norm": 0.7731459736824036, "learning_rate": 6.72243850826987e-05, "loss": 0.0606, "step": 25120 }, { "grad_norm": 0.48654335737228394, "learning_rate": 6.719851142946305e-05, "loss": 0.0576, "step": 25130 }, { "grad_norm": 0.728100597858429, "learning_rate": 6.717263255177372e-05, "loss": 0.0693, "step": 25140 }, { "grad_norm": 0.7312489151954651, "learning_rate": 6.714674845749205e-05, "loss": 0.0581, "step": 25150 }, { "grad_norm": 0.9274638295173645, "learning_rate": 6.712085915448092e-05, "loss": 0.0559, "step": 25160 }, { "grad_norm": 0.8335204720497131, "learning_rate": 6.709496465060486e-05, "loss": 0.0737, "step": 25170 }, { "grad_norm": 0.5289890766143799, "learning_rate": 6.706906495372987e-05, "loss": 0.0487, "step": 25180 }, { "grad_norm": 1.091695785522461, "learning_rate": 6.704316007172365e-05, "loss": 0.0658, "step": 25190 }, { "grad_norm": 0.459676593542099, "learning_rate": 6.701725001245539e-05, "loss": 0.0596, "step": 25200 }, { "grad_norm": 0.7618143558502197, "learning_rate": 6.699133478379588e-05, "loss": 0.0563, "step": 25210 }, { "grad_norm": 0.6826978921890259, "learning_rate": 6.69654143936175e-05, "loss": 0.0528, "step": 25220 }, { "grad_norm": 0.603641927242279, "learning_rate": 6.693948884979419e-05, "loss": 0.0512, "step": 25230 }, { "grad_norm": 0.5053703784942627, "learning_rate": 6.691355816020142e-05, "loss": 0.0565, "step": 25240 }, { "grad_norm": 0.35608890652656555, "learning_rate": 6.688762233271624e-05, "loss": 0.0444, "step": 25250 }, { "grad_norm": 0.754951000213623, "learning_rate": 6.68616813752173e-05, "loss": 0.0485, "step": 25260 }, { "grad_norm": 0.5915809869766235, "learning_rate": 6.683573529558477e-05, "loss": 0.0583, "step": 25270 }, { "grad_norm": 0.6834119558334351, "learning_rate": 6.680978410170037e-05, "loss": 0.0768, "step": 25280 }, { "grad_norm": 0.626686155796051, "learning_rate": 6.678382780144741e-05, "loss": 0.0528, "step": 25290 }, { "grad_norm": 0.580740213394165, "learning_rate": 6.675786640271071e-05, "loss": 0.0576, "step": 25300 }, { "grad_norm": 0.5660983920097351, "learning_rate": 6.673189991337665e-05, "loss": 0.0434, "step": 25310 }, { "grad_norm": 0.6116959452629089, "learning_rate": 6.670592834133317e-05, "loss": 0.0682, "step": 25320 }, { "grad_norm": 0.6133612990379333, "learning_rate": 6.667995169446979e-05, "loss": 0.0579, "step": 25330 }, { "grad_norm": 0.7114620804786682, "learning_rate": 6.665396998067747e-05, "loss": 0.0457, "step": 25340 }, { "grad_norm": 0.46052825450897217, "learning_rate": 6.66279832078488e-05, "loss": 0.0464, "step": 25350 }, { "grad_norm": 0.6025497913360596, "learning_rate": 6.660199138387786e-05, "loss": 0.0734, "step": 25360 }, { "grad_norm": 0.5583966970443726, "learning_rate": 6.65759945166603e-05, "loss": 0.0488, "step": 25370 }, { "grad_norm": 0.5691313147544861, "learning_rate": 6.654999261409326e-05, "loss": 0.0647, "step": 25380 }, { "grad_norm": 0.776589035987854, "learning_rate": 6.652398568407544e-05, "loss": 0.0591, "step": 25390 }, { "grad_norm": 0.5661314725875854, "learning_rate": 6.649797373450707e-05, "loss": 0.0512, "step": 25400 }, { "grad_norm": 1.3330086469650269, "learning_rate": 6.647195677328988e-05, "loss": 0.0453, "step": 25410 }, { "grad_norm": 0.6834153532981873, "learning_rate": 6.644593480832712e-05, "loss": 0.047, "step": 25420 }, { "grad_norm": 0.5015169382095337, "learning_rate": 6.641990784752363e-05, "loss": 0.0548, "step": 25430 }, { "grad_norm": 0.4559153914451599, "learning_rate": 6.639387589878566e-05, "loss": 0.0485, "step": 25440 }, { "grad_norm": 0.371011346578598, "learning_rate": 6.636783897002103e-05, "loss": 0.0498, "step": 25450 }, { "grad_norm": 0.5706992149353027, "learning_rate": 6.63417970691391e-05, "loss": 0.0564, "step": 25460 }, { "grad_norm": 0.7918767333030701, "learning_rate": 6.63157502040507e-05, "loss": 0.0589, "step": 25470 }, { "grad_norm": 0.7464714646339417, "learning_rate": 6.628969838266819e-05, "loss": 0.0617, "step": 25480 }, { "grad_norm": 0.6603574752807617, "learning_rate": 6.626364161290541e-05, "loss": 0.0537, "step": 25490 }, { "grad_norm": 0.3313160538673401, "learning_rate": 6.623757990267774e-05, "loss": 0.0415, "step": 25500 }, { "grad_norm": 0.4829399287700653, "learning_rate": 6.621151325990201e-05, "loss": 0.0674, "step": 25510 }, { "grad_norm": 0.6523845195770264, "learning_rate": 6.618544169249657e-05, "loss": 0.065, "step": 25520 }, { "grad_norm": 0.6155836582183838, "learning_rate": 6.615936520838133e-05, "loss": 0.0784, "step": 25530 }, { "grad_norm": 0.5626888871192932, "learning_rate": 6.613328381547759e-05, "loss": 0.0526, "step": 25540 }, { "grad_norm": 0.7429402470588684, "learning_rate": 6.610719752170821e-05, "loss": 0.0613, "step": 25550 }, { "grad_norm": 0.74092036485672, "learning_rate": 6.60811063349975e-05, "loss": 0.0611, "step": 25560 }, { "grad_norm": 0.9209029674530029, "learning_rate": 6.605501026327127e-05, "loss": 0.0727, "step": 25570 }, { "grad_norm": 0.5448964834213257, "learning_rate": 6.602890931445685e-05, "loss": 0.0483, "step": 25580 }, { "grad_norm": 0.48267316818237305, "learning_rate": 6.6002803496483e-05, "loss": 0.0506, "step": 25590 }, { "grad_norm": 0.7092859745025635, "learning_rate": 6.597669281727997e-05, "loss": 0.0634, "step": 25600 }, { "grad_norm": 0.7777819633483887, "learning_rate": 6.595057728477949e-05, "loss": 0.062, "step": 25610 }, { "grad_norm": 0.3823172152042389, "learning_rate": 6.59244569069148e-05, "loss": 0.0649, "step": 25620 }, { "grad_norm": 0.5846081972122192, "learning_rate": 6.589833169162054e-05, "loss": 0.0613, "step": 25630 }, { "grad_norm": 0.3824916183948517, "learning_rate": 6.587220164683291e-05, "loss": 0.051, "step": 25640 }, { "grad_norm": 0.7858004570007324, "learning_rate": 6.58460667804895e-05, "loss": 0.0568, "step": 25650 }, { "grad_norm": 0.8527355790138245, "learning_rate": 6.581992710052938e-05, "loss": 0.0684, "step": 25660 }, { "grad_norm": 0.6022881865501404, "learning_rate": 6.579378261489311e-05, "loss": 0.0485, "step": 25670 }, { "grad_norm": 0.3021657466888428, "learning_rate": 6.576763333152268e-05, "loss": 0.0492, "step": 25680 }, { "grad_norm": 0.49712976813316345, "learning_rate": 6.574147925836159e-05, "loss": 0.0414, "step": 25690 }, { "grad_norm": 0.3650001287460327, "learning_rate": 6.571532040335472e-05, "loss": 0.0479, "step": 25700 }, { "grad_norm": 0.6563081741333008, "learning_rate": 6.568915677444845e-05, "loss": 0.0654, "step": 25710 }, { "grad_norm": 0.3313564360141754, "learning_rate": 6.56629883795906e-05, "loss": 0.0574, "step": 25720 }, { "grad_norm": 0.4641467034816742, "learning_rate": 6.563681522673043e-05, "loss": 0.0511, "step": 25730 }, { "grad_norm": 0.6188220977783203, "learning_rate": 6.561063732381867e-05, "loss": 0.0539, "step": 25740 }, { "grad_norm": 0.6114196181297302, "learning_rate": 6.558445467880745e-05, "loss": 0.0427, "step": 25750 }, { "grad_norm": 0.4561634361743927, "learning_rate": 6.55582672996504e-05, "loss": 0.0498, "step": 25760 }, { "grad_norm": 0.5142156481742859, "learning_rate": 6.553207519430253e-05, "loss": 0.0505, "step": 25770 }, { "grad_norm": 0.43580660223960876, "learning_rate": 6.550587837072032e-05, "loss": 0.0522, "step": 25780 }, { "grad_norm": 0.43853631615638733, "learning_rate": 6.547967683686166e-05, "loss": 0.0654, "step": 25790 }, { "grad_norm": 0.39781343936920166, "learning_rate": 6.545347060068591e-05, "loss": 0.0634, "step": 25800 }, { "grad_norm": 0.7153623104095459, "learning_rate": 6.542725967015382e-05, "loss": 0.0602, "step": 25810 }, { "grad_norm": 0.8164191246032715, "learning_rate": 6.540104405322757e-05, "loss": 0.0478, "step": 25820 }, { "grad_norm": 0.4742793142795563, "learning_rate": 6.537482375787077e-05, "loss": 0.0422, "step": 25830 }, { "grad_norm": 0.3310747742652893, "learning_rate": 6.534859879204845e-05, "loss": 0.0495, "step": 25840 }, { "grad_norm": 0.6157658100128174, "learning_rate": 6.532236916372709e-05, "loss": 0.0452, "step": 25850 }, { "grad_norm": 0.9600095152854919, "learning_rate": 6.529613488087454e-05, "loss": 0.074, "step": 25860 }, { "grad_norm": 0.9733326435089111, "learning_rate": 6.526989595146009e-05, "loss": 0.0536, "step": 25870 }, { "grad_norm": 0.8289823532104492, "learning_rate": 6.524365238345441e-05, "loss": 0.0567, "step": 25880 }, { "grad_norm": 0.438699334859848, "learning_rate": 6.521740418482964e-05, "loss": 0.0444, "step": 25890 }, { "grad_norm": 0.5305780172348022, "learning_rate": 6.519115136355925e-05, "loss": 0.0438, "step": 25900 }, { "grad_norm": 0.6337984800338745, "learning_rate": 6.51648939276182e-05, "loss": 0.0445, "step": 25910 }, { "grad_norm": 0.629861056804657, "learning_rate": 6.513863188498277e-05, "loss": 0.0519, "step": 25920 }, { "grad_norm": 0.4113304913043976, "learning_rate": 6.511236524363068e-05, "loss": 0.0531, "step": 25930 }, { "grad_norm": 0.5100784301757812, "learning_rate": 6.508609401154104e-05, "loss": 0.0525, "step": 25940 }, { "grad_norm": 0.4423154294490814, "learning_rate": 6.505981819669439e-05, "loss": 0.0471, "step": 25950 }, { "grad_norm": 0.39601632952690125, "learning_rate": 6.503353780707258e-05, "loss": 0.0511, "step": 25960 }, { "grad_norm": 0.5324782133102417, "learning_rate": 6.500725285065895e-05, "loss": 0.0574, "step": 25970 }, { "grad_norm": 0.5385173559188843, "learning_rate": 6.498096333543813e-05, "loss": 0.0388, "step": 25980 }, { "grad_norm": 0.24984033405780792, "learning_rate": 6.49546692693962e-05, "loss": 0.0498, "step": 25990 }, { "grad_norm": 0.5347210168838501, "learning_rate": 6.492837066052059e-05, "loss": 0.0432, "step": 26000 }, { "grad_norm": 0.9931535124778748, "learning_rate": 6.490206751680014e-05, "loss": 0.0618, "step": 26010 }, { "grad_norm": 0.5912700891494751, "learning_rate": 6.487575984622505e-05, "loss": 0.076, "step": 26020 }, { "grad_norm": 0.7838676571846008, "learning_rate": 6.484944765678689e-05, "loss": 0.0547, "step": 26030 }, { "grad_norm": 0.6506220102310181, "learning_rate": 6.482313095647861e-05, "loss": 0.0614, "step": 26040 }, { "grad_norm": 0.5117701888084412, "learning_rate": 6.479680975329451e-05, "loss": 0.0761, "step": 26050 }, { "grad_norm": 0.6890639066696167, "learning_rate": 6.477048405523031e-05, "loss": 0.0562, "step": 26060 }, { "grad_norm": 0.5404845476150513, "learning_rate": 6.474415387028304e-05, "loss": 0.0375, "step": 26070 }, { "grad_norm": 0.5449622869491577, "learning_rate": 6.471781920645114e-05, "loss": 0.0474, "step": 26080 }, { "grad_norm": 0.5344012379646301, "learning_rate": 6.469148007173434e-05, "loss": 0.0475, "step": 26090 }, { "grad_norm": 0.5555412769317627, "learning_rate": 6.466513647413381e-05, "loss": 0.0385, "step": 26100 }, { "grad_norm": 0.6138212084770203, "learning_rate": 6.463878842165203e-05, "loss": 0.0467, "step": 26110 }, { "grad_norm": 0.634340226650238, "learning_rate": 6.461243592229286e-05, "loss": 0.0524, "step": 26120 }, { "grad_norm": 0.6716986298561096, "learning_rate": 6.458607898406146e-05, "loss": 0.0502, "step": 26130 }, { "grad_norm": 0.7136326432228088, "learning_rate": 6.455971761496439e-05, "loss": 0.0772, "step": 26140 }, { "grad_norm": 0.5983393788337708, "learning_rate": 6.453335182300953e-05, "loss": 0.0505, "step": 26150 }, { "grad_norm": 0.6540200710296631, "learning_rate": 6.450698161620612e-05, "loss": 0.0483, "step": 26160 }, { "grad_norm": 0.4467940628528595, "learning_rate": 6.448060700256473e-05, "loss": 0.0435, "step": 26170 }, { "grad_norm": 0.7553106546401978, "learning_rate": 6.445422799009726e-05, "loss": 0.0649, "step": 26180 }, { "grad_norm": 0.7089672088623047, "learning_rate": 6.442784458681699e-05, "loss": 0.0547, "step": 26190 }, { "grad_norm": 0.46432170271873474, "learning_rate": 6.440145680073847e-05, "loss": 0.045, "step": 26200 }, { "grad_norm": 0.6264537572860718, "learning_rate": 6.437506463987762e-05, "loss": 0.0398, "step": 26210 }, { "grad_norm": 0.7212665677070618, "learning_rate": 6.434866811225168e-05, "loss": 0.0557, "step": 26220 }, { "grad_norm": 0.7179176807403564, "learning_rate": 6.432226722587923e-05, "loss": 0.0544, "step": 26230 }, { "grad_norm": 0.7304664254188538, "learning_rate": 6.429586198878015e-05, "loss": 0.0492, "step": 26240 }, { "grad_norm": 0.5858041048049927, "learning_rate": 6.426945240897566e-05, "loss": 0.0639, "step": 26250 }, { "grad_norm": 0.39044713973999023, "learning_rate": 6.424303849448829e-05, "loss": 0.046, "step": 26260 }, { "grad_norm": 0.5402190089225769, "learning_rate": 6.42166202533419e-05, "loss": 0.0445, "step": 26270 }, { "grad_norm": 0.4856659770011902, "learning_rate": 6.419019769356164e-05, "loss": 0.0527, "step": 26280 }, { "grad_norm": 0.5514198541641235, "learning_rate": 6.416377082317398e-05, "loss": 0.0433, "step": 26290 }, { "grad_norm": 0.443473219871521, "learning_rate": 6.413733965020674e-05, "loss": 0.0559, "step": 26300 }, { "grad_norm": 0.4119541049003601, "learning_rate": 6.411090418268896e-05, "loss": 0.0348, "step": 26310 }, { "grad_norm": 0.3124011158943176, "learning_rate": 6.408446442865109e-05, "loss": 0.0397, "step": 26320 }, { "grad_norm": 0.6865003108978271, "learning_rate": 6.405802039612479e-05, "loss": 0.0571, "step": 26330 }, { "grad_norm": 0.6326753497123718, "learning_rate": 6.403157209314308e-05, "loss": 0.0434, "step": 26340 }, { "grad_norm": 0.5365711450576782, "learning_rate": 6.400511952774024e-05, "loss": 0.058, "step": 26350 }, { "grad_norm": 0.43223416805267334, "learning_rate": 6.397866270795187e-05, "loss": 0.0436, "step": 26360 }, { "grad_norm": 0.5863932967185974, "learning_rate": 6.395220164181489e-05, "loss": 0.0488, "step": 26370 }, { "grad_norm": 0.3780985474586487, "learning_rate": 6.39257363373674e-05, "loss": 0.0445, "step": 26380 }, { "grad_norm": 0.7260715365409851, "learning_rate": 6.389926680264892e-05, "loss": 0.0419, "step": 26390 }, { "grad_norm": 0.5963817834854126, "learning_rate": 6.387279304570017e-05, "loss": 0.0727, "step": 26400 }, { "grad_norm": 0.5931375026702881, "learning_rate": 6.384631507456319e-05, "loss": 0.0427, "step": 26410 }, { "grad_norm": 0.7918265461921692, "learning_rate": 6.381983289728126e-05, "loss": 0.0418, "step": 26420 }, { "grad_norm": 0.49632301926612854, "learning_rate": 6.3793346521899e-05, "loss": 0.0545, "step": 26430 }, { "grad_norm": 0.46386003494262695, "learning_rate": 6.376685595646226e-05, "loss": 0.0444, "step": 26440 }, { "grad_norm": 0.4767666459083557, "learning_rate": 6.374036120901816e-05, "loss": 0.0434, "step": 26450 }, { "grad_norm": 0.6186562180519104, "learning_rate": 6.371386228761514e-05, "loss": 0.0449, "step": 26460 }, { "grad_norm": 0.9290311932563782, "learning_rate": 6.368735920030283e-05, "loss": 0.0889, "step": 26470 }, { "grad_norm": 0.6548288464546204, "learning_rate": 6.366085195513218e-05, "loss": 0.05, "step": 26480 }, { "grad_norm": 0.559697151184082, "learning_rate": 6.363434056015543e-05, "loss": 0.0471, "step": 26490 }, { "grad_norm": 0.5270122289657593, "learning_rate": 6.360782502342599e-05, "loss": 0.0434, "step": 26500 }, { "grad_norm": 0.6539007425308228, "learning_rate": 6.358130535299862e-05, "loss": 0.0478, "step": 26510 }, { "grad_norm": 0.7119054198265076, "learning_rate": 6.355478155692926e-05, "loss": 0.0646, "step": 26520 }, { "grad_norm": 0.5916361808776855, "learning_rate": 6.352825364327517e-05, "loss": 0.0569, "step": 26530 }, { "grad_norm": 0.8404974937438965, "learning_rate": 6.350172162009482e-05, "loss": 0.0494, "step": 26540 }, { "grad_norm": 0.6843665242195129, "learning_rate": 6.347518549544793e-05, "loss": 0.0489, "step": 26550 }, { "grad_norm": 0.5313747525215149, "learning_rate": 6.344864527739547e-05, "loss": 0.0414, "step": 26560 }, { "grad_norm": 0.8456918001174927, "learning_rate": 6.342210097399966e-05, "loss": 0.0413, "step": 26570 }, { "grad_norm": 0.4144016206264496, "learning_rate": 6.339555259332398e-05, "loss": 0.0486, "step": 26580 }, { "grad_norm": 0.501737117767334, "learning_rate": 6.33690001434331e-05, "loss": 0.0399, "step": 26590 }, { "grad_norm": 0.5087623596191406, "learning_rate": 6.334244363239296e-05, "loss": 0.042, "step": 26600 }, { "grad_norm": 0.8149189352989197, "learning_rate": 6.331588306827073e-05, "loss": 0.0506, "step": 26610 }, { "grad_norm": 0.519636332988739, "learning_rate": 6.328931845913483e-05, "loss": 0.0596, "step": 26620 }, { "grad_norm": 0.5172142386436462, "learning_rate": 6.326274981305484e-05, "loss": 0.0587, "step": 26630 }, { "grad_norm": 0.6138646006584167, "learning_rate": 6.323617713810166e-05, "loss": 0.0627, "step": 26640 }, { "grad_norm": 0.5137553811073303, "learning_rate": 6.320960044234734e-05, "loss": 0.0478, "step": 26650 }, { "grad_norm": 0.6260169148445129, "learning_rate": 6.318301973386518e-05, "loss": 0.044, "step": 26660 }, { "grad_norm": 0.4720858037471771, "learning_rate": 6.315643502072971e-05, "loss": 0.064, "step": 26670 }, { "grad_norm": 0.5125299096107483, "learning_rate": 6.312984631101667e-05, "loss": 0.0393, "step": 26680 }, { "grad_norm": 0.5351711511611938, "learning_rate": 6.310325361280297e-05, "loss": 0.0521, "step": 26690 }, { "grad_norm": 0.6028822064399719, "learning_rate": 6.30766569341668e-05, "loss": 0.0411, "step": 26700 }, { "grad_norm": 0.528744637966156, "learning_rate": 6.305005628318753e-05, "loss": 0.0513, "step": 26710 }, { "grad_norm": 0.34974610805511475, "learning_rate": 6.302345166794572e-05, "loss": 0.0456, "step": 26720 }, { "grad_norm": 0.9245039224624634, "learning_rate": 6.299684309652316e-05, "loss": 0.0549, "step": 26730 }, { "grad_norm": 0.5274889469146729, "learning_rate": 6.297023057700283e-05, "loss": 0.0626, "step": 26740 }, { "grad_norm": 0.6402085423469543, "learning_rate": 6.294361411746891e-05, "loss": 0.0437, "step": 26750 }, { "grad_norm": 0.5056595802307129, "learning_rate": 6.291699372600677e-05, "loss": 0.0535, "step": 26760 }, { "grad_norm": 0.6700159311294556, "learning_rate": 6.2890369410703e-05, "loss": 0.0537, "step": 26770 }, { "grad_norm": 0.7659383416175842, "learning_rate": 6.286374117964534e-05, "loss": 0.0552, "step": 26780 }, { "grad_norm": 0.6044502258300781, "learning_rate": 6.283710904092277e-05, "loss": 0.0613, "step": 26790 }, { "grad_norm": 0.3745923936367035, "learning_rate": 6.281047300262542e-05, "loss": 0.0491, "step": 26800 }, { "grad_norm": 0.9643851518630981, "learning_rate": 6.278383307284461e-05, "loss": 0.0518, "step": 26810 }, { "grad_norm": 0.4028518795967102, "learning_rate": 6.275718925967284e-05, "loss": 0.0442, "step": 26820 }, { "grad_norm": 0.43697643280029297, "learning_rate": 6.273054157120382e-05, "loss": 0.0615, "step": 26830 }, { "grad_norm": 0.5079065561294556, "learning_rate": 6.270389001553238e-05, "loss": 0.047, "step": 26840 }, { "grad_norm": 0.6427322626113892, "learning_rate": 6.26772346007546e-05, "loss": 0.0519, "step": 26850 }, { "grad_norm": 0.5184283256530762, "learning_rate": 6.265057533496767e-05, "loss": 0.0421, "step": 26860 }, { "grad_norm": 0.3974684476852417, "learning_rate": 6.262391222626997e-05, "loss": 0.0505, "step": 26870 }, { "grad_norm": 1.1656032800674438, "learning_rate": 6.259724528276106e-05, "loss": 0.0597, "step": 26880 }, { "grad_norm": 0.5613616704940796, "learning_rate": 6.257057451254162e-05, "loss": 0.0509, "step": 26890 }, { "grad_norm": 0.7650153040885925, "learning_rate": 6.254389992371357e-05, "loss": 0.0679, "step": 26900 }, { "grad_norm": 0.40461263060569763, "learning_rate": 6.25172215243799e-05, "loss": 0.0733, "step": 26910 }, { "grad_norm": 0.7547531723976135, "learning_rate": 6.249053932264486e-05, "loss": 0.0487, "step": 26920 }, { "grad_norm": 0.5632423162460327, "learning_rate": 6.246385332661376e-05, "loss": 0.0654, "step": 26930 }, { "grad_norm": 0.5522407293319702, "learning_rate": 6.24371635443931e-05, "loss": 0.071, "step": 26940 }, { "grad_norm": 0.5838381052017212, "learning_rate": 6.241046998409054e-05, "loss": 0.0471, "step": 26950 }, { "grad_norm": 0.6203344464302063, "learning_rate": 6.238377265381489e-05, "loss": 0.0696, "step": 26960 }, { "grad_norm": 0.538800835609436, "learning_rate": 6.235707156167607e-05, "loss": 0.0635, "step": 26970 }, { "grad_norm": 0.515163004398346, "learning_rate": 6.233036671578519e-05, "loss": 0.0548, "step": 26980 }, { "grad_norm": 0.35526135563850403, "learning_rate": 6.230365812425445e-05, "loss": 0.0383, "step": 26990 }, { "grad_norm": 0.5614968538284302, "learning_rate": 6.227694579519724e-05, "loss": 0.0543, "step": 27000 }, { "grad_norm": 0.8947904706001282, "learning_rate": 6.225022973672805e-05, "loss": 0.0567, "step": 27010 }, { "grad_norm": 0.34701669216156006, "learning_rate": 6.222350995696253e-05, "loss": 0.0483, "step": 27020 }, { "grad_norm": 0.7763736248016357, "learning_rate": 6.21967864640174e-05, "loss": 0.072, "step": 27030 }, { "grad_norm": 0.4249142110347748, "learning_rate": 6.217005926601059e-05, "loss": 0.0603, "step": 27040 }, { "grad_norm": 0.5436002612113953, "learning_rate": 6.214332837106111e-05, "loss": 0.0722, "step": 27050 }, { "grad_norm": 0.6789377927780151, "learning_rate": 6.21165937872891e-05, "loss": 0.0596, "step": 27060 }, { "grad_norm": 0.44202062487602234, "learning_rate": 6.208985552281582e-05, "loss": 0.0781, "step": 27070 }, { "grad_norm": 0.40363776683807373, "learning_rate": 6.206311358576364e-05, "loss": 0.0424, "step": 27080 }, { "grad_norm": 0.46312204003334045, "learning_rate": 6.203636798425608e-05, "loss": 0.0444, "step": 27090 }, { "grad_norm": 0.40268516540527344, "learning_rate": 6.20096187264177e-05, "loss": 0.0521, "step": 27100 }, { "grad_norm": 0.5494117140769958, "learning_rate": 6.198286582037425e-05, "loss": 0.0595, "step": 27110 }, { "grad_norm": 0.5455650687217712, "learning_rate": 6.195610927425256e-05, "loss": 0.0497, "step": 27120 }, { "grad_norm": 0.4313790500164032, "learning_rate": 6.192934909618056e-05, "loss": 0.05, "step": 27130 }, { "grad_norm": 0.5438379049301147, "learning_rate": 6.190258529428728e-05, "loss": 0.044, "step": 27140 }, { "grad_norm": 0.6590949296951294, "learning_rate": 6.187581787670285e-05, "loss": 0.0405, "step": 27150 }, { "grad_norm": 0.5797727108001709, "learning_rate": 6.184904685155852e-05, "loss": 0.0488, "step": 27160 }, { "grad_norm": 0.7515206336975098, "learning_rate": 6.18222722269866e-05, "loss": 0.0691, "step": 27170 }, { "grad_norm": 0.5606809854507446, "learning_rate": 6.179549401112053e-05, "loss": 0.0478, "step": 27180 }, { "grad_norm": 0.504827618598938, "learning_rate": 6.176871221209482e-05, "loss": 0.0837, "step": 27190 }, { "grad_norm": 0.8329652547836304, "learning_rate": 6.174192683804508e-05, "loss": 0.058, "step": 27200 }, { "grad_norm": 0.6342533826828003, "learning_rate": 6.1715137897108e-05, "loss": 0.0424, "step": 27210 }, { "grad_norm": 0.4587891101837158, "learning_rate": 6.168834539742134e-05, "loss": 0.0472, "step": 27220 }, { "grad_norm": 0.4679805338382721, "learning_rate": 6.166154934712397e-05, "loss": 0.0376, "step": 27230 }, { "grad_norm": 0.48306846618652344, "learning_rate": 6.163474975435581e-05, "loss": 0.0698, "step": 27240 }, { "grad_norm": 0.40790361166000366, "learning_rate": 6.160794662725787e-05, "loss": 0.0406, "step": 27250 }, { "grad_norm": 0.7069532871246338, "learning_rate": 6.158113997397222e-05, "loss": 0.0437, "step": 27260 }, { "grad_norm": 0.4305925667285919, "learning_rate": 6.155432980264205e-05, "loss": 0.0357, "step": 27270 }, { "grad_norm": 0.4344123601913452, "learning_rate": 6.152751612141156e-05, "loss": 0.0341, "step": 27280 }, { "grad_norm": 0.5393272638320923, "learning_rate": 6.150069893842602e-05, "loss": 0.0465, "step": 27290 }, { "grad_norm": 0.3668077290058136, "learning_rate": 6.147387826183182e-05, "loss": 0.0568, "step": 27300 }, { "grad_norm": 0.47121256589889526, "learning_rate": 6.144705409977635e-05, "loss": 0.0615, "step": 27310 }, { "grad_norm": 0.42164698243141174, "learning_rate": 6.142022646040808e-05, "loss": 0.0566, "step": 27320 }, { "grad_norm": 0.31017717719078064, "learning_rate": 6.139339535187653e-05, "loss": 0.0399, "step": 27330 }, { "grad_norm": 0.5401390194892883, "learning_rate": 6.136656078233232e-05, "loss": 0.0506, "step": 27340 }, { "grad_norm": 0.8385655879974365, "learning_rate": 6.133972275992707e-05, "loss": 0.0421, "step": 27350 }, { "grad_norm": 0.5379586815834045, "learning_rate": 6.131288129281342e-05, "loss": 0.0391, "step": 27360 }, { "grad_norm": 0.4193815290927887, "learning_rate": 6.128603638914516e-05, "loss": 0.0531, "step": 27370 }, { "grad_norm": 0.6303272247314453, "learning_rate": 6.125918805707704e-05, "loss": 0.0418, "step": 27380 }, { "grad_norm": 0.5592548251152039, "learning_rate": 6.123233630476485e-05, "loss": 0.0599, "step": 27390 }, { "grad_norm": 0.777833104133606, "learning_rate": 6.120548114036547e-05, "loss": 0.0685, "step": 27400 }, { "grad_norm": 0.619719922542572, "learning_rate": 6.117862257203679e-05, "loss": 0.0582, "step": 27410 }, { "grad_norm": 0.576230525970459, "learning_rate": 6.115176060793771e-05, "loss": 0.0536, "step": 27420 }, { "grad_norm": 0.45700162649154663, "learning_rate": 6.112489525622822e-05, "loss": 0.046, "step": 27430 }, { "grad_norm": 0.4165976047515869, "learning_rate": 6.109802652506928e-05, "loss": 0.047, "step": 27440 }, { "grad_norm": 0.6612272262573242, "learning_rate": 6.107115442262291e-05, "loss": 0.0487, "step": 27450 }, { "grad_norm": 0.6991482377052307, "learning_rate": 6.104427895705214e-05, "loss": 0.0459, "step": 27460 }, { "grad_norm": 0.5191430449485779, "learning_rate": 6.101740013652103e-05, "loss": 0.0372, "step": 27470 }, { "grad_norm": 0.2975722849369049, "learning_rate": 6.099051796919465e-05, "loss": 0.0442, "step": 27480 }, { "grad_norm": 0.6026555299758911, "learning_rate": 6.096363246323911e-05, "loss": 0.0559, "step": 27490 }, { "grad_norm": 0.7113026976585388, "learning_rate": 6.0936743626821504e-05, "loss": 0.044, "step": 27500 }, { "grad_norm": 0.4630691707134247, "learning_rate": 6.090985146810996e-05, "loss": 0.0596, "step": 27510 }, { "grad_norm": 0.6367729306221008, "learning_rate": 6.088295599527357e-05, "loss": 0.084, "step": 27520 }, { "grad_norm": 0.6678121089935303, "learning_rate": 6.085605721648252e-05, "loss": 0.0473, "step": 27530 }, { "grad_norm": 0.4857844412326813, "learning_rate": 6.082915513990792e-05, "loss": 0.0468, "step": 27540 }, { "grad_norm": 0.4740521311759949, "learning_rate": 6.080224977372192e-05, "loss": 0.0527, "step": 27550 }, { "grad_norm": 0.6159349083900452, "learning_rate": 6.0775341126097666e-05, "loss": 0.0485, "step": 27560 }, { "grad_norm": 0.6239722371101379, "learning_rate": 6.074842920520926e-05, "loss": 0.0634, "step": 27570 }, { "grad_norm": 0.8902099132537842, "learning_rate": 6.072151401923186e-05, "loss": 0.0456, "step": 27580 }, { "grad_norm": 0.7189630270004272, "learning_rate": 6.069459557634159e-05, "loss": 0.0428, "step": 27590 }, { "grad_norm": 0.4995422661304474, "learning_rate": 6.066767388471557e-05, "loss": 0.0561, "step": 27600 }, { "grad_norm": 0.5093445777893066, "learning_rate": 6.064074895253188e-05, "loss": 0.0645, "step": 27610 }, { "grad_norm": 0.6883164644241333, "learning_rate": 6.061382078796961e-05, "loss": 0.054, "step": 27620 }, { "grad_norm": 0.732742428779602, "learning_rate": 6.0586889399208814e-05, "loss": 0.0472, "step": 27630 }, { "grad_norm": 0.4396074116230011, "learning_rate": 6.0559954794430565e-05, "loss": 0.0591, "step": 27640 }, { "grad_norm": 0.6008154153823853, "learning_rate": 6.053301698181687e-05, "loss": 0.0705, "step": 27650 }, { "grad_norm": 0.8479161262512207, "learning_rate": 6.0506075969550725e-05, "loss": 0.0583, "step": 27660 }, { "grad_norm": 0.6310752034187317, "learning_rate": 6.047913176581609e-05, "loss": 0.054, "step": 27670 }, { "grad_norm": 0.705268144607544, "learning_rate": 6.0452184378797904e-05, "loss": 0.0569, "step": 27680 }, { "grad_norm": 0.4688934087753296, "learning_rate": 6.042523381668209e-05, "loss": 0.0336, "step": 27690 }, { "grad_norm": 0.8299901485443115, "learning_rate": 6.03982800876555e-05, "loss": 0.0508, "step": 27700 }, { "grad_norm": 0.5923662781715393, "learning_rate": 6.0371323199905975e-05, "loss": 0.0542, "step": 27710 }, { "grad_norm": 0.605591893196106, "learning_rate": 6.03443631616223e-05, "loss": 0.0448, "step": 27720 }, { "grad_norm": 0.5497490763664246, "learning_rate": 6.031739998099421e-05, "loss": 0.0732, "step": 27730 }, { "grad_norm": 0.7046129703521729, "learning_rate": 6.029043366621243e-05, "loss": 0.0527, "step": 27740 }, { "grad_norm": 0.44197842478752136, "learning_rate": 6.0263464225468615e-05, "loss": 0.0535, "step": 27750 }, { "grad_norm": 0.5590490102767944, "learning_rate": 6.023649166695534e-05, "loss": 0.0337, "step": 27760 }, { "grad_norm": 0.5187345743179321, "learning_rate": 6.0209515998866186e-05, "loss": 0.0499, "step": 27770 }, { "grad_norm": 0.7214081287384033, "learning_rate": 6.018253722939563e-05, "loss": 0.0767, "step": 27780 }, { "grad_norm": 0.5786367058753967, "learning_rate": 6.015555536673914e-05, "loss": 0.0445, "step": 27790 }, { "grad_norm": 0.5488693118095398, "learning_rate": 6.0128570419093054e-05, "loss": 0.0542, "step": 27800 }, { "grad_norm": 0.9604530334472656, "learning_rate": 6.010158239465471e-05, "loss": 0.0601, "step": 27810 }, { "grad_norm": 0.8603129386901855, "learning_rate": 6.007459130162235e-05, "loss": 0.0498, "step": 27820 }, { "grad_norm": 0.5584774613380432, "learning_rate": 6.004759714819516e-05, "loss": 0.0702, "step": 27830 }, { "grad_norm": 0.4264543354511261, "learning_rate": 6.002059994257323e-05, "loss": 0.0414, "step": 27840 }, { "grad_norm": 0.6211913228034973, "learning_rate": 5.999359969295764e-05, "loss": 0.04, "step": 27850 }, { "grad_norm": 0.6049425005912781, "learning_rate": 5.9966596407550314e-05, "loss": 0.056, "step": 27860 }, { "grad_norm": 0.5645686984062195, "learning_rate": 5.993959009455416e-05, "loss": 0.0547, "step": 27870 }, { "grad_norm": 0.6415889859199524, "learning_rate": 5.991258076217298e-05, "loss": 0.0402, "step": 27880 }, { "grad_norm": 0.4261781871318817, "learning_rate": 5.988556841861147e-05, "loss": 0.0518, "step": 27890 }, { "grad_norm": 0.5936372876167297, "learning_rate": 5.985855307207531e-05, "loss": 0.0519, "step": 27900 }, { "grad_norm": 0.48289206624031067, "learning_rate": 5.9831534730771e-05, "loss": 0.0492, "step": 27910 }, { "grad_norm": 0.497627854347229, "learning_rate": 5.980451340290605e-05, "loss": 0.0555, "step": 27920 }, { "grad_norm": 0.5641434788703918, "learning_rate": 5.97774890966888e-05, "loss": 0.0711, "step": 27930 }, { "grad_norm": 1.0671380758285522, "learning_rate": 5.975046182032851e-05, "loss": 0.0801, "step": 27940 }, { "grad_norm": 0.5749000906944275, "learning_rate": 5.972343158203537e-05, "loss": 0.0528, "step": 27950 }, { "grad_norm": 0.538180410861969, "learning_rate": 5.969639839002045e-05, "loss": 0.0556, "step": 27960 }, { "grad_norm": 0.7903569340705872, "learning_rate": 5.966936225249572e-05, "loss": 0.0545, "step": 27970 }, { "grad_norm": 0.5314232707023621, "learning_rate": 5.9642323177674044e-05, "loss": 0.0452, "step": 27980 }, { "grad_norm": 0.6738035082817078, "learning_rate": 5.9615281173769154e-05, "loss": 0.0615, "step": 27990 }, { "grad_norm": 0.502201497554779, "learning_rate": 5.958823624899574e-05, "loss": 0.0464, "step": 28000 }, { "grad_norm": 0.7611238956451416, "learning_rate": 5.956118841156933e-05, "loss": 0.0534, "step": 28010 }, { "grad_norm": 0.6902185082435608, "learning_rate": 5.953413766970631e-05, "loss": 0.0541, "step": 28020 }, { "grad_norm": 0.5565411448478699, "learning_rate": 5.9507084031624e-05, "loss": 0.0594, "step": 28030 }, { "grad_norm": 0.5778388381004333, "learning_rate": 5.948002750554058e-05, "loss": 0.0483, "step": 28040 }, { "grad_norm": 0.5137740969657898, "learning_rate": 5.9452968099675124e-05, "loss": 0.0529, "step": 28050 }, { "grad_norm": 0.4413629472255707, "learning_rate": 5.9425905822247527e-05, "loss": 0.0539, "step": 28060 }, { "grad_norm": 0.7454096078872681, "learning_rate": 5.939884068147864e-05, "loss": 0.0477, "step": 28070 }, { "grad_norm": 0.4319804906845093, "learning_rate": 5.937177268559011e-05, "loss": 0.0433, "step": 28080 }, { "grad_norm": 0.6663150191307068, "learning_rate": 5.934470184280448e-05, "loss": 0.0451, "step": 28090 }, { "grad_norm": 0.48142871260643005, "learning_rate": 5.931762816134516e-05, "loss": 0.0422, "step": 28100 }, { "grad_norm": 0.7843912839889526, "learning_rate": 5.9290551649436434e-05, "loss": 0.0491, "step": 28110 }, { "grad_norm": 0.5853727459907532, "learning_rate": 5.9263472315303416e-05, "loss": 0.0467, "step": 28120 }, { "grad_norm": 0.4461943805217743, "learning_rate": 5.9236390167172096e-05, "loss": 0.056, "step": 28130 }, { "grad_norm": 0.7413589358329773, "learning_rate": 5.920930521326932e-05, "loss": 0.0528, "step": 28140 }, { "grad_norm": 0.8107931613922119, "learning_rate": 5.918221746182276e-05, "loss": 0.0631, "step": 28150 }, { "grad_norm": 0.5533984303474426, "learning_rate": 5.9155126921061e-05, "loss": 0.0446, "step": 28160 }, { "grad_norm": 0.579518735408783, "learning_rate": 5.91280335992134e-05, "loss": 0.0556, "step": 28170 }, { "grad_norm": 0.5540369153022766, "learning_rate": 5.91009375045102e-05, "loss": 0.0452, "step": 28180 }, { "grad_norm": 0.9057053327560425, "learning_rate": 5.9073838645182476e-05, "loss": 0.0523, "step": 28190 }, { "grad_norm": 0.4483063817024231, "learning_rate": 5.904673702946217e-05, "loss": 0.0475, "step": 28200 }, { "grad_norm": 0.4737318754196167, "learning_rate": 5.9019632665582004e-05, "loss": 0.0434, "step": 28210 }, { "grad_norm": 0.748706042766571, "learning_rate": 5.899252556177559e-05, "loss": 0.0593, "step": 28220 }, { "grad_norm": 0.8458240032196045, "learning_rate": 5.896541572627735e-05, "loss": 0.0422, "step": 28230 }, { "grad_norm": 0.4607914388179779, "learning_rate": 5.893830316732253e-05, "loss": 0.0419, "step": 28240 }, { "grad_norm": 0.7460883855819702, "learning_rate": 5.8911187893147214e-05, "loss": 0.0426, "step": 28250 }, { "grad_norm": 0.39254316687583923, "learning_rate": 5.888406991198828e-05, "loss": 0.0601, "step": 28260 }, { "grad_norm": 0.8147833347320557, "learning_rate": 5.885694923208349e-05, "loss": 0.0676, "step": 28270 }, { "grad_norm": 0.6285564303398132, "learning_rate": 5.882982586167138e-05, "loss": 0.0514, "step": 28280 }, { "grad_norm": 0.5265780091285706, "learning_rate": 5.880269980899131e-05, "loss": 0.0354, "step": 28290 }, { "grad_norm": 0.9615175724029541, "learning_rate": 5.8775571082283465e-05, "loss": 0.0592, "step": 28300 }, { "grad_norm": 0.9778316617012024, "learning_rate": 5.8748439689788824e-05, "loss": 0.0712, "step": 28310 }, { "grad_norm": 0.9285436272621155, "learning_rate": 5.87213056397492e-05, "loss": 0.063, "step": 28320 }, { "grad_norm": 0.5479291081428528, "learning_rate": 5.869416894040719e-05, "loss": 0.0476, "step": 28330 }, { "grad_norm": 0.7123785614967346, "learning_rate": 5.866702960000621e-05, "loss": 0.0514, "step": 28340 }, { "grad_norm": 0.4369118809700012, "learning_rate": 5.863988762679048e-05, "loss": 0.0502, "step": 28350 }, { "grad_norm": 0.48228514194488525, "learning_rate": 5.8612743029005e-05, "loss": 0.0379, "step": 28360 }, { "grad_norm": 0.5755884647369385, "learning_rate": 5.858559581489561e-05, "loss": 0.049, "step": 28370 }, { "grad_norm": 0.542680561542511, "learning_rate": 5.85584459927089e-05, "loss": 0.0481, "step": 28380 }, { "grad_norm": 0.65384441614151, "learning_rate": 5.853129357069227e-05, "loss": 0.0491, "step": 28390 }, { "grad_norm": 0.4921700656414032, "learning_rate": 5.8504138557093913e-05, "loss": 0.0392, "step": 28400 }, { "grad_norm": 0.5283504128456116, "learning_rate": 5.8476980960162784e-05, "loss": 0.0552, "step": 28410 }, { "grad_norm": 0.6527729034423828, "learning_rate": 5.844982078814868e-05, "loss": 0.0474, "step": 28420 }, { "grad_norm": 0.5048859119415283, "learning_rate": 5.842265804930211e-05, "loss": 0.0792, "step": 28430 }, { "grad_norm": 0.528708279132843, "learning_rate": 5.839549275187444e-05, "loss": 0.0319, "step": 28440 }, { "grad_norm": 0.4297647476196289, "learning_rate": 5.836832490411771e-05, "loss": 0.0506, "step": 28450 }, { "grad_norm": 0.6205086708068848, "learning_rate": 5.834115451428485e-05, "loss": 0.0326, "step": 28460 }, { "grad_norm": 0.5032621026039124, "learning_rate": 5.831398159062946e-05, "loss": 0.0468, "step": 28470 }, { "grad_norm": 0.6530228853225708, "learning_rate": 5.828680614140599e-05, "loss": 0.047, "step": 28480 }, { "grad_norm": 0.4408782124519348, "learning_rate": 5.825962817486962e-05, "loss": 0.0534, "step": 28490 }, { "grad_norm": 0.4951290786266327, "learning_rate": 5.823244769927629e-05, "loss": 0.0478, "step": 28500 }, { "grad_norm": 0.7258790731430054, "learning_rate": 5.8205264722882716e-05, "loss": 0.0485, "step": 28510 }, { "grad_norm": 0.49423977732658386, "learning_rate": 5.817807925394636e-05, "loss": 0.053, "step": 28520 }, { "grad_norm": 0.7305997610092163, "learning_rate": 5.815089130072546e-05, "loss": 0.0639, "step": 28530 }, { "grad_norm": 0.4863918721675873, "learning_rate": 5.8123700871479e-05, "loss": 0.0529, "step": 28540 }, { "grad_norm": 0.5390973687171936, "learning_rate": 5.809650797446671e-05, "loss": 0.0451, "step": 28550 }, { "grad_norm": 0.4765864908695221, "learning_rate": 5.806931261794907e-05, "loss": 0.0456, "step": 28560 }, { "grad_norm": 0.527809202671051, "learning_rate": 5.804211481018731e-05, "loss": 0.0571, "step": 28570 }, { "grad_norm": 0.5437464118003845, "learning_rate": 5.801491455944341e-05, "loss": 0.0426, "step": 28580 }, { "grad_norm": 0.5086677670478821, "learning_rate": 5.79877118739801e-05, "loss": 0.0438, "step": 28590 }, { "grad_norm": 0.5136754512786865, "learning_rate": 5.7960506762060816e-05, "loss": 0.0628, "step": 28600 }, { "grad_norm": 0.40574634075164795, "learning_rate": 5.793329923194977e-05, "loss": 0.043, "step": 28610 }, { "grad_norm": 0.4951188266277313, "learning_rate": 5.790608929191187e-05, "loss": 0.0465, "step": 28620 }, { "grad_norm": 0.4532805383205414, "learning_rate": 5.78788769502128e-05, "loss": 0.0709, "step": 28630 }, { "grad_norm": 0.4891648590564728, "learning_rate": 5.785166221511894e-05, "loss": 0.038, "step": 28640 }, { "grad_norm": 0.5188447833061218, "learning_rate": 5.7824445094897415e-05, "loss": 0.0503, "step": 28650 }, { "grad_norm": 0.5744966268539429, "learning_rate": 5.7797225597816065e-05, "loss": 0.0367, "step": 28660 }, { "grad_norm": 0.5001101493835449, "learning_rate": 5.777000373214345e-05, "loss": 0.0501, "step": 28670 }, { "grad_norm": 0.5806112885475159, "learning_rate": 5.774277950614885e-05, "loss": 0.0463, "step": 28680 }, { "grad_norm": 0.5650782585144043, "learning_rate": 5.771555292810227e-05, "loss": 0.0468, "step": 28690 }, { "grad_norm": 0.7897985577583313, "learning_rate": 5.768832400627444e-05, "loss": 0.0618, "step": 28700 }, { "grad_norm": 0.651762068271637, "learning_rate": 5.7661092748936775e-05, "loss": 0.0437, "step": 28710 }, { "grad_norm": 0.4676160514354706, "learning_rate": 5.76338591643614e-05, "loss": 0.0419, "step": 28720 }, { "grad_norm": 0.6868846416473389, "learning_rate": 5.760662326082118e-05, "loss": 0.0432, "step": 28730 }, { "grad_norm": 0.7280373573303223, "learning_rate": 5.757938504658965e-05, "loss": 0.0404, "step": 28740 }, { "grad_norm": 0.43459346890449524, "learning_rate": 5.755214452994107e-05, "loss": 0.0452, "step": 28750 }, { "grad_norm": 0.41618359088897705, "learning_rate": 5.752490171915039e-05, "loss": 0.0493, "step": 28760 }, { "grad_norm": 0.5832470655441284, "learning_rate": 5.749765662249324e-05, "loss": 0.0511, "step": 28770 }, { "grad_norm": 0.43801945447921753, "learning_rate": 5.747040924824596e-05, "loss": 0.0659, "step": 28780 }, { "grad_norm": 0.45150160789489746, "learning_rate": 5.7443159604685613e-05, "loss": 0.056, "step": 28790 }, { "grad_norm": 0.5662516951560974, "learning_rate": 5.74159077000899e-05, "loss": 0.0429, "step": 28800 }, { "grad_norm": 0.4180372357368469, "learning_rate": 5.7388653542737235e-05, "loss": 0.0416, "step": 28810 }, { "grad_norm": 0.47983691096305847, "learning_rate": 5.736139714090672e-05, "loss": 0.0475, "step": 28820 }, { "grad_norm": 0.5470103621482849, "learning_rate": 5.73341385028781e-05, "loss": 0.0433, "step": 28830 }, { "grad_norm": 0.6224489808082581, "learning_rate": 5.7306877636931855e-05, "loss": 0.0495, "step": 28840 }, { "grad_norm": 0.4943543076515198, "learning_rate": 5.7279614551349125e-05, "loss": 0.0456, "step": 28850 }, { "grad_norm": 0.7851821780204773, "learning_rate": 5.725234925441169e-05, "loss": 0.0581, "step": 28860 }, { "grad_norm": 0.968539834022522, "learning_rate": 5.7225081754402044e-05, "loss": 0.0507, "step": 28870 }, { "grad_norm": 0.5047797560691833, "learning_rate": 5.7197812059603326e-05, "loss": 0.0451, "step": 28880 }, { "grad_norm": 0.5334600806236267, "learning_rate": 5.717054017829934e-05, "loss": 0.0444, "step": 28890 }, { "grad_norm": 0.47794899344444275, "learning_rate": 5.7143266118774584e-05, "loss": 0.0443, "step": 28900 }, { "grad_norm": 0.3902881443500519, "learning_rate": 5.711598988931418e-05, "loss": 0.0507, "step": 28910 }, { "grad_norm": 0.4459350109100342, "learning_rate": 5.7088711498203954e-05, "loss": 0.0337, "step": 28920 }, { "grad_norm": 0.5593816041946411, "learning_rate": 5.706143095373033e-05, "loss": 0.0424, "step": 28930 }, { "grad_norm": 0.43162041902542114, "learning_rate": 5.703414826418042e-05, "loss": 0.0649, "step": 28940 }, { "grad_norm": 0.4647137224674225, "learning_rate": 5.7006863437842007e-05, "loss": 0.0535, "step": 28950 }, { "grad_norm": 0.5616277456283569, "learning_rate": 5.697957648300348e-05, "loss": 0.0513, "step": 28960 }, { "grad_norm": 0.5928959250450134, "learning_rate": 5.695228740795391e-05, "loss": 0.0506, "step": 28970 }, { "grad_norm": 0.5459349751472473, "learning_rate": 5.6924996220982985e-05, "loss": 0.04, "step": 28980 }, { "grad_norm": 0.6740354895591736, "learning_rate": 5.6897702930381045e-05, "loss": 0.066, "step": 28990 }, { "grad_norm": 0.4442308247089386, "learning_rate": 5.687040754443908e-05, "loss": 0.0535, "step": 29000 }, { "grad_norm": 0.6512748003005981, "learning_rate": 5.6843110071448725e-05, "loss": 0.043, "step": 29010 }, { "grad_norm": 0.41457682847976685, "learning_rate": 5.6815810519702194e-05, "loss": 0.0437, "step": 29020 }, { "grad_norm": 0.3775354027748108, "learning_rate": 5.6788508897492396e-05, "loss": 0.0542, "step": 29030 }, { "grad_norm": 0.4033072590827942, "learning_rate": 5.676120521311282e-05, "loss": 0.0437, "step": 29040 }, { "grad_norm": 0.4525412321090698, "learning_rate": 5.6733899474857634e-05, "loss": 0.0813, "step": 29050 }, { "grad_norm": 0.6664342880249023, "learning_rate": 5.670659169102157e-05, "loss": 0.0396, "step": 29060 }, { "grad_norm": 0.4048948884010315, "learning_rate": 5.6679281869900044e-05, "loss": 0.0464, "step": 29070 }, { "grad_norm": 0.8417076468467712, "learning_rate": 5.6651970019789045e-05, "loss": 0.0508, "step": 29080 }, { "grad_norm": 0.491362601518631, "learning_rate": 5.662465614898519e-05, "loss": 0.039, "step": 29090 }, { "grad_norm": 1.0266447067260742, "learning_rate": 5.6597340265785695e-05, "loss": 0.0564, "step": 29100 }, { "grad_norm": 0.4738434851169586, "learning_rate": 5.657002237848843e-05, "loss": 0.0429, "step": 29110 }, { "grad_norm": 0.4023396670818329, "learning_rate": 5.654270249539183e-05, "loss": 0.045, "step": 29120 }, { "grad_norm": 0.48854774236679077, "learning_rate": 5.651538062479498e-05, "loss": 0.0444, "step": 29130 }, { "grad_norm": 0.5320647954940796, "learning_rate": 5.648805677499751e-05, "loss": 0.0404, "step": 29140 }, { "grad_norm": 0.5219225287437439, "learning_rate": 5.646073095429969e-05, "loss": 0.0617, "step": 29150 }, { "grad_norm": 0.6245487928390503, "learning_rate": 5.643340317100241e-05, "loss": 0.047, "step": 29160 }, { "grad_norm": 0.6789108514785767, "learning_rate": 5.64060734334071e-05, "loss": 0.0564, "step": 29170 }, { "grad_norm": 0.4476436376571655, "learning_rate": 5.637874174981583e-05, "loss": 0.0567, "step": 29180 }, { "grad_norm": 0.7431433796882629, "learning_rate": 5.635140812853124e-05, "loss": 0.0419, "step": 29190 }, { "grad_norm": 0.5761498808860779, "learning_rate": 5.6324072577856544e-05, "loss": 0.0745, "step": 29200 }, { "grad_norm": 0.3691083788871765, "learning_rate": 5.629673510609559e-05, "loss": 0.0319, "step": 29210 }, { "grad_norm": 0.632802426815033, "learning_rate": 5.626939572155276e-05, "loss": 0.039, "step": 29220 }, { "grad_norm": 0.7277398109436035, "learning_rate": 5.6242054432533054e-05, "loss": 0.0415, "step": 29230 }, { "grad_norm": 0.5058076977729797, "learning_rate": 5.621471124734201e-05, "loss": 0.0492, "step": 29240 }, { "grad_norm": 0.5963249802589417, "learning_rate": 5.6187366174285794e-05, "loss": 0.037, "step": 29250 }, { "grad_norm": 0.5275521278381348, "learning_rate": 5.616001922167109e-05, "loss": 0.0505, "step": 29260 }, { "grad_norm": 0.27992066740989685, "learning_rate": 5.61326703978052e-05, "loss": 0.0397, "step": 29270 }, { "grad_norm": 0.5064265131950378, "learning_rate": 5.6105319710995964e-05, "loss": 0.0317, "step": 29280 }, { "grad_norm": 0.5098099708557129, "learning_rate": 5.60779671695518e-05, "loss": 0.0458, "step": 29290 }, { "grad_norm": 0.6325677037239075, "learning_rate": 5.6050612781781684e-05, "loss": 0.0492, "step": 29300 }, { "grad_norm": 0.44871875643730164, "learning_rate": 5.602325655599516e-05, "loss": 0.0488, "step": 29310 }, { "grad_norm": 0.7830904722213745, "learning_rate": 5.599589850050234e-05, "loss": 0.0459, "step": 29320 }, { "grad_norm": 0.6491694450378418, "learning_rate": 5.5968538623613874e-05, "loss": 0.0541, "step": 29330 }, { "grad_norm": 0.6548482179641724, "learning_rate": 5.594117693364095e-05, "loss": 0.0468, "step": 29340 }, { "grad_norm": 0.4965038597583771, "learning_rate": 5.591381343889535e-05, "loss": 0.0439, "step": 29350 }, { "grad_norm": 0.6628499627113342, "learning_rate": 5.5886448147689355e-05, "loss": 0.0413, "step": 29360 }, { "grad_norm": 0.610365629196167, "learning_rate": 5.585908106833585e-05, "loss": 0.0462, "step": 29370 }, { "grad_norm": 0.5899383425712585, "learning_rate": 5.5831712209148226e-05, "loss": 0.0391, "step": 29380 }, { "grad_norm": 0.3563321828842163, "learning_rate": 5.58043415784404e-05, "loss": 0.0401, "step": 29390 }, { "grad_norm": 0.5891003012657166, "learning_rate": 5.577696918452686e-05, "loss": 0.0531, "step": 29400 }, { "grad_norm": 0.3678502142429352, "learning_rate": 5.5749595035722604e-05, "loss": 0.0407, "step": 29410 }, { "grad_norm": 0.7675569653511047, "learning_rate": 5.5722219140343193e-05, "loss": 0.0661, "step": 29420 }, { "grad_norm": 0.4355248510837555, "learning_rate": 5.56948415067047e-05, "loss": 0.0471, "step": 29430 }, { "grad_norm": 0.33006736636161804, "learning_rate": 5.5667462143123704e-05, "loss": 0.0459, "step": 29440 }, { "grad_norm": 0.5185755491256714, "learning_rate": 5.564008105791737e-05, "loss": 0.051, "step": 29450 }, { "grad_norm": 0.4855380952358246, "learning_rate": 5.5612698259403316e-05, "loss": 0.0582, "step": 29460 }, { "grad_norm": 0.5432390570640564, "learning_rate": 5.5585313755899724e-05, "loss": 0.062, "step": 29470 }, { "grad_norm": 0.7410330772399902, "learning_rate": 5.5557927555725285e-05, "loss": 0.0456, "step": 29480 }, { "grad_norm": 0.6328222751617432, "learning_rate": 5.55305396671992e-05, "loss": 0.0456, "step": 29490 }, { "grad_norm": 0.5401435494422913, "learning_rate": 5.55031500986412e-05, "loss": 0.0515, "step": 29500 }, { "grad_norm": 0.4890204966068268, "learning_rate": 5.547575885837149e-05, "loss": 0.0545, "step": 29510 }, { "grad_norm": 0.4785020351409912, "learning_rate": 5.5448365954710825e-05, "loss": 0.0646, "step": 29520 }, { "grad_norm": 0.59397953748703, "learning_rate": 5.5420971395980446e-05, "loss": 0.0447, "step": 29530 }, { "grad_norm": 0.514378547668457, "learning_rate": 5.539357519050209e-05, "loss": 0.0415, "step": 29540 }, { "grad_norm": 0.8318538069725037, "learning_rate": 5.536617734659799e-05, "loss": 0.045, "step": 29550 }, { "grad_norm": 0.40923255681991577, "learning_rate": 5.533877787259091e-05, "loss": 0.046, "step": 29560 }, { "grad_norm": 0.5615935921669006, "learning_rate": 5.5311376776804044e-05, "loss": 0.0441, "step": 29570 }, { "grad_norm": 0.43388405442237854, "learning_rate": 5.528397406756118e-05, "loss": 0.0404, "step": 29580 }, { "grad_norm": 0.33125609159469604, "learning_rate": 5.525656975318652e-05, "loss": 0.0453, "step": 29590 }, { "grad_norm": 0.8577701449394226, "learning_rate": 5.522916384200474e-05, "loss": 0.0565, "step": 29600 }, { "grad_norm": 0.5245051980018616, "learning_rate": 5.520175634234106e-05, "loss": 0.049, "step": 29610 }, { "grad_norm": 0.5569892525672913, "learning_rate": 5.517434726252113e-05, "loss": 0.0452, "step": 29620 }, { "grad_norm": 0.7294795513153076, "learning_rate": 5.514693661087113e-05, "loss": 0.0567, "step": 29630 }, { "grad_norm": 0.6428250074386597, "learning_rate": 5.511952439571769e-05, "loss": 0.0385, "step": 29640 }, { "grad_norm": 0.674837052822113, "learning_rate": 5.509211062538791e-05, "loss": 0.077, "step": 29650 }, { "grad_norm": 0.7294267416000366, "learning_rate": 5.506469530820939e-05, "loss": 0.0399, "step": 29660 }, { "grad_norm": 0.3888942003250122, "learning_rate": 5.503727845251014e-05, "loss": 0.0855, "step": 29670 }, { "grad_norm": 0.6063898801803589, "learning_rate": 5.50098600666187e-05, "loss": 0.0457, "step": 29680 }, { "grad_norm": 0.8690393567085266, "learning_rate": 5.498244015886406e-05, "loss": 0.0407, "step": 29690 }, { "grad_norm": 0.5479191541671753, "learning_rate": 5.495501873757565e-05, "loss": 0.0891, "step": 29700 }, { "grad_norm": 0.44819387793540955, "learning_rate": 5.492759581108336e-05, "loss": 0.0428, "step": 29710 }, { "grad_norm": 0.9588799476623535, "learning_rate": 5.490017138771759e-05, "loss": 0.0525, "step": 29720 }, { "grad_norm": 0.9775363802909851, "learning_rate": 5.487274547580912e-05, "loss": 0.0445, "step": 29730 }, { "grad_norm": 0.4637380540370941, "learning_rate": 5.484531808368923e-05, "loss": 0.0424, "step": 29740 }, { "grad_norm": 0.6204479336738586, "learning_rate": 5.4817889219689656e-05, "loss": 0.0424, "step": 29750 }, { "grad_norm": 0.6487653255462646, "learning_rate": 5.4790458892142536e-05, "loss": 0.0577, "step": 29760 }, { "grad_norm": 0.5398982167243958, "learning_rate": 5.476302710938048e-05, "loss": 0.0447, "step": 29770 }, { "grad_norm": 0.44049811363220215, "learning_rate": 5.473559387973657e-05, "loss": 0.0474, "step": 29780 }, { "grad_norm": 0.49703454971313477, "learning_rate": 5.470815921154425e-05, "loss": 0.0505, "step": 29790 }, { "grad_norm": 0.625049352645874, "learning_rate": 5.468072311313749e-05, "loss": 0.0464, "step": 29800 }, { "grad_norm": 0.5756197571754456, "learning_rate": 5.465328559285063e-05, "loss": 0.0456, "step": 29810 }, { "grad_norm": 0.7582029104232788, "learning_rate": 5.462584665901849e-05, "loss": 0.0511, "step": 29820 }, { "grad_norm": 0.6809479594230652, "learning_rate": 5.4598406319976235e-05, "loss": 0.0517, "step": 29830 }, { "grad_norm": 0.5482094287872314, "learning_rate": 5.457096458405958e-05, "loss": 0.0464, "step": 29840 }, { "grad_norm": 0.8355728387832642, "learning_rate": 5.454352145960457e-05, "loss": 0.0365, "step": 29850 }, { "grad_norm": 0.6063175797462463, "learning_rate": 5.4516076954947715e-05, "loss": 0.0393, "step": 29860 }, { "grad_norm": 0.7472133636474609, "learning_rate": 5.448863107842591e-05, "loss": 0.0611, "step": 29870 }, { "grad_norm": 0.5108821392059326, "learning_rate": 5.446118383837651e-05, "loss": 0.0484, "step": 29880 }, { "grad_norm": 0.5632910132408142, "learning_rate": 5.443373524313722e-05, "loss": 0.0419, "step": 29890 }, { "grad_norm": 0.7910232543945312, "learning_rate": 5.440628530104626e-05, "loss": 0.0809, "step": 29900 }, { "grad_norm": 0.6581374406814575, "learning_rate": 5.4378834020442146e-05, "loss": 0.0453, "step": 29910 }, { "grad_norm": 0.4932384192943573, "learning_rate": 5.4351381409663884e-05, "loss": 0.0601, "step": 29920 }, { "grad_norm": 0.6719849705696106, "learning_rate": 5.432392747705084e-05, "loss": 0.0399, "step": 29930 }, { "grad_norm": 0.39153262972831726, "learning_rate": 5.429647223094278e-05, "loss": 0.0546, "step": 29940 }, { "grad_norm": 0.8663010597229004, "learning_rate": 5.4269015679679924e-05, "loss": 0.0444, "step": 29950 }, { "grad_norm": 0.5159448385238647, "learning_rate": 5.424155783160281e-05, "loss": 0.0378, "step": 29960 }, { "grad_norm": 0.6339073777198792, "learning_rate": 5.4214098695052415e-05, "loss": 0.0413, "step": 29970 }, { "grad_norm": 0.4473738670349121, "learning_rate": 5.418663827837012e-05, "loss": 0.0421, "step": 29980 }, { "grad_norm": 0.9539688229560852, "learning_rate": 5.415917658989763e-05, "loss": 0.0537, "step": 29990 }, { "grad_norm": 0.6029810309410095, "learning_rate": 5.413171363797713e-05, "loss": 0.0554, "step": 30000 }, { "grad_norm": 0.5465465784072876, "learning_rate": 5.4104249430951116e-05, "loss": 0.036, "step": 30010 }, { "grad_norm": 0.2880787253379822, "learning_rate": 5.4076783977162494e-05, "loss": 0.0325, "step": 30020 }, { "grad_norm": 0.4100145101547241, "learning_rate": 5.4049317284954525e-05, "loss": 0.0309, "step": 30030 }, { "grad_norm": 0.3877953588962555, "learning_rate": 5.4021849362670884e-05, "loss": 0.0524, "step": 30040 }, { "grad_norm": 0.5315605401992798, "learning_rate": 5.3994380218655604e-05, "loss": 0.0403, "step": 30050 }, { "grad_norm": 0.6105934381484985, "learning_rate": 5.396690986125309e-05, "loss": 0.0384, "step": 30060 }, { "grad_norm": 0.7120442986488342, "learning_rate": 5.3939438298808075e-05, "loss": 0.0797, "step": 30070 }, { "grad_norm": 0.6351486444473267, "learning_rate": 5.3911965539665744e-05, "loss": 0.0338, "step": 30080 }, { "grad_norm": 0.5463349223136902, "learning_rate": 5.388449159217156e-05, "loss": 0.0421, "step": 30090 }, { "grad_norm": 0.40844035148620605, "learning_rate": 5.3857016464671385e-05, "loss": 0.074, "step": 30100 }, { "grad_norm": 1.3174443244934082, "learning_rate": 5.382954016551146e-05, "loss": 0.0681, "step": 30110 }, { "grad_norm": 0.38147035241127014, "learning_rate": 5.380206270303835e-05, "loss": 0.0452, "step": 30120 }, { "grad_norm": 0.40802323818206787, "learning_rate": 5.377458408559897e-05, "loss": 0.0312, "step": 30130 }, { "grad_norm": 0.5667328834533691, "learning_rate": 5.374710432154061e-05, "loss": 0.0551, "step": 30140 }, { "grad_norm": 0.6060197949409485, "learning_rate": 5.3719623419210886e-05, "loss": 0.0359, "step": 30150 }, { "grad_norm": 0.6181932687759399, "learning_rate": 5.3692141386957786e-05, "loss": 0.0496, "step": 30160 }, { "grad_norm": 0.576881468296051, "learning_rate": 5.3664658233129616e-05, "loss": 0.0675, "step": 30170 }, { "grad_norm": 0.6339951157569885, "learning_rate": 5.363717396607504e-05, "loss": 0.0467, "step": 30180 }, { "grad_norm": 1.2120808362960815, "learning_rate": 5.360968859414305e-05, "loss": 0.057, "step": 30190 }, { "grad_norm": 0.33820098638534546, "learning_rate": 5.358220212568295e-05, "loss": 0.0365, "step": 30200 }, { "grad_norm": 0.3477543294429779, "learning_rate": 5.355471456904444e-05, "loss": 0.0359, "step": 30210 }, { "grad_norm": 0.7003567814826965, "learning_rate": 5.3527225932577495e-05, "loss": 0.0485, "step": 30220 }, { "grad_norm": 0.43974024057388306, "learning_rate": 5.349973622463246e-05, "loss": 0.0457, "step": 30230 }, { "grad_norm": 0.5373403429985046, "learning_rate": 5.3472245453559956e-05, "loss": 0.042, "step": 30240 }, { "grad_norm": 0.3786028027534485, "learning_rate": 5.3444753627710955e-05, "loss": 0.0376, "step": 30250 }, { "grad_norm": 0.30802515149116516, "learning_rate": 5.341726075543676e-05, "loss": 0.0347, "step": 30260 }, { "grad_norm": 0.6560201644897461, "learning_rate": 5.338976684508898e-05, "loss": 0.0313, "step": 30270 }, { "grad_norm": 0.5163845419883728, "learning_rate": 5.336227190501953e-05, "loss": 0.036, "step": 30280 }, { "grad_norm": 0.7104943990707397, "learning_rate": 5.3334775943580664e-05, "loss": 0.0425, "step": 30290 }, { "grad_norm": 0.5704562664031982, "learning_rate": 5.330727896912491e-05, "loss": 0.0415, "step": 30300 }, { "grad_norm": 0.6331015229225159, "learning_rate": 5.327978099000511e-05, "loss": 0.046, "step": 30310 }, { "grad_norm": 0.9034392833709717, "learning_rate": 5.3252282014574465e-05, "loss": 0.0571, "step": 30320 }, { "grad_norm": 0.3570694327354431, "learning_rate": 5.322478205118641e-05, "loss": 0.0429, "step": 30330 }, { "grad_norm": 0.5448889136314392, "learning_rate": 5.3197281108194704e-05, "loss": 0.0583, "step": 30340 }, { "grad_norm": 0.6252672076225281, "learning_rate": 5.316977919395342e-05, "loss": 0.0371, "step": 30350 }, { "grad_norm": 0.48762935400009155, "learning_rate": 5.314227631681691e-05, "loss": 0.0472, "step": 30360 }, { "grad_norm": 0.7759519815444946, "learning_rate": 5.311477248513982e-05, "loss": 0.0463, "step": 30370 }, { "grad_norm": 0.5971262454986572, "learning_rate": 5.30872677072771e-05, "loss": 0.0332, "step": 30380 }, { "grad_norm": 0.7368080019950867, "learning_rate": 5.3059761991583954e-05, "loss": 0.0546, "step": 30390 }, { "grad_norm": 0.5711492300033569, "learning_rate": 5.303225534641592e-05, "loss": 0.0425, "step": 30400 }, { "grad_norm": 0.3871608376502991, "learning_rate": 5.300474778012875e-05, "loss": 0.0491, "step": 30410 }, { "grad_norm": 0.5532242655754089, "learning_rate": 5.297723930107855e-05, "loss": 0.0376, "step": 30420 }, { "grad_norm": 0.7355248928070068, "learning_rate": 5.294972991762167e-05, "loss": 0.0472, "step": 30430 }, { "grad_norm": 0.7712422609329224, "learning_rate": 5.292221963811472e-05, "loss": 0.0473, "step": 30440 }, { "grad_norm": 0.5086511969566345, "learning_rate": 5.28947084709146e-05, "loss": 0.0421, "step": 30450 }, { "grad_norm": 0.35943603515625, "learning_rate": 5.2867196424378465e-05, "loss": 0.0349, "step": 30460 }, { "grad_norm": 0.5992060899734497, "learning_rate": 5.2839683506863765e-05, "loss": 0.0353, "step": 30470 }, { "grad_norm": 0.33475732803344727, "learning_rate": 5.281216972672821e-05, "loss": 0.0379, "step": 30480 }, { "grad_norm": 0.4355679154396057, "learning_rate": 5.278465509232973e-05, "loss": 0.0375, "step": 30490 }, { "grad_norm": 0.5124037861824036, "learning_rate": 5.275713961202655e-05, "loss": 0.0749, "step": 30500 }, { "grad_norm": 0.6575549840927124, "learning_rate": 5.2729623294177165e-05, "loss": 0.0438, "step": 30510 }, { "grad_norm": 0.43649303913116455, "learning_rate": 5.270210614714028e-05, "loss": 0.045, "step": 30520 }, { "grad_norm": 0.6022247076034546, "learning_rate": 5.267458817927491e-05, "loss": 0.0383, "step": 30530 }, { "grad_norm": 0.625848650932312, "learning_rate": 5.264706939894026e-05, "loss": 0.0562, "step": 30540 }, { "grad_norm": 0.6408353447914124, "learning_rate": 5.261954981449584e-05, "loss": 0.0469, "step": 30550 }, { "grad_norm": 0.3620222210884094, "learning_rate": 5.2592029434301324e-05, "loss": 0.0383, "step": 30560 }, { "grad_norm": 0.6229206323623657, "learning_rate": 5.256450826671672e-05, "loss": 0.0568, "step": 30570 }, { "grad_norm": 0.6290371417999268, "learning_rate": 5.253698632010221e-05, "loss": 0.0479, "step": 30580 }, { "grad_norm": 0.6198424696922302, "learning_rate": 5.2509463602818246e-05, "loss": 0.038, "step": 30590 }, { "grad_norm": 0.6041807532310486, "learning_rate": 5.248194012322549e-05, "loss": 0.0474, "step": 30600 }, { "grad_norm": 0.734293520450592, "learning_rate": 5.245441588968486e-05, "loss": 0.0632, "step": 30610 }, { "grad_norm": 0.45021307468414307, "learning_rate": 5.242689091055748e-05, "loss": 0.0419, "step": 30620 }, { "grad_norm": 0.7693207859992981, "learning_rate": 5.239936519420473e-05, "loss": 0.0449, "step": 30630 }, { "grad_norm": 0.6958133578300476, "learning_rate": 5.2371838748988175e-05, "loss": 0.0346, "step": 30640 }, { "grad_norm": 0.47813156247138977, "learning_rate": 5.234431158326965e-05, "loss": 0.0341, "step": 30650 }, { "grad_norm": 0.691744863986969, "learning_rate": 5.231678370541115e-05, "loss": 0.0408, "step": 30660 }, { "grad_norm": 0.5315853357315063, "learning_rate": 5.228925512377495e-05, "loss": 0.051, "step": 30670 }, { "grad_norm": 0.45707055926322937, "learning_rate": 5.2261725846723465e-05, "loss": 0.0488, "step": 30680 }, { "grad_norm": 1.0388176441192627, "learning_rate": 5.22341958826194e-05, "loss": 0.0472, "step": 30690 }, { "grad_norm": 0.6666968464851379, "learning_rate": 5.22066652398256e-05, "loss": 0.0582, "step": 30700 }, { "grad_norm": 0.6437695622444153, "learning_rate": 5.2179133926705185e-05, "loss": 0.0384, "step": 30710 }, { "grad_norm": 0.38450953364372253, "learning_rate": 5.215160195162141e-05, "loss": 0.0389, "step": 30720 }, { "grad_norm": 0.4680781662464142, "learning_rate": 5.212406932293776e-05, "loss": 0.063, "step": 30730 }, { "grad_norm": 0.5701716542243958, "learning_rate": 5.209653604901795e-05, "loss": 0.0392, "step": 30740 }, { "grad_norm": 0.5005760788917542, "learning_rate": 5.206900213822584e-05, "loss": 0.0428, "step": 30750 }, { "grad_norm": 0.35931500792503357, "learning_rate": 5.204146759892551e-05, "loss": 0.0257, "step": 30760 }, { "grad_norm": 0.30207371711730957, "learning_rate": 5.2013932439481216e-05, "loss": 0.0422, "step": 30770 }, { "grad_norm": 0.8763881325721741, "learning_rate": 5.198639666825743e-05, "loss": 0.0511, "step": 30780 }, { "grad_norm": 0.44573917984962463, "learning_rate": 5.195886029361877e-05, "loss": 0.0541, "step": 30790 }, { "grad_norm": 0.4218328297138214, "learning_rate": 5.193132332393009e-05, "loss": 0.0396, "step": 30800 }, { "grad_norm": 0.5007890462875366, "learning_rate": 5.1903785767556376e-05, "loss": 0.0374, "step": 30810 }, { "grad_norm": 0.7661132216453552, "learning_rate": 5.187624763286282e-05, "loss": 0.0385, "step": 30820 }, { "grad_norm": 0.5506207942962646, "learning_rate": 5.184870892821475e-05, "loss": 0.0458, "step": 30830 }, { "grad_norm": 0.38086310029029846, "learning_rate": 5.182116966197773e-05, "loss": 0.0438, "step": 30840 }, { "grad_norm": 0.6815128326416016, "learning_rate": 5.1793629842517466e-05, "loss": 0.0582, "step": 30850 }, { "grad_norm": 0.44727200269699097, "learning_rate": 5.17660894781998e-05, "loss": 0.0626, "step": 30860 }, { "grad_norm": 0.9397882223129272, "learning_rate": 5.173854857739079e-05, "loss": 0.0426, "step": 30870 }, { "grad_norm": 1.3236631155014038, "learning_rate": 5.171100714845661e-05, "loss": 0.0427, "step": 30880 }, { "grad_norm": 0.7499212622642517, "learning_rate": 5.1683465199763646e-05, "loss": 0.0614, "step": 30890 }, { "grad_norm": 0.4595293700695038, "learning_rate": 5.16559227396784e-05, "loss": 0.0449, "step": 30900 }, { "grad_norm": 0.5040279626846313, "learning_rate": 5.1628379776567556e-05, "loss": 0.0473, "step": 30910 }, { "grad_norm": 0.49026837944984436, "learning_rate": 5.160083631879792e-05, "loss": 0.0466, "step": 30920 }, { "grad_norm": 0.3925880193710327, "learning_rate": 5.1573292374736484e-05, "loss": 0.0379, "step": 30930 }, { "grad_norm": 0.38421517610549927, "learning_rate": 5.1545747952750356e-05, "loss": 0.0429, "step": 30940 }, { "grad_norm": 0.5175942182540894, "learning_rate": 5.151820306120682e-05, "loss": 0.0394, "step": 30950 }, { "grad_norm": 0.7901777029037476, "learning_rate": 5.149065770847328e-05, "loss": 0.0555, "step": 30960 }, { "grad_norm": 0.3156496584415436, "learning_rate": 5.1463111902917297e-05, "loss": 0.0441, "step": 30970 }, { "grad_norm": 0.49791866540908813, "learning_rate": 5.143556565290654e-05, "loss": 0.0441, "step": 30980 }, { "grad_norm": 0.3964872360229492, "learning_rate": 5.140801896680882e-05, "loss": 0.0364, "step": 30990 }, { "grad_norm": 0.661019504070282, "learning_rate": 5.1380471852992144e-05, "loss": 0.0487, "step": 31000 }, { "grad_norm": 0.5185008645057678, "learning_rate": 5.135292431982457e-05, "loss": 0.0496, "step": 31010 }, { "grad_norm": 0.507385790348053, "learning_rate": 5.1325376375674294e-05, "loss": 0.0399, "step": 31020 }, { "grad_norm": 0.685515284538269, "learning_rate": 5.129782802890968e-05, "loss": 0.039, "step": 31030 }, { "grad_norm": 0.41665998101234436, "learning_rate": 5.127027928789916e-05, "loss": 0.0351, "step": 31040 }, { "grad_norm": 0.5185707211494446, "learning_rate": 5.124273016101135e-05, "loss": 0.0446, "step": 31050 }, { "grad_norm": 0.4536466598510742, "learning_rate": 5.121518065661492e-05, "loss": 0.0454, "step": 31060 }, { "grad_norm": 0.5501371026039124, "learning_rate": 5.11876307830787e-05, "loss": 0.0449, "step": 31070 }, { "grad_norm": 0.36578765511512756, "learning_rate": 5.1160080548771596e-05, "loss": 0.0283, "step": 31080 }, { "grad_norm": 0.7339156866073608, "learning_rate": 5.1132529962062656e-05, "loss": 0.0478, "step": 31090 }, { "grad_norm": 0.41906455159187317, "learning_rate": 5.110497903132101e-05, "loss": 0.0333, "step": 31100 }, { "grad_norm": 0.41247767210006714, "learning_rate": 5.107742776491592e-05, "loss": 0.0537, "step": 31110 }, { "grad_norm": 0.6108039021492004, "learning_rate": 5.104987617121673e-05, "loss": 0.0419, "step": 31120 }, { "grad_norm": 0.5904902815818787, "learning_rate": 5.102232425859287e-05, "loss": 0.0447, "step": 31130 }, { "grad_norm": 0.46907979249954224, "learning_rate": 5.09947720354139e-05, "loss": 0.0538, "step": 31140 }, { "grad_norm": 0.46288514137268066, "learning_rate": 5.096721951004942e-05, "loss": 0.045, "step": 31150 }, { "grad_norm": 0.5195901989936829, "learning_rate": 5.0939666690869227e-05, "loss": 0.0532, "step": 31160 }, { "grad_norm": 0.8244607448577881, "learning_rate": 5.0912113586243096e-05, "loss": 0.0395, "step": 31170 }, { "grad_norm": 0.5230569839477539, "learning_rate": 5.0884560204540935e-05, "loss": 0.0384, "step": 31180 }, { "grad_norm": 0.4989451766014099, "learning_rate": 5.0857006554132736e-05, "loss": 0.0506, "step": 31190 }, { "grad_norm": 0.491314560174942, "learning_rate": 5.0829452643388575e-05, "loss": 0.0431, "step": 31200 }, { "grad_norm": 0.4421670436859131, "learning_rate": 5.08018984806786e-05, "loss": 0.0385, "step": 31210 }, { "grad_norm": 0.7348514199256897, "learning_rate": 5.0774344074373036e-05, "loss": 0.0523, "step": 31220 }, { "grad_norm": 0.7755212783813477, "learning_rate": 5.07467894328422e-05, "loss": 0.0375, "step": 31230 }, { "grad_norm": 0.6403796076774597, "learning_rate": 5.0719234564456454e-05, "loss": 0.0379, "step": 31240 }, { "grad_norm": 0.6350787281990051, "learning_rate": 5.0691679477586216e-05, "loss": 0.0449, "step": 31250 }, { "grad_norm": 0.6209282279014587, "learning_rate": 5.0664124180602035e-05, "loss": 0.0481, "step": 31260 }, { "grad_norm": 0.4144504964351654, "learning_rate": 5.063656868187447e-05, "loss": 0.0436, "step": 31270 }, { "grad_norm": 0.4751732349395752, "learning_rate": 5.060901298977413e-05, "loss": 0.0456, "step": 31280 }, { "grad_norm": 0.6715149283409119, "learning_rate": 5.0581457112671725e-05, "loss": 0.0445, "step": 31290 }, { "grad_norm": 0.44224581122398376, "learning_rate": 5.0553901058938016e-05, "loss": 0.0611, "step": 31300 }, { "grad_norm": 0.6644887328147888, "learning_rate": 5.052634483694377e-05, "loss": 0.0403, "step": 31310 }, { "grad_norm": 0.6281777620315552, "learning_rate": 5.049878845505988e-05, "loss": 0.0497, "step": 31320 }, { "grad_norm": 0.5883652567863464, "learning_rate": 5.047123192165721e-05, "loss": 0.0352, "step": 31330 }, { "grad_norm": 0.6148471832275391, "learning_rate": 5.0443675245106735e-05, "loss": 0.0519, "step": 31340 }, { "grad_norm": 0.3793698251247406, "learning_rate": 5.0416118433779426e-05, "loss": 0.0358, "step": 31350 }, { "grad_norm": 0.8390260338783264, "learning_rate": 5.038856149604633e-05, "loss": 0.0391, "step": 31360 }, { "grad_norm": 0.547360897064209, "learning_rate": 5.03610044402785e-05, "loss": 0.0506, "step": 31370 }, { "grad_norm": 0.5100196599960327, "learning_rate": 5.033344727484707e-05, "loss": 0.0482, "step": 31380 }, { "grad_norm": 0.4769405424594879, "learning_rate": 5.030589000812315e-05, "loss": 0.0347, "step": 31390 }, { "grad_norm": 0.6492520570755005, "learning_rate": 5.027833264847793e-05, "loss": 0.0417, "step": 31400 }, { "grad_norm": 0.44816648960113525, "learning_rate": 5.025077520428258e-05, "loss": 0.049, "step": 31410 }, { "grad_norm": 0.5808398723602295, "learning_rate": 5.022321768390837e-05, "loss": 0.0491, "step": 31420 }, { "grad_norm": 0.44447943568229675, "learning_rate": 5.0195660095726516e-05, "loss": 0.0405, "step": 31430 }, { "grad_norm": 0.6317142248153687, "learning_rate": 5.016810244810829e-05, "loss": 0.0436, "step": 31440 }, { "grad_norm": 0.9314597845077515, "learning_rate": 5.0140544749424976e-05, "loss": 0.0496, "step": 31450 }, { "grad_norm": 0.6711386442184448, "learning_rate": 5.0112987008047874e-05, "loss": 0.0504, "step": 31460 }, { "grad_norm": 0.48524153232574463, "learning_rate": 5.008542923234831e-05, "loss": 0.0494, "step": 31470 }, { "grad_norm": 0.41928550601005554, "learning_rate": 5.00578714306976e-05, "loss": 0.039, "step": 31480 }, { "grad_norm": 0.4772840738296509, "learning_rate": 5.0030313611467084e-05, "loss": 0.0502, "step": 31490 }, { "grad_norm": 0.5982838273048401, "learning_rate": 5.0002755783028074e-05, "loss": 0.0566, "step": 31500 }, { "grad_norm": 0.9979145526885986, "learning_rate": 4.997519795375194e-05, "loss": 0.0393, "step": 31510 }, { "grad_norm": 0.6072215437889099, "learning_rate": 4.9947640132010016e-05, "loss": 0.0441, "step": 31520 }, { "grad_norm": 0.4089731276035309, "learning_rate": 4.9920082326173625e-05, "loss": 0.0358, "step": 31530 }, { "grad_norm": 0.5646291971206665, "learning_rate": 4.9892524544614114e-05, "loss": 0.0502, "step": 31540 }, { "grad_norm": 0.676548182964325, "learning_rate": 4.986496679570283e-05, "loss": 0.053, "step": 31550 }, { "grad_norm": 0.43218839168548584, "learning_rate": 4.983740908781105e-05, "loss": 0.0355, "step": 31560 }, { "grad_norm": 0.6767707467079163, "learning_rate": 4.9809851429310116e-05, "loss": 0.0614, "step": 31570 }, { "grad_norm": 0.5634758472442627, "learning_rate": 4.9782293828571275e-05, "loss": 0.0439, "step": 31580 }, { "grad_norm": 0.5918683409690857, "learning_rate": 4.9754736293965846e-05, "loss": 0.0425, "step": 31590 }, { "grad_norm": 0.4027884304523468, "learning_rate": 4.972717883386502e-05, "loss": 0.0424, "step": 31600 }, { "grad_norm": 0.39671263098716736, "learning_rate": 4.9699621456640075e-05, "loss": 0.0482, "step": 31610 }, { "grad_norm": 0.6367741227149963, "learning_rate": 4.9672064170662214e-05, "loss": 0.0444, "step": 31620 }, { "grad_norm": 0.9253485202789307, "learning_rate": 4.9644506984302583e-05, "loss": 0.0415, "step": 31630 }, { "grad_norm": 0.5409702062606812, "learning_rate": 4.9616949905932356e-05, "loss": 0.0404, "step": 31640 }, { "grad_norm": 0.3911043107509613, "learning_rate": 4.9589392943922615e-05, "loss": 0.0395, "step": 31650 }, { "grad_norm": 0.8360609412193298, "learning_rate": 4.956183610664447e-05, "loss": 0.0364, "step": 31660 }, { "grad_norm": 0.661444365978241, "learning_rate": 4.9534279402468945e-05, "loss": 0.0335, "step": 31670 }, { "grad_norm": 0.5154592990875244, "learning_rate": 4.9506722839767036e-05, "loss": 0.0433, "step": 31680 }, { "grad_norm": 0.42220598459243774, "learning_rate": 4.947916642690972e-05, "loss": 0.0399, "step": 31690 }, { "grad_norm": 0.7773271799087524, "learning_rate": 4.9451610172267874e-05, "loss": 0.0488, "step": 31700 }, { "grad_norm": 0.6220124959945679, "learning_rate": 4.9424054084212376e-05, "loss": 0.0465, "step": 31710 }, { "grad_norm": 0.5434072613716125, "learning_rate": 4.939649817111407e-05, "loss": 0.0418, "step": 31720 }, { "grad_norm": 0.6870599389076233, "learning_rate": 4.936894244134365e-05, "loss": 0.0399, "step": 31730 }, { "grad_norm": 0.4941057562828064, "learning_rate": 4.9341386903271886e-05, "loss": 0.0542, "step": 31740 }, { "grad_norm": 0.7160769701004028, "learning_rate": 4.931383156526936e-05, "loss": 0.0588, "step": 31750 }, { "grad_norm": 0.7337652444839478, "learning_rate": 4.92862764357067e-05, "loss": 0.0407, "step": 31760 }, { "grad_norm": 0.587890088558197, "learning_rate": 4.925872152295443e-05, "loss": 0.058, "step": 31770 }, { "grad_norm": 0.5819334983825684, "learning_rate": 4.923116683538296e-05, "loss": 0.0426, "step": 31780 }, { "grad_norm": 0.5257004499435425, "learning_rate": 4.920361238136273e-05, "loss": 0.034, "step": 31790 }, { "grad_norm": 0.45959505438804626, "learning_rate": 4.9176058169264014e-05, "loss": 0.0372, "step": 31800 }, { "grad_norm": 0.40322235226631165, "learning_rate": 4.9148504207457074e-05, "loss": 0.0371, "step": 31810 }, { "grad_norm": 0.4607407748699188, "learning_rate": 4.912095050431208e-05, "loss": 0.0333, "step": 31820 }, { "grad_norm": 0.6720170974731445, "learning_rate": 4.909339706819911e-05, "loss": 0.0402, "step": 31830 }, { "grad_norm": 0.44094353914260864, "learning_rate": 4.906584390748819e-05, "loss": 0.0607, "step": 31840 }, { "grad_norm": 0.5407675504684448, "learning_rate": 4.9038291030549195e-05, "loss": 0.0393, "step": 31850 }, { "grad_norm": 0.5060336589813232, "learning_rate": 4.9010738445751995e-05, "loss": 0.0419, "step": 31860 }, { "grad_norm": 0.6453689336776733, "learning_rate": 4.8983186161466364e-05, "loss": 0.0426, "step": 31870 }, { "grad_norm": 0.8826618790626526, "learning_rate": 4.89556341860619e-05, "loss": 0.0444, "step": 31880 }, { "grad_norm": 0.6241528391838074, "learning_rate": 4.892808252790822e-05, "loss": 0.0434, "step": 31890 }, { "grad_norm": 0.7735816836357117, "learning_rate": 4.890053119537475e-05, "loss": 0.0362, "step": 31900 }, { "grad_norm": 0.3096461296081543, "learning_rate": 4.887298019683087e-05, "loss": 0.0478, "step": 31910 }, { "grad_norm": 0.5331097841262817, "learning_rate": 4.884542954064587e-05, "loss": 0.0347, "step": 31920 }, { "grad_norm": 0.415122389793396, "learning_rate": 4.881787923518887e-05, "loss": 0.0489, "step": 31930 }, { "grad_norm": 0.7958645820617676, "learning_rate": 4.879032928882896e-05, "loss": 0.0481, "step": 31940 }, { "grad_norm": 0.5675541758537292, "learning_rate": 4.876277970993505e-05, "loss": 0.0431, "step": 31950 }, { "grad_norm": 0.6145936250686646, "learning_rate": 4.873523050687602e-05, "loss": 0.0479, "step": 31960 }, { "grad_norm": 0.6419429183006287, "learning_rate": 4.870768168802056e-05, "loss": 0.0553, "step": 31970 }, { "grad_norm": 0.646269679069519, "learning_rate": 4.868013326173728e-05, "loss": 0.0549, "step": 31980 }, { "grad_norm": 0.5022913217544556, "learning_rate": 4.865258523639468e-05, "loss": 0.044, "step": 31990 }, { "grad_norm": 0.528839647769928, "learning_rate": 4.862503762036109e-05, "loss": 0.0464, "step": 32000 }, { "grad_norm": 0.7367538213729858, "learning_rate": 4.859749042200478e-05, "loss": 0.0552, "step": 32010 }, { "grad_norm": 0.620688796043396, "learning_rate": 4.856994364969384e-05, "loss": 0.0427, "step": 32020 }, { "grad_norm": 0.3820106089115143, "learning_rate": 4.854239731179625e-05, "loss": 0.0462, "step": 32030 }, { "grad_norm": 0.5209752917289734, "learning_rate": 4.85148514166799e-05, "loss": 0.042, "step": 32040 }, { "grad_norm": 0.5306393504142761, "learning_rate": 4.8487305972712456e-05, "loss": 0.0557, "step": 32050 }, { "grad_norm": 0.6201822757720947, "learning_rate": 4.8459760988261526e-05, "loss": 0.0367, "step": 32060 }, { "grad_norm": 0.7650476098060608, "learning_rate": 4.843221647169453e-05, "loss": 0.0467, "step": 32070 }, { "grad_norm": 0.3598059415817261, "learning_rate": 4.840467243137878e-05, "loss": 0.0376, "step": 32080 }, { "grad_norm": 0.42652180790901184, "learning_rate": 4.837712887568143e-05, "loss": 0.0352, "step": 32090 }, { "grad_norm": 0.3709927797317505, "learning_rate": 4.8349585812969464e-05, "loss": 0.0513, "step": 32100 }, { "grad_norm": 0.6131112575531006, "learning_rate": 4.8322043251609775e-05, "loss": 0.0366, "step": 32110 }, { "grad_norm": 0.5032520890235901, "learning_rate": 4.8294501199969015e-05, "loss": 0.0408, "step": 32120 }, { "grad_norm": 0.5683629512786865, "learning_rate": 4.826695966641376e-05, "loss": 0.0401, "step": 32130 }, { "grad_norm": 0.6000210046768188, "learning_rate": 4.823941865931043e-05, "loss": 0.0405, "step": 32140 }, { "grad_norm": 0.5755215883255005, "learning_rate": 4.82118781870252e-05, "loss": 0.0422, "step": 32150 }, { "grad_norm": 0.5877774357795715, "learning_rate": 4.8184338257924185e-05, "loss": 0.0392, "step": 32160 }, { "grad_norm": 0.4491860568523407, "learning_rate": 4.815679888037324e-05, "loss": 0.0437, "step": 32170 }, { "grad_norm": 0.46444761753082275, "learning_rate": 4.8129260062738135e-05, "loss": 0.0369, "step": 32180 }, { "grad_norm": 0.730342447757721, "learning_rate": 4.810172181338445e-05, "loss": 0.0386, "step": 32190 }, { "grad_norm": 0.3951740264892578, "learning_rate": 4.807418414067753e-05, "loss": 0.0371, "step": 32200 }, { "grad_norm": 0.5071313977241516, "learning_rate": 4.804664705298264e-05, "loss": 0.0456, "step": 32210 }, { "grad_norm": 0.6976562142372131, "learning_rate": 4.80191105586648e-05, "loss": 0.0347, "step": 32220 }, { "grad_norm": 0.6493787169456482, "learning_rate": 4.799157466608886e-05, "loss": 0.0508, "step": 32230 }, { "grad_norm": 0.5039350390434265, "learning_rate": 4.796403938361951e-05, "loss": 0.0431, "step": 32240 }, { "grad_norm": 0.3615456521511078, "learning_rate": 4.793650471962123e-05, "loss": 0.0329, "step": 32250 }, { "grad_norm": 0.4312593936920166, "learning_rate": 4.790897068245835e-05, "loss": 0.0465, "step": 32260 }, { "grad_norm": 0.41612839698791504, "learning_rate": 4.7881437280494954e-05, "loss": 0.0365, "step": 32270 }, { "grad_norm": 0.541657030582428, "learning_rate": 4.7853904522094965e-05, "loss": 0.0488, "step": 32280 }, { "grad_norm": 0.5873473882675171, "learning_rate": 4.782637241562215e-05, "loss": 0.0547, "step": 32290 }, { "grad_norm": 0.7062764167785645, "learning_rate": 4.779884096943997e-05, "loss": 0.0472, "step": 32300 }, { "grad_norm": 0.8502093553543091, "learning_rate": 4.777131019191182e-05, "loss": 0.0574, "step": 32310 }, { "grad_norm": 0.2715814411640167, "learning_rate": 4.774378009140076e-05, "loss": 0.0396, "step": 32320 }, { "grad_norm": 0.3278416395187378, "learning_rate": 4.7716250676269735e-05, "loss": 0.0378, "step": 32330 }, { "grad_norm": 0.2724023163318634, "learning_rate": 4.7688721954881485e-05, "loss": 0.0376, "step": 32340 }, { "grad_norm": 0.6166359186172485, "learning_rate": 4.7661193935598446e-05, "loss": 0.0345, "step": 32350 }, { "grad_norm": 0.39700523018836975, "learning_rate": 4.763366662678296e-05, "loss": 0.0363, "step": 32360 }, { "grad_norm": 0.40923815965652466, "learning_rate": 4.7606140036797064e-05, "loss": 0.0474, "step": 32370 }, { "grad_norm": 0.5866122245788574, "learning_rate": 4.7578614174002614e-05, "loss": 0.039, "step": 32380 }, { "grad_norm": 0.5245516300201416, "learning_rate": 4.755108904676125e-05, "loss": 0.0461, "step": 32390 }, { "grad_norm": 0.8832660913467407, "learning_rate": 4.752356466343436e-05, "loss": 0.0448, "step": 32400 }, { "grad_norm": 0.5038317441940308, "learning_rate": 4.7496041032383174e-05, "loss": 0.0549, "step": 32410 }, { "grad_norm": 0.658946692943573, "learning_rate": 4.746851816196858e-05, "loss": 0.0442, "step": 32420 }, { "grad_norm": 0.5641771554946899, "learning_rate": 4.744099606055135e-05, "loss": 0.0417, "step": 32430 }, { "grad_norm": 0.3413252532482147, "learning_rate": 4.741347473649193e-05, "loss": 0.0362, "step": 32440 }, { "grad_norm": 0.5878582000732422, "learning_rate": 4.738595419815058e-05, "loss": 0.0452, "step": 32450 }, { "grad_norm": 0.32585427165031433, "learning_rate": 4.7358434453887365e-05, "loss": 0.0537, "step": 32460 }, { "grad_norm": 0.46214163303375244, "learning_rate": 4.7330915512061976e-05, "loss": 0.0543, "step": 32470 }, { "grad_norm": 0.6697698831558228, "learning_rate": 4.730339738103402e-05, "loss": 0.0537, "step": 32480 }, { "grad_norm": 0.49717074632644653, "learning_rate": 4.727588006916271e-05, "loss": 0.0401, "step": 32490 }, { "grad_norm": 0.5266716480255127, "learning_rate": 4.724836358480711e-05, "loss": 0.0395, "step": 32500 }, { "grad_norm": 0.45744919776916504, "learning_rate": 4.722084793632601e-05, "loss": 0.0358, "step": 32510 }, { "grad_norm": 0.7633131742477417, "learning_rate": 4.719333313207792e-05, "loss": 0.05, "step": 32520 }, { "grad_norm": 0.5527465343475342, "learning_rate": 4.716581918042114e-05, "loss": 0.0385, "step": 32530 }, { "grad_norm": 0.6252866387367249, "learning_rate": 4.7138306089713636e-05, "loss": 0.0464, "step": 32540 }, { "grad_norm": 0.487419992685318, "learning_rate": 4.7110793868313183e-05, "loss": 0.0587, "step": 32550 }, { "grad_norm": 0.3812021315097809, "learning_rate": 4.708328252457729e-05, "loss": 0.0362, "step": 32560 }, { "grad_norm": 0.5437343716621399, "learning_rate": 4.7055772066863135e-05, "loss": 0.0477, "step": 32570 }, { "grad_norm": 0.3747781217098236, "learning_rate": 4.702826250352771e-05, "loss": 0.0427, "step": 32580 }, { "grad_norm": 0.5151990652084351, "learning_rate": 4.7000753842927653e-05, "loss": 0.033, "step": 32590 }, { "grad_norm": 0.561296284198761, "learning_rate": 4.6973246093419384e-05, "loss": 0.0367, "step": 32600 }, { "grad_norm": 0.6157417297363281, "learning_rate": 4.694573926335906e-05, "loss": 0.0378, "step": 32610 }, { "grad_norm": 0.5516732335090637, "learning_rate": 4.6918233361102476e-05, "loss": 0.0393, "step": 32620 }, { "grad_norm": 0.591910183429718, "learning_rate": 4.689072839500525e-05, "loss": 0.0436, "step": 32630 }, { "grad_norm": 0.5109500288963318, "learning_rate": 4.6863224373422635e-05, "loss": 0.0387, "step": 32640 }, { "grad_norm": 0.2767799198627472, "learning_rate": 4.683572130470962e-05, "loss": 0.0553, "step": 32650 }, { "grad_norm": 0.38340625166893005, "learning_rate": 4.680821919722094e-05, "loss": 0.0364, "step": 32660 }, { "grad_norm": 0.4792957007884979, "learning_rate": 4.6780718059310975e-05, "loss": 0.0334, "step": 32670 }, { "grad_norm": 0.23646371066570282, "learning_rate": 4.675321789933389e-05, "loss": 0.0411, "step": 32680 }, { "grad_norm": 0.45236507058143616, "learning_rate": 4.6725718725643464e-05, "loss": 0.0405, "step": 32690 }, { "grad_norm": 0.4070137143135071, "learning_rate": 4.669822054659323e-05, "loss": 0.0321, "step": 32700 }, { "grad_norm": 0.4723351001739502, "learning_rate": 4.667072337053644e-05, "loss": 0.0478, "step": 32710 }, { "grad_norm": 0.5872053503990173, "learning_rate": 4.6643227205825965e-05, "loss": 0.0445, "step": 32720 }, { "grad_norm": 0.4748661518096924, "learning_rate": 4.6615732060814454e-05, "loss": 0.0432, "step": 32730 }, { "grad_norm": 0.3752307593822479, "learning_rate": 4.658823794385417e-05, "loss": 0.0555, "step": 32740 }, { "grad_norm": 0.637750506401062, "learning_rate": 4.6560744863297115e-05, "loss": 0.0476, "step": 32750 }, { "grad_norm": 0.35745304822921753, "learning_rate": 4.653325282749498e-05, "loss": 0.0346, "step": 32760 }, { "grad_norm": 0.48343101143836975, "learning_rate": 4.6505761844799075e-05, "loss": 0.0443, "step": 32770 }, { "grad_norm": 0.4231046140193939, "learning_rate": 4.647827192356048e-05, "loss": 0.0518, "step": 32780 }, { "grad_norm": 0.7000221610069275, "learning_rate": 4.645078307212989e-05, "loss": 0.0452, "step": 32790 }, { "grad_norm": 0.7395123839378357, "learning_rate": 4.642329529885768e-05, "loss": 0.0522, "step": 32800 }, { "grad_norm": 0.5930688381195068, "learning_rate": 4.639580861209393e-05, "loss": 0.0505, "step": 32810 }, { "grad_norm": 0.6862329840660095, "learning_rate": 4.636832302018835e-05, "loss": 0.0488, "step": 32820 }, { "grad_norm": 0.6835291981697083, "learning_rate": 4.6340838531490365e-05, "loss": 0.0358, "step": 32830 }, { "grad_norm": 0.49716871976852417, "learning_rate": 4.6313355154349e-05, "loss": 0.0477, "step": 32840 }, { "grad_norm": 0.6233711242675781, "learning_rate": 4.6285872897113025e-05, "loss": 0.0504, "step": 32850 }, { "grad_norm": 0.7859353423118591, "learning_rate": 4.625839176813077e-05, "loss": 0.0523, "step": 32860 }, { "grad_norm": 0.379395067691803, "learning_rate": 4.623091177575031e-05, "loss": 0.0483, "step": 32870 }, { "grad_norm": 0.45758697390556335, "learning_rate": 4.620343292831936e-05, "loss": 0.0467, "step": 32880 }, { "grad_norm": 0.5351310968399048, "learning_rate": 4.6175955234185206e-05, "loss": 0.0397, "step": 32890 }, { "grad_norm": 0.7135164737701416, "learning_rate": 4.614847870169492e-05, "loss": 0.0433, "step": 32900 }, { "grad_norm": 0.9590808153152466, "learning_rate": 4.612100333919509e-05, "loss": 0.0476, "step": 32910 }, { "grad_norm": 0.5436590313911438, "learning_rate": 4.609352915503202e-05, "loss": 0.0534, "step": 32920 }, { "grad_norm": 0.4801543653011322, "learning_rate": 4.606605615755166e-05, "loss": 0.0326, "step": 32930 }, { "grad_norm": 0.43318116664886475, "learning_rate": 4.6038584355099576e-05, "loss": 0.0333, "step": 32940 }, { "grad_norm": 0.258481502532959, "learning_rate": 4.6011113756020964e-05, "loss": 0.0334, "step": 32950 }, { "grad_norm": 0.684848427772522, "learning_rate": 4.598364436866066e-05, "loss": 0.0621, "step": 32960 }, { "grad_norm": 0.5909836292266846, "learning_rate": 4.595617620136316e-05, "loss": 0.0478, "step": 32970 }, { "grad_norm": 0.4147104322910309, "learning_rate": 4.592870926247257e-05, "loss": 0.038, "step": 32980 }, { "grad_norm": 0.44576555490493774, "learning_rate": 4.5901243560332594e-05, "loss": 0.0303, "step": 32990 }, { "grad_norm": 0.6290338635444641, "learning_rate": 4.587377910328662e-05, "loss": 0.0417, "step": 33000 }, { "grad_norm": 0.564954936504364, "learning_rate": 4.5846315899677586e-05, "loss": 0.0452, "step": 33010 }, { "grad_norm": 0.7090103626251221, "learning_rate": 4.5818853957848114e-05, "loss": 0.044, "step": 33020 }, { "grad_norm": 0.4794834852218628, "learning_rate": 4.579139328614043e-05, "loss": 0.0421, "step": 33030 }, { "grad_norm": 0.5361998677253723, "learning_rate": 4.576393389289633e-05, "loss": 0.0448, "step": 33040 }, { "grad_norm": 0.9322121739387512, "learning_rate": 4.573647578645728e-05, "loss": 0.0354, "step": 33050 }, { "grad_norm": 0.36004403233528137, "learning_rate": 4.57090189751643e-05, "loss": 0.0476, "step": 33060 }, { "grad_norm": 0.5003473162651062, "learning_rate": 4.568156346735806e-05, "loss": 0.0444, "step": 33070 }, { "grad_norm": 0.6907212734222412, "learning_rate": 4.565410927137882e-05, "loss": 0.0476, "step": 33080 }, { "grad_norm": 0.4146416485309601, "learning_rate": 4.562665639556644e-05, "loss": 0.0325, "step": 33090 }, { "grad_norm": 0.410856693983078, "learning_rate": 4.559920484826037e-05, "loss": 0.0353, "step": 33100 }, { "grad_norm": 0.47791171073913574, "learning_rate": 4.5571754637799665e-05, "loss": 0.0532, "step": 33110 }, { "grad_norm": 0.5166528820991516, "learning_rate": 4.554430577252298e-05, "loss": 0.0488, "step": 33120 }, { "grad_norm": 0.33651506900787354, "learning_rate": 4.551685826076858e-05, "loss": 0.0475, "step": 33130 }, { "grad_norm": 0.46368151903152466, "learning_rate": 4.5489412110874246e-05, "loss": 0.039, "step": 33140 }, { "grad_norm": 0.3644006848335266, "learning_rate": 4.5461967331177444e-05, "loss": 0.051, "step": 33150 }, { "grad_norm": 0.39293205738067627, "learning_rate": 4.5434523930015115e-05, "loss": 0.0366, "step": 33160 }, { "grad_norm": 0.3694569170475006, "learning_rate": 4.540708191572388e-05, "loss": 0.0429, "step": 33170 }, { "grad_norm": 0.43236014246940613, "learning_rate": 4.537964129663991e-05, "loss": 0.0457, "step": 33180 }, { "grad_norm": 0.48264235258102417, "learning_rate": 4.535220208109889e-05, "loss": 0.0391, "step": 33190 }, { "grad_norm": 0.5785101652145386, "learning_rate": 4.5324764277436194e-05, "loss": 0.0517, "step": 33200 }, { "grad_norm": 0.6220090389251709, "learning_rate": 4.529732789398664e-05, "loss": 0.0399, "step": 33210 }, { "grad_norm": 0.49334943294525146, "learning_rate": 4.526989293908472e-05, "loss": 0.0343, "step": 33220 }, { "grad_norm": 0.45013320446014404, "learning_rate": 4.524245942106442e-05, "loss": 0.0322, "step": 33230 }, { "grad_norm": 0.5484622120857239, "learning_rate": 4.5215027348259345e-05, "loss": 0.0411, "step": 33240 }, { "grad_norm": 0.3609580397605896, "learning_rate": 4.5187596729002616e-05, "loss": 0.026, "step": 33250 }, { "grad_norm": 0.4309524893760681, "learning_rate": 4.516016757162693e-05, "loss": 0.0368, "step": 33260 }, { "grad_norm": 0.20797720551490784, "learning_rate": 4.513273988446457e-05, "loss": 0.0419, "step": 33270 }, { "grad_norm": 0.41750285029411316, "learning_rate": 4.5105313675847296e-05, "loss": 0.045, "step": 33280 }, { "grad_norm": 0.9425210356712341, "learning_rate": 4.5077888954106495e-05, "loss": 0.0479, "step": 33290 }, { "grad_norm": 0.5866557955741882, "learning_rate": 4.505046572757309e-05, "loss": 0.0398, "step": 33300 }, { "grad_norm": 0.4697965681552887, "learning_rate": 4.502304400457749e-05, "loss": 0.0393, "step": 33310 }, { "grad_norm": 0.424709677696228, "learning_rate": 4.499562379344973e-05, "loss": 0.0396, "step": 33320 }, { "grad_norm": 0.4617866277694702, "learning_rate": 4.4968205102519306e-05, "loss": 0.0358, "step": 33330 }, { "grad_norm": 0.5250745415687561, "learning_rate": 4.494078794011532e-05, "loss": 0.0606, "step": 33340 }, { "grad_norm": 0.35714030265808105, "learning_rate": 4.491337231456639e-05, "loss": 0.0406, "step": 33350 }, { "grad_norm": 0.49041748046875, "learning_rate": 4.4885958234200634e-05, "loss": 0.0414, "step": 33360 }, { "grad_norm": 0.4065510332584381, "learning_rate": 4.485854570734575e-05, "loss": 0.0324, "step": 33370 }, { "grad_norm": 0.6561926007270813, "learning_rate": 4.483113474232891e-05, "loss": 0.045, "step": 33380 }, { "grad_norm": 0.4116778075695038, "learning_rate": 4.480372534747688e-05, "loss": 0.0428, "step": 33390 }, { "grad_norm": 0.4857001006603241, "learning_rate": 4.477631753111588e-05, "loss": 0.0453, "step": 33400 }, { "grad_norm": 0.5186020135879517, "learning_rate": 4.4748911301571686e-05, "loss": 0.0487, "step": 33410 }, { "grad_norm": 0.3224460184574127, "learning_rate": 4.472150666716961e-05, "loss": 0.0343, "step": 33420 }, { "grad_norm": 0.9694647789001465, "learning_rate": 4.469410363623442e-05, "loss": 0.0539, "step": 33430 }, { "grad_norm": 0.6754404902458191, "learning_rate": 4.466670221709044e-05, "loss": 0.0451, "step": 33440 }, { "grad_norm": 0.5208157896995544, "learning_rate": 4.463930241806154e-05, "loss": 0.0288, "step": 33450 }, { "grad_norm": 0.5174795389175415, "learning_rate": 4.4611904247471006e-05, "loss": 0.0623, "step": 33460 }, { "grad_norm": 0.43925705552101135, "learning_rate": 4.458450771364171e-05, "loss": 0.0359, "step": 33470 }, { "grad_norm": 0.4635041356086731, "learning_rate": 4.4557112824895965e-05, "loss": 0.0389, "step": 33480 }, { "grad_norm": 0.3970741033554077, "learning_rate": 4.452971958955563e-05, "loss": 0.0347, "step": 33490 }, { "grad_norm": 0.6557695865631104, "learning_rate": 4.450232801594208e-05, "loss": 0.0506, "step": 33500 }, { "grad_norm": 0.6305457353591919, "learning_rate": 4.447493811237609e-05, "loss": 0.0392, "step": 33510 }, { "grad_norm": 1.0504220724105835, "learning_rate": 4.444754988717804e-05, "loss": 0.0374, "step": 33520 }, { "grad_norm": 0.41564956307411194, "learning_rate": 4.442016334866771e-05, "loss": 0.036, "step": 33530 }, { "grad_norm": 0.44126707315444946, "learning_rate": 4.4392778505164445e-05, "loss": 0.032, "step": 33540 }, { "grad_norm": 0.6236696243286133, "learning_rate": 4.436539536498702e-05, "loss": 0.0402, "step": 33550 }, { "grad_norm": 1.2158342599868774, "learning_rate": 4.433801393645369e-05, "loss": 0.0568, "step": 33560 }, { "grad_norm": 0.43115705251693726, "learning_rate": 4.431063422788226e-05, "loss": 0.0361, "step": 33570 }, { "grad_norm": 0.3007037043571472, "learning_rate": 4.428325624758991e-05, "loss": 0.0349, "step": 33580 }, { "grad_norm": 0.39746734499931335, "learning_rate": 4.4255880003893366e-05, "loss": 0.0437, "step": 33590 }, { "grad_norm": 0.4468780755996704, "learning_rate": 4.422850550510884e-05, "loss": 0.0479, "step": 33600 }, { "grad_norm": 0.41156551241874695, "learning_rate": 4.4201132759551934e-05, "loss": 0.0277, "step": 33610 }, { "grad_norm": 0.6656305193901062, "learning_rate": 4.4173761775537804e-05, "loss": 0.0565, "step": 33620 }, { "grad_norm": 0.34776461124420166, "learning_rate": 4.414639256138099e-05, "loss": 0.0401, "step": 33630 }, { "grad_norm": 0.38447877764701843, "learning_rate": 4.411902512539557e-05, "loss": 0.0323, "step": 33640 }, { "grad_norm": 0.4399155378341675, "learning_rate": 4.4091659475895044e-05, "loss": 0.0419, "step": 33650 }, { "grad_norm": 0.4484727382659912, "learning_rate": 4.406429562119235e-05, "loss": 0.039, "step": 33660 }, { "grad_norm": 0.42555806040763855, "learning_rate": 4.4036933569599945e-05, "loss": 0.0324, "step": 33670 }, { "grad_norm": 0.7198932766914368, "learning_rate": 4.400957332942965e-05, "loss": 0.0358, "step": 33680 }, { "grad_norm": 0.6113157272338867, "learning_rate": 4.3982214908992844e-05, "loss": 0.0313, "step": 33690 }, { "grad_norm": 0.6782222986221313, "learning_rate": 4.3954858316600235e-05, "loss": 0.0361, "step": 33700 }, { "grad_norm": 0.4430203139781952, "learning_rate": 4.392750356056205e-05, "loss": 0.0271, "step": 33710 }, { "grad_norm": 0.3503997325897217, "learning_rate": 4.390015064918798e-05, "loss": 0.0525, "step": 33720 }, { "grad_norm": 0.5207122564315796, "learning_rate": 4.387279959078705e-05, "loss": 0.0361, "step": 33730 }, { "grad_norm": 0.5592111349105835, "learning_rate": 4.384545039366786e-05, "loss": 0.0397, "step": 33740 }, { "grad_norm": 0.4247814416885376, "learning_rate": 4.381810306613831e-05, "loss": 0.0355, "step": 33750 }, { "grad_norm": 0.5673230290412903, "learning_rate": 4.3790757616505826e-05, "loss": 0.0418, "step": 33760 }, { "grad_norm": 0.4269827902317047, "learning_rate": 4.376341405307725e-05, "loss": 0.0405, "step": 33770 }, { "grad_norm": 0.6903880834579468, "learning_rate": 4.37360723841588e-05, "loss": 0.0424, "step": 33780 }, { "grad_norm": 0.5370358228683472, "learning_rate": 4.370873261805619e-05, "loss": 0.0377, "step": 33790 }, { "grad_norm": 0.6700480580329895, "learning_rate": 4.368139476307449e-05, "loss": 0.0386, "step": 33800 }, { "grad_norm": 0.3991105556488037, "learning_rate": 4.365405882751822e-05, "loss": 0.0374, "step": 33810 }, { "grad_norm": 0.3170241415500641, "learning_rate": 4.3626724819691326e-05, "loss": 0.0397, "step": 33820 }, { "grad_norm": 0.6563465595245361, "learning_rate": 4.359939274789715e-05, "loss": 0.0433, "step": 33830 }, { "grad_norm": 0.5244877338409424, "learning_rate": 4.357206262043848e-05, "loss": 0.0381, "step": 33840 }, { "grad_norm": 0.5214926600456238, "learning_rate": 4.354473444561745e-05, "loss": 0.0425, "step": 33850 }, { "grad_norm": 0.5716546177864075, "learning_rate": 4.3517408231735644e-05, "loss": 0.0299, "step": 33860 }, { "grad_norm": 0.8239344954490662, "learning_rate": 4.3490083987094086e-05, "loss": 0.0501, "step": 33870 }, { "grad_norm": 0.48297104239463806, "learning_rate": 4.34627617199931e-05, "loss": 0.0335, "step": 33880 }, { "grad_norm": 1.7105486392974854, "learning_rate": 4.3435441438732526e-05, "loss": 0.0505, "step": 33890 }, { "grad_norm": 0.5213309526443481, "learning_rate": 4.340812315161149e-05, "loss": 0.0469, "step": 33900 }, { "grad_norm": 0.4476781189441681, "learning_rate": 4.338080686692859e-05, "loss": 0.0384, "step": 33910 }, { "grad_norm": 0.736973226070404, "learning_rate": 4.3353492592981816e-05, "loss": 0.0499, "step": 33920 }, { "grad_norm": 0.5700157284736633, "learning_rate": 4.3326180338068485e-05, "loss": 0.0365, "step": 33930 }, { "grad_norm": 0.6232342720031738, "learning_rate": 4.3298870110485356e-05, "loss": 0.0358, "step": 33940 }, { "grad_norm": 0.901321530342102, "learning_rate": 4.3271561918528567e-05, "loss": 0.0423, "step": 33950 }, { "grad_norm": 0.4167889952659607, "learning_rate": 4.324425577049359e-05, "loss": 0.0371, "step": 33960 }, { "grad_norm": 0.6315491199493408, "learning_rate": 4.321695167467535e-05, "loss": 0.0381, "step": 33970 }, { "grad_norm": 0.551144003868103, "learning_rate": 4.3189649639368093e-05, "loss": 0.0525, "step": 33980 }, { "grad_norm": 0.5169941186904907, "learning_rate": 4.316234967286547e-05, "loss": 0.0293, "step": 33990 }, { "grad_norm": 0.9575517773628235, "learning_rate": 4.313505178346046e-05, "loss": 0.0478, "step": 34000 }, { "grad_norm": 0.4294770061969757, "learning_rate": 4.3107755979445465e-05, "loss": 0.0432, "step": 34010 }, { "grad_norm": 0.7880650758743286, "learning_rate": 4.308046226911224e-05, "loss": 0.0633, "step": 34020 }, { "grad_norm": 0.43732884526252747, "learning_rate": 4.305317066075185e-05, "loss": 0.0445, "step": 34030 }, { "grad_norm": 0.5855035781860352, "learning_rate": 4.302588116265482e-05, "loss": 0.0526, "step": 34040 }, { "grad_norm": 0.40366464853286743, "learning_rate": 4.299859378311094e-05, "loss": 0.0343, "step": 34050 }, { "grad_norm": 0.2817210257053375, "learning_rate": 4.2971308530409424e-05, "loss": 0.0286, "step": 34060 }, { "grad_norm": 0.4163440763950348, "learning_rate": 4.2944025412838765e-05, "loss": 0.0432, "step": 34070 }, { "grad_norm": 0.4325031340122223, "learning_rate": 4.291674443868689e-05, "loss": 0.0325, "step": 34080 }, { "grad_norm": 0.5136054158210754, "learning_rate": 4.288946561624104e-05, "loss": 0.0373, "step": 34090 }, { "grad_norm": 0.524934709072113, "learning_rate": 4.2862188953787794e-05, "loss": 0.0368, "step": 34100 }, { "grad_norm": 0.5956789255142212, "learning_rate": 4.283491445961308e-05, "loss": 0.0339, "step": 34110 }, { "grad_norm": 0.9309860467910767, "learning_rate": 4.2807642142002155e-05, "loss": 0.0507, "step": 34120 }, { "grad_norm": 0.7084258198738098, "learning_rate": 4.278037200923966e-05, "loss": 0.045, "step": 34130 }, { "grad_norm": 0.37964197993278503, "learning_rate": 4.275310406960953e-05, "loss": 0.0339, "step": 34140 }, { "grad_norm": 0.4035922884941101, "learning_rate": 4.272583833139502e-05, "loss": 0.0392, "step": 34150 }, { "grad_norm": 0.5014989376068115, "learning_rate": 4.2698574802878794e-05, "loss": 0.0278, "step": 34160 }, { "grad_norm": 0.37094634771347046, "learning_rate": 4.2671313492342734e-05, "loss": 0.0305, "step": 34170 }, { "grad_norm": 0.6020047664642334, "learning_rate": 4.264405440806813e-05, "loss": 0.0436, "step": 34180 }, { "grad_norm": 0.34942179918289185, "learning_rate": 4.26167975583356e-05, "loss": 0.0346, "step": 34190 }, { "grad_norm": 0.5034937262535095, "learning_rate": 4.2589542951425e-05, "loss": 0.0332, "step": 34200 }, { "grad_norm": 0.43112805485725403, "learning_rate": 4.2562290595615615e-05, "loss": 0.0357, "step": 34210 }, { "grad_norm": 0.3169421851634979, "learning_rate": 4.2535040499185946e-05, "loss": 0.0502, "step": 34220 }, { "grad_norm": 0.6125676035881042, "learning_rate": 4.250779267041387e-05, "loss": 0.0413, "step": 34230 }, { "grad_norm": 0.7481167912483215, "learning_rate": 4.248054711757657e-05, "loss": 0.0382, "step": 34240 }, { "grad_norm": 0.4257263243198395, "learning_rate": 4.245330384895052e-05, "loss": 0.0285, "step": 34250 }, { "grad_norm": 0.7295517325401306, "learning_rate": 4.242606287281151e-05, "loss": 0.0371, "step": 34260 }, { "grad_norm": 0.7671251893043518, "learning_rate": 4.2398824197434595e-05, "loss": 0.0374, "step": 34270 }, { "grad_norm": 0.30624544620513916, "learning_rate": 4.23715878310942e-05, "loss": 0.0278, "step": 34280 }, { "grad_norm": 0.4393518567085266, "learning_rate": 4.234435378206402e-05, "loss": 0.044, "step": 34290 }, { "grad_norm": 0.7179508209228516, "learning_rate": 4.2317122058617006e-05, "loss": 0.0414, "step": 34300 }, { "grad_norm": 0.33699095249176025, "learning_rate": 4.2289892669025485e-05, "loss": 0.03, "step": 34310 }, { "grad_norm": 0.44354936480522156, "learning_rate": 4.226266562156097e-05, "loss": 0.0446, "step": 34320 }, { "grad_norm": 0.42990607023239136, "learning_rate": 4.223544092449435e-05, "loss": 0.0578, "step": 34330 }, { "grad_norm": 0.5626803040504456, "learning_rate": 4.2208218586095784e-05, "loss": 0.0376, "step": 34340 }, { "grad_norm": 0.4843546450138092, "learning_rate": 4.218099861463466e-05, "loss": 0.0364, "step": 34350 }, { "grad_norm": 0.4564955234527588, "learning_rate": 4.215378101837972e-05, "loss": 0.0349, "step": 34360 }, { "grad_norm": 0.7841764092445374, "learning_rate": 4.2126565805598937e-05, "loss": 0.0391, "step": 34370 }, { "grad_norm": 0.4566444456577301, "learning_rate": 4.209935298455957e-05, "loss": 0.0296, "step": 34380 }, { "grad_norm": 0.5547240972518921, "learning_rate": 4.207214256352817e-05, "loss": 0.0376, "step": 34390 }, { "grad_norm": 0.4661291539669037, "learning_rate": 4.2044934550770524e-05, "loss": 0.0464, "step": 34400 }, { "grad_norm": 0.8259737491607666, "learning_rate": 4.201772895455174e-05, "loss": 0.0383, "step": 34410 }, { "grad_norm": 0.4424671530723572, "learning_rate": 4.199052578313613e-05, "loss": 0.0355, "step": 34420 }, { "grad_norm": 0.4423311650753021, "learning_rate": 4.1963325044787294e-05, "loss": 0.0445, "step": 34430 }, { "grad_norm": 0.6961237788200378, "learning_rate": 4.193612674776814e-05, "loss": 0.034, "step": 34440 }, { "grad_norm": 0.4832436144351959, "learning_rate": 4.1908930900340745e-05, "loss": 0.0406, "step": 34450 }, { "grad_norm": 0.4122052490711212, "learning_rate": 4.1881737510766536e-05, "loss": 0.034, "step": 34460 }, { "grad_norm": 0.4567112922668457, "learning_rate": 4.185454658730609e-05, "loss": 0.035, "step": 34470 }, { "grad_norm": 0.40583205223083496, "learning_rate": 4.1827358138219355e-05, "loss": 0.0416, "step": 34480 }, { "grad_norm": 0.33872467279434204, "learning_rate": 4.1800172171765404e-05, "loss": 0.0564, "step": 34490 }, { "grad_norm": 0.49886953830718994, "learning_rate": 4.177298869620264e-05, "loss": 0.0362, "step": 34500 }, { "grad_norm": 0.7692508697509766, "learning_rate": 4.1745807719788705e-05, "loss": 0.0417, "step": 34510 }, { "grad_norm": 0.5138983726501465, "learning_rate": 4.1718629250780445e-05, "loss": 0.0333, "step": 34520 }, { "grad_norm": 0.5781868696212769, "learning_rate": 4.1691453297433956e-05, "loss": 0.0341, "step": 34530 }, { "grad_norm": 0.4636389911174774, "learning_rate": 4.166427986800457e-05, "loss": 0.0381, "step": 34540 }, { "grad_norm": 0.6054379343986511, "learning_rate": 4.163710897074688e-05, "loss": 0.0449, "step": 34550 }, { "grad_norm": 0.441042959690094, "learning_rate": 4.1609940613914686e-05, "loss": 0.0415, "step": 34560 }, { "grad_norm": 0.3383660614490509, "learning_rate": 4.1582774805760996e-05, "loss": 0.0323, "step": 34570 }, { "grad_norm": 0.5419133305549622, "learning_rate": 4.155561155453809e-05, "loss": 0.0544, "step": 34580 }, { "grad_norm": 0.5721708536148071, "learning_rate": 4.15284508684974e-05, "loss": 0.0408, "step": 34590 }, { "grad_norm": 0.5545462965965271, "learning_rate": 4.1501292755889675e-05, "loss": 0.0521, "step": 34600 }, { "grad_norm": 0.32714033126831055, "learning_rate": 4.1474137224964833e-05, "loss": 0.039, "step": 34610 }, { "grad_norm": 0.3315695822238922, "learning_rate": 4.144698428397197e-05, "loss": 0.0216, "step": 34620 }, { "grad_norm": 0.4537088871002197, "learning_rate": 4.1419833941159466e-05, "loss": 0.025, "step": 34630 }, { "grad_norm": 0.42860886454582214, "learning_rate": 4.1392686204774846e-05, "loss": 0.0386, "step": 34640 }, { "grad_norm": 0.6170178055763245, "learning_rate": 4.13655410830649e-05, "loss": 0.0431, "step": 34650 }, { "grad_norm": 0.69862300157547, "learning_rate": 4.1338398584275594e-05, "loss": 0.0442, "step": 34660 }, { "grad_norm": 0.5160709023475647, "learning_rate": 4.1311258716652104e-05, "loss": 0.0473, "step": 34670 }, { "grad_norm": 0.7085931897163391, "learning_rate": 4.128412148843881e-05, "loss": 0.0363, "step": 34680 }, { "grad_norm": 0.7325635552406311, "learning_rate": 4.125698690787926e-05, "loss": 0.0363, "step": 34690 }, { "grad_norm": 0.4720636010169983, "learning_rate": 4.1229854983216245e-05, "loss": 0.0349, "step": 34700 }, { "grad_norm": 0.474533349275589, "learning_rate": 4.120272572269175e-05, "loss": 0.0452, "step": 34710 }, { "grad_norm": 0.563895583152771, "learning_rate": 4.117559913454687e-05, "loss": 0.0443, "step": 34720 }, { "grad_norm": 0.49655160307884216, "learning_rate": 4.114847522702201e-05, "loss": 0.0311, "step": 34730 }, { "grad_norm": 0.6635484099388123, "learning_rate": 4.112135400835664e-05, "loss": 0.0461, "step": 34740 }, { "grad_norm": 0.41037917137145996, "learning_rate": 4.109423548678949e-05, "loss": 0.0312, "step": 34750 }, { "grad_norm": 0.48497045040130615, "learning_rate": 4.106711967055848e-05, "loss": 0.0299, "step": 34760 }, { "grad_norm": 1.0820424556732178, "learning_rate": 4.1040006567900636e-05, "loss": 0.0445, "step": 34770 }, { "grad_norm": 0.27649644017219543, "learning_rate": 4.101289618705224e-05, "loss": 0.0245, "step": 34780 }, { "grad_norm": 0.4463048577308655, "learning_rate": 4.0985788536248675e-05, "loss": 0.047, "step": 34790 }, { "grad_norm": 0.5433073043823242, "learning_rate": 4.095868362372454e-05, "loss": 0.0485, "step": 34800 }, { "grad_norm": 0.5122371912002563, "learning_rate": 4.0931581457713614e-05, "loss": 0.0269, "step": 34810 }, { "grad_norm": 0.43182969093322754, "learning_rate": 4.09044820464488e-05, "loss": 0.0335, "step": 34820 }, { "grad_norm": 0.34321749210357666, "learning_rate": 4.087738539816219e-05, "loss": 0.0352, "step": 34830 }, { "grad_norm": 0.47977110743522644, "learning_rate": 4.085029152108501e-05, "loss": 0.0428, "step": 34840 }, { "grad_norm": 0.6480620503425598, "learning_rate": 4.0823200423447714e-05, "loss": 0.0395, "step": 34850 }, { "grad_norm": 0.3334551751613617, "learning_rate": 4.079611211347981e-05, "loss": 0.0336, "step": 34860 }, { "grad_norm": 0.24620921909809113, "learning_rate": 4.076902659941002e-05, "loss": 0.0348, "step": 34870 }, { "grad_norm": 0.4704356789588928, "learning_rate": 4.074194388946624e-05, "loss": 0.0458, "step": 34880 }, { "grad_norm": 0.44888538122177124, "learning_rate": 4.071486399187545e-05, "loss": 0.0419, "step": 34890 }, { "grad_norm": 0.45084574818611145, "learning_rate": 4.0687786914863836e-05, "loss": 0.0412, "step": 34900 }, { "grad_norm": 0.6122797727584839, "learning_rate": 4.0660712666656666e-05, "loss": 0.031, "step": 34910 }, { "grad_norm": 0.42711368203163147, "learning_rate": 4.0633641255478394e-05, "loss": 0.0393, "step": 34920 }, { "grad_norm": 0.3967357873916626, "learning_rate": 4.0606572689552624e-05, "loss": 0.0353, "step": 34930 }, { "grad_norm": 0.3840460777282715, "learning_rate": 4.0579506977102036e-05, "loss": 0.0339, "step": 34940 }, { "grad_norm": 0.5020797252655029, "learning_rate": 4.055244412634849e-05, "loss": 0.0349, "step": 34950 }, { "grad_norm": 0.4138273298740387, "learning_rate": 4.052538414551298e-05, "loss": 0.0364, "step": 34960 }, { "grad_norm": 0.35158559679985046, "learning_rate": 4.0498327042815596e-05, "loss": 0.04, "step": 34970 }, { "grad_norm": 0.5774462223052979, "learning_rate": 4.047127282647559e-05, "loss": 0.0348, "step": 34980 }, { "grad_norm": 0.6034305095672607, "learning_rate": 4.04442215047113e-05, "loss": 0.0307, "step": 34990 }, { "grad_norm": 0.41479408740997314, "learning_rate": 4.041717308574023e-05, "loss": 0.0281, "step": 35000 }, { "grad_norm": 1.1402921676635742, "learning_rate": 4.039012757777893e-05, "loss": 0.0486, "step": 35010 }, { "grad_norm": 0.37755948305130005, "learning_rate": 4.036308498904314e-05, "loss": 0.0447, "step": 35020 }, { "grad_norm": 0.48340344429016113, "learning_rate": 4.033604532774771e-05, "loss": 0.0351, "step": 35030 }, { "grad_norm": 0.5664953589439392, "learning_rate": 4.030900860210652e-05, "loss": 0.0385, "step": 35040 }, { "grad_norm": 0.5476329326629639, "learning_rate": 4.028197482033266e-05, "loss": 0.0472, "step": 35050 }, { "grad_norm": 0.3979140818119049, "learning_rate": 4.0254943990638246e-05, "loss": 0.0341, "step": 35060 }, { "grad_norm": 0.5882547497749329, "learning_rate": 4.022791612123454e-05, "loss": 0.0462, "step": 35070 }, { "grad_norm": 0.5427249073982239, "learning_rate": 4.020089122033192e-05, "loss": 0.0398, "step": 35080 }, { "grad_norm": 0.35950228571891785, "learning_rate": 4.01738692961398e-05, "loss": 0.0394, "step": 35090 }, { "grad_norm": 0.5219293832778931, "learning_rate": 4.014685035686675e-05, "loss": 0.0433, "step": 35100 }, { "grad_norm": 0.48604220151901245, "learning_rate": 4.011983441072039e-05, "loss": 0.0485, "step": 35110 }, { "grad_norm": 0.44173380732536316, "learning_rate": 4.0092821465907485e-05, "loss": 0.0528, "step": 35120 }, { "grad_norm": 0.555090069770813, "learning_rate": 4.006581153063383e-05, "loss": 0.0382, "step": 35130 }, { "grad_norm": 0.5398850440979004, "learning_rate": 4.003880461310432e-05, "loss": 0.0469, "step": 35140 }, { "grad_norm": 0.33594322204589844, "learning_rate": 4.001180072152298e-05, "loss": 0.0378, "step": 35150 }, { "grad_norm": 0.6663643717765808, "learning_rate": 3.998479986409285e-05, "loss": 0.0392, "step": 35160 }, { "grad_norm": 0.73133784532547, "learning_rate": 3.995780204901607e-05, "loss": 0.0467, "step": 35170 }, { "grad_norm": 0.2906554043292999, "learning_rate": 3.993080728449391e-05, "loss": 0.0522, "step": 35180 }, { "grad_norm": 0.4371411204338074, "learning_rate": 3.990381557872661e-05, "loss": 0.0282, "step": 35190 }, { "grad_norm": 1.0390347242355347, "learning_rate": 3.987682693991359e-05, "loss": 0.0683, "step": 35200 }, { "grad_norm": 0.9660499095916748, "learning_rate": 3.9849841376253226e-05, "loss": 0.0536, "step": 35210 }, { "grad_norm": 0.6025097966194153, "learning_rate": 3.982285889594306e-05, "loss": 0.0443, "step": 35220 }, { "grad_norm": 0.6599099636077881, "learning_rate": 3.9795879507179665e-05, "loss": 0.0405, "step": 35230 }, { "grad_norm": 0.577697217464447, "learning_rate": 3.9768903218158634e-05, "loss": 0.0446, "step": 35240 }, { "grad_norm": 0.3478442132472992, "learning_rate": 3.974193003707468e-05, "loss": 0.0417, "step": 35250 }, { "grad_norm": 0.4742845594882965, "learning_rate": 3.971495997212152e-05, "loss": 0.0403, "step": 35260 }, { "grad_norm": 0.4020459055900574, "learning_rate": 3.9687993031491985e-05, "loss": 0.0362, "step": 35270 }, { "grad_norm": 0.43447789549827576, "learning_rate": 3.966102922337787e-05, "loss": 0.0444, "step": 35280 }, { "grad_norm": 0.5408800840377808, "learning_rate": 3.963406855597009e-05, "loss": 0.0383, "step": 35290 }, { "grad_norm": 0.36541077494621277, "learning_rate": 3.960711103745861e-05, "loss": 0.0291, "step": 35300 }, { "grad_norm": 0.5556033253669739, "learning_rate": 3.958015667603237e-05, "loss": 0.0346, "step": 35310 }, { "grad_norm": 0.5440998077392578, "learning_rate": 3.955320547987943e-05, "loss": 0.0642, "step": 35320 }, { "grad_norm": 0.35394346714019775, "learning_rate": 3.952625745718681e-05, "loss": 0.0407, "step": 35330 }, { "grad_norm": 0.41301074624061584, "learning_rate": 3.949931261614064e-05, "loss": 0.0396, "step": 35340 }, { "grad_norm": 0.4026029407978058, "learning_rate": 3.947237096492605e-05, "loss": 0.0475, "step": 35350 }, { "grad_norm": 0.8742837905883789, "learning_rate": 3.944543251172719e-05, "loss": 0.0366, "step": 35360 }, { "grad_norm": 0.36563432216644287, "learning_rate": 3.941849726472725e-05, "loss": 0.0375, "step": 35370 }, { "grad_norm": 0.4116532802581787, "learning_rate": 3.939156523210846e-05, "loss": 0.0346, "step": 35380 }, { "grad_norm": 0.3132840096950531, "learning_rate": 3.9364636422052046e-05, "loss": 0.044, "step": 35390 }, { "grad_norm": 0.5793635845184326, "learning_rate": 3.933771084273828e-05, "loss": 0.0334, "step": 35400 }, { "grad_norm": 0.6353548765182495, "learning_rate": 3.931078850234643e-05, "loss": 0.0422, "step": 35410 }, { "grad_norm": 0.7860891222953796, "learning_rate": 3.928386940905483e-05, "loss": 0.0497, "step": 35420 }, { "grad_norm": 0.3480146527290344, "learning_rate": 3.925695357104073e-05, "loss": 0.0373, "step": 35430 }, { "grad_norm": 0.4390029013156891, "learning_rate": 3.923004099648049e-05, "loss": 0.0406, "step": 35440 }, { "grad_norm": 0.39444395899772644, "learning_rate": 3.920313169354944e-05, "loss": 0.0454, "step": 35450 }, { "grad_norm": 0.49191945791244507, "learning_rate": 3.9176225670421897e-05, "loss": 0.0337, "step": 35460 }, { "grad_norm": 0.3895690441131592, "learning_rate": 3.9149322935271224e-05, "loss": 0.0307, "step": 35470 }, { "grad_norm": 0.2313399463891983, "learning_rate": 3.9122423496269725e-05, "loss": 0.0519, "step": 35480 }, { "grad_norm": 0.531633198261261, "learning_rate": 3.909552736158877e-05, "loss": 0.045, "step": 35490 }, { "grad_norm": 0.7677546739578247, "learning_rate": 3.90686345393987e-05, "loss": 0.0686, "step": 35500 }, { "grad_norm": 0.6244480013847351, "learning_rate": 3.9041745037868816e-05, "loss": 0.0398, "step": 35510 }, { "grad_norm": 0.6755330562591553, "learning_rate": 3.9014858865167465e-05, "loss": 0.0534, "step": 35520 }, { "grad_norm": 0.3902726173400879, "learning_rate": 3.8987976029461935e-05, "loss": 0.03, "step": 35530 }, { "grad_norm": 0.45246002078056335, "learning_rate": 3.896109653891853e-05, "loss": 0.032, "step": 35540 }, { "grad_norm": 0.5304949879646301, "learning_rate": 3.893422040170254e-05, "loss": 0.0371, "step": 35550 }, { "grad_norm": 0.3065507709980011, "learning_rate": 3.8907347625978207e-05, "loss": 0.0554, "step": 35560 }, { "grad_norm": 0.5881665945053101, "learning_rate": 3.88804782199088e-05, "loss": 0.0218, "step": 35570 }, { "grad_norm": 0.524183988571167, "learning_rate": 3.8853612191656495e-05, "loss": 0.032, "step": 35580 }, { "grad_norm": 0.41814038157463074, "learning_rate": 3.88267495493825e-05, "loss": 0.0428, "step": 35590 }, { "grad_norm": 0.352466881275177, "learning_rate": 3.8799890301247004e-05, "loss": 0.032, "step": 35600 }, { "grad_norm": 0.47854381799697876, "learning_rate": 3.8773034455409096e-05, "loss": 0.0358, "step": 35610 }, { "grad_norm": 0.4463513195514679, "learning_rate": 3.8746182020026904e-05, "loss": 0.0495, "step": 35620 }, { "grad_norm": 0.3548593521118164, "learning_rate": 3.871933300325745e-05, "loss": 0.0325, "step": 35630 }, { "grad_norm": 0.5764238238334656, "learning_rate": 3.869248741325679e-05, "loss": 0.0418, "step": 35640 }, { "grad_norm": 0.5221849679946899, "learning_rate": 3.866564525817992e-05, "loss": 0.0385, "step": 35650 }, { "grad_norm": 0.4572702944278717, "learning_rate": 3.8638806546180725e-05, "loss": 0.036, "step": 35660 }, { "grad_norm": 0.48466813564300537, "learning_rate": 3.861197128541213e-05, "loss": 0.0373, "step": 35670 }, { "grad_norm": 0.4191665053367615, "learning_rate": 3.858513948402599e-05, "loss": 0.0369, "step": 35680 }, { "grad_norm": 0.4837014079093933, "learning_rate": 3.8558311150173077e-05, "loss": 0.0288, "step": 35690 }, { "grad_norm": 0.5486913919448853, "learning_rate": 3.853148629200312e-05, "loss": 0.0455, "step": 35700 }, { "grad_norm": 0.3509698510169983, "learning_rate": 3.850466491766482e-05, "loss": 0.0326, "step": 35710 }, { "grad_norm": 0.43506133556365967, "learning_rate": 3.847784703530583e-05, "loss": 0.0336, "step": 35720 }, { "grad_norm": 0.49506276845932007, "learning_rate": 3.845103265307266e-05, "loss": 0.0321, "step": 35730 }, { "grad_norm": 0.4623821973800659, "learning_rate": 3.842422177911086e-05, "loss": 0.0367, "step": 35740 }, { "grad_norm": 0.6004802584648132, "learning_rate": 3.8397414421564826e-05, "loss": 0.0323, "step": 35750 }, { "grad_norm": 0.6187326908111572, "learning_rate": 3.8370610588577935e-05, "loss": 0.0339, "step": 35760 }, { "grad_norm": 0.4419007897377014, "learning_rate": 3.834381028829251e-05, "loss": 0.0285, "step": 35770 }, { "grad_norm": 0.5219330191612244, "learning_rate": 3.8317013528849745e-05, "loss": 0.0321, "step": 35780 }, { "grad_norm": 0.519686222076416, "learning_rate": 3.8290220318389815e-05, "loss": 0.0366, "step": 35790 }, { "grad_norm": 0.6451826095581055, "learning_rate": 3.8263430665051746e-05, "loss": 0.0267, "step": 35800 }, { "grad_norm": 0.7163410782814026, "learning_rate": 3.8236644576973554e-05, "loss": 0.0298, "step": 35810 }, { "grad_norm": 0.6176568269729614, "learning_rate": 3.820986206229217e-05, "loss": 0.0424, "step": 35820 }, { "grad_norm": 0.5301483869552612, "learning_rate": 3.8183083129143384e-05, "loss": 0.043, "step": 35830 }, { "grad_norm": 0.6214040517807007, "learning_rate": 3.815630778566193e-05, "loss": 0.031, "step": 35840 }, { "grad_norm": 0.4572949707508087, "learning_rate": 3.812953603998145e-05, "loss": 0.0364, "step": 35850 }, { "grad_norm": 0.45162925124168396, "learning_rate": 3.8102767900234504e-05, "loss": 0.0386, "step": 35860 }, { "grad_norm": 0.8040012717247009, "learning_rate": 3.807600337455256e-05, "loss": 0.0374, "step": 35870 }, { "grad_norm": 0.4622640013694763, "learning_rate": 3.804924247106593e-05, "loss": 0.0321, "step": 35880 }, { "grad_norm": 0.4599973261356354, "learning_rate": 3.8022485197903925e-05, "loss": 0.0662, "step": 35890 }, { "grad_norm": 0.4461115598678589, "learning_rate": 3.799573156319464e-05, "loss": 0.0289, "step": 35900 }, { "grad_norm": 0.42499998211860657, "learning_rate": 3.796898157506515e-05, "loss": 0.0354, "step": 35910 }, { "grad_norm": 0.4827074110507965, "learning_rate": 3.794223524164143e-05, "loss": 0.0427, "step": 35920 }, { "grad_norm": 0.5427567362785339, "learning_rate": 3.7915492571048245e-05, "loss": 0.0343, "step": 35930 }, { "grad_norm": 0.464890718460083, "learning_rate": 3.788875357140937e-05, "loss": 0.0217, "step": 35940 }, { "grad_norm": 0.4736895263195038, "learning_rate": 3.786201825084736e-05, "loss": 0.0562, "step": 35950 }, { "grad_norm": 0.572733461856842, "learning_rate": 3.783528661748372e-05, "loss": 0.0484, "step": 35960 }, { "grad_norm": 2.0651023387908936, "learning_rate": 3.780855867943882e-05, "loss": 0.0417, "step": 35970 }, { "grad_norm": 0.6264193058013916, "learning_rate": 3.778183444483189e-05, "loss": 0.0311, "step": 35980 }, { "grad_norm": 0.3042038381099701, "learning_rate": 3.775511392178108e-05, "loss": 0.0508, "step": 35990 }, { "grad_norm": 0.4048289656639099, "learning_rate": 3.772839711840332e-05, "loss": 0.0345, "step": 36000 }, { "grad_norm": 0.3278079628944397, "learning_rate": 3.7701684042814515e-05, "loss": 0.0294, "step": 36010 }, { "grad_norm": 0.38582703471183777, "learning_rate": 3.76749747031294e-05, "loss": 0.0564, "step": 36020 }, { "grad_norm": 0.41698721051216125, "learning_rate": 3.764826910746152e-05, "loss": 0.0515, "step": 36030 }, { "grad_norm": 0.2992769181728363, "learning_rate": 3.762156726392338e-05, "loss": 0.0314, "step": 36040 }, { "grad_norm": 0.28044912219047546, "learning_rate": 3.759486918062625e-05, "loss": 0.0466, "step": 36050 }, { "grad_norm": 0.5768527984619141, "learning_rate": 3.756817486568033e-05, "loss": 0.0394, "step": 36060 }, { "grad_norm": 0.40144965052604675, "learning_rate": 3.7541484327194654e-05, "loss": 0.0478, "step": 36070 }, { "grad_norm": 0.44961774349212646, "learning_rate": 3.751479757327707e-05, "loss": 0.0357, "step": 36080 }, { "grad_norm": 0.7975692749023438, "learning_rate": 3.7488114612034345e-05, "loss": 0.0338, "step": 36090 }, { "grad_norm": 0.5523799657821655, "learning_rate": 3.7461435451572044e-05, "loss": 0.0418, "step": 36100 }, { "grad_norm": 0.49523597955703735, "learning_rate": 3.743476009999459e-05, "loss": 0.0373, "step": 36110 }, { "grad_norm": 0.6188082695007324, "learning_rate": 3.7408088565405245e-05, "loss": 0.0353, "step": 36120 }, { "grad_norm": 0.5024920701980591, "learning_rate": 3.738142085590612e-05, "loss": 0.0441, "step": 36130 }, { "grad_norm": 0.42114031314849854, "learning_rate": 3.7354756979598194e-05, "loss": 0.0339, "step": 36140 }, { "grad_norm": 0.6677346229553223, "learning_rate": 3.7328096944581187e-05, "loss": 0.035, "step": 36150 }, { "grad_norm": 0.5733512043952942, "learning_rate": 3.730144075895377e-05, "loss": 0.0283, "step": 36160 }, { "grad_norm": 0.3230975866317749, "learning_rate": 3.727478843081335e-05, "loss": 0.0337, "step": 36170 }, { "grad_norm": 0.331271231174469, "learning_rate": 3.72481399682562e-05, "loss": 0.0362, "step": 36180 }, { "grad_norm": 0.399149626493454, "learning_rate": 3.722149537937747e-05, "loss": 0.035, "step": 36190 }, { "grad_norm": 0.6805139183998108, "learning_rate": 3.7194854672271015e-05, "loss": 0.046, "step": 36200 }, { "grad_norm": 0.3544389605522156, "learning_rate": 3.7168217855029644e-05, "loss": 0.0301, "step": 36210 }, { "grad_norm": 0.4290791153907776, "learning_rate": 3.7141584935744856e-05, "loss": 0.0355, "step": 36220 }, { "grad_norm": 0.4918850064277649, "learning_rate": 3.7114955922507055e-05, "loss": 0.045, "step": 36230 }, { "grad_norm": 0.47407829761505127, "learning_rate": 3.708833082340545e-05, "loss": 0.0379, "step": 36240 }, { "grad_norm": 0.2111039012670517, "learning_rate": 3.7061709646528034e-05, "loss": 0.0322, "step": 36250 }, { "grad_norm": 0.5363543033599854, "learning_rate": 3.7035092399961604e-05, "loss": 0.0355, "step": 36260 }, { "grad_norm": 0.5319388508796692, "learning_rate": 3.700847909179177e-05, "loss": 0.0334, "step": 36270 }, { "grad_norm": 0.23087430000305176, "learning_rate": 3.698186973010297e-05, "loss": 0.0376, "step": 36280 }, { "grad_norm": 0.6100839972496033, "learning_rate": 3.695526432297844e-05, "loss": 0.0322, "step": 36290 }, { "grad_norm": 0.38783419132232666, "learning_rate": 3.692866287850017e-05, "loss": 0.0393, "step": 36300 }, { "grad_norm": 0.546977162361145, "learning_rate": 3.6902065404749006e-05, "loss": 0.039, "step": 36310 }, { "grad_norm": 0.34170451760292053, "learning_rate": 3.6875471909804516e-05, "loss": 0.0408, "step": 36320 }, { "grad_norm": 0.41356295347213745, "learning_rate": 3.6848882401745135e-05, "loss": 0.0537, "step": 36330 }, { "grad_norm": 0.49627208709716797, "learning_rate": 3.682229688864806e-05, "loss": 0.032, "step": 36340 }, { "grad_norm": 0.30804768204689026, "learning_rate": 3.6795715378589235e-05, "loss": 0.0319, "step": 36350 }, { "grad_norm": 0.44611838459968567, "learning_rate": 3.676913787964345e-05, "loss": 0.0307, "step": 36360 }, { "grad_norm": 0.7904666066169739, "learning_rate": 3.674256439988423e-05, "loss": 0.0364, "step": 36370 }, { "grad_norm": 0.43108388781547546, "learning_rate": 3.6715994947383904e-05, "loss": 0.0384, "step": 36380 }, { "grad_norm": 0.49444037675857544, "learning_rate": 3.668942953021357e-05, "loss": 0.0337, "step": 36390 }, { "grad_norm": 0.5140727758407593, "learning_rate": 3.66628681564431e-05, "loss": 0.0313, "step": 36400 }, { "grad_norm": 0.46480268239974976, "learning_rate": 3.663631083414114e-05, "loss": 0.0437, "step": 36410 }, { "grad_norm": 0.392991304397583, "learning_rate": 3.660975757137509e-05, "loss": 0.0333, "step": 36420 }, { "grad_norm": 0.47087574005126953, "learning_rate": 3.658320837621114e-05, "loss": 0.0335, "step": 36430 }, { "grad_norm": 0.5313156843185425, "learning_rate": 3.655666325671426e-05, "loss": 0.0331, "step": 36440 }, { "grad_norm": 0.4981410801410675, "learning_rate": 3.65301222209481e-05, "loss": 0.0337, "step": 36450 }, { "grad_norm": 1.0720466375350952, "learning_rate": 3.650358527697519e-05, "loss": 0.0388, "step": 36460 }, { "grad_norm": 0.6671035289764404, "learning_rate": 3.64770524328567e-05, "loss": 0.0319, "step": 36470 }, { "grad_norm": 0.7070547342300415, "learning_rate": 3.645052369665265e-05, "loss": 0.0404, "step": 36480 }, { "grad_norm": 0.5291541218757629, "learning_rate": 3.6423999076421724e-05, "loss": 0.0454, "step": 36490 }, { "grad_norm": 0.6251935362815857, "learning_rate": 3.639747858022142e-05, "loss": 0.0389, "step": 36500 }, { "grad_norm": 0.44927313923835754, "learning_rate": 3.637096221610799e-05, "loss": 0.0301, "step": 36510 }, { "grad_norm": 0.4836238622665405, "learning_rate": 3.634444999213638e-05, "loss": 0.0303, "step": 36520 }, { "grad_norm": 0.638465940952301, "learning_rate": 3.6317941916360296e-05, "loss": 0.0386, "step": 36530 }, { "grad_norm": 0.6766464114189148, "learning_rate": 3.629143799683221e-05, "loss": 0.0368, "step": 36540 }, { "grad_norm": 0.2936432361602783, "learning_rate": 3.626493824160331e-05, "loss": 0.0262, "step": 36550 }, { "grad_norm": 0.40898123383522034, "learning_rate": 3.623844265872352e-05, "loss": 0.0292, "step": 36560 }, { "grad_norm": 0.5712490677833557, "learning_rate": 3.621195125624149e-05, "loss": 0.0324, "step": 36570 }, { "grad_norm": 0.4472835063934326, "learning_rate": 3.618546404220463e-05, "loss": 0.0263, "step": 36580 }, { "grad_norm": 0.3766047954559326, "learning_rate": 3.615898102465903e-05, "loss": 0.0291, "step": 36590 }, { "grad_norm": 0.5616854429244995, "learning_rate": 3.6132502211649544e-05, "loss": 0.0347, "step": 36600 }, { "grad_norm": 0.4005572497844696, "learning_rate": 3.610602761121975e-05, "loss": 0.0319, "step": 36610 }, { "grad_norm": 0.6191664338111877, "learning_rate": 3.6079557231411897e-05, "loss": 0.0448, "step": 36620 }, { "grad_norm": 0.2910742461681366, "learning_rate": 3.6053091080267035e-05, "loss": 0.0288, "step": 36630 }, { "grad_norm": 0.5225792527198792, "learning_rate": 3.602662916582483e-05, "loss": 0.0387, "step": 36640 }, { "grad_norm": 0.4374711811542511, "learning_rate": 3.600017149612375e-05, "loss": 0.0411, "step": 36650 }, { "grad_norm": 0.2302878201007843, "learning_rate": 3.5973718079200935e-05, "loss": 0.0314, "step": 36660 }, { "grad_norm": 0.396259605884552, "learning_rate": 3.5947268923092216e-05, "loss": 0.041, "step": 36670 }, { "grad_norm": 0.5410014390945435, "learning_rate": 3.592082403583216e-05, "loss": 0.0362, "step": 36680 }, { "grad_norm": 0.5406314730644226, "learning_rate": 3.5894383425454004e-05, "loss": 0.028, "step": 36690 }, { "grad_norm": 0.31046998500823975, "learning_rate": 3.586794709998975e-05, "loss": 0.0336, "step": 36700 }, { "grad_norm": 0.533282995223999, "learning_rate": 3.584151506747002e-05, "loss": 0.0334, "step": 36710 }, { "grad_norm": 0.4695211350917816, "learning_rate": 3.581508733592418e-05, "loss": 0.0301, "step": 36720 }, { "grad_norm": 0.3299477696418762, "learning_rate": 3.5788663913380297e-05, "loss": 0.0279, "step": 36730 }, { "grad_norm": 0.4681288003921509, "learning_rate": 3.576224480786506e-05, "loss": 0.0301, "step": 36740 }, { "grad_norm": 0.5420832633972168, "learning_rate": 3.573583002740393e-05, "loss": 0.0438, "step": 36750 }, { "grad_norm": 0.5476633310317993, "learning_rate": 3.570941958002103e-05, "loss": 0.0303, "step": 36760 }, { "grad_norm": 0.3976428508758545, "learning_rate": 3.568301347373912e-05, "loss": 0.0304, "step": 36770 }, { "grad_norm": 0.3976464867591858, "learning_rate": 3.5656611716579726e-05, "loss": 0.0288, "step": 36780 }, { "grad_norm": 0.6528501510620117, "learning_rate": 3.5630214316562946e-05, "loss": 0.0461, "step": 36790 }, { "grad_norm": 0.34386447072029114, "learning_rate": 3.560382128170766e-05, "loss": 0.0442, "step": 36800 }, { "grad_norm": 0.5184855461120605, "learning_rate": 3.5577432620031374e-05, "loss": 0.0337, "step": 36810 }, { "grad_norm": 0.5367711186408997, "learning_rate": 3.5551048339550216e-05, "loss": 0.044, "step": 36820 }, { "grad_norm": 0.4537104368209839, "learning_rate": 3.55246684482791e-05, "loss": 0.0373, "step": 36830 }, { "grad_norm": 0.42161086201667786, "learning_rate": 3.5498292954231496e-05, "loss": 0.04, "step": 36840 }, { "grad_norm": 0.8647037148475647, "learning_rate": 3.54719218654196e-05, "loss": 0.0476, "step": 36850 }, { "grad_norm": 0.2961327135562897, "learning_rate": 3.544555518985425e-05, "loss": 0.0331, "step": 36860 }, { "grad_norm": 0.4240495264530182, "learning_rate": 3.541919293554494e-05, "loss": 0.0304, "step": 36870 }, { "grad_norm": 0.7547842860221863, "learning_rate": 3.539283511049985e-05, "loss": 0.0364, "step": 36880 }, { "grad_norm": 0.3391607701778412, "learning_rate": 3.5366481722725755e-05, "loss": 0.0268, "step": 36890 }, { "grad_norm": 0.4087321162223816, "learning_rate": 3.534013278022816e-05, "loss": 0.0435, "step": 36900 }, { "grad_norm": 0.7300174832344055, "learning_rate": 3.531378829101113e-05, "loss": 0.0347, "step": 36910 }, { "grad_norm": 0.5003631114959717, "learning_rate": 3.528744826307746e-05, "loss": 0.0415, "step": 36920 }, { "grad_norm": 0.34205490350723267, "learning_rate": 3.5261112704428554e-05, "loss": 0.0305, "step": 36930 }, { "grad_norm": 0.4212041199207306, "learning_rate": 3.523478162306443e-05, "loss": 0.0463, "step": 36940 }, { "grad_norm": 1.3100621700286865, "learning_rate": 3.520845502698381e-05, "loss": 0.0487, "step": 36950 }, { "grad_norm": 0.7293629050254822, "learning_rate": 3.5182132924184005e-05, "loss": 0.0313, "step": 36960 }, { "grad_norm": 0.4878681004047394, "learning_rate": 3.5155815322660966e-05, "loss": 0.0297, "step": 36970 }, { "grad_norm": 0.5103172659873962, "learning_rate": 3.512950223040931e-05, "loss": 0.0346, "step": 36980 }, { "grad_norm": 0.3160833418369293, "learning_rate": 3.5103193655422216e-05, "loss": 0.0474, "step": 36990 }, { "grad_norm": 0.5530313849449158, "learning_rate": 3.5076889605691596e-05, "loss": 0.0354, "step": 37000 }, { "grad_norm": 0.5108746290206909, "learning_rate": 3.505059008920787e-05, "loss": 0.0456, "step": 37010 }, { "grad_norm": 0.31294310092926025, "learning_rate": 3.502429511396016e-05, "loss": 0.0329, "step": 37020 }, { "grad_norm": 0.5755030512809753, "learning_rate": 3.4998004687936196e-05, "loss": 0.0322, "step": 37030 }, { "grad_norm": 0.6652011275291443, "learning_rate": 3.497171881912229e-05, "loss": 0.035, "step": 37040 }, { "grad_norm": 0.30149132013320923, "learning_rate": 3.494543751550342e-05, "loss": 0.031, "step": 37050 }, { "grad_norm": 0.38088560104370117, "learning_rate": 3.491916078506313e-05, "loss": 0.0304, "step": 37060 }, { "grad_norm": 0.5027806162834167, "learning_rate": 3.489288863578361e-05, "loss": 0.0283, "step": 37070 }, { "grad_norm": 0.5395493507385254, "learning_rate": 3.4866621075645646e-05, "loss": 0.03, "step": 37080 }, { "grad_norm": 0.6117560267448425, "learning_rate": 3.4840358112628614e-05, "loss": 0.0394, "step": 37090 }, { "grad_norm": 0.42851927876472473, "learning_rate": 3.481409975471053e-05, "loss": 0.0335, "step": 37100 }, { "grad_norm": 0.582604706287384, "learning_rate": 3.4787846009867986e-05, "loss": 0.0367, "step": 37110 }, { "grad_norm": 0.5447967052459717, "learning_rate": 3.476159688607615e-05, "loss": 0.0535, "step": 37120 }, { "grad_norm": 0.36739033460617065, "learning_rate": 3.4735352391308854e-05, "loss": 0.0302, "step": 37130 }, { "grad_norm": 0.7110898494720459, "learning_rate": 3.4709112533538446e-05, "loss": 0.0428, "step": 37140 }, { "grad_norm": 0.3544427752494812, "learning_rate": 3.4682877320735934e-05, "loss": 0.024, "step": 37150 }, { "grad_norm": 0.4248226583003998, "learning_rate": 3.465664676087085e-05, "loss": 0.0358, "step": 37160 }, { "grad_norm": 0.2202548384666443, "learning_rate": 3.463042086191136e-05, "loss": 0.0371, "step": 37170 }, { "grad_norm": 0.3832944333553314, "learning_rate": 3.460419963182423e-05, "loss": 0.0419, "step": 37180 }, { "grad_norm": 0.8520732522010803, "learning_rate": 3.457798307857473e-05, "loss": 0.0491, "step": 37190 }, { "grad_norm": 0.31997665762901306, "learning_rate": 3.455177121012678e-05, "loss": 0.0398, "step": 37200 }, { "grad_norm": 0.6073611378669739, "learning_rate": 3.452556403444285e-05, "loss": 0.0376, "step": 37210 }, { "grad_norm": 0.8089702129364014, "learning_rate": 3.4499361559483975e-05, "loss": 0.0325, "step": 37220 }, { "grad_norm": 0.9997991323471069, "learning_rate": 3.44731637932098e-05, "loss": 0.0357, "step": 37230 }, { "grad_norm": 0.43319305777549744, "learning_rate": 3.44469707435785e-05, "loss": 0.0558, "step": 37240 }, { "grad_norm": 0.8551160097122192, "learning_rate": 3.4420782418546835e-05, "loss": 0.0635, "step": 37250 }, { "grad_norm": 0.4420667886734009, "learning_rate": 3.439459882607012e-05, "loss": 0.0581, "step": 37260 }, { "grad_norm": 0.6244044899940491, "learning_rate": 3.436841997410225e-05, "loss": 0.0316, "step": 37270 }, { "grad_norm": 0.33043286204338074, "learning_rate": 3.434224587059567e-05, "loss": 0.0349, "step": 37280 }, { "grad_norm": 0.6521527171134949, "learning_rate": 3.431607652350136e-05, "loss": 0.0443, "step": 37290 }, { "grad_norm": 0.2647317051887512, "learning_rate": 3.428991194076891e-05, "loss": 0.0264, "step": 37300 }, { "grad_norm": 0.3916364908218384, "learning_rate": 3.4263752130346394e-05, "loss": 0.0326, "step": 37310 }, { "grad_norm": 0.4530986547470093, "learning_rate": 3.4237597100180515e-05, "loss": 0.0418, "step": 37320 }, { "grad_norm": 0.45174965262413025, "learning_rate": 3.4211446858216427e-05, "loss": 0.033, "step": 37330 }, { "grad_norm": 0.6200129389762878, "learning_rate": 3.4185301412397915e-05, "loss": 0.0367, "step": 37340 }, { "grad_norm": 0.46088701486587524, "learning_rate": 3.415916077066729e-05, "loss": 0.0302, "step": 37350 }, { "grad_norm": 0.2838863432407379, "learning_rate": 3.413302494096535e-05, "loss": 0.0311, "step": 37360 }, { "grad_norm": 0.3815024197101593, "learning_rate": 3.410689393123151e-05, "loss": 0.0375, "step": 37370 }, { "grad_norm": 0.399950236082077, "learning_rate": 3.408076774940364e-05, "loss": 0.025, "step": 37380 }, { "grad_norm": 0.35181859135627747, "learning_rate": 3.40546464034182e-05, "loss": 0.0244, "step": 37390 }, { "grad_norm": 0.5009101033210754, "learning_rate": 3.4028529901210185e-05, "loss": 0.0286, "step": 37400 }, { "grad_norm": 0.9857137799263, "learning_rate": 3.4002418250713086e-05, "loss": 0.0492, "step": 37410 }, { "grad_norm": 0.4616108536720276, "learning_rate": 3.3976311459858936e-05, "loss": 0.0305, "step": 37420 }, { "grad_norm": 0.5652284026145935, "learning_rate": 3.395020953657826e-05, "loss": 0.0313, "step": 37430 }, { "grad_norm": 0.259032279253006, "learning_rate": 3.3924112488800165e-05, "loss": 0.0244, "step": 37440 }, { "grad_norm": 0.5562984943389893, "learning_rate": 3.389802032445225e-05, "loss": 0.0737, "step": 37450 }, { "grad_norm": 0.2808259427547455, "learning_rate": 3.38719330514606e-05, "loss": 0.0491, "step": 37460 }, { "grad_norm": 0.4305935502052307, "learning_rate": 3.3845850677749866e-05, "loss": 0.0313, "step": 37470 }, { "grad_norm": 0.47887706756591797, "learning_rate": 3.3819773211243157e-05, "loss": 0.0293, "step": 37480 }, { "grad_norm": 0.3100670874118805, "learning_rate": 3.379370065986213e-05, "loss": 0.0236, "step": 37490 }, { "grad_norm": 0.49032941460609436, "learning_rate": 3.3767633031526955e-05, "loss": 0.0316, "step": 37500 }, { "grad_norm": 0.45546820759773254, "learning_rate": 3.374157033415626e-05, "loss": 0.0317, "step": 37510 }, { "grad_norm": 0.8867506980895996, "learning_rate": 3.371551257566723e-05, "loss": 0.0367, "step": 37520 }, { "grad_norm": 0.5048530697822571, "learning_rate": 3.36894597639755e-05, "loss": 0.0495, "step": 37530 }, { "grad_norm": 0.2255973517894745, "learning_rate": 3.366341190699523e-05, "loss": 0.051, "step": 37540 }, { "grad_norm": 0.4144626259803772, "learning_rate": 3.36373690126391e-05, "loss": 0.0318, "step": 37550 }, { "grad_norm": 0.35834822058677673, "learning_rate": 3.3611331088818234e-05, "loss": 0.0643, "step": 37560 }, { "grad_norm": 0.5099838376045227, "learning_rate": 3.3585298143442265e-05, "loss": 0.0361, "step": 37570 }, { "grad_norm": 0.561277449131012, "learning_rate": 3.35592701844193e-05, "loss": 0.0343, "step": 37580 }, { "grad_norm": 0.533328115940094, "learning_rate": 3.353324721965596e-05, "loss": 0.0349, "step": 37590 }, { "grad_norm": 0.6225327849388123, "learning_rate": 3.350722925705736e-05, "loss": 0.0362, "step": 37600 }, { "grad_norm": 0.4838770627975464, "learning_rate": 3.348121630452703e-05, "loss": 0.0476, "step": 37610 }, { "grad_norm": 0.5988354682922363, "learning_rate": 3.3455208369967044e-05, "loss": 0.0344, "step": 37620 }, { "grad_norm": 0.4371527433395386, "learning_rate": 3.34292054612779e-05, "loss": 0.0377, "step": 37630 }, { "grad_norm": 0.3595876693725586, "learning_rate": 3.340320758635861e-05, "loss": 0.0315, "step": 37640 }, { "grad_norm": 0.7082940936088562, "learning_rate": 3.337721475310666e-05, "loss": 0.0421, "step": 37650 }, { "grad_norm": 0.5505525469779968, "learning_rate": 3.335122696941795e-05, "loss": 0.0255, "step": 37660 }, { "grad_norm": 0.37674933671951294, "learning_rate": 3.332524424318692e-05, "loss": 0.037, "step": 37670 }, { "grad_norm": 0.6391472816467285, "learning_rate": 3.32992665823064e-05, "loss": 0.0288, "step": 37680 }, { "grad_norm": 0.3610638380050659, "learning_rate": 3.327329399466774e-05, "loss": 0.0357, "step": 37690 }, { "grad_norm": 0.5257157683372498, "learning_rate": 3.324732648816072e-05, "loss": 0.0312, "step": 37700 }, { "grad_norm": 0.39583510160446167, "learning_rate": 3.322136407067358e-05, "loss": 0.0307, "step": 37710 }, { "grad_norm": 0.8943191170692444, "learning_rate": 3.3195406750093036e-05, "loss": 0.0465, "step": 37720 }, { "grad_norm": 0.5195011496543884, "learning_rate": 3.3169454534304205e-05, "loss": 0.0354, "step": 37730 }, { "grad_norm": 0.449168860912323, "learning_rate": 3.3143507431190725e-05, "loss": 0.0346, "step": 37740 }, { "grad_norm": 0.41348791122436523, "learning_rate": 3.311756544863459e-05, "loss": 0.0343, "step": 37750 }, { "grad_norm": 0.5845832824707031, "learning_rate": 3.309162859451633e-05, "loss": 0.0282, "step": 37760 }, { "grad_norm": 0.40188729763031006, "learning_rate": 3.306569687671487e-05, "loss": 0.0336, "step": 37770 }, { "grad_norm": 0.39686980843544006, "learning_rate": 3.303977030310756e-05, "loss": 0.0376, "step": 37780 }, { "grad_norm": 0.3731520473957062, "learning_rate": 3.3013848881570245e-05, "loss": 0.0359, "step": 37790 }, { "grad_norm": 0.5489621162414551, "learning_rate": 3.298793261997712e-05, "loss": 0.0542, "step": 37800 }, { "grad_norm": 0.5016993880271912, "learning_rate": 3.2962021526200893e-05, "loss": 0.0345, "step": 37810 }, { "grad_norm": 0.5968080759048462, "learning_rate": 3.293611560811268e-05, "loss": 0.0386, "step": 37820 }, { "grad_norm": 0.3624768853187561, "learning_rate": 3.291021487358199e-05, "loss": 0.0302, "step": 37830 }, { "grad_norm": 0.46570637822151184, "learning_rate": 3.28843193304768e-05, "loss": 0.032, "step": 37840 }, { "grad_norm": 0.4716837704181671, "learning_rate": 3.2858428986663456e-05, "loss": 0.0304, "step": 37850 }, { "grad_norm": 0.5174664855003357, "learning_rate": 3.283254385000681e-05, "loss": 0.0427, "step": 37860 }, { "grad_norm": 0.735996425151825, "learning_rate": 3.2806663928370076e-05, "loss": 0.0331, "step": 37870 }, { "grad_norm": 0.48939451575279236, "learning_rate": 3.278078922961485e-05, "loss": 0.0334, "step": 37880 }, { "grad_norm": 0.4090332090854645, "learning_rate": 3.275491976160123e-05, "loss": 0.0288, "step": 37890 }, { "grad_norm": 0.31808313727378845, "learning_rate": 3.2729055532187645e-05, "loss": 0.0312, "step": 37900 }, { "grad_norm": 0.5931683778762817, "learning_rate": 3.270319654923097e-05, "loss": 0.0347, "step": 37910 }, { "grad_norm": 0.419586718082428, "learning_rate": 3.2677342820586506e-05, "loss": 0.0422, "step": 37920 }, { "grad_norm": 0.44274938106536865, "learning_rate": 3.2651494354107905e-05, "loss": 0.0288, "step": 37930 }, { "grad_norm": 0.4463084042072296, "learning_rate": 3.2625651157647266e-05, "loss": 0.0344, "step": 37940 }, { "grad_norm": 0.9393623471260071, "learning_rate": 3.259981323905505e-05, "loss": 0.0395, "step": 37950 }, { "grad_norm": 0.8019347190856934, "learning_rate": 3.257398060618014e-05, "loss": 0.0318, "step": 37960 }, { "grad_norm": 0.5914812684059143, "learning_rate": 3.254815326686983e-05, "loss": 0.027, "step": 37970 }, { "grad_norm": 0.4608764946460724, "learning_rate": 3.2522331228969774e-05, "loss": 0.0369, "step": 37980 }, { "grad_norm": 0.4386102259159088, "learning_rate": 3.2496514500324006e-05, "loss": 0.0237, "step": 37990 }, { "grad_norm": 0.2869147062301636, "learning_rate": 3.247070308877498e-05, "loss": 0.0457, "step": 38000 }, { "grad_norm": 0.40716004371643066, "learning_rate": 3.2444897002163515e-05, "loss": 0.0345, "step": 38010 }, { "grad_norm": 0.6812382340431213, "learning_rate": 3.241909624832885e-05, "loss": 0.0436, "step": 38020 }, { "grad_norm": 0.3926805555820465, "learning_rate": 3.239330083510852e-05, "loss": 0.0313, "step": 38030 }, { "grad_norm": 0.6771249771118164, "learning_rate": 3.236751077033855e-05, "loss": 0.0302, "step": 38040 }, { "grad_norm": 0.2995101511478424, "learning_rate": 3.234172606185322e-05, "loss": 0.0236, "step": 38050 }, { "grad_norm": 0.6159873604774475, "learning_rate": 3.231594671748528e-05, "loss": 0.0332, "step": 38060 }, { "grad_norm": 0.2311103790998459, "learning_rate": 3.2290172745065815e-05, "loss": 0.0465, "step": 38070 }, { "grad_norm": 0.34507185220718384, "learning_rate": 3.226440415242426e-05, "loss": 0.0219, "step": 38080 }, { "grad_norm": 0.3615958094596863, "learning_rate": 3.223864094738846e-05, "loss": 0.0348, "step": 38090 }, { "grad_norm": 0.7111542820930481, "learning_rate": 3.221288313778456e-05, "loss": 0.0305, "step": 38100 }, { "grad_norm": 0.35866376757621765, "learning_rate": 3.2187130731437125e-05, "loss": 0.032, "step": 38110 }, { "grad_norm": 0.47464466094970703, "learning_rate": 3.216138373616905e-05, "loss": 0.0365, "step": 38120 }, { "grad_norm": 0.5102115273475647, "learning_rate": 3.21356421598016e-05, "loss": 0.0411, "step": 38130 }, { "grad_norm": 0.39144569635391235, "learning_rate": 3.210990601015438e-05, "loss": 0.0367, "step": 38140 }, { "grad_norm": 0.30635374784469604, "learning_rate": 3.208417529504535e-05, "loss": 0.0307, "step": 38150 }, { "grad_norm": 0.38205769658088684, "learning_rate": 3.205845002229084e-05, "loss": 0.0353, "step": 38160 }, { "grad_norm": 0.28143760561943054, "learning_rate": 3.203273019970547e-05, "loss": 0.041, "step": 38170 }, { "grad_norm": 0.3953935205936432, "learning_rate": 3.200701583510227e-05, "loss": 0.0297, "step": 38180 }, { "grad_norm": 0.45913153886795044, "learning_rate": 3.198130693629261e-05, "loss": 0.0486, "step": 38190 }, { "grad_norm": 0.36083537340164185, "learning_rate": 3.195560351108612e-05, "loss": 0.0299, "step": 38200 }, { "grad_norm": 0.4912375509738922, "learning_rate": 3.1929905567290865e-05, "loss": 0.0317, "step": 38210 }, { "grad_norm": 0.3857437074184418, "learning_rate": 3.1904213112713164e-05, "loss": 0.0382, "step": 38220 }, { "grad_norm": 0.44772523641586304, "learning_rate": 3.187852615515774e-05, "loss": 0.0304, "step": 38230 }, { "grad_norm": 0.25723081827163696, "learning_rate": 3.1852844702427606e-05, "loss": 0.0373, "step": 38240 }, { "grad_norm": 0.2924826741218567, "learning_rate": 3.18271687623241e-05, "loss": 0.0393, "step": 38250 }, { "grad_norm": 0.32993704080581665, "learning_rate": 3.1801498342646896e-05, "loss": 0.0529, "step": 38260 }, { "grad_norm": 0.40826675295829773, "learning_rate": 3.177583345119398e-05, "loss": 0.0229, "step": 38270 }, { "grad_norm": 0.46396294236183167, "learning_rate": 3.17501740957617e-05, "loss": 0.0295, "step": 38280 }, { "grad_norm": 0.40493908524513245, "learning_rate": 3.172452028414467e-05, "loss": 0.0308, "step": 38290 }, { "grad_norm": 0.4820024371147156, "learning_rate": 3.169887202413583e-05, "loss": 0.0272, "step": 38300 }, { "grad_norm": 0.4975849390029907, "learning_rate": 3.167322932352646e-05, "loss": 0.0333, "step": 38310 }, { "grad_norm": 0.5890365839004517, "learning_rate": 3.164759219010613e-05, "loss": 0.0365, "step": 38320 }, { "grad_norm": 0.20895634591579437, "learning_rate": 3.1621960631662725e-05, "loss": 0.0263, "step": 38330 }, { "grad_norm": 0.4233465790748596, "learning_rate": 3.159633465598245e-05, "loss": 0.032, "step": 38340 }, { "grad_norm": 0.48660606145858765, "learning_rate": 3.1570714270849767e-05, "loss": 0.0398, "step": 38350 }, { "grad_norm": 0.4420309066772461, "learning_rate": 3.1545099484047516e-05, "loss": 0.0292, "step": 38360 }, { "grad_norm": 0.7460551857948303, "learning_rate": 3.151949030335674e-05, "loss": 0.0667, "step": 38370 }, { "grad_norm": 0.6160295009613037, "learning_rate": 3.149388673655687e-05, "loss": 0.0351, "step": 38380 }, { "grad_norm": 0.4043038487434387, "learning_rate": 3.146828879142559e-05, "loss": 0.0285, "step": 38390 }, { "grad_norm": 0.9777981042861938, "learning_rate": 3.1442696475738866e-05, "loss": 0.053, "step": 38400 }, { "grad_norm": 0.4225418269634247, "learning_rate": 3.141710979727098e-05, "loss": 0.0334, "step": 38410 }, { "grad_norm": 0.5193871855735779, "learning_rate": 3.139152876379447e-05, "loss": 0.0228, "step": 38420 }, { "grad_norm": 0.4479409456253052, "learning_rate": 3.1365953383080214e-05, "loss": 0.031, "step": 38430 }, { "grad_norm": 0.5730171799659729, "learning_rate": 3.134038366289731e-05, "loss": 0.0331, "step": 38440 }, { "grad_norm": 0.9764124155044556, "learning_rate": 3.131481961101317e-05, "loss": 0.0391, "step": 38450 }, { "grad_norm": 0.6680290102958679, "learning_rate": 3.128926123519349e-05, "loss": 0.0389, "step": 38460 }, { "grad_norm": 0.43972548842430115, "learning_rate": 3.1263708543202194e-05, "loss": 0.0287, "step": 38470 }, { "grad_norm": 0.3727506399154663, "learning_rate": 3.123816154280155e-05, "loss": 0.0339, "step": 38480 }, { "grad_norm": 0.4604775011539459, "learning_rate": 3.121262024175207e-05, "loss": 0.0337, "step": 38490 }, { "grad_norm": 0.30301937460899353, "learning_rate": 3.118708464781248e-05, "loss": 0.0274, "step": 38500 }, { "grad_norm": 0.38655591011047363, "learning_rate": 3.116155476873987e-05, "loss": 0.0299, "step": 38510 }, { "grad_norm": 0.5288612246513367, "learning_rate": 3.11360306122895e-05, "loss": 0.0322, "step": 38520 }, { "grad_norm": 0.4136812388896942, "learning_rate": 3.1110512186214975e-05, "loss": 0.0291, "step": 38530 }, { "grad_norm": 0.40480664372444153, "learning_rate": 3.1084999498268095e-05, "loss": 0.0248, "step": 38540 }, { "grad_norm": 0.32259654998779297, "learning_rate": 3.1059492556198934e-05, "loss": 0.0239, "step": 38550 }, { "grad_norm": 0.44128644466400146, "learning_rate": 3.103399136775586e-05, "loss": 0.0259, "step": 38560 }, { "grad_norm": 0.2412479668855667, "learning_rate": 3.100849594068541e-05, "loss": 0.0335, "step": 38570 }, { "grad_norm": 0.5875385403633118, "learning_rate": 3.0983006282732484e-05, "loss": 0.0483, "step": 38580 }, { "grad_norm": 0.4818826913833618, "learning_rate": 3.0957522401640116e-05, "loss": 0.0236, "step": 38590 }, { "grad_norm": 0.2524304687976837, "learning_rate": 3.0932044305149645e-05, "loss": 0.0311, "step": 38600 }, { "grad_norm": 0.4018889367580414, "learning_rate": 3.090657200100068e-05, "loss": 0.0322, "step": 38610 }, { "grad_norm": 0.38896629214286804, "learning_rate": 3.088110549693099e-05, "loss": 0.0369, "step": 38620 }, { "grad_norm": 0.24721843004226685, "learning_rate": 3.085564480067667e-05, "loss": 0.0333, "step": 38630 }, { "grad_norm": 0.46981337666511536, "learning_rate": 3.0830189919971955e-05, "loss": 0.046, "step": 38640 }, { "grad_norm": 0.4433526396751404, "learning_rate": 3.080474086254939e-05, "loss": 0.0362, "step": 38650 }, { "grad_norm": 0.6855091452598572, "learning_rate": 3.077929763613975e-05, "loss": 0.0625, "step": 38660 }, { "grad_norm": 0.5169939994812012, "learning_rate": 3.075386024847198e-05, "loss": 0.0352, "step": 38670 }, { "grad_norm": 0.9090452194213867, "learning_rate": 3.072842870727331e-05, "loss": 0.0636, "step": 38680 }, { "grad_norm": 0.4424038529396057, "learning_rate": 3.070300302026916e-05, "loss": 0.0397, "step": 38690 }, { "grad_norm": 0.45325079560279846, "learning_rate": 3.067758319518318e-05, "loss": 0.0382, "step": 38700 }, { "grad_norm": 0.4204695522785187, "learning_rate": 3.065216923973725e-05, "loss": 0.0265, "step": 38710 }, { "grad_norm": 0.35786503553390503, "learning_rate": 3.062676116165145e-05, "loss": 0.0291, "step": 38720 }, { "grad_norm": 0.7161620259284973, "learning_rate": 3.06013589686441e-05, "loss": 0.0308, "step": 38730 }, { "grad_norm": 0.339714914560318, "learning_rate": 3.05759626684317e-05, "loss": 0.024, "step": 38740 }, { "grad_norm": 0.5893936157226562, "learning_rate": 3.055057226872896e-05, "loss": 0.0334, "step": 38750 }, { "grad_norm": 0.5102516412734985, "learning_rate": 3.052518777724887e-05, "loss": 0.0567, "step": 38760 }, { "grad_norm": 0.7086442112922668, "learning_rate": 3.04998092017025e-05, "loss": 0.0365, "step": 38770 }, { "grad_norm": 0.39462369680404663, "learning_rate": 3.0474436549799246e-05, "loss": 0.0334, "step": 38780 }, { "grad_norm": 0.5004608631134033, "learning_rate": 3.044906982924661e-05, "loss": 0.0333, "step": 38790 }, { "grad_norm": 0.37212324142456055, "learning_rate": 3.0423709047750337e-05, "loss": 0.0334, "step": 38800 }, { "grad_norm": 0.5769866108894348, "learning_rate": 3.03983542130144e-05, "loss": 0.0273, "step": 38810 }, { "grad_norm": 0.48830145597457886, "learning_rate": 3.0373005332740877e-05, "loss": 0.0241, "step": 38820 }, { "grad_norm": 0.37337154150009155, "learning_rate": 3.034766241463013e-05, "loss": 0.0398, "step": 38830 }, { "grad_norm": 0.41192924976348877, "learning_rate": 3.032232546638064e-05, "loss": 0.0231, "step": 38840 }, { "grad_norm": 0.17823587357997894, "learning_rate": 3.0296994495689114e-05, "loss": 0.0306, "step": 38850 }, { "grad_norm": 0.7528064250946045, "learning_rate": 3.0271669510250444e-05, "loss": 0.0704, "step": 38860 }, { "grad_norm": 0.4830862283706665, "learning_rate": 3.024635051775766e-05, "loss": 0.0325, "step": 38870 }, { "grad_norm": 0.3205649256706238, "learning_rate": 3.022103752590205e-05, "loss": 0.0315, "step": 38880 }, { "grad_norm": 0.6112725138664246, "learning_rate": 3.0195730542372992e-05, "loss": 0.0282, "step": 38890 }, { "grad_norm": 0.4727705419063568, "learning_rate": 3.0170429574858084e-05, "loss": 0.0392, "step": 38900 }, { "grad_norm": 0.5827333331108093, "learning_rate": 3.0145134631043127e-05, "loss": 0.0328, "step": 38910 }, { "grad_norm": 0.4032798409461975, "learning_rate": 3.0119845718612018e-05, "loss": 0.0262, "step": 38920 }, { "grad_norm": 0.5613429546356201, "learning_rate": 3.009456284524688e-05, "loss": 0.0429, "step": 38930 }, { "grad_norm": 0.41994303464889526, "learning_rate": 3.0069286018627967e-05, "loss": 0.042, "step": 38940 }, { "grad_norm": 0.5024409890174866, "learning_rate": 3.0044015246433743e-05, "loss": 0.0271, "step": 38950 }, { "grad_norm": 0.5469229221343994, "learning_rate": 3.0018750536340755e-05, "loss": 0.0322, "step": 38960 }, { "grad_norm": 0.5137537121772766, "learning_rate": 2.999349189602378e-05, "loss": 0.0364, "step": 38970 }, { "grad_norm": 0.5075299143791199, "learning_rate": 2.9968239333155733e-05, "loss": 0.0504, "step": 38980 }, { "grad_norm": 0.4882086515426636, "learning_rate": 2.994299285540767e-05, "loss": 0.0364, "step": 38990 }, { "grad_norm": 0.5671966671943665, "learning_rate": 2.9917752470448813e-05, "loss": 0.0287, "step": 39000 }, { "grad_norm": 0.4908280372619629, "learning_rate": 2.9892518185946495e-05, "loss": 0.0382, "step": 39010 }, { "grad_norm": 0.4588521420955658, "learning_rate": 2.986729000956624e-05, "loss": 0.0363, "step": 39020 }, { "grad_norm": 0.3260127007961273, "learning_rate": 2.9842067948971736e-05, "loss": 0.0273, "step": 39030 }, { "grad_norm": 0.33076462149620056, "learning_rate": 2.9816852011824727e-05, "loss": 0.0267, "step": 39040 }, { "grad_norm": 0.7231981158256531, "learning_rate": 2.979164220578519e-05, "loss": 0.0281, "step": 39050 }, { "grad_norm": 0.5036042928695679, "learning_rate": 2.9766438538511165e-05, "loss": 0.0295, "step": 39060 }, { "grad_norm": 0.4144940674304962, "learning_rate": 2.9741241017658873e-05, "loss": 0.0287, "step": 39070 }, { "grad_norm": 0.39225122332572937, "learning_rate": 2.971604965088267e-05, "loss": 0.0309, "step": 39080 }, { "grad_norm": 0.5917347073554993, "learning_rate": 2.9690864445835008e-05, "loss": 0.0366, "step": 39090 }, { "grad_norm": 0.34269291162490845, "learning_rate": 2.966568541016651e-05, "loss": 0.0323, "step": 39100 }, { "grad_norm": 0.4727505147457123, "learning_rate": 2.9640512551525867e-05, "loss": 0.0341, "step": 39110 }, { "grad_norm": 0.34655797481536865, "learning_rate": 2.961534587755995e-05, "loss": 0.0385, "step": 39120 }, { "grad_norm": 0.51029372215271, "learning_rate": 2.959018539591375e-05, "loss": 0.0408, "step": 39130 }, { "grad_norm": 0.2925247251987457, "learning_rate": 2.9565031114230325e-05, "loss": 0.0272, "step": 39140 }, { "grad_norm": 0.39612746238708496, "learning_rate": 2.9539883040150895e-05, "loss": 0.0381, "step": 39150 }, { "grad_norm": 0.39609381556510925, "learning_rate": 2.9514741181314774e-05, "loss": 0.0257, "step": 39160 }, { "grad_norm": 0.2624293863773346, "learning_rate": 2.94896055453594e-05, "loss": 0.0261, "step": 39170 }, { "grad_norm": 0.4246884286403656, "learning_rate": 2.9464476139920332e-05, "loss": 0.0269, "step": 39180 }, { "grad_norm": 0.45838111639022827, "learning_rate": 2.9439352972631186e-05, "loss": 0.0329, "step": 39190 }, { "grad_norm": 0.6155233383178711, "learning_rate": 2.9414236051123757e-05, "loss": 0.0305, "step": 39200 }, { "grad_norm": 0.4051811099052429, "learning_rate": 2.938912538302785e-05, "loss": 0.036, "step": 39210 }, { "grad_norm": 0.25751185417175293, "learning_rate": 2.9364020975971464e-05, "loss": 0.0262, "step": 39220 }, { "grad_norm": 0.38049259781837463, "learning_rate": 2.9338922837580657e-05, "loss": 0.0384, "step": 39230 }, { "grad_norm": 0.2971763014793396, "learning_rate": 2.931383097547955e-05, "loss": 0.0277, "step": 39240 }, { "grad_norm": 0.3340304493904114, "learning_rate": 2.928874539729043e-05, "loss": 0.0358, "step": 39250 }, { "grad_norm": 0.3324222266674042, "learning_rate": 2.926366611063358e-05, "loss": 0.025, "step": 39260 }, { "grad_norm": 0.27541598677635193, "learning_rate": 2.9238593123127463e-05, "loss": 0.0247, "step": 39270 }, { "grad_norm": 0.2839916944503784, "learning_rate": 2.9213526442388583e-05, "loss": 0.0258, "step": 39280 }, { "grad_norm": 0.289482057094574, "learning_rate": 2.9188466076031545e-05, "loss": 0.0316, "step": 39290 }, { "grad_norm": 0.2909396290779114, "learning_rate": 2.9163412031669012e-05, "loss": 0.0293, "step": 39300 }, { "grad_norm": 0.4180492162704468, "learning_rate": 2.913836431691175e-05, "loss": 0.0278, "step": 39310 }, { "grad_norm": 0.4315732419490814, "learning_rate": 2.9113322939368583e-05, "loss": 0.0342, "step": 39320 }, { "grad_norm": 0.6872775554656982, "learning_rate": 2.9088287906646427e-05, "loss": 0.0402, "step": 39330 }, { "grad_norm": 0.36146894097328186, "learning_rate": 2.906325922635024e-05, "loss": 0.0332, "step": 39340 }, { "grad_norm": 0.6014456748962402, "learning_rate": 2.903823690608313e-05, "loss": 0.036, "step": 39350 }, { "grad_norm": 0.47016677260398865, "learning_rate": 2.9013220953446174e-05, "loss": 0.0295, "step": 39360 }, { "grad_norm": 0.49231988191604614, "learning_rate": 2.8988211376038564e-05, "loss": 0.0417, "step": 39370 }, { "grad_norm": 0.47450971603393555, "learning_rate": 2.8963208181457564e-05, "loss": 0.0278, "step": 39380 }, { "grad_norm": 0.48886725306510925, "learning_rate": 2.8938211377298453e-05, "loss": 0.0303, "step": 39390 }, { "grad_norm": 0.7817880511283875, "learning_rate": 2.8913220971154652e-05, "loss": 0.0395, "step": 39400 }, { "grad_norm": 0.6683650612831116, "learning_rate": 2.888823697061753e-05, "loss": 0.0396, "step": 39410 }, { "grad_norm": 0.3259637653827667, "learning_rate": 2.8863259383276618e-05, "loss": 0.0385, "step": 39420 }, { "grad_norm": 0.5483993291854858, "learning_rate": 2.8838288216719395e-05, "loss": 0.0288, "step": 39430 }, { "grad_norm": 0.9991803765296936, "learning_rate": 2.8813323478531484e-05, "loss": 0.0377, "step": 39440 }, { "grad_norm": 0.40366998314857483, "learning_rate": 2.8788365176296496e-05, "loss": 0.0357, "step": 39450 }, { "grad_norm": 0.4658909738063812, "learning_rate": 2.876341331759611e-05, "loss": 0.0353, "step": 39460 }, { "grad_norm": 0.4535815119743347, "learning_rate": 2.8738467910010036e-05, "loss": 0.0391, "step": 39470 }, { "grad_norm": 0.5803951025009155, "learning_rate": 2.8713528961116032e-05, "loss": 0.0339, "step": 39480 }, { "grad_norm": 0.8079209327697754, "learning_rate": 2.8688596478489875e-05, "loss": 0.0236, "step": 39490 }, { "grad_norm": 0.23073187470436096, "learning_rate": 2.8663670469705434e-05, "loss": 0.0206, "step": 39500 }, { "grad_norm": 0.5087918639183044, "learning_rate": 2.8638750942334546e-05, "loss": 0.036, "step": 39510 }, { "grad_norm": 0.32685157656669617, "learning_rate": 2.8613837903947115e-05, "loss": 0.0414, "step": 39520 }, { "grad_norm": 0.5833987593650818, "learning_rate": 2.858893136211106e-05, "loss": 0.0399, "step": 39530 }, { "grad_norm": 0.32845860719680786, "learning_rate": 2.8564031324392315e-05, "loss": 0.0295, "step": 39540 }, { "grad_norm": 0.9603100419044495, "learning_rate": 2.85391377983549e-05, "loss": 0.0475, "step": 39550 }, { "grad_norm": 0.4054658114910126, "learning_rate": 2.851425079156075e-05, "loss": 0.0268, "step": 39560 }, { "grad_norm": 0.4009225368499756, "learning_rate": 2.848937031156994e-05, "loss": 0.0376, "step": 39570 }, { "grad_norm": 0.5321239829063416, "learning_rate": 2.846449636594044e-05, "loss": 0.0427, "step": 39580 }, { "grad_norm": 0.3014086186885834, "learning_rate": 2.843962896222836e-05, "loss": 0.039, "step": 39590 }, { "grad_norm": 0.4311501383781433, "learning_rate": 2.8414768107987722e-05, "loss": 0.0359, "step": 39600 }, { "grad_norm": 0.5395899415016174, "learning_rate": 2.838991381077061e-05, "loss": 0.0367, "step": 39610 }, { "grad_norm": 0.41440004110336304, "learning_rate": 2.83650660781271e-05, "loss": 0.0314, "step": 39620 }, { "grad_norm": 0.48144322633743286, "learning_rate": 2.8340224917605285e-05, "loss": 0.037, "step": 39630 }, { "grad_norm": 0.3245808482170105, "learning_rate": 2.831539033675122e-05, "loss": 0.0333, "step": 39640 }, { "grad_norm": 0.7450011372566223, "learning_rate": 2.8290562343109038e-05, "loss": 0.0244, "step": 39650 }, { "grad_norm": 0.39860251545906067, "learning_rate": 2.826574094422082e-05, "loss": 0.0182, "step": 39660 }, { "grad_norm": 0.48639553785324097, "learning_rate": 2.8240926147626645e-05, "loss": 0.0317, "step": 39670 }, { "grad_norm": 0.36009976267814636, "learning_rate": 2.8216117960864586e-05, "loss": 0.0242, "step": 39680 }, { "grad_norm": 0.5258987545967102, "learning_rate": 2.8191316391470703e-05, "loss": 0.0311, "step": 39690 }, { "grad_norm": 0.644208550453186, "learning_rate": 2.816652144697911e-05, "loss": 0.0627, "step": 39700 }, { "grad_norm": 0.34079813957214355, "learning_rate": 2.8141733134921783e-05, "loss": 0.0246, "step": 39710 }, { "grad_norm": 0.47887948155403137, "learning_rate": 2.811695146282884e-05, "loss": 0.0559, "step": 39720 }, { "grad_norm": 0.30618149042129517, "learning_rate": 2.8092176438228212e-05, "loss": 0.0262, "step": 39730 }, { "grad_norm": 0.36918318271636963, "learning_rate": 2.806740806864598e-05, "loss": 0.032, "step": 39740 }, { "grad_norm": 0.46076059341430664, "learning_rate": 2.804264636160604e-05, "loss": 0.0463, "step": 39750 }, { "grad_norm": 0.44124919176101685, "learning_rate": 2.8017891324630402e-05, "loss": 0.0462, "step": 39760 }, { "grad_norm": 0.5597279071807861, "learning_rate": 2.7993142965238976e-05, "loss": 0.0388, "step": 39770 }, { "grad_norm": 0.7811678051948547, "learning_rate": 2.7968401290949665e-05, "loss": 0.0287, "step": 39780 }, { "grad_norm": 0.36800962686538696, "learning_rate": 2.7943666309278328e-05, "loss": 0.0402, "step": 39790 }, { "grad_norm": 0.5773888826370239, "learning_rate": 2.7918938027738783e-05, "loss": 0.0621, "step": 39800 }, { "grad_norm": 0.5550701022148132, "learning_rate": 2.789421645384287e-05, "loss": 0.0323, "step": 39810 }, { "grad_norm": 0.42804110050201416, "learning_rate": 2.786950159510032e-05, "loss": 0.0263, "step": 39820 }, { "grad_norm": 0.5930785536766052, "learning_rate": 2.7844793459018876e-05, "loss": 0.0345, "step": 39830 }, { "grad_norm": 0.6064878106117249, "learning_rate": 2.7820092053104195e-05, "loss": 0.0392, "step": 39840 }, { "grad_norm": 0.5680369734764099, "learning_rate": 2.7795397384859933e-05, "loss": 0.0308, "step": 39850 }, { "grad_norm": 0.4781127870082855, "learning_rate": 2.7770709461787638e-05, "loss": 0.0427, "step": 39860 }, { "grad_norm": 0.5540755987167358, "learning_rate": 2.7746028291386915e-05, "loss": 0.0318, "step": 39870 }, { "grad_norm": 0.5486699342727661, "learning_rate": 2.772135388115519e-05, "loss": 0.0386, "step": 39880 }, { "grad_norm": 0.5286202430725098, "learning_rate": 2.7696686238587945e-05, "loss": 0.0228, "step": 39890 }, { "grad_norm": 0.6199668049812317, "learning_rate": 2.7672025371178505e-05, "loss": 0.0361, "step": 39900 }, { "grad_norm": 0.43036583065986633, "learning_rate": 2.7647371286418238e-05, "loss": 0.0374, "step": 39910 }, { "grad_norm": 0.46909135580062866, "learning_rate": 2.762272399179639e-05, "loss": 0.0269, "step": 39920 }, { "grad_norm": 0.20229357481002808, "learning_rate": 2.7598083494800154e-05, "loss": 0.0366, "step": 39930 }, { "grad_norm": 0.32099127769470215, "learning_rate": 2.7573449802914664e-05, "loss": 0.0239, "step": 39940 }, { "grad_norm": 0.257966011762619, "learning_rate": 2.7548822923622964e-05, "loss": 0.0301, "step": 39950 }, { "grad_norm": 0.7000918984413147, "learning_rate": 2.752420286440609e-05, "loss": 0.0459, "step": 39960 }, { "grad_norm": 0.45202839374542236, "learning_rate": 2.749958963274295e-05, "loss": 0.0308, "step": 39970 }, { "grad_norm": 0.31417179107666016, "learning_rate": 2.747498323611039e-05, "loss": 0.0255, "step": 39980 }, { "grad_norm": 0.41380977630615234, "learning_rate": 2.7450383681983184e-05, "loss": 0.0253, "step": 39990 }, { "grad_norm": 0.45092126727104187, "learning_rate": 2.742579097783403e-05, "loss": 0.0348, "step": 40000 }, { "grad_norm": 0.3504027724266052, "learning_rate": 2.7401205131133512e-05, "loss": 0.0348, "step": 40010 }, { "grad_norm": 0.31726592779159546, "learning_rate": 2.7376626149350238e-05, "loss": 0.0249, "step": 40020 }, { "grad_norm": 0.4211348593235016, "learning_rate": 2.735205403995056e-05, "loss": 0.0282, "step": 40030 }, { "grad_norm": 0.30263230204582214, "learning_rate": 2.7327488810398917e-05, "loss": 0.0337, "step": 40040 }, { "grad_norm": 0.5127614140510559, "learning_rate": 2.7302930468157507e-05, "loss": 0.0286, "step": 40050 }, { "grad_norm": 0.448442667722702, "learning_rate": 2.727837902068655e-05, "loss": 0.0193, "step": 40060 }, { "grad_norm": 0.355538547039032, "learning_rate": 2.7253834475444123e-05, "loss": 0.0598, "step": 40070 }, { "grad_norm": 0.44202837347984314, "learning_rate": 2.7229296839886204e-05, "loss": 0.0289, "step": 40080 }, { "grad_norm": 0.5610182285308838, "learning_rate": 2.720476612146668e-05, "loss": 0.0289, "step": 40090 }, { "grad_norm": 0.3312712013721466, "learning_rate": 2.7180242327637317e-05, "loss": 0.0332, "step": 40100 }, { "grad_norm": 0.4464181065559387, "learning_rate": 2.7155725465847826e-05, "loss": 0.0237, "step": 40110 }, { "grad_norm": 0.6672564744949341, "learning_rate": 2.713121554354578e-05, "loss": 0.0289, "step": 40120 }, { "grad_norm": 0.25075411796569824, "learning_rate": 2.7106712568176628e-05, "loss": 0.0287, "step": 40130 }, { "grad_norm": 0.5937716364860535, "learning_rate": 2.708221654718374e-05, "loss": 0.0458, "step": 40140 }, { "grad_norm": 0.3990294337272644, "learning_rate": 2.7057727488008357e-05, "loss": 0.0323, "step": 40150 }, { "grad_norm": 0.357526570558548, "learning_rate": 2.703324539808961e-05, "loss": 0.0267, "step": 40160 }, { "grad_norm": 0.4531690180301666, "learning_rate": 2.7008770284864505e-05, "loss": 0.0377, "step": 40170 }, { "grad_norm": 0.24365678429603577, "learning_rate": 2.6984302155767916e-05, "loss": 0.0277, "step": 40180 }, { "grad_norm": 0.31678470969200134, "learning_rate": 2.6959841018232683e-05, "loss": 0.0229, "step": 40190 }, { "grad_norm": 0.42569097876548767, "learning_rate": 2.693538687968937e-05, "loss": 0.0338, "step": 40200 }, { "grad_norm": 0.434817373752594, "learning_rate": 2.6910939747566556e-05, "loss": 0.0342, "step": 40210 }, { "grad_norm": 0.4970466196537018, "learning_rate": 2.6886499629290607e-05, "loss": 0.0272, "step": 40220 }, { "grad_norm": 0.2680359184741974, "learning_rate": 2.6862066532285802e-05, "loss": 0.0315, "step": 40230 }, { "grad_norm": 0.6569496989250183, "learning_rate": 2.6837640463974262e-05, "loss": 0.0309, "step": 40240 }, { "grad_norm": 0.48308268189430237, "learning_rate": 2.681322143177596e-05, "loss": 0.0385, "step": 40250 }, { "grad_norm": 0.391179621219635, "learning_rate": 2.678880944310882e-05, "loss": 0.0278, "step": 40260 }, { "grad_norm": 0.5810186862945557, "learning_rate": 2.6764404505388474e-05, "loss": 0.0329, "step": 40270 }, { "grad_norm": 0.42408323287963867, "learning_rate": 2.6740006626028558e-05, "loss": 0.0259, "step": 40280 }, { "grad_norm": 0.4618145525455475, "learning_rate": 2.671561581244048e-05, "loss": 0.0397, "step": 40290 }, { "grad_norm": 0.22961969673633575, "learning_rate": 2.6691232072033536e-05, "loss": 0.0257, "step": 40300 }, { "grad_norm": 0.5480813384056091, "learning_rate": 2.6666855412214852e-05, "loss": 0.0383, "step": 40310 }, { "grad_norm": 0.38573145866394043, "learning_rate": 2.664248584038942e-05, "loss": 0.0374, "step": 40320 }, { "grad_norm": 0.43229955434799194, "learning_rate": 2.6618123363960047e-05, "loss": 0.046, "step": 40330 }, { "grad_norm": 0.49568605422973633, "learning_rate": 2.659376799032748e-05, "loss": 0.0311, "step": 40340 }, { "grad_norm": 0.26244252920150757, "learning_rate": 2.6569419726890145e-05, "loss": 0.0353, "step": 40350 }, { "grad_norm": 0.19568248093128204, "learning_rate": 2.654507858104447e-05, "loss": 0.0305, "step": 40360 }, { "grad_norm": 0.37246423959732056, "learning_rate": 2.652074456018463e-05, "loss": 0.0253, "step": 40370 }, { "grad_norm": 0.4475647509098053, "learning_rate": 2.6496417671702646e-05, "loss": 0.0259, "step": 40380 }, { "grad_norm": 0.5355042815208435, "learning_rate": 2.6472097922988427e-05, "loss": 0.0288, "step": 40390 }, { "grad_norm": 0.495238721370697, "learning_rate": 2.6447785321429607e-05, "loss": 0.0326, "step": 40400 }, { "grad_norm": 1.0583171844482422, "learning_rate": 2.6423479874411784e-05, "loss": 0.0383, "step": 40410 }, { "grad_norm": 0.37857556343078613, "learning_rate": 2.6399181589318234e-05, "loss": 0.0297, "step": 40420 }, { "grad_norm": 0.4418305456638336, "learning_rate": 2.6374890473530188e-05, "loss": 0.0346, "step": 40430 }, { "grad_norm": 0.46217232942581177, "learning_rate": 2.635060653442664e-05, "loss": 0.0236, "step": 40440 }, { "grad_norm": 0.37247464060783386, "learning_rate": 2.6326329779384395e-05, "loss": 0.0276, "step": 40450 }, { "grad_norm": 0.57157963514328, "learning_rate": 2.63020602157781e-05, "loss": 0.0549, "step": 40460 }, { "grad_norm": 0.39857614040374756, "learning_rate": 2.62777978509802e-05, "loss": 0.0348, "step": 40470 }, { "grad_norm": 0.7436456680297852, "learning_rate": 2.6253542692360954e-05, "loss": 0.0347, "step": 40480 }, { "grad_norm": 0.33811154961586, "learning_rate": 2.6229294747288458e-05, "loss": 0.0321, "step": 40490 }, { "grad_norm": 0.4889834523200989, "learning_rate": 2.6205054023128596e-05, "loss": 0.0258, "step": 40500 }, { "grad_norm": 0.34370601177215576, "learning_rate": 2.6180820527245043e-05, "loss": 0.0285, "step": 40510 }, { "grad_norm": 0.7068136930465698, "learning_rate": 2.6156594266999313e-05, "loss": 0.0251, "step": 40520 }, { "grad_norm": 0.6649960279464722, "learning_rate": 2.6132375249750672e-05, "loss": 0.0205, "step": 40530 }, { "grad_norm": 0.3639022409915924, "learning_rate": 2.6108163482856286e-05, "loss": 0.0207, "step": 40540 }, { "grad_norm": 0.3706546425819397, "learning_rate": 2.6083958973670964e-05, "loss": 0.0304, "step": 40550 }, { "grad_norm": 0.5694289207458496, "learning_rate": 2.6059761729547483e-05, "loss": 0.0362, "step": 40560 }, { "grad_norm": 0.5461279153823853, "learning_rate": 2.603557175783624e-05, "loss": 0.0443, "step": 40570 }, { "grad_norm": 0.4856213331222534, "learning_rate": 2.601138906588559e-05, "loss": 0.032, "step": 40580 }, { "grad_norm": 0.5292989611625671, "learning_rate": 2.598721366104152e-05, "loss": 0.0255, "step": 40590 }, { "grad_norm": 0.37727704644203186, "learning_rate": 2.5963045550647945e-05, "loss": 0.0287, "step": 40600 }, { "grad_norm": 0.4319492280483246, "learning_rate": 2.5938884742046466e-05, "loss": 0.0396, "step": 40610 }, { "grad_norm": 0.733405590057373, "learning_rate": 2.5914731242576507e-05, "loss": 0.0341, "step": 40620 }, { "grad_norm": 0.7581285834312439, "learning_rate": 2.5890585059575268e-05, "loss": 0.0346, "step": 40630 }, { "grad_norm": 0.4786682724952698, "learning_rate": 2.5866446200377688e-05, "loss": 0.0373, "step": 40640 }, { "grad_norm": 0.7744359374046326, "learning_rate": 2.5842314672316566e-05, "loss": 0.0345, "step": 40650 }, { "grad_norm": 0.4141761362552643, "learning_rate": 2.581819048272239e-05, "loss": 0.0233, "step": 40660 }, { "grad_norm": 0.589647650718689, "learning_rate": 2.5794073638923478e-05, "loss": 0.0297, "step": 40670 }, { "grad_norm": 0.2743240296840668, "learning_rate": 2.576996414824586e-05, "loss": 0.0315, "step": 40680 }, { "grad_norm": 0.6497073769569397, "learning_rate": 2.574586201801339e-05, "loss": 0.0366, "step": 40690 }, { "grad_norm": 0.598448634147644, "learning_rate": 2.572176725554762e-05, "loss": 0.0401, "step": 40700 }, { "grad_norm": 0.4652646780014038, "learning_rate": 2.5697679868167966e-05, "loss": 0.0387, "step": 40710 }, { "grad_norm": 0.8110827803611755, "learning_rate": 2.5673599863191468e-05, "loss": 0.0355, "step": 40720 }, { "grad_norm": 0.43400925397872925, "learning_rate": 2.564952724793306e-05, "loss": 0.031, "step": 40730 }, { "grad_norm": 0.5320695638656616, "learning_rate": 2.5625462029705306e-05, "loss": 0.0341, "step": 40740 }, { "grad_norm": 0.4359763264656067, "learning_rate": 2.5601404215818624e-05, "loss": 0.0333, "step": 40750 }, { "grad_norm": 0.5068012475967407, "learning_rate": 2.5577353813581144e-05, "loss": 0.0548, "step": 40760 }, { "grad_norm": 0.5357072949409485, "learning_rate": 2.5553310830298733e-05, "loss": 0.0309, "step": 40770 }, { "grad_norm": 0.40797412395477295, "learning_rate": 2.5529275273275012e-05, "loss": 0.0296, "step": 40780 }, { "grad_norm": 0.38383957743644714, "learning_rate": 2.550524714981133e-05, "loss": 0.0293, "step": 40790 }, { "grad_norm": 0.30818912386894226, "learning_rate": 2.5481226467206837e-05, "loss": 0.0284, "step": 40800 }, { "grad_norm": 0.32826486229896545, "learning_rate": 2.5457213232758365e-05, "loss": 0.0268, "step": 40810 }, { "grad_norm": 0.5217797756195068, "learning_rate": 2.5433207453760498e-05, "loss": 0.0319, "step": 40820 }, { "grad_norm": 0.419963002204895, "learning_rate": 2.5409209137505552e-05, "loss": 0.0289, "step": 40830 }, { "grad_norm": 0.3460156321525574, "learning_rate": 2.5385218291283597e-05, "loss": 0.0303, "step": 40840 }, { "grad_norm": 0.3268379867076874, "learning_rate": 2.5361234922382383e-05, "loss": 0.0257, "step": 40850 }, { "grad_norm": 0.3166348934173584, "learning_rate": 2.533725903808749e-05, "loss": 0.0276, "step": 40860 }, { "grad_norm": 0.2744201719760895, "learning_rate": 2.5313290645682085e-05, "loss": 0.0338, "step": 40870 }, { "grad_norm": 0.4498538374900818, "learning_rate": 2.52893297524472e-05, "loss": 0.0317, "step": 40880 }, { "grad_norm": 0.30646124482154846, "learning_rate": 2.526537636566145e-05, "loss": 0.027, "step": 40890 }, { "grad_norm": 0.39604467153549194, "learning_rate": 2.5241430492601305e-05, "loss": 0.0262, "step": 40900 }, { "grad_norm": 0.6957789659500122, "learning_rate": 2.5217492140540867e-05, "loss": 0.0606, "step": 40910 }, { "grad_norm": 0.40253058075904846, "learning_rate": 2.5193561316751967e-05, "loss": 0.0326, "step": 40920 }, { "grad_norm": 0.700057327747345, "learning_rate": 2.516963802850416e-05, "loss": 0.0334, "step": 40930 }, { "grad_norm": 0.4521433711051941, "learning_rate": 2.5145722283064698e-05, "loss": 0.0466, "step": 40940 }, { "grad_norm": 0.5514474511146545, "learning_rate": 2.5121814087698602e-05, "loss": 0.035, "step": 40950 }, { "grad_norm": 0.4850444495677948, "learning_rate": 2.509791344966848e-05, "loss": 0.0343, "step": 40960 }, { "grad_norm": 0.706505298614502, "learning_rate": 2.5074020376234768e-05, "loss": 0.0247, "step": 40970 }, { "grad_norm": 0.43413230776786804, "learning_rate": 2.5050134874655534e-05, "loss": 0.0228, "step": 40980 }, { "grad_norm": 0.4226599335670471, "learning_rate": 2.5026256952186566e-05, "loss": 0.0232, "step": 40990 }, { "grad_norm": 0.3565478026866913, "learning_rate": 2.5002386616081335e-05, "loss": 0.0365, "step": 41000 }, { "grad_norm": 0.4392145872116089, "learning_rate": 2.497852387359103e-05, "loss": 0.0283, "step": 41010 }, { "grad_norm": 0.5917642116546631, "learning_rate": 2.4954668731964496e-05, "loss": 0.0305, "step": 41020 }, { "grad_norm": 0.35183361172676086, "learning_rate": 2.4930821198448364e-05, "loss": 0.0239, "step": 41030 }, { "grad_norm": 0.2542153298854828, "learning_rate": 2.4906981280286796e-05, "loss": 0.0306, "step": 41040 }, { "grad_norm": 0.31075093150138855, "learning_rate": 2.488314898472179e-05, "loss": 0.0284, "step": 41050 }, { "grad_norm": 0.5038303732872009, "learning_rate": 2.485932431899295e-05, "loss": 0.0403, "step": 41060 }, { "grad_norm": 0.41684266924858093, "learning_rate": 2.4835507290337584e-05, "loss": 0.0278, "step": 41070 }, { "grad_norm": 0.4917982220649719, "learning_rate": 2.4811697905990672e-05, "loss": 0.0318, "step": 41080 }, { "grad_norm": 0.688408374786377, "learning_rate": 2.4787896173184854e-05, "loss": 0.0335, "step": 41090 }, { "grad_norm": 0.5492655634880066, "learning_rate": 2.4764102099150534e-05, "loss": 0.0291, "step": 41100 }, { "grad_norm": 0.5204286575317383, "learning_rate": 2.4740315691115644e-05, "loss": 0.0289, "step": 41110 }, { "grad_norm": 0.5499295592308044, "learning_rate": 2.4716536956305918e-05, "loss": 0.0285, "step": 41120 }, { "grad_norm": 0.4012340009212494, "learning_rate": 2.4692765901944697e-05, "loss": 0.0319, "step": 41130 }, { "grad_norm": 0.3496284782886505, "learning_rate": 2.4669002535253e-05, "loss": 0.0235, "step": 41140 }, { "grad_norm": 0.8036830425262451, "learning_rate": 2.46452468634495e-05, "loss": 0.0325, "step": 41150 }, { "grad_norm": 0.5896130800247192, "learning_rate": 2.462149889375055e-05, "loss": 0.0237, "step": 41160 }, { "grad_norm": 0.7073529362678528, "learning_rate": 2.459775863337014e-05, "loss": 0.0333, "step": 41170 }, { "grad_norm": 0.5172039270401001, "learning_rate": 2.4574026089519985e-05, "loss": 0.0264, "step": 41180 }, { "grad_norm": 0.8598776459693909, "learning_rate": 2.4550301269409333e-05, "loss": 0.0298, "step": 41190 }, { "grad_norm": 0.5340048670768738, "learning_rate": 2.4526584180245216e-05, "loss": 0.0307, "step": 41200 }, { "grad_norm": 0.6563267111778259, "learning_rate": 2.4502874829232236e-05, "loss": 0.032, "step": 41210 }, { "grad_norm": 0.38719695806503296, "learning_rate": 2.447917322357267e-05, "loss": 0.0482, "step": 41220 }, { "grad_norm": 0.22340957820415497, "learning_rate": 2.4455479370466443e-05, "loss": 0.0247, "step": 41230 }, { "grad_norm": 0.5011997222900391, "learning_rate": 2.4431793277111097e-05, "loss": 0.0283, "step": 41240 }, { "grad_norm": 0.44381943345069885, "learning_rate": 2.4408114950701905e-05, "loss": 0.0323, "step": 41250 }, { "grad_norm": 0.5076360702514648, "learning_rate": 2.4384444398431634e-05, "loss": 0.0346, "step": 41260 }, { "grad_norm": 0.5643966197967529, "learning_rate": 2.4360781627490837e-05, "loss": 0.0291, "step": 41270 }, { "grad_norm": 0.3454962372779846, "learning_rate": 2.433712664506762e-05, "loss": 0.0272, "step": 41280 }, { "grad_norm": 0.27333641052246094, "learning_rate": 2.431347945834774e-05, "loss": 0.0234, "step": 41290 }, { "grad_norm": 0.3477621376514435, "learning_rate": 2.428984007451458e-05, "loss": 0.0301, "step": 41300 }, { "grad_norm": 0.495482861995697, "learning_rate": 2.426620850074917e-05, "loss": 0.0241, "step": 41310 }, { "grad_norm": 0.55810546875, "learning_rate": 2.424258474423014e-05, "loss": 0.04, "step": 41320 }, { "grad_norm": 0.36853542923927307, "learning_rate": 2.421896881213382e-05, "loss": 0.0236, "step": 41330 }, { "grad_norm": 0.3838249742984772, "learning_rate": 2.419536071163402e-05, "loss": 0.028, "step": 41340 }, { "grad_norm": 0.39233070611953735, "learning_rate": 2.417176044990233e-05, "loss": 0.0286, "step": 41350 }, { "grad_norm": 0.3881334662437439, "learning_rate": 2.4148168034107855e-05, "loss": 0.0328, "step": 41360 }, { "grad_norm": 0.4358205795288086, "learning_rate": 2.4124583471417355e-05, "loss": 0.0216, "step": 41370 }, { "grad_norm": 0.44988152384757996, "learning_rate": 2.41010067689952e-05, "loss": 0.0293, "step": 41380 }, { "grad_norm": 0.5669235587120056, "learning_rate": 2.4077437934003338e-05, "loss": 0.0355, "step": 41390 }, { "grad_norm": 0.37843793630599976, "learning_rate": 2.405387697360143e-05, "loss": 0.0379, "step": 41400 }, { "grad_norm": 0.5046517848968506, "learning_rate": 2.4030323894946595e-05, "loss": 0.0295, "step": 41410 }, { "grad_norm": 0.5244222283363342, "learning_rate": 2.40067787051937e-05, "loss": 0.0286, "step": 41420 }, { "grad_norm": 0.3602273166179657, "learning_rate": 2.3983241411495087e-05, "loss": 0.0366, "step": 41430 }, { "grad_norm": 0.5587309002876282, "learning_rate": 2.3959712021000823e-05, "loss": 0.0243, "step": 41440 }, { "grad_norm": 0.384525865316391, "learning_rate": 2.3936190540858495e-05, "loss": 0.024, "step": 41450 }, { "grad_norm": 0.505947470664978, "learning_rate": 2.39126769782133e-05, "loss": 0.0446, "step": 41460 }, { "grad_norm": 0.29733550548553467, "learning_rate": 2.388917134020805e-05, "loss": 0.0244, "step": 41470 }, { "grad_norm": 0.22911261022090912, "learning_rate": 2.3865673633983128e-05, "loss": 0.0183, "step": 41480 }, { "grad_norm": 0.2613844871520996, "learning_rate": 2.3842183866676492e-05, "loss": 0.0354, "step": 41490 }, { "grad_norm": 0.5796260833740234, "learning_rate": 2.381870204542377e-05, "loss": 0.0338, "step": 41500 }, { "grad_norm": 0.44690585136413574, "learning_rate": 2.379522817735808e-05, "loss": 0.0289, "step": 41510 }, { "grad_norm": 0.6093001961708069, "learning_rate": 2.377176226961018e-05, "loss": 0.0332, "step": 41520 }, { "grad_norm": 0.5161908864974976, "learning_rate": 2.3748304329308384e-05, "loss": 0.0339, "step": 41530 }, { "grad_norm": 0.3519429564476013, "learning_rate": 2.372485436357858e-05, "loss": 0.0433, "step": 41540 }, { "grad_norm": 0.4182250499725342, "learning_rate": 2.3701412379544296e-05, "loss": 0.0286, "step": 41550 }, { "grad_norm": 0.5622655153274536, "learning_rate": 2.367797838432653e-05, "loss": 0.0278, "step": 41560 }, { "grad_norm": 0.47444358468055725, "learning_rate": 2.3654552385043967e-05, "loss": 0.0295, "step": 41570 }, { "grad_norm": 0.33138802647590637, "learning_rate": 2.3631134388812742e-05, "loss": 0.0347, "step": 41580 }, { "grad_norm": 0.40196987986564636, "learning_rate": 2.3607724402746684e-05, "loss": 0.0414, "step": 41590 }, { "grad_norm": 0.24990300834178925, "learning_rate": 2.35843224339571e-05, "loss": 0.0285, "step": 41600 }, { "grad_norm": 0.2721821069717407, "learning_rate": 2.3560928489552897e-05, "loss": 0.026, "step": 41610 }, { "grad_norm": 0.8354922533035278, "learning_rate": 2.353754257664053e-05, "loss": 0.0356, "step": 41620 }, { "grad_norm": 0.29225587844848633, "learning_rate": 2.3514164702324037e-05, "loss": 0.0357, "step": 41630 }, { "grad_norm": 0.43484625220298767, "learning_rate": 2.3490794873704963e-05, "loss": 0.0369, "step": 41640 }, { "grad_norm": 0.4848105311393738, "learning_rate": 2.3467433097882496e-05, "loss": 0.0309, "step": 41650 }, { "grad_norm": 0.47897103428840637, "learning_rate": 2.34440793819533e-05, "loss": 0.0262, "step": 41660 }, { "grad_norm": 0.47454801201820374, "learning_rate": 2.3420733733011617e-05, "loss": 0.04, "step": 41670 }, { "grad_norm": 0.7432457208633423, "learning_rate": 2.3397396158149243e-05, "loss": 0.0311, "step": 41680 }, { "grad_norm": 0.45965737104415894, "learning_rate": 2.3374066664455498e-05, "loss": 0.037, "step": 41690 }, { "grad_norm": 0.2514582574367523, "learning_rate": 2.3350745259017315e-05, "loss": 0.0238, "step": 41700 }, { "grad_norm": 0.2822817862033844, "learning_rate": 2.332743194891906e-05, "loss": 0.0361, "step": 41710 }, { "grad_norm": 0.3980847895145416, "learning_rate": 2.330412674124276e-05, "loss": 0.0283, "step": 41720 }, { "grad_norm": 0.34189286828041077, "learning_rate": 2.328082964306786e-05, "loss": 0.032, "step": 41730 }, { "grad_norm": 0.48139336705207825, "learning_rate": 2.325754066147145e-05, "loss": 0.0312, "step": 41740 }, { "grad_norm": 0.2967931628227234, "learning_rate": 2.32342598035281e-05, "loss": 0.0357, "step": 41750 }, { "grad_norm": 0.6118594408035278, "learning_rate": 2.321098707630991e-05, "loss": 0.032, "step": 41760 }, { "grad_norm": 0.4211256206035614, "learning_rate": 2.318772248688652e-05, "loss": 0.0288, "step": 41770 }, { "grad_norm": 0.4800620675086975, "learning_rate": 2.3164466042325107e-05, "loss": 0.0368, "step": 41780 }, { "grad_norm": 0.6172856092453003, "learning_rate": 2.3141217749690353e-05, "loss": 0.0348, "step": 41790 }, { "grad_norm": 0.6024065017700195, "learning_rate": 2.3117977616044466e-05, "loss": 0.0321, "step": 41800 }, { "grad_norm": 0.5898048281669617, "learning_rate": 2.309474564844722e-05, "loss": 0.0304, "step": 41810 }, { "grad_norm": 0.5402352213859558, "learning_rate": 2.307152185395585e-05, "loss": 0.0336, "step": 41820 }, { "grad_norm": 0.37055718898773193, "learning_rate": 2.3048306239625144e-05, "loss": 0.0333, "step": 41830 }, { "grad_norm": 0.3358520567417145, "learning_rate": 2.3025098812507378e-05, "loss": 0.0277, "step": 41840 }, { "grad_norm": 0.3819088339805603, "learning_rate": 2.3001899579652366e-05, "loss": 0.0269, "step": 41850 }, { "grad_norm": 0.25146645307540894, "learning_rate": 2.2978708548107393e-05, "loss": 0.0237, "step": 41860 }, { "grad_norm": 0.8592247366905212, "learning_rate": 2.2955525724917348e-05, "loss": 0.0374, "step": 41870 }, { "grad_norm": 0.3330078125, "learning_rate": 2.2932351117124477e-05, "loss": 0.0295, "step": 41880 }, { "grad_norm": 0.5202153921127319, "learning_rate": 2.29091847317687e-05, "loss": 0.0228, "step": 41890 }, { "grad_norm": 0.39093029499053955, "learning_rate": 2.2886026575887277e-05, "loss": 0.0289, "step": 41900 }, { "grad_norm": 0.39846110343933105, "learning_rate": 2.2862876656515094e-05, "loss": 0.0298, "step": 41910 }, { "grad_norm": 0.2565005421638489, "learning_rate": 2.2839734980684464e-05, "loss": 0.0219, "step": 41920 }, { "grad_norm": 0.4092845320701599, "learning_rate": 2.281660155542522e-05, "loss": 0.0292, "step": 41930 }, { "grad_norm": 0.48725321888923645, "learning_rate": 2.279347638776469e-05, "loss": 0.0269, "step": 41940 }, { "grad_norm": 0.4475294053554535, "learning_rate": 2.2770359484727665e-05, "loss": 0.0264, "step": 41950 }, { "grad_norm": 0.3738207221031189, "learning_rate": 2.27472508533365e-05, "loss": 0.0295, "step": 41960 }, { "grad_norm": 0.3469092547893524, "learning_rate": 2.2724150500610948e-05, "loss": 0.0281, "step": 41970 }, { "grad_norm": 0.35027456283569336, "learning_rate": 2.2701058433568302e-05, "loss": 0.0293, "step": 41980 }, { "grad_norm": 0.15371884405612946, "learning_rate": 2.2677974659223318e-05, "loss": 0.0219, "step": 41990 }, { "grad_norm": 0.27443599700927734, "learning_rate": 2.2654899184588235e-05, "loss": 0.033, "step": 42000 }, { "grad_norm": 0.3741719424724579, "learning_rate": 2.2631832016672756e-05, "loss": 0.0388, "step": 42010 }, { "grad_norm": 0.37362411618232727, "learning_rate": 2.2608773162484127e-05, "loss": 0.028, "step": 42020 }, { "grad_norm": 0.5667247176170349, "learning_rate": 2.2585722629026958e-05, "loss": 0.0369, "step": 42030 }, { "grad_norm": 0.2848275601863861, "learning_rate": 2.2562680423303457e-05, "loss": 0.0217, "step": 42040 }, { "grad_norm": 0.448518842458725, "learning_rate": 2.2539646552313165e-05, "loss": 0.0278, "step": 42050 }, { "grad_norm": 0.5054379105567932, "learning_rate": 2.251662102305322e-05, "loss": 0.0411, "step": 42060 }, { "grad_norm": 0.4794843792915344, "learning_rate": 2.2493603842518152e-05, "loss": 0.0248, "step": 42070 }, { "grad_norm": 0.29833734035491943, "learning_rate": 2.2470595017699974e-05, "loss": 0.0244, "step": 42080 }, { "grad_norm": 0.36550506949424744, "learning_rate": 2.244759455558816e-05, "loss": 0.0356, "step": 42090 }, { "grad_norm": 0.3635661005973816, "learning_rate": 2.2424602463169614e-05, "loss": 0.032, "step": 42100 }, { "grad_norm": 0.34538546204566956, "learning_rate": 2.2401618747428776e-05, "loss": 0.0368, "step": 42110 }, { "grad_norm": 0.416928768157959, "learning_rate": 2.237864341534747e-05, "loss": 0.0442, "step": 42120 }, { "grad_norm": 0.5478176474571228, "learning_rate": 2.2355676473904998e-05, "loss": 0.0401, "step": 42130 }, { "grad_norm": 0.44853484630584717, "learning_rate": 2.2332717930078108e-05, "loss": 0.0282, "step": 42140 }, { "grad_norm": 0.5910545587539673, "learning_rate": 2.2309767790840992e-05, "loss": 0.0299, "step": 42150 }, { "grad_norm": 0.47930774092674255, "learning_rate": 2.228682606316529e-05, "loss": 0.0323, "step": 42160 }, { "grad_norm": 0.30542701482772827, "learning_rate": 2.2263892754020138e-05, "loss": 0.0343, "step": 42170 }, { "grad_norm": 0.34353581070899963, "learning_rate": 2.2240967870372004e-05, "loss": 0.0325, "step": 42180 }, { "grad_norm": 0.4611995220184326, "learning_rate": 2.2218051419184933e-05, "loss": 0.0338, "step": 42190 }, { "grad_norm": 0.4084872901439667, "learning_rate": 2.219514340742026e-05, "loss": 0.031, "step": 42200 }, { "grad_norm": 0.4590965807437897, "learning_rate": 2.2172243842036898e-05, "loss": 0.0287, "step": 42210 }, { "grad_norm": 0.46561309695243835, "learning_rate": 2.2149352729991107e-05, "loss": 0.0274, "step": 42220 }, { "grad_norm": 0.6008407473564148, "learning_rate": 2.2126470078236605e-05, "loss": 0.0238, "step": 42230 }, { "grad_norm": 0.3572075068950653, "learning_rate": 2.2103595893724533e-05, "loss": 0.0352, "step": 42240 }, { "grad_norm": 0.28550511598587036, "learning_rate": 2.208073018340345e-05, "loss": 0.0412, "step": 42250 }, { "grad_norm": 0.5964122414588928, "learning_rate": 2.2057872954219405e-05, "loss": 0.0299, "step": 42260 }, { "grad_norm": 0.6434292197227478, "learning_rate": 2.203502421311575e-05, "loss": 0.0331, "step": 42270 }, { "grad_norm": 0.5876860618591309, "learning_rate": 2.2012183967033388e-05, "loss": 0.029, "step": 42280 }, { "grad_norm": 0.32981663942337036, "learning_rate": 2.198935222291056e-05, "loss": 0.0256, "step": 42290 }, { "grad_norm": 0.4593054950237274, "learning_rate": 2.1966528987682948e-05, "loss": 0.0254, "step": 42300 }, { "grad_norm": 0.27747514843940735, "learning_rate": 2.194371426828365e-05, "loss": 0.0262, "step": 42310 }, { "grad_norm": 0.5715326070785522, "learning_rate": 2.192090807164317e-05, "loss": 0.0245, "step": 42320 }, { "grad_norm": 0.5591733455657959, "learning_rate": 2.1898110404689422e-05, "loss": 0.0345, "step": 42330 }, { "grad_norm": 0.41821345686912537, "learning_rate": 2.1875321274347776e-05, "loss": 0.0305, "step": 42340 }, { "grad_norm": 0.3362422287464142, "learning_rate": 2.18525406875409e-05, "loss": 0.0315, "step": 42350 }, { "grad_norm": 0.35256731510162354, "learning_rate": 2.1829768651188997e-05, "loss": 0.0241, "step": 42360 }, { "grad_norm": 0.4317275285720825, "learning_rate": 2.180700517220958e-05, "loss": 0.0331, "step": 42370 }, { "grad_norm": 0.6081284284591675, "learning_rate": 2.1784250257517603e-05, "loss": 0.0323, "step": 42380 }, { "grad_norm": 0.6107482314109802, "learning_rate": 2.1761503914025406e-05, "loss": 0.0285, "step": 42390 }, { "grad_norm": 0.3705753684043884, "learning_rate": 2.1738766148642705e-05, "loss": 0.0326, "step": 42400 }, { "grad_norm": 0.3673882782459259, "learning_rate": 2.1716036968276683e-05, "loss": 0.0298, "step": 42410 }, { "grad_norm": 0.4214837849140167, "learning_rate": 2.1693316379831808e-05, "loss": 0.0291, "step": 42420 }, { "grad_norm": 0.4699249565601349, "learning_rate": 2.1670604390210037e-05, "loss": 0.0329, "step": 42430 }, { "grad_norm": 0.2713616192340851, "learning_rate": 2.1647901006310656e-05, "loss": 0.0233, "step": 42440 }, { "grad_norm": 0.2741526663303375, "learning_rate": 2.1625206235030353e-05, "loss": 0.042, "step": 42450 }, { "grad_norm": 0.5503622889518738, "learning_rate": 2.160252008326321e-05, "loss": 0.0255, "step": 42460 }, { "grad_norm": 0.22813837230205536, "learning_rate": 2.157984255790067e-05, "loss": 0.0221, "step": 42470 }, { "grad_norm": 0.5248556137084961, "learning_rate": 2.1557173665831553e-05, "loss": 0.0254, "step": 42480 }, { "grad_norm": 0.3237801194190979, "learning_rate": 2.153451341394212e-05, "loss": 0.0272, "step": 42490 }, { "grad_norm": 0.27054736018180847, "learning_rate": 2.151186180911589e-05, "loss": 0.0242, "step": 42500 }, { "grad_norm": 0.39389434456825256, "learning_rate": 2.1489218858233877e-05, "loss": 0.0318, "step": 42510 }, { "grad_norm": 0.23893338441848755, "learning_rate": 2.1466584568174392e-05, "loss": 0.0288, "step": 42520 }, { "grad_norm": 0.36535927653312683, "learning_rate": 2.1443958945813132e-05, "loss": 0.031, "step": 42530 }, { "grad_norm": 0.20186494290828705, "learning_rate": 2.1421341998023163e-05, "loss": 0.0304, "step": 42540 }, { "grad_norm": 0.40499258041381836, "learning_rate": 2.139873373167491e-05, "loss": 0.0213, "step": 42550 }, { "grad_norm": 0.289829283952713, "learning_rate": 2.13761341536362e-05, "loss": 0.0272, "step": 42560 }, { "grad_norm": 0.3415164649486542, "learning_rate": 2.1353543270772136e-05, "loss": 0.0223, "step": 42570 }, { "grad_norm": 0.5335811972618103, "learning_rate": 2.1330961089945297e-05, "loss": 0.0252, "step": 42580 }, { "grad_norm": 0.3387433588504791, "learning_rate": 2.130838761801548e-05, "loss": 0.0232, "step": 42590 }, { "grad_norm": 0.4250245988368988, "learning_rate": 2.1285822861839966e-05, "loss": 0.0187, "step": 42600 }, { "grad_norm": 0.32798266410827637, "learning_rate": 2.126326682827331e-05, "loss": 0.026, "step": 42610 }, { "grad_norm": 0.5078892111778259, "learning_rate": 2.124071952416744e-05, "loss": 0.0285, "step": 42620 }, { "grad_norm": 0.3400942087173462, "learning_rate": 2.1218180956371634e-05, "loss": 0.0283, "step": 42630 }, { "grad_norm": 0.3812902569770813, "learning_rate": 2.119565113173252e-05, "loss": 0.0253, "step": 42640 }, { "grad_norm": 0.4941513240337372, "learning_rate": 2.1173130057094033e-05, "loss": 0.0316, "step": 42650 }, { "grad_norm": 0.55985027551651, "learning_rate": 2.115061773929753e-05, "loss": 0.0554, "step": 42660 }, { "grad_norm": 0.5273133516311646, "learning_rate": 2.1128114185181623e-05, "loss": 0.025, "step": 42670 }, { "grad_norm": 0.40174561738967896, "learning_rate": 2.1105619401582317e-05, "loss": 0.0206, "step": 42680 }, { "grad_norm": 0.2908004820346832, "learning_rate": 2.1083133395332928e-05, "loss": 0.0322, "step": 42690 }, { "grad_norm": 0.29460838437080383, "learning_rate": 2.1060656173264082e-05, "loss": 0.026, "step": 42700 }, { "grad_norm": 0.43386542797088623, "learning_rate": 2.103818774220383e-05, "loss": 0.034, "step": 42710 }, { "grad_norm": 0.4002944231033325, "learning_rate": 2.1015728108977412e-05, "loss": 0.0265, "step": 42720 }, { "grad_norm": 0.340487539768219, "learning_rate": 2.0993277280407548e-05, "loss": 0.0279, "step": 42730 }, { "grad_norm": 0.4175109267234802, "learning_rate": 2.0970835263314132e-05, "loss": 0.0212, "step": 42740 }, { "grad_norm": 0.3531644642353058, "learning_rate": 2.094840206451451e-05, "loss": 0.0242, "step": 42750 }, { "grad_norm": 0.6378037929534912, "learning_rate": 2.0925977690823273e-05, "loss": 0.0304, "step": 42760 }, { "grad_norm": 0.4439701437950134, "learning_rate": 2.0903562149052364e-05, "loss": 0.0253, "step": 42770 }, { "grad_norm": 0.37880367040634155, "learning_rate": 2.0881155446011025e-05, "loss": 0.0292, "step": 42780 }, { "grad_norm": 0.4172859787940979, "learning_rate": 2.0858757588505823e-05, "loss": 0.0355, "step": 42790 }, { "grad_norm": 0.381226122379303, "learning_rate": 2.0836368583340622e-05, "loss": 0.0332, "step": 42800 }, { "grad_norm": 0.3156057596206665, "learning_rate": 2.081398843731664e-05, "loss": 0.0321, "step": 42810 }, { "grad_norm": 0.5833593010902405, "learning_rate": 2.0791617157232357e-05, "loss": 0.0453, "step": 42820 }, { "grad_norm": 0.4011850357055664, "learning_rate": 2.0769254749883576e-05, "loss": 0.0334, "step": 42830 }, { "grad_norm": 0.38886767625808716, "learning_rate": 2.0746901222063415e-05, "loss": 0.0436, "step": 42840 }, { "grad_norm": 0.46650782227516174, "learning_rate": 2.072455658056226e-05, "loss": 0.029, "step": 42850 }, { "grad_norm": 0.2997060716152191, "learning_rate": 2.0702220832167873e-05, "loss": 0.0252, "step": 42860 }, { "grad_norm": 0.6057910323143005, "learning_rate": 2.0679893983665205e-05, "loss": 0.0289, "step": 42870 }, { "grad_norm": 0.5310240387916565, "learning_rate": 2.0657576041836622e-05, "loss": 0.0313, "step": 42880 }, { "grad_norm": 0.6215100884437561, "learning_rate": 2.0635267013461666e-05, "loss": 0.0312, "step": 42890 }, { "grad_norm": 0.3697080910205841, "learning_rate": 2.061296690531728e-05, "loss": 0.038, "step": 42900 }, { "grad_norm": 0.2792647182941437, "learning_rate": 2.0590675724177622e-05, "loss": 0.037, "step": 42910 }, { "grad_norm": 0.3436148166656494, "learning_rate": 2.0568393476814167e-05, "loss": 0.0277, "step": 42920 }, { "grad_norm": 0.6544877886772156, "learning_rate": 2.0546120169995685e-05, "loss": 0.0422, "step": 42930 }, { "grad_norm": 0.49456697702407837, "learning_rate": 2.0523855810488214e-05, "loss": 0.034, "step": 42940 }, { "grad_norm": 0.3682911694049835, "learning_rate": 2.050160040505505e-05, "loss": 0.0206, "step": 42950 }, { "grad_norm": 0.38415029644966125, "learning_rate": 2.0479353960456843e-05, "loss": 0.0317, "step": 42960 }, { "grad_norm": 0.541388213634491, "learning_rate": 2.0457116483451456e-05, "loss": 0.0239, "step": 42970 }, { "grad_norm": 0.5715724229812622, "learning_rate": 2.0434887980794043e-05, "loss": 0.0281, "step": 42980 }, { "grad_norm": 0.44666168093681335, "learning_rate": 2.0412668459237043e-05, "loss": 0.027, "step": 42990 }, { "grad_norm": 0.3147922456264496, "learning_rate": 2.039045792553016e-05, "loss": 0.0267, "step": 43000 }, { "grad_norm": 0.6510388255119324, "learning_rate": 2.036825638642036e-05, "loss": 0.0369, "step": 43010 }, { "grad_norm": 0.3942941725254059, "learning_rate": 2.0346063848651868e-05, "loss": 0.0184, "step": 43020 }, { "grad_norm": 0.3494260907173157, "learning_rate": 2.0323880318966254e-05, "loss": 0.0217, "step": 43030 }, { "grad_norm": 0.3629794418811798, "learning_rate": 2.030170580410221e-05, "loss": 0.0316, "step": 43040 }, { "grad_norm": 0.5463144183158875, "learning_rate": 2.0279540310795837e-05, "loss": 0.0325, "step": 43050 }, { "grad_norm": 0.37832388281822205, "learning_rate": 2.0257383845780365e-05, "loss": 0.0187, "step": 43060 }, { "grad_norm": 0.23101229965686798, "learning_rate": 2.0235236415786384e-05, "loss": 0.0196, "step": 43070 }, { "grad_norm": 0.5105869174003601, "learning_rate": 2.021309802754169e-05, "loss": 0.0354, "step": 43080 }, { "grad_norm": 0.7967942953109741, "learning_rate": 2.0190968687771332e-05, "loss": 0.0227, "step": 43090 }, { "grad_norm": 0.7998312711715698, "learning_rate": 2.016884840319763e-05, "loss": 0.0366, "step": 43100 }, { "grad_norm": 0.3595915138721466, "learning_rate": 2.0146737180540122e-05, "loss": 0.0292, "step": 43110 }, { "grad_norm": 0.4763645529747009, "learning_rate": 2.012463502651564e-05, "loss": 0.0317, "step": 43120 }, { "grad_norm": 0.39488187432289124, "learning_rate": 2.0102541947838228e-05, "loss": 0.0227, "step": 43130 }, { "grad_norm": 0.35619544982910156, "learning_rate": 2.0080457951219173e-05, "loss": 0.0223, "step": 43140 }, { "grad_norm": 0.3981454074382782, "learning_rate": 2.0058383043367017e-05, "loss": 0.0249, "step": 43150 }, { "grad_norm": 0.35421910881996155, "learning_rate": 2.0036317230987528e-05, "loss": 0.0283, "step": 43160 }, { "grad_norm": 0.45575055480003357, "learning_rate": 2.0014260520783696e-05, "loss": 0.0297, "step": 43170 }, { "grad_norm": 0.4617892801761627, "learning_rate": 1.9992212919455834e-05, "loss": 0.0369, "step": 43180 }, { "grad_norm": 0.2893750071525574, "learning_rate": 1.9970174433701333e-05, "loss": 0.0259, "step": 43190 }, { "grad_norm": 0.29692015051841736, "learning_rate": 1.9948145070214992e-05, "loss": 0.0286, "step": 43200 }, { "grad_norm": 0.3609246611595154, "learning_rate": 1.9926124835688663e-05, "loss": 0.0297, "step": 43210 }, { "grad_norm": 0.7035847902297974, "learning_rate": 1.9904113736811576e-05, "loss": 0.0513, "step": 43220 }, { "grad_norm": 0.30916517972946167, "learning_rate": 1.9882111780270096e-05, "loss": 0.0246, "step": 43230 }, { "grad_norm": 0.36521464586257935, "learning_rate": 1.986011897274784e-05, "loss": 0.0223, "step": 43240 }, { "grad_norm": 0.4179956316947937, "learning_rate": 1.983813532092565e-05, "loss": 0.0289, "step": 43250 }, { "grad_norm": 0.39889267086982727, "learning_rate": 1.981616083148155e-05, "loss": 0.0242, "step": 43260 }, { "grad_norm": 0.4360087215900421, "learning_rate": 1.9794195511090845e-05, "loss": 0.032, "step": 43270 }, { "grad_norm": 0.4683392345905304, "learning_rate": 1.977223936642601e-05, "loss": 0.0277, "step": 43280 }, { "grad_norm": 0.4198536276817322, "learning_rate": 1.975029240415674e-05, "loss": 0.0346, "step": 43290 }, { "grad_norm": 0.42236393690109253, "learning_rate": 1.9728354630949936e-05, "loss": 0.0194, "step": 43300 }, { "grad_norm": 0.32042238116264343, "learning_rate": 1.9706426053469716e-05, "loss": 0.0252, "step": 43310 }, { "grad_norm": 0.31268414855003357, "learning_rate": 1.9684506678377396e-05, "loss": 0.0255, "step": 43320 }, { "grad_norm": 0.36482682824134827, "learning_rate": 1.9662596512331544e-05, "loss": 0.0263, "step": 43330 }, { "grad_norm": 0.9310865998268127, "learning_rate": 1.964069556198782e-05, "loss": 0.0244, "step": 43340 }, { "grad_norm": 0.44409501552581787, "learning_rate": 1.9618803833999232e-05, "loss": 0.0261, "step": 43350 }, { "grad_norm": 0.5899567604064941, "learning_rate": 1.9596921335015838e-05, "loss": 0.0299, "step": 43360 }, { "grad_norm": 0.47261977195739746, "learning_rate": 1.957504807168501e-05, "loss": 0.0196, "step": 43370 }, { "grad_norm": 0.5580005049705505, "learning_rate": 1.9553184050651253e-05, "loss": 0.0297, "step": 43380 }, { "grad_norm": 0.2286643385887146, "learning_rate": 1.953132927855628e-05, "loss": 0.03, "step": 43390 }, { "grad_norm": 0.5719003677368164, "learning_rate": 1.9509483762038995e-05, "loss": 0.0261, "step": 43400 }, { "grad_norm": 0.4411337375640869, "learning_rate": 1.9487647507735467e-05, "loss": 0.0233, "step": 43410 }, { "grad_norm": 0.4349382221698761, "learning_rate": 1.9465820522279032e-05, "loss": 0.0296, "step": 43420 }, { "grad_norm": 0.6857959628105164, "learning_rate": 1.9444002812300078e-05, "loss": 0.0336, "step": 43430 }, { "grad_norm": 0.4443163275718689, "learning_rate": 1.94221943844263e-05, "loss": 0.0329, "step": 43440 }, { "grad_norm": 0.34482306241989136, "learning_rate": 1.9400395245282515e-05, "loss": 0.0213, "step": 43450 }, { "grad_norm": 0.5887449979782104, "learning_rate": 1.937860540149071e-05, "loss": 0.0245, "step": 43460 }, { "grad_norm": 0.5773076415061951, "learning_rate": 1.9356824859670082e-05, "loss": 0.0267, "step": 43470 }, { "grad_norm": 0.9981679916381836, "learning_rate": 1.9335053626436967e-05, "loss": 0.0294, "step": 43480 }, { "grad_norm": 0.28961747884750366, "learning_rate": 1.9313291708404885e-05, "loss": 0.0239, "step": 43490 }, { "grad_norm": 0.3800503611564636, "learning_rate": 1.9291539112184587e-05, "loss": 0.0274, "step": 43500 }, { "grad_norm": 0.4671802818775177, "learning_rate": 1.9269795844383854e-05, "loss": 0.0226, "step": 43510 }, { "grad_norm": 0.6107974648475647, "learning_rate": 1.9248061911607777e-05, "loss": 0.0484, "step": 43520 }, { "grad_norm": 0.5157607197761536, "learning_rate": 1.9226337320458538e-05, "loss": 0.0358, "step": 43530 }, { "grad_norm": 0.3047747015953064, "learning_rate": 1.9204622077535488e-05, "loss": 0.0266, "step": 43540 }, { "grad_norm": 0.34872016310691833, "learning_rate": 1.9182916189435147e-05, "loss": 0.0311, "step": 43550 }, { "grad_norm": 0.5913704633712769, "learning_rate": 1.916121966275117e-05, "loss": 0.0558, "step": 43560 }, { "grad_norm": 0.22189266979694366, "learning_rate": 1.9139532504074443e-05, "loss": 0.0199, "step": 43570 }, { "grad_norm": 0.31784456968307495, "learning_rate": 1.9117854719992885e-05, "loss": 0.0291, "step": 43580 }, { "grad_norm": 0.29723048210144043, "learning_rate": 1.9096186317091687e-05, "loss": 0.0241, "step": 43590 }, { "grad_norm": 0.6988416314125061, "learning_rate": 1.9074527301953116e-05, "loss": 0.0536, "step": 43600 }, { "grad_norm": 0.3944399356842041, "learning_rate": 1.9052877681156607e-05, "loss": 0.0256, "step": 43610 }, { "grad_norm": 0.33445656299591064, "learning_rate": 1.903123746127875e-05, "loss": 0.0217, "step": 43620 }, { "grad_norm": 0.5434167981147766, "learning_rate": 1.900960664889327e-05, "loss": 0.0256, "step": 43630 }, { "grad_norm": 0.45461374521255493, "learning_rate": 1.8987985250571015e-05, "loss": 0.0332, "step": 43640 }, { "grad_norm": 0.4245731234550476, "learning_rate": 1.8966373272880054e-05, "loss": 0.0341, "step": 43650 }, { "grad_norm": 1.044776439666748, "learning_rate": 1.8944770722385462e-05, "loss": 0.0386, "step": 43660 }, { "grad_norm": 0.4956419765949249, "learning_rate": 1.8923177605649576e-05, "loss": 0.0333, "step": 43670 }, { "grad_norm": 0.2735886573791504, "learning_rate": 1.8901593929231802e-05, "loss": 0.0266, "step": 43680 }, { "grad_norm": 0.41941577196121216, "learning_rate": 1.8880019699688684e-05, "loss": 0.0323, "step": 43690 }, { "grad_norm": 0.43813347816467285, "learning_rate": 1.8858454923573904e-05, "loss": 0.0276, "step": 43700 }, { "grad_norm": 0.3785078525543213, "learning_rate": 1.8836899607438253e-05, "loss": 0.0309, "step": 43710 }, { "grad_norm": 0.4968891739845276, "learning_rate": 1.8815353757829723e-05, "loss": 0.0293, "step": 43720 }, { "grad_norm": 0.21113364398479462, "learning_rate": 1.879381738129331e-05, "loss": 0.0293, "step": 43730 }, { "grad_norm": 0.5396242737770081, "learning_rate": 1.8772290484371236e-05, "loss": 0.0273, "step": 43740 }, { "grad_norm": 0.3588218688964844, "learning_rate": 1.8750773073602795e-05, "loss": 0.0226, "step": 43750 }, { "grad_norm": 0.44449514150619507, "learning_rate": 1.8729265155524405e-05, "loss": 0.0224, "step": 43760 }, { "grad_norm": 0.3407878279685974, "learning_rate": 1.8707766736669607e-05, "loss": 0.0255, "step": 43770 }, { "grad_norm": 0.4012695848941803, "learning_rate": 1.8686277823569055e-05, "loss": 0.0245, "step": 43780 }, { "grad_norm": 0.44178658723831177, "learning_rate": 1.8664798422750484e-05, "loss": 0.0252, "step": 43790 }, { "grad_norm": 0.5680636763572693, "learning_rate": 1.8643328540738832e-05, "loss": 0.0348, "step": 43800 }, { "grad_norm": 0.5721374154090881, "learning_rate": 1.862186818405601e-05, "loss": 0.0239, "step": 43810 }, { "grad_norm": 0.27611759305000305, "learning_rate": 1.8600417359221156e-05, "loss": 0.0326, "step": 43820 }, { "grad_norm": 0.4487643539905548, "learning_rate": 1.8578976072750454e-05, "loss": 0.0323, "step": 43830 }, { "grad_norm": 0.48579803109169006, "learning_rate": 1.8557544331157194e-05, "loss": 0.029, "step": 43840 }, { "grad_norm": 0.5523331761360168, "learning_rate": 1.8536122140951785e-05, "loss": 0.027, "step": 43850 }, { "grad_norm": 0.5687160491943359, "learning_rate": 1.8514709508641688e-05, "loss": 0.021, "step": 43860 }, { "grad_norm": 0.4877760410308838, "learning_rate": 1.8493306440731555e-05, "loss": 0.0247, "step": 43870 }, { "grad_norm": 0.6325808763504028, "learning_rate": 1.8471912943723013e-05, "loss": 0.0341, "step": 43880 }, { "grad_norm": 0.3568752110004425, "learning_rate": 1.8450529024114894e-05, "loss": 0.018, "step": 43890 }, { "grad_norm": 0.3544720709323883, "learning_rate": 1.842915468840301e-05, "loss": 0.0256, "step": 43900 }, { "grad_norm": 0.3450658321380615, "learning_rate": 1.840778994308037e-05, "loss": 0.0237, "step": 43910 }, { "grad_norm": 0.3285292983055115, "learning_rate": 1.8386434794637004e-05, "loss": 0.02, "step": 43920 }, { "grad_norm": 0.42215847969055176, "learning_rate": 1.8365089249560034e-05, "loss": 0.0285, "step": 43930 }, { "grad_norm": 0.4783863127231598, "learning_rate": 1.8343753314333683e-05, "loss": 0.0256, "step": 43940 }, { "grad_norm": 0.18880753219127655, "learning_rate": 1.8322426995439236e-05, "loss": 0.0179, "step": 43950 }, { "grad_norm": 0.4901190400123596, "learning_rate": 1.8301110299355058e-05, "loss": 0.0386, "step": 43960 }, { "grad_norm": 0.8512096405029297, "learning_rate": 1.8279803232556625e-05, "loss": 0.0225, "step": 43970 }, { "grad_norm": 0.48660504817962646, "learning_rate": 1.8258505801516444e-05, "loss": 0.0313, "step": 43980 }, { "grad_norm": 0.41736745834350586, "learning_rate": 1.8237218012704117e-05, "loss": 0.0296, "step": 43990 }, { "grad_norm": 0.46722957491874695, "learning_rate": 1.821593987258631e-05, "loss": 0.0315, "step": 44000 }, { "grad_norm": 0.5302354693412781, "learning_rate": 1.8194671387626744e-05, "loss": 0.0273, "step": 44010 }, { "grad_norm": 0.4479953646659851, "learning_rate": 1.8173412564286276e-05, "loss": 0.0245, "step": 44020 }, { "grad_norm": 0.40508347749710083, "learning_rate": 1.8152163409022697e-05, "loss": 0.0343, "step": 44030 }, { "grad_norm": 0.5434513092041016, "learning_rate": 1.8130923928291023e-05, "loss": 0.0338, "step": 44040 }, { "grad_norm": 0.4892917275428772, "learning_rate": 1.8109694128543163e-05, "loss": 0.0478, "step": 44050 }, { "grad_norm": 0.32035794854164124, "learning_rate": 1.8088474016228237e-05, "loss": 0.0228, "step": 44060 }, { "grad_norm": 0.40878599882125854, "learning_rate": 1.8067263597792328e-05, "loss": 0.028, "step": 44070 }, { "grad_norm": 0.3966746926307678, "learning_rate": 1.80460628796786e-05, "loss": 0.037, "step": 44080 }, { "grad_norm": 0.3196769952774048, "learning_rate": 1.8024871868327276e-05, "loss": 0.0312, "step": 44090 }, { "grad_norm": 0.18659166991710663, "learning_rate": 1.8003690570175608e-05, "loss": 0.0309, "step": 44100 }, { "grad_norm": 0.40050795674324036, "learning_rate": 1.7982518991657943e-05, "loss": 0.0222, "step": 44110 }, { "grad_norm": 0.7209390997886658, "learning_rate": 1.7961357139205643e-05, "loss": 0.0253, "step": 44120 }, { "grad_norm": 0.2802256643772125, "learning_rate": 1.7940205019247108e-05, "loss": 0.0325, "step": 44130 }, { "grad_norm": 0.3601173460483551, "learning_rate": 1.79190626382078e-05, "loss": 0.0236, "step": 44140 }, { "grad_norm": 0.4338151514530182, "learning_rate": 1.7897930002510215e-05, "loss": 0.0237, "step": 44150 }, { "grad_norm": 0.38062748312950134, "learning_rate": 1.787680711857387e-05, "loss": 0.0222, "step": 44160 }, { "grad_norm": 0.5204013586044312, "learning_rate": 1.7855693992815398e-05, "loss": 0.0371, "step": 44170 }, { "grad_norm": 0.3129327595233917, "learning_rate": 1.7834590631648328e-05, "loss": 0.0322, "step": 44180 }, { "grad_norm": 0.29488468170166016, "learning_rate": 1.7813497041483384e-05, "loss": 0.0209, "step": 44190 }, { "grad_norm": 0.3239944279193878, "learning_rate": 1.779241322872817e-05, "loss": 0.0214, "step": 44200 }, { "grad_norm": 0.8770729899406433, "learning_rate": 1.777133919978744e-05, "loss": 0.0311, "step": 44210 }, { "grad_norm": 0.5927838087081909, "learning_rate": 1.7750274961062912e-05, "loss": 0.052, "step": 44220 }, { "grad_norm": 0.524592399597168, "learning_rate": 1.772922051895335e-05, "loss": 0.0389, "step": 44230 }, { "grad_norm": 0.654060423374176, "learning_rate": 1.770817587985453e-05, "loss": 0.0295, "step": 44240 }, { "grad_norm": 0.5514875650405884, "learning_rate": 1.7687141050159246e-05, "loss": 0.0286, "step": 44250 }, { "grad_norm": 0.5014081001281738, "learning_rate": 1.7666116036257375e-05, "loss": 0.0285, "step": 44260 }, { "grad_norm": 0.3554975390434265, "learning_rate": 1.764510084453569e-05, "loss": 0.0182, "step": 44270 }, { "grad_norm": 0.288996160030365, "learning_rate": 1.76240954813781e-05, "loss": 0.0367, "step": 44280 }, { "grad_norm": 0.4960533082485199, "learning_rate": 1.7603099953165476e-05, "loss": 0.0274, "step": 44290 }, { "grad_norm": 0.29057934880256653, "learning_rate": 1.7582114266275683e-05, "loss": 0.0283, "step": 44300 }, { "grad_norm": 0.30147457122802734, "learning_rate": 1.756113842708364e-05, "loss": 0.0204, "step": 44310 }, { "grad_norm": 0.4075626730918884, "learning_rate": 1.7540172441961245e-05, "loss": 0.023, "step": 44320 }, { "grad_norm": 0.427538126707077, "learning_rate": 1.7519216317277387e-05, "loss": 0.0224, "step": 44330 }, { "grad_norm": 0.23946399986743927, "learning_rate": 1.7498270059398046e-05, "loss": 0.0209, "step": 44340 }, { "grad_norm": 0.28052693605422974, "learning_rate": 1.7477333674686062e-05, "loss": 0.0212, "step": 44350 }, { "grad_norm": 0.540862500667572, "learning_rate": 1.745640716950142e-05, "loss": 0.0219, "step": 44360 }, { "grad_norm": 0.34867075085639954, "learning_rate": 1.7435490550201017e-05, "loss": 0.0246, "step": 44370 }, { "grad_norm": 0.34085312485694885, "learning_rate": 1.7414583823138762e-05, "loss": 0.0275, "step": 44380 }, { "grad_norm": 0.39617884159088135, "learning_rate": 1.739368699466558e-05, "loss": 0.0151, "step": 44390 }, { "grad_norm": 0.5596469044685364, "learning_rate": 1.737280007112935e-05, "loss": 0.0289, "step": 44400 }, { "grad_norm": 0.35948485136032104, "learning_rate": 1.735192305887502e-05, "loss": 0.0299, "step": 44410 }, { "grad_norm": 0.47185587882995605, "learning_rate": 1.733105596424441e-05, "loss": 0.0292, "step": 44420 }, { "grad_norm": 0.2856932580471039, "learning_rate": 1.7310198793576437e-05, "loss": 0.025, "step": 44430 }, { "grad_norm": 0.2728714942932129, "learning_rate": 1.7289351553206952e-05, "loss": 0.0232, "step": 44440 }, { "grad_norm": 0.3290155827999115, "learning_rate": 1.7268514249468788e-05, "loss": 0.0186, "step": 44450 }, { "grad_norm": 0.2886522710323334, "learning_rate": 1.7247686888691765e-05, "loss": 0.0249, "step": 44460 }, { "grad_norm": 0.3212850093841553, "learning_rate": 1.7226869477202694e-05, "loss": 0.0321, "step": 44470 }, { "grad_norm": 0.7106180787086487, "learning_rate": 1.7206062021325336e-05, "loss": 0.0316, "step": 44480 }, { "grad_norm": 0.23655292391777039, "learning_rate": 1.7185264527380502e-05, "loss": 0.0141, "step": 44490 }, { "grad_norm": 0.42447587847709656, "learning_rate": 1.716447700168584e-05, "loss": 0.0284, "step": 44500 }, { "grad_norm": 0.48578813672065735, "learning_rate": 1.714369945055611e-05, "loss": 0.0233, "step": 44510 }, { "grad_norm": 0.5197054147720337, "learning_rate": 1.7122931880302968e-05, "loss": 0.035, "step": 44520 }, { "grad_norm": 0.53948575258255, "learning_rate": 1.710217429723505e-05, "loss": 0.0279, "step": 44530 }, { "grad_norm": 0.29819217324256897, "learning_rate": 1.7081426707657972e-05, "loss": 0.0236, "step": 44540 }, { "grad_norm": 0.43046751618385315, "learning_rate": 1.7060689117874275e-05, "loss": 0.0253, "step": 44550 }, { "grad_norm": 0.5587775111198425, "learning_rate": 1.703996153418354e-05, "loss": 0.0377, "step": 44560 }, { "grad_norm": 0.442739337682724, "learning_rate": 1.7019243962882205e-05, "loss": 0.0275, "step": 44570 }, { "grad_norm": 0.27922523021698, "learning_rate": 1.6998536410263754e-05, "loss": 0.0228, "step": 44580 }, { "grad_norm": 0.20990313589572906, "learning_rate": 1.6977838882618596e-05, "loss": 0.029, "step": 44590 }, { "grad_norm": 0.3231562674045563, "learning_rate": 1.6957151386234088e-05, "loss": 0.0203, "step": 44600 }, { "grad_norm": 0.2335236519575119, "learning_rate": 1.6936473927394536e-05, "loss": 0.0267, "step": 44610 }, { "grad_norm": 0.35477331280708313, "learning_rate": 1.6915806512381222e-05, "loss": 0.0246, "step": 44620 }, { "grad_norm": 0.3939056992530823, "learning_rate": 1.6895149147472344e-05, "loss": 0.0305, "step": 44630 }, { "grad_norm": 0.8793933987617493, "learning_rate": 1.6874501838943073e-05, "loss": 0.0353, "step": 44640 }, { "grad_norm": 0.516894519329071, "learning_rate": 1.6853864593065506e-05, "loss": 0.0181, "step": 44650 }, { "grad_norm": 0.4208962321281433, "learning_rate": 1.683323741610871e-05, "loss": 0.026, "step": 44660 }, { "grad_norm": 0.3478683531284332, "learning_rate": 1.6812620314338674e-05, "loss": 0.0302, "step": 44670 }, { "grad_norm": 0.15754587948322296, "learning_rate": 1.6792013294018326e-05, "loss": 0.025, "step": 44680 }, { "grad_norm": 0.6578392386436462, "learning_rate": 1.6771416361407526e-05, "loss": 0.0298, "step": 44690 }, { "grad_norm": 0.5753393769264221, "learning_rate": 1.675082952276308e-05, "loss": 0.0354, "step": 44700 }, { "grad_norm": 0.6125508546829224, "learning_rate": 1.6730252784338757e-05, "loss": 0.0226, "step": 44710 }, { "grad_norm": 0.4589073657989502, "learning_rate": 1.6709686152385166e-05, "loss": 0.0257, "step": 44720 }, { "grad_norm": 0.4987391233444214, "learning_rate": 1.668912963314998e-05, "loss": 0.0316, "step": 44730 }, { "grad_norm": 0.33320164680480957, "learning_rate": 1.6668583232877653e-05, "loss": 0.042, "step": 44740 }, { "grad_norm": 0.3326053023338318, "learning_rate": 1.6648046957809698e-05, "loss": 0.0215, "step": 44750 }, { "grad_norm": 0.5422163009643555, "learning_rate": 1.6627520814184462e-05, "loss": 0.0284, "step": 44760 }, { "grad_norm": 0.5186840295791626, "learning_rate": 1.660700480823726e-05, "loss": 0.0206, "step": 44770 }, { "grad_norm": 0.25698384642601013, "learning_rate": 1.65864989462003e-05, "loss": 0.0222, "step": 44780 }, { "grad_norm": 0.3784202039241791, "learning_rate": 1.656600323430273e-05, "loss": 0.0222, "step": 44790 }, { "grad_norm": 0.5595548152923584, "learning_rate": 1.654551767877059e-05, "loss": 0.0307, "step": 44800 }, { "grad_norm": 0.9526786804199219, "learning_rate": 1.6525042285826874e-05, "loss": 0.0466, "step": 44810 }, { "grad_norm": 0.2788141667842865, "learning_rate": 1.6504577061691468e-05, "loss": 0.0328, "step": 44820 }, { "grad_norm": 0.5567314624786377, "learning_rate": 1.6484122012581143e-05, "loss": 0.026, "step": 44830 }, { "grad_norm": 0.9897366166114807, "learning_rate": 1.6463677144709623e-05, "loss": 0.0314, "step": 44840 }, { "grad_norm": 0.3914376199245453, "learning_rate": 1.6443242464287493e-05, "loss": 0.0288, "step": 44850 }, { "grad_norm": 0.2820419371128082, "learning_rate": 1.642281797752232e-05, "loss": 0.0253, "step": 44860 }, { "grad_norm": 0.22578145563602448, "learning_rate": 1.6402403690618456e-05, "loss": 0.0264, "step": 44870 }, { "grad_norm": 0.4814079999923706, "learning_rate": 1.6381999609777295e-05, "loss": 0.0271, "step": 44880 }, { "grad_norm": 0.4091816842556, "learning_rate": 1.6361605741196983e-05, "loss": 0.027, "step": 44890 }, { "grad_norm": 0.4637095034122467, "learning_rate": 1.63412220910727e-05, "loss": 0.0274, "step": 44900 }, { "grad_norm": 0.316240519285202, "learning_rate": 1.6320848665596433e-05, "loss": 0.0273, "step": 44910 }, { "grad_norm": 0.35535329580307007, "learning_rate": 1.6300485470957095e-05, "loss": 0.0256, "step": 44920 }, { "grad_norm": 0.4233652949333191, "learning_rate": 1.6280132513340483e-05, "loss": 0.024, "step": 44930 }, { "grad_norm": 0.3469667434692383, "learning_rate": 1.62597897989293e-05, "loss": 0.0389, "step": 44940 }, { "grad_norm": 0.31276845932006836, "learning_rate": 1.623945733390309e-05, "loss": 0.0231, "step": 44950 }, { "grad_norm": 0.4802025258541107, "learning_rate": 1.6219135124438374e-05, "loss": 0.034, "step": 44960 }, { "grad_norm": 0.3397437036037445, "learning_rate": 1.6198823176708465e-05, "loss": 0.0351, "step": 44970 }, { "grad_norm": 0.6037551760673523, "learning_rate": 1.6178521496883613e-05, "loss": 0.0227, "step": 44980 }, { "grad_norm": 0.33746010065078735, "learning_rate": 1.6158230091130926e-05, "loss": 0.0275, "step": 44990 }, { "grad_norm": 0.5324826240539551, "learning_rate": 1.613794896561438e-05, "loss": 0.0232, "step": 45000 }, { "grad_norm": 0.4292178153991699, "learning_rate": 1.6117678126494894e-05, "loss": 0.022, "step": 45010 }, { "grad_norm": 0.37590140104293823, "learning_rate": 1.6097417579930153e-05, "loss": 0.0224, "step": 45020 }, { "grad_norm": 0.23558984696865082, "learning_rate": 1.6077167332074834e-05, "loss": 0.0184, "step": 45030 }, { "grad_norm": 0.42956212162971497, "learning_rate": 1.605692738908037e-05, "loss": 0.0371, "step": 45040 }, { "grad_norm": 0.5963520407676697, "learning_rate": 1.6036697757095176e-05, "loss": 0.0529, "step": 45050 }, { "grad_norm": 0.6833932399749756, "learning_rate": 1.6016478442264428e-05, "loss": 0.0313, "step": 45060 }, { "grad_norm": 0.3805127739906311, "learning_rate": 1.599626945073026e-05, "loss": 0.0238, "step": 45070 }, { "grad_norm": 0.3326156437397003, "learning_rate": 1.597607078863162e-05, "loss": 0.0162, "step": 45080 }, { "grad_norm": 0.460641473531723, "learning_rate": 1.595588246210432e-05, "loss": 0.0296, "step": 45090 }, { "grad_norm": 1.0471223592758179, "learning_rate": 1.5935704477281048e-05, "loss": 0.0355, "step": 45100 }, { "grad_norm": 0.3302094042301178, "learning_rate": 1.5915536840291323e-05, "loss": 0.0198, "step": 45110 }, { "grad_norm": 0.7393028736114502, "learning_rate": 1.5895379557261576e-05, "loss": 0.0421, "step": 45120 }, { "grad_norm": 0.4432527422904968, "learning_rate": 1.5875232634315033e-05, "loss": 0.0237, "step": 45130 }, { "grad_norm": 0.9017910361289978, "learning_rate": 1.5855096077571812e-05, "loss": 0.025, "step": 45140 }, { "grad_norm": 0.2643239200115204, "learning_rate": 1.5834969893148855e-05, "loss": 0.0233, "step": 45150 }, { "grad_norm": 0.2581791579723358, "learning_rate": 1.581485408715997e-05, "loss": 0.0202, "step": 45160 }, { "grad_norm": 0.6393464803695679, "learning_rate": 1.5794748665715785e-05, "loss": 0.0264, "step": 45170 }, { "grad_norm": 0.22347013652324677, "learning_rate": 1.5774653634923857e-05, "loss": 0.0171, "step": 45180 }, { "grad_norm": 0.277820348739624, "learning_rate": 1.575456900088845e-05, "loss": 0.019, "step": 45190 }, { "grad_norm": 0.584441602230072, "learning_rate": 1.5734494769710816e-05, "loss": 0.0268, "step": 45200 }, { "grad_norm": 0.3961442708969116, "learning_rate": 1.5714430947488912e-05, "loss": 0.02, "step": 45210 }, { "grad_norm": 0.4648391604423523, "learning_rate": 1.5694377540317645e-05, "loss": 0.0307, "step": 45220 }, { "grad_norm": 0.6032213568687439, "learning_rate": 1.5674334554288694e-05, "loss": 0.0285, "step": 45230 }, { "grad_norm": 0.5159558653831482, "learning_rate": 1.5654301995490582e-05, "loss": 0.0245, "step": 45240 }, { "grad_norm": 0.3272252082824707, "learning_rate": 1.5634279870008685e-05, "loss": 0.0194, "step": 45250 }, { "grad_norm": 0.5961705446243286, "learning_rate": 1.5614268183925174e-05, "loss": 0.0278, "step": 45260 }, { "grad_norm": 0.3942110240459442, "learning_rate": 1.5594266943319097e-05, "loss": 0.0347, "step": 45270 }, { "grad_norm": 0.37445706129074097, "learning_rate": 1.5574276154266294e-05, "loss": 0.0292, "step": 45280 }, { "grad_norm": 0.2140222042798996, "learning_rate": 1.5554295822839437e-05, "loss": 0.0205, "step": 45290 }, { "grad_norm": 0.4331212341785431, "learning_rate": 1.5534325955108025e-05, "loss": 0.0293, "step": 45300 }, { "grad_norm": 0.823716938495636, "learning_rate": 1.5514366557138373e-05, "loss": 0.0235, "step": 45310 }, { "grad_norm": 0.6178929805755615, "learning_rate": 1.5494417634993602e-05, "loss": 0.0298, "step": 45320 }, { "grad_norm": 0.3540644347667694, "learning_rate": 1.547447919473372e-05, "loss": 0.0227, "step": 45330 }, { "grad_norm": 0.5099239349365234, "learning_rate": 1.5454551242415434e-05, "loss": 0.0242, "step": 45340 }, { "grad_norm": 0.44753003120422363, "learning_rate": 1.543463378409239e-05, "loss": 0.0271, "step": 45350 }, { "grad_norm": 0.5880846977233887, "learning_rate": 1.541472682581493e-05, "loss": 0.0357, "step": 45360 }, { "grad_norm": 0.6308807730674744, "learning_rate": 1.5394830373630298e-05, "loss": 0.0304, "step": 45370 }, { "grad_norm": 0.4834938943386078, "learning_rate": 1.5374944433582506e-05, "loss": 0.0182, "step": 45380 }, { "grad_norm": 0.47779855132102966, "learning_rate": 1.5355069011712375e-05, "loss": 0.0209, "step": 45390 }, { "grad_norm": 0.4977414309978485, "learning_rate": 1.5335204114057526e-05, "loss": 0.0297, "step": 45400 }, { "grad_norm": 0.42828795313835144, "learning_rate": 1.5315349746652387e-05, "loss": 0.0265, "step": 45410 }, { "grad_norm": 0.32052260637283325, "learning_rate": 1.5295505915528212e-05, "loss": 0.0441, "step": 45420 }, { "grad_norm": 0.33919528126716614, "learning_rate": 1.5275672626713024e-05, "loss": 0.0186, "step": 45430 }, { "grad_norm": 0.41506025195121765, "learning_rate": 1.5255849886231643e-05, "loss": 0.0251, "step": 45440 }, { "grad_norm": 0.32488134503364563, "learning_rate": 1.523603770010571e-05, "loss": 0.0243, "step": 45450 }, { "grad_norm": 0.35362544655799866, "learning_rate": 1.521623607435363e-05, "loss": 0.0215, "step": 45460 }, { "grad_norm": 0.5141936540603638, "learning_rate": 1.5196445014990612e-05, "loss": 0.0275, "step": 45470 }, { "grad_norm": 0.46649980545043945, "learning_rate": 1.5176664528028672e-05, "loss": 0.0269, "step": 45480 }, { "grad_norm": 0.4346105754375458, "learning_rate": 1.5156894619476574e-05, "loss": 0.0317, "step": 45490 }, { "grad_norm": 0.450131893157959, "learning_rate": 1.5137135295339938e-05, "loss": 0.0353, "step": 45500 }, { "grad_norm": 0.4236833453178406, "learning_rate": 1.5117386561621073e-05, "loss": 0.0324, "step": 45510 }, { "grad_norm": 0.4680764675140381, "learning_rate": 1.5097648424319167e-05, "loss": 0.0279, "step": 45520 }, { "grad_norm": 0.3588695228099823, "learning_rate": 1.5077920889430119e-05, "loss": 0.0272, "step": 45530 }, { "grad_norm": 0.25334712862968445, "learning_rate": 1.5058203962946644e-05, "loss": 0.0231, "step": 45540 }, { "grad_norm": 0.512424111366272, "learning_rate": 1.503849765085822e-05, "loss": 0.0216, "step": 45550 }, { "grad_norm": 0.787659227848053, "learning_rate": 1.501880195915109e-05, "loss": 0.0307, "step": 45560 }, { "grad_norm": 0.4614821970462799, "learning_rate": 1.499911689380833e-05, "loss": 0.0191, "step": 45570 }, { "grad_norm": 0.49711236357688904, "learning_rate": 1.4979442460809683e-05, "loss": 0.02, "step": 45580 }, { "grad_norm": 0.5463783740997314, "learning_rate": 1.4959778666131763e-05, "loss": 0.0273, "step": 45590 }, { "grad_norm": 0.21954113245010376, "learning_rate": 1.4940125515747905e-05, "loss": 0.0213, "step": 45600 }, { "grad_norm": 0.5082725882530212, "learning_rate": 1.4920483015628211e-05, "loss": 0.0288, "step": 45610 }, { "grad_norm": 0.5767166614532471, "learning_rate": 1.490085117173956e-05, "loss": 0.0363, "step": 45620 }, { "grad_norm": 0.5724625587463379, "learning_rate": 1.488122999004558e-05, "loss": 0.0338, "step": 45630 }, { "grad_norm": 0.29836273193359375, "learning_rate": 1.486161947650666e-05, "loss": 0.0246, "step": 45640 }, { "grad_norm": 0.26398396492004395, "learning_rate": 1.4842019637079995e-05, "loss": 0.0181, "step": 45650 }, { "grad_norm": 0.30397140979766846, "learning_rate": 1.482243047771944e-05, "loss": 0.022, "step": 45660 }, { "grad_norm": 0.29539260268211365, "learning_rate": 1.4802852004375712e-05, "loss": 0.0284, "step": 45670 }, { "grad_norm": 1.1826242208480835, "learning_rate": 1.4783284222996218e-05, "loss": 0.0293, "step": 45680 }, { "grad_norm": 0.2573571801185608, "learning_rate": 1.4763727139525135e-05, "loss": 0.038, "step": 45690 }, { "grad_norm": 0.6173754334449768, "learning_rate": 1.4744180759903392e-05, "loss": 0.0274, "step": 45700 }, { "grad_norm": 0.5141643285751343, "learning_rate": 1.4724645090068635e-05, "loss": 0.0316, "step": 45710 }, { "grad_norm": 0.3604147732257843, "learning_rate": 1.4705120135955341e-05, "loss": 0.0188, "step": 45720 }, { "grad_norm": 0.24480363726615906, "learning_rate": 1.4685605903494614e-05, "loss": 0.021, "step": 45730 }, { "grad_norm": 0.6011723875999451, "learning_rate": 1.46661023986144e-05, "loss": 0.0248, "step": 45740 }, { "grad_norm": 0.3372799754142761, "learning_rate": 1.4646609627239344e-05, "loss": 0.0304, "step": 45750 }, { "grad_norm": 0.3211826682090759, "learning_rate": 1.4627127595290835e-05, "loss": 0.0386, "step": 45760 }, { "grad_norm": 0.4133773148059845, "learning_rate": 1.460765630868699e-05, "loss": 0.0244, "step": 45770 }, { "grad_norm": 0.18704381585121155, "learning_rate": 1.4588195773342678e-05, "loss": 0.0238, "step": 45780 }, { "grad_norm": 0.6504974365234375, "learning_rate": 1.4568745995169485e-05, "loss": 0.0358, "step": 45790 }, { "grad_norm": 0.6375557780265808, "learning_rate": 1.4549306980075778e-05, "loss": 0.0269, "step": 45800 }, { "grad_norm": 0.41731154918670654, "learning_rate": 1.4529878733966557e-05, "loss": 0.0343, "step": 45810 }, { "grad_norm": 0.35580357909202576, "learning_rate": 1.4510461262743658e-05, "loss": 0.0264, "step": 45820 }, { "grad_norm": 0.2836168110370636, "learning_rate": 1.4491054572305585e-05, "loss": 0.0387, "step": 45830 }, { "grad_norm": 0.4104689657688141, "learning_rate": 1.4471658668547566e-05, "loss": 0.0309, "step": 45840 }, { "grad_norm": 0.22525297105312347, "learning_rate": 1.4452273557361579e-05, "loss": 0.0251, "step": 45850 }, { "grad_norm": 0.42029303312301636, "learning_rate": 1.4432899244636282e-05, "loss": 0.0275, "step": 45860 }, { "grad_norm": 0.44621339440345764, "learning_rate": 1.4413535736257134e-05, "loss": 0.0275, "step": 45870 }, { "grad_norm": 0.4780449867248535, "learning_rate": 1.439418303810619e-05, "loss": 0.0224, "step": 45880 }, { "grad_norm": 0.30889132618904114, "learning_rate": 1.4374841156062352e-05, "loss": 0.0267, "step": 45890 }, { "grad_norm": 0.40472978353500366, "learning_rate": 1.4355510096001112e-05, "loss": 0.0446, "step": 45900 }, { "grad_norm": 0.2493123710155487, "learning_rate": 1.4336189863794786e-05, "loss": 0.0263, "step": 45910 }, { "grad_norm": 0.41507139801979065, "learning_rate": 1.4316880465312327e-05, "loss": 0.0246, "step": 45920 }, { "grad_norm": 0.34163081645965576, "learning_rate": 1.4297581906419426e-05, "loss": 0.0237, "step": 45930 }, { "grad_norm": 0.4475589692592621, "learning_rate": 1.4278294192978475e-05, "loss": 0.0236, "step": 45940 }, { "grad_norm": 0.29909762740135193, "learning_rate": 1.4259017330848574e-05, "loss": 0.0188, "step": 45950 }, { "grad_norm": 0.5933652520179749, "learning_rate": 1.4239751325885498e-05, "loss": 0.0266, "step": 45960 }, { "grad_norm": 0.4105842411518097, "learning_rate": 1.4220496183941795e-05, "loss": 0.023, "step": 45970 }, { "grad_norm": 0.3781134784221649, "learning_rate": 1.4201251910866648e-05, "loss": 0.0268, "step": 45980 }, { "grad_norm": 0.40720510482788086, "learning_rate": 1.4182018512505957e-05, "loss": 0.0178, "step": 45990 }, { "grad_norm": 0.5272995829582214, "learning_rate": 1.4162795994702327e-05, "loss": 0.0262, "step": 46000 }, { "grad_norm": 0.39064040780067444, "learning_rate": 1.4143584363295032e-05, "loss": 0.0223, "step": 46010 }, { "grad_norm": 0.39085084199905396, "learning_rate": 1.4124383624120101e-05, "loss": 0.0265, "step": 46020 }, { "grad_norm": 0.405878484249115, "learning_rate": 1.4105193783010151e-05, "loss": 0.0279, "step": 46030 }, { "grad_norm": 0.3298300504684448, "learning_rate": 1.4086014845794621e-05, "loss": 0.0207, "step": 46040 }, { "grad_norm": 0.5639722943305969, "learning_rate": 1.4066846818299489e-05, "loss": 0.0265, "step": 46050 }, { "grad_norm": 0.2782714366912842, "learning_rate": 1.4047689706347555e-05, "loss": 0.024, "step": 46060 }, { "grad_norm": 1.00850248336792, "learning_rate": 1.402854351575822e-05, "loss": 0.0405, "step": 46070 }, { "grad_norm": 0.500239372253418, "learning_rate": 1.4009408252347588e-05, "loss": 0.0339, "step": 46080 }, { "grad_norm": 0.2771191895008087, "learning_rate": 1.399028392192846e-05, "loss": 0.0372, "step": 46090 }, { "grad_norm": 0.6796746850013733, "learning_rate": 1.397117053031029e-05, "loss": 0.0247, "step": 46100 }, { "grad_norm": 0.7753120064735413, "learning_rate": 1.3952068083299213e-05, "loss": 0.0283, "step": 46110 }, { "grad_norm": 0.4098176062107086, "learning_rate": 1.3932976586698082e-05, "loss": 0.0244, "step": 46120 }, { "grad_norm": 0.47489020228385925, "learning_rate": 1.3913896046306363e-05, "loss": 0.0216, "step": 46130 }, { "grad_norm": 0.385797381401062, "learning_rate": 1.389482646792023e-05, "loss": 0.0224, "step": 46140 }, { "grad_norm": 0.5540374517440796, "learning_rate": 1.387576785733251e-05, "loss": 0.0309, "step": 46150 }, { "grad_norm": 0.40277594327926636, "learning_rate": 1.3856720220332703e-05, "loss": 0.0194, "step": 46160 }, { "grad_norm": 0.5651462078094482, "learning_rate": 1.383768356270701e-05, "loss": 0.0269, "step": 46170 }, { "grad_norm": 0.4019812345504761, "learning_rate": 1.3818657890238207e-05, "loss": 0.0191, "step": 46180 }, { "grad_norm": 0.4106164276599884, "learning_rate": 1.3799643208705859e-05, "loss": 0.0341, "step": 46190 }, { "grad_norm": 0.3236028552055359, "learning_rate": 1.3780639523886058e-05, "loss": 0.0206, "step": 46200 }, { "grad_norm": 0.5532834529876709, "learning_rate": 1.3761646841551668e-05, "loss": 0.0392, "step": 46210 }, { "grad_norm": 0.6395262479782104, "learning_rate": 1.3742665167472146e-05, "loss": 0.0204, "step": 46220 }, { "grad_norm": 0.24216386675834656, "learning_rate": 1.372369450741363e-05, "loss": 0.0244, "step": 46230 }, { "grad_norm": 0.2553656995296478, "learning_rate": 1.3704734867138901e-05, "loss": 0.0244, "step": 46240 }, { "grad_norm": 0.6827428340911865, "learning_rate": 1.36857862524074e-05, "loss": 0.022, "step": 46250 }, { "grad_norm": 0.29751452803611755, "learning_rate": 1.3666848668975213e-05, "loss": 0.025, "step": 46260 }, { "grad_norm": 0.40606528520584106, "learning_rate": 1.3647922122595063e-05, "loss": 0.0165, "step": 46270 }, { "grad_norm": 0.3443761467933655, "learning_rate": 1.3629006619016366e-05, "loss": 0.0194, "step": 46280 }, { "grad_norm": 0.6572445631027222, "learning_rate": 1.3610102163985139e-05, "loss": 0.0472, "step": 46290 }, { "grad_norm": 0.4171977937221527, "learning_rate": 1.3591208763244057e-05, "loss": 0.0226, "step": 46300 }, { "grad_norm": 0.28081920742988586, "learning_rate": 1.3572326422532428e-05, "loss": 0.0208, "step": 46310 }, { "grad_norm": 0.3678078353404999, "learning_rate": 1.355345514758622e-05, "loss": 0.0223, "step": 46320 }, { "grad_norm": 0.2751639783382416, "learning_rate": 1.3534594944138007e-05, "loss": 0.017, "step": 46330 }, { "grad_norm": 0.28048422932624817, "learning_rate": 1.3515745817917069e-05, "loss": 0.0274, "step": 46340 }, { "grad_norm": 0.3203229308128357, "learning_rate": 1.3496907774649208e-05, "loss": 0.0164, "step": 46350 }, { "grad_norm": 0.40013131499290466, "learning_rate": 1.3478080820056987e-05, "loss": 0.0201, "step": 46360 }, { "grad_norm": 0.2735268473625183, "learning_rate": 1.3459264959859474e-05, "loss": 0.0305, "step": 46370 }, { "grad_norm": 0.8058281540870667, "learning_rate": 1.3440460199772487e-05, "loss": 0.0228, "step": 46380 }, { "grad_norm": 0.2319640815258026, "learning_rate": 1.3421666545508382e-05, "loss": 0.0209, "step": 46390 }, { "grad_norm": 0.2503618597984314, "learning_rate": 1.3402884002776194e-05, "loss": 0.0283, "step": 46400 }, { "grad_norm": 0.36544570326805115, "learning_rate": 1.3384112577281555e-05, "loss": 0.023, "step": 46410 }, { "grad_norm": 0.27707991003990173, "learning_rate": 1.3365352274726711e-05, "loss": 0.0237, "step": 46420 }, { "grad_norm": 0.3389171063899994, "learning_rate": 1.3346603100810578e-05, "loss": 0.0238, "step": 46430 }, { "grad_norm": 0.5904521942138672, "learning_rate": 1.3327865061228645e-05, "loss": 0.0265, "step": 46440 }, { "grad_norm": 0.448236346244812, "learning_rate": 1.330913816167304e-05, "loss": 0.0226, "step": 46450 }, { "grad_norm": 0.25829339027404785, "learning_rate": 1.3290422407832492e-05, "loss": 0.0263, "step": 46460 }, { "grad_norm": 0.5706892013549805, "learning_rate": 1.3271717805392354e-05, "loss": 0.0298, "step": 46470 }, { "grad_norm": 0.5484504699707031, "learning_rate": 1.3253024360034582e-05, "loss": 0.0297, "step": 46480 }, { "grad_norm": 0.28201574087142944, "learning_rate": 1.323434207743779e-05, "loss": 0.0356, "step": 46490 }, { "grad_norm": 0.6302004456520081, "learning_rate": 1.3215670963277105e-05, "loss": 0.0222, "step": 46500 }, { "grad_norm": 0.19456009566783905, "learning_rate": 1.3197011023224376e-05, "loss": 0.0226, "step": 46510 }, { "grad_norm": 0.35673269629478455, "learning_rate": 1.3178362262947941e-05, "loss": 0.0214, "step": 46520 }, { "grad_norm": 0.45459362864494324, "learning_rate": 1.3159724688112845e-05, "loss": 0.0177, "step": 46530 }, { "grad_norm": 0.8728671669960022, "learning_rate": 1.3141098304380683e-05, "loss": 0.0325, "step": 46540 }, { "grad_norm": 0.4123145341873169, "learning_rate": 1.3122483117409651e-05, "loss": 0.0232, "step": 46550 }, { "grad_norm": 0.3972259759902954, "learning_rate": 1.3103879132854552e-05, "loss": 0.0215, "step": 46560 }, { "grad_norm": 0.9345623850822449, "learning_rate": 1.3085286356366771e-05, "loss": 0.0361, "step": 46570 }, { "grad_norm": 0.823115348815918, "learning_rate": 1.3066704793594337e-05, "loss": 0.0239, "step": 46580 }, { "grad_norm": 0.2762221395969391, "learning_rate": 1.3048134450181816e-05, "loss": 0.0209, "step": 46590 }, { "grad_norm": 0.4811594784259796, "learning_rate": 1.3029575331770394e-05, "loss": 0.0307, "step": 46600 }, { "grad_norm": 0.40473175048828125, "learning_rate": 1.3011027443997837e-05, "loss": 0.0272, "step": 46610 }, { "grad_norm": 0.5341181755065918, "learning_rate": 1.2992490792498507e-05, "loss": 0.0301, "step": 46620 }, { "grad_norm": 0.3794604539871216, "learning_rate": 1.297396538290333e-05, "loss": 0.0251, "step": 46630 }, { "grad_norm": 0.5507363677024841, "learning_rate": 1.2955451220839888e-05, "loss": 0.0246, "step": 46640 }, { "grad_norm": 0.47710490226745605, "learning_rate": 1.2936948311932223e-05, "loss": 0.0205, "step": 46650 }, { "grad_norm": 0.25288301706314087, "learning_rate": 1.2918456661801104e-05, "loss": 0.0283, "step": 46660 }, { "grad_norm": 0.4782372713088989, "learning_rate": 1.2899976276063736e-05, "loss": 0.0281, "step": 46670 }, { "grad_norm": 0.2579995095729828, "learning_rate": 1.2881507160334022e-05, "loss": 0.0252, "step": 46680 }, { "grad_norm": 0.5383922457695007, "learning_rate": 1.286304932022238e-05, "loss": 0.0217, "step": 46690 }, { "grad_norm": 0.22760678827762604, "learning_rate": 1.2844602761335806e-05, "loss": 0.0317, "step": 46700 }, { "grad_norm": 0.4671749770641327, "learning_rate": 1.2826167489277885e-05, "loss": 0.0173, "step": 46710 }, { "grad_norm": 0.4783598482608795, "learning_rate": 1.2807743509648745e-05, "loss": 0.0278, "step": 46720 }, { "grad_norm": 0.4797961413860321, "learning_rate": 1.2789330828045149e-05, "loss": 0.0294, "step": 46730 }, { "grad_norm": 0.2602786719799042, "learning_rate": 1.2770929450060332e-05, "loss": 0.0224, "step": 46740 }, { "grad_norm": 0.19212403893470764, "learning_rate": 1.2752539381284184e-05, "loss": 0.0245, "step": 46750 }, { "grad_norm": 0.6222054362297058, "learning_rate": 1.273416062730311e-05, "loss": 0.036, "step": 46760 }, { "grad_norm": 0.5932255387306213, "learning_rate": 1.2715793193700088e-05, "loss": 0.0214, "step": 46770 }, { "grad_norm": 0.4720034599304199, "learning_rate": 1.2697437086054664e-05, "loss": 0.0463, "step": 46780 }, { "grad_norm": 0.675833523273468, "learning_rate": 1.2679092309942937e-05, "loss": 0.0239, "step": 46790 }, { "grad_norm": 0.18429192900657654, "learning_rate": 1.266075887093755e-05, "loss": 0.0425, "step": 46800 }, { "grad_norm": 0.40674448013305664, "learning_rate": 1.2642436774607757e-05, "loss": 0.0332, "step": 46810 }, { "grad_norm": 0.3897237777709961, "learning_rate": 1.2624126026519278e-05, "loss": 0.0198, "step": 46820 }, { "grad_norm": 0.39982008934020996, "learning_rate": 1.2605826632234474e-05, "loss": 0.0205, "step": 46830 }, { "grad_norm": 0.1528707593679428, "learning_rate": 1.2587538597312198e-05, "loss": 0.0148, "step": 46840 }, { "grad_norm": 0.5097653269767761, "learning_rate": 1.2569261927307884e-05, "loss": 0.0313, "step": 46850 }, { "grad_norm": 0.3907802402973175, "learning_rate": 1.2550996627773493e-05, "loss": 0.0256, "step": 46860 }, { "grad_norm": 0.16326305270195007, "learning_rate": 1.2532742704257527e-05, "loss": 0.0195, "step": 46870 }, { "grad_norm": 0.40741845965385437, "learning_rate": 1.2514500162305087e-05, "loss": 0.0219, "step": 46880 }, { "grad_norm": 0.3683767318725586, "learning_rate": 1.2496269007457728e-05, "loss": 0.0219, "step": 46890 }, { "grad_norm": 0.4012030363082886, "learning_rate": 1.2478049245253625e-05, "loss": 0.0323, "step": 46900 }, { "grad_norm": 0.5921972990036011, "learning_rate": 1.2459840881227459e-05, "loss": 0.0218, "step": 46910 }, { "grad_norm": 0.6722419857978821, "learning_rate": 1.2441643920910435e-05, "loss": 0.0253, "step": 46920 }, { "grad_norm": 0.8743788599967957, "learning_rate": 1.2423458369830322e-05, "loss": 0.0319, "step": 46930 }, { "grad_norm": 0.3567187786102295, "learning_rate": 1.2405284233511406e-05, "loss": 0.0233, "step": 46940 }, { "grad_norm": 0.4091665744781494, "learning_rate": 1.2387121517474487e-05, "loss": 0.0313, "step": 46950 }, { "grad_norm": 0.2050471007823944, "learning_rate": 1.2368970227236975e-05, "loss": 0.0187, "step": 46960 }, { "grad_norm": 0.33135688304901123, "learning_rate": 1.2350830368312688e-05, "loss": 0.0241, "step": 46970 }, { "grad_norm": 0.44922903180122375, "learning_rate": 1.2332701946212083e-05, "loss": 0.031, "step": 46980 }, { "grad_norm": 0.437488317489624, "learning_rate": 1.2314584966442077e-05, "loss": 0.0232, "step": 46990 }, { "grad_norm": 0.40283316373825073, "learning_rate": 1.2296479434506136e-05, "loss": 0.0427, "step": 47000 }, { "grad_norm": 0.2817712426185608, "learning_rate": 1.2278385355904232e-05, "loss": 0.0263, "step": 47010 }, { "grad_norm": 0.29767292737960815, "learning_rate": 1.2260302736132867e-05, "loss": 0.0243, "step": 47020 }, { "grad_norm": 0.597955048084259, "learning_rate": 1.2242231580685098e-05, "loss": 0.0483, "step": 47030 }, { "grad_norm": 0.7449266314506531, "learning_rate": 1.2224171895050413e-05, "loss": 0.0495, "step": 47040 }, { "grad_norm": 0.7584511041641235, "learning_rate": 1.2206123684714903e-05, "loss": 0.0278, "step": 47050 }, { "grad_norm": 0.8971900939941406, "learning_rate": 1.2188086955161132e-05, "loss": 0.0184, "step": 47060 }, { "grad_norm": 0.39480140805244446, "learning_rate": 1.2170061711868175e-05, "loss": 0.0276, "step": 47070 }, { "grad_norm": 0.26126861572265625, "learning_rate": 1.215204796031163e-05, "loss": 0.0241, "step": 47080 }, { "grad_norm": 0.41973766684532166, "learning_rate": 1.2134045705963599e-05, "loss": 0.0205, "step": 47090 }, { "grad_norm": 0.44049015641212463, "learning_rate": 1.2116054954292689e-05, "loss": 0.024, "step": 47100 }, { "grad_norm": 0.3039969205856323, "learning_rate": 1.2098075710764011e-05, "loss": 0.0218, "step": 47110 }, { "grad_norm": 0.44868314266204834, "learning_rate": 1.2080107980839183e-05, "loss": 0.0242, "step": 47120 }, { "grad_norm": 0.38995516300201416, "learning_rate": 1.2062151769976343e-05, "loss": 0.0297, "step": 47130 }, { "grad_norm": 0.30208826065063477, "learning_rate": 1.204420708363011e-05, "loss": 0.0191, "step": 47140 }, { "grad_norm": 0.39118051528930664, "learning_rate": 1.2026273927251597e-05, "loss": 0.0205, "step": 47150 }, { "grad_norm": 0.5645486116409302, "learning_rate": 1.2008352306288424e-05, "loss": 0.0237, "step": 47160 }, { "grad_norm": 0.32941314578056335, "learning_rate": 1.1990442226184695e-05, "loss": 0.0224, "step": 47170 }, { "grad_norm": 0.4039554297924042, "learning_rate": 1.1972543692381066e-05, "loss": 0.0349, "step": 47180 }, { "grad_norm": 0.2681765556335449, "learning_rate": 1.1954656710314576e-05, "loss": 0.028, "step": 47190 }, { "grad_norm": 0.2865937352180481, "learning_rate": 1.1936781285418875e-05, "loss": 0.0212, "step": 47200 }, { "grad_norm": 0.5254170298576355, "learning_rate": 1.1918917423123993e-05, "loss": 0.0305, "step": 47210 }, { "grad_norm": 0.22118496894836426, "learning_rate": 1.1901065128856537e-05, "loss": 0.0167, "step": 47220 }, { "grad_norm": 0.25344881415367126, "learning_rate": 1.1883224408039551e-05, "loss": 0.0196, "step": 47230 }, { "grad_norm": 0.35610613226890564, "learning_rate": 1.1865395266092578e-05, "loss": 0.0232, "step": 47240 }, { "grad_norm": 0.44594261050224304, "learning_rate": 1.1847577708431633e-05, "loss": 0.0163, "step": 47250 }, { "grad_norm": 0.28394758701324463, "learning_rate": 1.1829771740469225e-05, "loss": 0.026, "step": 47260 }, { "grad_norm": 0.27687868475914, "learning_rate": 1.1811977367614324e-05, "loss": 0.0244, "step": 47270 }, { "grad_norm": 0.7191239595413208, "learning_rate": 1.1794194595272412e-05, "loss": 0.0393, "step": 47280 }, { "grad_norm": 0.5031290054321289, "learning_rate": 1.1776423428845423e-05, "loss": 0.0214, "step": 47290 }, { "grad_norm": 0.4521493911743164, "learning_rate": 1.1758663873731756e-05, "loss": 0.0291, "step": 47300 }, { "grad_norm": 0.7474930882453918, "learning_rate": 1.1740915935326302e-05, "loss": 0.0245, "step": 47310 }, { "grad_norm": 0.2594698667526245, "learning_rate": 1.1723179619020396e-05, "loss": 0.0238, "step": 47320 }, { "grad_norm": 0.5568456053733826, "learning_rate": 1.1705454930201914e-05, "loss": 0.0179, "step": 47330 }, { "grad_norm": 0.24883778393268585, "learning_rate": 1.1687741874255087e-05, "loss": 0.0282, "step": 47340 }, { "grad_norm": 0.492987722158432, "learning_rate": 1.1670040456560728e-05, "loss": 0.032, "step": 47350 }, { "grad_norm": 0.5796894431114197, "learning_rate": 1.1652350682496005e-05, "loss": 0.0297, "step": 47360 }, { "grad_norm": 0.26321056485176086, "learning_rate": 1.163467255743465e-05, "loss": 0.0232, "step": 47370 }, { "grad_norm": 0.27273377776145935, "learning_rate": 1.1617006086746796e-05, "loss": 0.0158, "step": 47380 }, { "grad_norm": 0.23122048377990723, "learning_rate": 1.1599351275799047e-05, "loss": 0.0297, "step": 47390 }, { "grad_norm": 0.4275909960269928, "learning_rate": 1.1581708129954466e-05, "loss": 0.026, "step": 47400 }, { "grad_norm": 0.28962308168411255, "learning_rate": 1.1564076654572587e-05, "loss": 0.0208, "step": 47410 }, { "grad_norm": 0.3737964332103729, "learning_rate": 1.1546456855009358e-05, "loss": 0.0261, "step": 47420 }, { "grad_norm": 0.5127573609352112, "learning_rate": 1.1528848736617248e-05, "loss": 0.0182, "step": 47430 }, { "grad_norm": 0.3739262819290161, "learning_rate": 1.1511252304745112e-05, "loss": 0.018, "step": 47440 }, { "grad_norm": 0.2838037312030792, "learning_rate": 1.1493667564738297e-05, "loss": 0.0237, "step": 47450 }, { "grad_norm": 0.6592850089073181, "learning_rate": 1.1476094521938574e-05, "loss": 0.0379, "step": 47460 }, { "grad_norm": 0.5878570079803467, "learning_rate": 1.1458533181684167e-05, "loss": 0.0216, "step": 47470 }, { "grad_norm": 0.41072776913642883, "learning_rate": 1.1440983549309753e-05, "loss": 0.0194, "step": 47480 }, { "grad_norm": 0.3789444863796234, "learning_rate": 1.1423445630146434e-05, "loss": 0.0282, "step": 47490 }, { "grad_norm": 0.27976110577583313, "learning_rate": 1.1405919429521799e-05, "loss": 0.0228, "step": 47500 }, { "grad_norm": 0.6899651885032654, "learning_rate": 1.1388404952759802e-05, "loss": 0.0221, "step": 47510 }, { "grad_norm": 0.3012693524360657, "learning_rate": 1.1370902205180923e-05, "loss": 0.0323, "step": 47520 }, { "grad_norm": 1.448905110359192, "learning_rate": 1.1353411192101987e-05, "loss": 0.0247, "step": 47530 }, { "grad_norm": 0.5099520683288574, "learning_rate": 1.133593191883634e-05, "loss": 0.0209, "step": 47540 }, { "grad_norm": 0.49217915534973145, "learning_rate": 1.1318464390693711e-05, "loss": 0.0213, "step": 47550 }, { "grad_norm": 0.37034040689468384, "learning_rate": 1.1301008612980257e-05, "loss": 0.0249, "step": 47560 }, { "grad_norm": 0.36265861988067627, "learning_rate": 1.128356459099863e-05, "loss": 0.02, "step": 47570 }, { "grad_norm": 0.6995985507965088, "learning_rate": 1.1266132330047802e-05, "loss": 0.0222, "step": 47580 }, { "grad_norm": 0.39901018142700195, "learning_rate": 1.1248711835423281e-05, "loss": 0.0248, "step": 47590 }, { "grad_norm": 0.24459445476531982, "learning_rate": 1.123130311241693e-05, "loss": 0.0246, "step": 47600 }, { "grad_norm": 0.41861000657081604, "learning_rate": 1.1213906166317068e-05, "loss": 0.0217, "step": 47610 }, { "grad_norm": 0.3936530649662018, "learning_rate": 1.1196521002408427e-05, "loss": 0.024, "step": 47620 }, { "grad_norm": 0.23118755221366882, "learning_rate": 1.1179147625972159e-05, "loss": 0.0145, "step": 47630 }, { "grad_norm": 0.3296017646789551, "learning_rate": 1.1161786042285822e-05, "loss": 0.0216, "step": 47640 }, { "grad_norm": 0.20905429124832153, "learning_rate": 1.1144436256623447e-05, "loss": 0.0143, "step": 47650 }, { "grad_norm": 0.5897279977798462, "learning_rate": 1.1127098274255392e-05, "loss": 0.0255, "step": 47660 }, { "grad_norm": 0.3852584660053253, "learning_rate": 1.1109772100448512e-05, "loss": 0.0209, "step": 47670 }, { "grad_norm": 0.40567025542259216, "learning_rate": 1.1092457740466033e-05, "loss": 0.0234, "step": 47680 }, { "grad_norm": 0.4337923526763916, "learning_rate": 1.10751551995676e-05, "loss": 0.0199, "step": 47690 }, { "grad_norm": 0.2558114230632782, "learning_rate": 1.1057864483009262e-05, "loss": 0.0127, "step": 47700 }, { "grad_norm": 0.4756414294242859, "learning_rate": 1.1040585596043473e-05, "loss": 0.0211, "step": 47710 }, { "grad_norm": 0.4864591956138611, "learning_rate": 1.1023318543919148e-05, "loss": 0.0227, "step": 47720 }, { "grad_norm": 0.8827995657920837, "learning_rate": 1.10060633318815e-05, "loss": 0.0332, "step": 47730 }, { "grad_norm": 0.313583105802536, "learning_rate": 1.0988819965172248e-05, "loss": 0.0183, "step": 47740 }, { "grad_norm": 0.21504110097885132, "learning_rate": 1.0971588449029462e-05, "loss": 0.0235, "step": 47750 }, { "grad_norm": 0.4451597332954407, "learning_rate": 1.095436878868762e-05, "loss": 0.0194, "step": 47760 }, { "grad_norm": 0.4081652760505676, "learning_rate": 1.0937160989377598e-05, "loss": 0.0179, "step": 47770 }, { "grad_norm": 0.3893977403640747, "learning_rate": 1.0919965056326676e-05, "loss": 0.044, "step": 47780 }, { "grad_norm": 0.5056771636009216, "learning_rate": 1.0902780994758504e-05, "loss": 0.0242, "step": 47790 }, { "grad_norm": 0.3589947521686554, "learning_rate": 1.0885608809893193e-05, "loss": 0.0221, "step": 47800 }, { "grad_norm": 0.4260898232460022, "learning_rate": 1.0868448506947142e-05, "loss": 0.0218, "step": 47810 }, { "grad_norm": 0.4324277341365814, "learning_rate": 1.0851300091133243e-05, "loss": 0.0214, "step": 47820 }, { "grad_norm": 0.3229798972606659, "learning_rate": 1.083416356766071e-05, "loss": 0.026, "step": 47830 }, { "grad_norm": 0.24577635526657104, "learning_rate": 1.0817038941735175e-05, "loss": 0.0298, "step": 47840 }, { "grad_norm": 0.558200478553772, "learning_rate": 1.0799926218558642e-05, "loss": 0.0213, "step": 47850 }, { "grad_norm": 0.2022993266582489, "learning_rate": 1.0782825403329488e-05, "loss": 0.022, "step": 47860 }, { "grad_norm": 0.15888026356697083, "learning_rate": 1.076573650124254e-05, "loss": 0.0168, "step": 47870 }, { "grad_norm": 0.4077562391757965, "learning_rate": 1.0748659517488891e-05, "loss": 0.0193, "step": 47880 }, { "grad_norm": 0.48927637934684753, "learning_rate": 1.0731594457256138e-05, "loss": 0.0212, "step": 47890 }, { "grad_norm": 0.4710875451564789, "learning_rate": 1.0714541325728139e-05, "loss": 0.0253, "step": 47900 }, { "grad_norm": 0.3689672350883484, "learning_rate": 1.0697500128085231e-05, "loss": 0.0319, "step": 47910 }, { "grad_norm": 0.4280972480773926, "learning_rate": 1.0680470869504055e-05, "loss": 0.0354, "step": 47920 }, { "grad_norm": 0.3613497018814087, "learning_rate": 1.066345355515766e-05, "loss": 0.0289, "step": 47930 }, { "grad_norm": 0.6038423776626587, "learning_rate": 1.0646448190215453e-05, "loss": 0.0351, "step": 47940 }, { "grad_norm": 0.4783444404602051, "learning_rate": 1.0629454779843217e-05, "loss": 0.0259, "step": 47950 }, { "grad_norm": 0.39748844504356384, "learning_rate": 1.0612473329203082e-05, "loss": 0.0282, "step": 47960 }, { "grad_norm": 0.48765498399734497, "learning_rate": 1.0595503843453596e-05, "loss": 0.0308, "step": 47970 }, { "grad_norm": 0.32685330510139465, "learning_rate": 1.0578546327749634e-05, "loss": 0.0291, "step": 47980 }, { "grad_norm": 0.18607935309410095, "learning_rate": 1.0561600787242425e-05, "loss": 0.0166, "step": 47990 }, { "grad_norm": 0.49036601185798645, "learning_rate": 1.0544667227079591e-05, "loss": 0.0224, "step": 48000 }, { "grad_norm": 0.38324078917503357, "learning_rate": 1.0527745652405085e-05, "loss": 0.019, "step": 48010 }, { "grad_norm": 0.30097511410713196, "learning_rate": 1.051083606835927e-05, "loss": 0.0191, "step": 48020 }, { "grad_norm": 0.3730314075946808, "learning_rate": 1.049393848007878e-05, "loss": 0.0314, "step": 48030 }, { "grad_norm": 0.2566574513912201, "learning_rate": 1.0477052892696709e-05, "loss": 0.0183, "step": 48040 }, { "grad_norm": 0.3853583335876465, "learning_rate": 1.0460179311342394e-05, "loss": 0.0221, "step": 48050 }, { "grad_norm": 0.43804535269737244, "learning_rate": 1.0443317741141634e-05, "loss": 0.0312, "step": 48060 }, { "grad_norm": 0.43453097343444824, "learning_rate": 1.0426468187216514e-05, "loss": 0.0163, "step": 48070 }, { "grad_norm": 0.41941341757774353, "learning_rate": 1.0409630654685477e-05, "loss": 0.0255, "step": 48080 }, { "grad_norm": 0.5464519262313843, "learning_rate": 1.039280514866332e-05, "loss": 0.0241, "step": 48090 }, { "grad_norm": 0.3287629187107086, "learning_rate": 1.0375991674261198e-05, "loss": 0.0206, "step": 48100 }, { "grad_norm": 0.5247558951377869, "learning_rate": 1.0359190236586575e-05, "loss": 0.0229, "step": 48110 }, { "grad_norm": 0.6678019165992737, "learning_rate": 1.0342400840743322e-05, "loss": 0.026, "step": 48120 }, { "grad_norm": 0.3393023610115051, "learning_rate": 1.0325623491831593e-05, "loss": 0.0181, "step": 48130 }, { "grad_norm": 0.2146586924791336, "learning_rate": 1.0308858194947906e-05, "loss": 0.0232, "step": 48140 }, { "grad_norm": 0.21430213749408722, "learning_rate": 1.0292104955185111e-05, "loss": 0.027, "step": 48150 }, { "grad_norm": 0.7819600701332092, "learning_rate": 1.0275363777632396e-05, "loss": 0.0225, "step": 48160 }, { "grad_norm": 0.22416622936725616, "learning_rate": 1.0258634667375321e-05, "loss": 0.0228, "step": 48170 }, { "grad_norm": 0.1991235613822937, "learning_rate": 1.02419176294957e-05, "loss": 0.0178, "step": 48180 }, { "grad_norm": 0.4629959166049957, "learning_rate": 1.0225212669071782e-05, "loss": 0.0297, "step": 48190 }, { "grad_norm": 0.44474396109580994, "learning_rate": 1.0208519791178029e-05, "loss": 0.0266, "step": 48200 }, { "grad_norm": 0.6424425840377808, "learning_rate": 1.019183900088535e-05, "loss": 0.0267, "step": 48210 }, { "grad_norm": 0.2619667947292328, "learning_rate": 1.0175170303260906e-05, "loss": 0.0203, "step": 48220 }, { "grad_norm": 0.4644091725349426, "learning_rate": 1.0158513703368206e-05, "loss": 0.0198, "step": 48230 }, { "grad_norm": 0.24787777662277222, "learning_rate": 1.0141869206267095e-05, "loss": 0.0196, "step": 48240 }, { "grad_norm": 0.26808708906173706, "learning_rate": 1.0125236817013723e-05, "loss": 0.038, "step": 48250 }, { "grad_norm": 0.40097811818122864, "learning_rate": 1.010861654066056e-05, "loss": 0.0292, "step": 48260 }, { "grad_norm": 0.3300603926181793, "learning_rate": 1.0092008382256434e-05, "loss": 0.019, "step": 48270 }, { "grad_norm": 0.2851089537143707, "learning_rate": 1.0075412346846458e-05, "loss": 0.0275, "step": 48280 }, { "grad_norm": 0.4718991219997406, "learning_rate": 1.0058828439472056e-05, "loss": 0.0226, "step": 48290 }, { "grad_norm": 0.3697427213191986, "learning_rate": 1.0042256665170996e-05, "loss": 0.0187, "step": 48300 }, { "grad_norm": 0.426682710647583, "learning_rate": 1.0025697028977332e-05, "loss": 0.0218, "step": 48310 }, { "grad_norm": 0.22695772349834442, "learning_rate": 1.0009149535921454e-05, "loss": 0.0206, "step": 48320 }, { "grad_norm": 0.17169040441513062, "learning_rate": 9.992614191030031e-06, "loss": 0.0203, "step": 48330 }, { "grad_norm": 0.4407140016555786, "learning_rate": 9.976090999326115e-06, "loss": 0.0237, "step": 48340 }, { "grad_norm": 0.4817124009132385, "learning_rate": 9.959579965828952e-06, "loss": 0.0246, "step": 48350 }, { "grad_norm": 0.47914737462997437, "learning_rate": 9.943081095554218e-06, "loss": 0.0212, "step": 48360 }, { "grad_norm": 0.3725070655345917, "learning_rate": 9.926594393513783e-06, "loss": 0.017, "step": 48370 }, { "grad_norm": 0.25286319851875305, "learning_rate": 9.910119864715906e-06, "loss": 0.0211, "step": 48380 }, { "grad_norm": 0.25352123379707336, "learning_rate": 9.8936575141651e-06, "loss": 0.0141, "step": 48390 }, { "grad_norm": 0.2521395981311798, "learning_rate": 9.877207346862194e-06, "loss": 0.0215, "step": 48400 }, { "grad_norm": 0.31467270851135254, "learning_rate": 9.860769367804312e-06, "loss": 0.0153, "step": 48410 }, { "grad_norm": 0.5030589699745178, "learning_rate": 9.844343581984877e-06, "loss": 0.0267, "step": 48420 }, { "grad_norm": 0.23464231193065643, "learning_rate": 9.82792999439362e-06, "loss": 0.0204, "step": 48430 }, { "grad_norm": 0.6175536513328552, "learning_rate": 9.811528610016546e-06, "loss": 0.0361, "step": 48440 }, { "grad_norm": 0.22903203964233398, "learning_rate": 9.79513943383597e-06, "loss": 0.0231, "step": 48450 }, { "grad_norm": 0.24525761604309082, "learning_rate": 9.778762470830489e-06, "loss": 0.0243, "step": 48460 }, { "grad_norm": 0.3358255624771118, "learning_rate": 9.762397725974982e-06, "loss": 0.0193, "step": 48470 }, { "grad_norm": 0.26119518280029297, "learning_rate": 9.746045204240622e-06, "loss": 0.0215, "step": 48480 }, { "grad_norm": 0.5820352435112, "learning_rate": 9.729704910594917e-06, "loss": 0.0199, "step": 48490 }, { "grad_norm": 0.4744873344898224, "learning_rate": 9.713376850001554e-06, "loss": 0.0237, "step": 48500 }, { "grad_norm": 0.41095706820487976, "learning_rate": 9.697061027420622e-06, "loss": 0.022, "step": 48510 }, { "grad_norm": 0.36692267656326294, "learning_rate": 9.680757447808385e-06, "loss": 0.0206, "step": 48520 }, { "grad_norm": 0.5617361068725586, "learning_rate": 9.664466116117488e-06, "loss": 0.0257, "step": 48530 }, { "grad_norm": 0.47474196553230286, "learning_rate": 9.64818703729678e-06, "loss": 0.0216, "step": 48540 }, { "grad_norm": 0.5723667144775391, "learning_rate": 9.631920216291423e-06, "loss": 0.0174, "step": 48550 }, { "grad_norm": 0.39474278688430786, "learning_rate": 9.615665658042849e-06, "loss": 0.0204, "step": 48560 }, { "grad_norm": 0.3480013310909271, "learning_rate": 9.599423367488747e-06, "loss": 0.0175, "step": 48570 }, { "grad_norm": 0.4026832580566406, "learning_rate": 9.583193349563124e-06, "loss": 0.028, "step": 48580 }, { "grad_norm": 0.318982869386673, "learning_rate": 9.566975609196216e-06, "loss": 0.0394, "step": 48590 }, { "grad_norm": 0.6102703809738159, "learning_rate": 9.550770151314548e-06, "loss": 0.0206, "step": 48600 }, { "grad_norm": 0.3694676160812378, "learning_rate": 9.53457698084091e-06, "loss": 0.0206, "step": 48610 }, { "grad_norm": 0.498006135225296, "learning_rate": 9.518396102694355e-06, "loss": 0.0419, "step": 48620 }, { "grad_norm": 0.18614554405212402, "learning_rate": 9.502227521790198e-06, "loss": 0.0184, "step": 48630 }, { "grad_norm": 0.4350806474685669, "learning_rate": 9.486071243040063e-06, "loss": 0.0234, "step": 48640 }, { "grad_norm": 0.4897637367248535, "learning_rate": 9.469927271351747e-06, "loss": 0.0201, "step": 48650 }, { "grad_norm": 0.5971179008483887, "learning_rate": 9.453795611629419e-06, "loss": 0.0215, "step": 48660 }, { "grad_norm": 0.5431463122367859, "learning_rate": 9.437676268773399e-06, "loss": 0.0216, "step": 48670 }, { "grad_norm": 0.43293869495391846, "learning_rate": 9.421569247680357e-06, "loss": 0.0253, "step": 48680 }, { "grad_norm": 0.46774381399154663, "learning_rate": 9.40547455324316e-06, "loss": 0.0255, "step": 48690 }, { "grad_norm": 0.296232134103775, "learning_rate": 9.389392190350965e-06, "loss": 0.0207, "step": 48700 }, { "grad_norm": 0.27982890605926514, "learning_rate": 9.373322163889153e-06, "loss": 0.0238, "step": 48710 }, { "grad_norm": 0.5935471057891846, "learning_rate": 9.357264478739375e-06, "loss": 0.0144, "step": 48720 }, { "grad_norm": 0.34454667568206787, "learning_rate": 9.341219139779567e-06, "loss": 0.031, "step": 48730 }, { "grad_norm": 0.3842402398586273, "learning_rate": 9.325186151883824e-06, "loss": 0.0175, "step": 48740 }, { "grad_norm": 0.3596890866756439, "learning_rate": 9.30916551992258e-06, "loss": 0.0199, "step": 48750 }, { "grad_norm": 0.5787613987922668, "learning_rate": 9.293157248762479e-06, "loss": 0.0264, "step": 48760 }, { "grad_norm": 0.4415416121482849, "learning_rate": 9.2771613432664e-06, "loss": 0.0225, "step": 48770 }, { "grad_norm": 0.3172985911369324, "learning_rate": 9.261177808293481e-06, "loss": 0.0225, "step": 48780 }, { "grad_norm": 0.5975133776664734, "learning_rate": 9.245206648699096e-06, "loss": 0.0194, "step": 48790 }, { "grad_norm": 0.47465232014656067, "learning_rate": 9.22924786933485e-06, "loss": 0.0344, "step": 48800 }, { "grad_norm": 0.4414154291152954, "learning_rate": 9.213301475048642e-06, "loss": 0.0168, "step": 48810 }, { "grad_norm": 0.32321542501449585, "learning_rate": 9.197367470684504e-06, "loss": 0.02, "step": 48820 }, { "grad_norm": 0.31794634461402893, "learning_rate": 9.181445861082816e-06, "loss": 0.0163, "step": 48830 }, { "grad_norm": 0.2884072959423065, "learning_rate": 9.16553665108012e-06, "loss": 0.0209, "step": 48840 }, { "grad_norm": 0.6953832507133484, "learning_rate": 9.149639845509223e-06, "loss": 0.0342, "step": 48850 }, { "grad_norm": 0.39967721700668335, "learning_rate": 9.133755449199144e-06, "loss": 0.0269, "step": 48860 }, { "grad_norm": 0.5910430550575256, "learning_rate": 9.117883466975135e-06, "loss": 0.0172, "step": 48870 }, { "grad_norm": 0.30769771337509155, "learning_rate": 9.10202390365873e-06, "loss": 0.0162, "step": 48880 }, { "grad_norm": 0.32123905420303345, "learning_rate": 9.086176764067583e-06, "loss": 0.0221, "step": 48890 }, { "grad_norm": 0.5202832818031311, "learning_rate": 9.070342053015684e-06, "loss": 0.0293, "step": 48900 }, { "grad_norm": 0.28111356496810913, "learning_rate": 9.054519775313187e-06, "loss": 0.0326, "step": 48910 }, { "grad_norm": 0.19943632185459137, "learning_rate": 9.038709935766476e-06, "loss": 0.0265, "step": 48920 }, { "grad_norm": 0.27882829308509827, "learning_rate": 9.02291253917817e-06, "loss": 0.0233, "step": 48930 }, { "grad_norm": 0.42765673995018005, "learning_rate": 9.007127590347091e-06, "loss": 0.0263, "step": 48940 }, { "grad_norm": 0.315432608127594, "learning_rate": 8.991355094068288e-06, "loss": 0.0181, "step": 48950 }, { "grad_norm": 0.42218807339668274, "learning_rate": 8.975595055133062e-06, "loss": 0.0297, "step": 48960 }, { "grad_norm": 0.22936810553073883, "learning_rate": 8.959847478328848e-06, "loss": 0.0233, "step": 48970 }, { "grad_norm": 0.41989386081695557, "learning_rate": 8.944112368439378e-06, "loss": 0.0257, "step": 48980 }, { "grad_norm": 0.5386545658111572, "learning_rate": 8.928389730244552e-06, "loss": 0.0257, "step": 48990 }, { "grad_norm": 0.32969358563423157, "learning_rate": 8.912679568520494e-06, "loss": 0.028, "step": 49000 }, { "grad_norm": 0.3428790271282196, "learning_rate": 8.896981888039534e-06, "loss": 0.0227, "step": 49010 }, { "grad_norm": 0.32110583782196045, "learning_rate": 8.881296693570201e-06, "loss": 0.0204, "step": 49020 }, { "grad_norm": 0.7097495794296265, "learning_rate": 8.865623989877281e-06, "loss": 0.0294, "step": 49030 }, { "grad_norm": 0.44931963086128235, "learning_rate": 8.849963781721681e-06, "loss": 0.0164, "step": 49040 }, { "grad_norm": 1.3194514513015747, "learning_rate": 8.834316073860588e-06, "loss": 0.024, "step": 49050 }, { "grad_norm": 0.47013917565345764, "learning_rate": 8.818680871047357e-06, "loss": 0.0214, "step": 49060 }, { "grad_norm": 0.3266001343727112, "learning_rate": 8.803058178031549e-06, "loss": 0.0202, "step": 49070 }, { "grad_norm": 0.4723084568977356, "learning_rate": 8.787447999558922e-06, "loss": 0.021, "step": 49080 }, { "grad_norm": 0.6043652296066284, "learning_rate": 8.77185034037144e-06, "loss": 0.0227, "step": 49090 }, { "grad_norm": 0.4913334548473358, "learning_rate": 8.756265205207259e-06, "loss": 0.0172, "step": 49100 }, { "grad_norm": 0.5118086934089661, "learning_rate": 8.740692598800732e-06, "loss": 0.0272, "step": 49110 }, { "grad_norm": 0.24622513353824615, "learning_rate": 8.72513252588239e-06, "loss": 0.018, "step": 49120 }, { "grad_norm": 0.3283456563949585, "learning_rate": 8.709584991178998e-06, "loss": 0.0194, "step": 49130 }, { "grad_norm": 0.4925851821899414, "learning_rate": 8.694049999413479e-06, "loss": 0.0314, "step": 49140 }, { "grad_norm": 0.7510756850242615, "learning_rate": 8.678527555304945e-06, "loss": 0.022, "step": 49150 }, { "grad_norm": 0.25011444091796875, "learning_rate": 8.663017663568712e-06, "loss": 0.0259, "step": 49160 }, { "grad_norm": 0.39153850078582764, "learning_rate": 8.647520328916259e-06, "loss": 0.0248, "step": 49170 }, { "grad_norm": 0.5628716945648193, "learning_rate": 8.632035556055307e-06, "loss": 0.0206, "step": 49180 }, { "grad_norm": 0.21842777729034424, "learning_rate": 8.616563349689672e-06, "loss": 0.0139, "step": 49190 }, { "grad_norm": 0.42325150966644287, "learning_rate": 8.601103714519448e-06, "loss": 0.0217, "step": 49200 }, { "grad_norm": 0.25230905413627625, "learning_rate": 8.58565665524082e-06, "loss": 0.027, "step": 49210 }, { "grad_norm": 0.6334234476089478, "learning_rate": 8.570222176546222e-06, "loss": 0.0294, "step": 49220 }, { "grad_norm": 0.47452056407928467, "learning_rate": 8.554800283124242e-06, "loss": 0.0186, "step": 49230 }, { "grad_norm": 0.3783588111400604, "learning_rate": 8.539390979659639e-06, "loss": 0.0176, "step": 49240 }, { "grad_norm": 0.3307105004787445, "learning_rate": 8.523994270833352e-06, "loss": 0.0254, "step": 49250 }, { "grad_norm": 0.7688267230987549, "learning_rate": 8.5086101613225e-06, "loss": 0.021, "step": 49260 }, { "grad_norm": 0.44260933995246887, "learning_rate": 8.493238655800346e-06, "loss": 0.0266, "step": 49270 }, { "grad_norm": 0.29353976249694824, "learning_rate": 8.47787975893638e-06, "loss": 0.0157, "step": 49280 }, { "grad_norm": 0.30627843737602234, "learning_rate": 8.462533475396211e-06, "loss": 0.0193, "step": 49290 }, { "grad_norm": 0.3328759968280792, "learning_rate": 8.447199809841643e-06, "loss": 0.0177, "step": 49300 }, { "grad_norm": 0.39729592204093933, "learning_rate": 8.431878766930635e-06, "loss": 0.0218, "step": 49310 }, { "grad_norm": 0.22969377040863037, "learning_rate": 8.416570351317304e-06, "loss": 0.0267, "step": 49320 }, { "grad_norm": 0.3869680166244507, "learning_rate": 8.401274567651973e-06, "loss": 0.0287, "step": 49330 }, { "grad_norm": 0.4451853930950165, "learning_rate": 8.385991420581058e-06, "loss": 0.029, "step": 49340 }, { "grad_norm": 0.29515352845191956, "learning_rate": 8.370720914747215e-06, "loss": 0.0167, "step": 49350 }, { "grad_norm": 0.34055617451667786, "learning_rate": 8.355463054789181e-06, "loss": 0.0211, "step": 49360 }, { "grad_norm": 0.2968428432941437, "learning_rate": 8.340217845341919e-06, "loss": 0.0239, "step": 49370 }, { "grad_norm": 0.33866721391677856, "learning_rate": 8.324985291036514e-06, "loss": 0.0208, "step": 49380 }, { "grad_norm": 0.4076691269874573, "learning_rate": 8.309765396500213e-06, "loss": 0.0228, "step": 49390 }, { "grad_norm": 0.39092326164245605, "learning_rate": 8.294558166356419e-06, "loss": 0.0313, "step": 49400 }, { "grad_norm": 0.4271622598171234, "learning_rate": 8.279363605224683e-06, "loss": 0.0256, "step": 49410 }, { "grad_norm": 0.547061562538147, "learning_rate": 8.264181717720704e-06, "loss": 0.0209, "step": 49420 }, { "grad_norm": 0.3408813774585724, "learning_rate": 8.249012508456361e-06, "loss": 0.0238, "step": 49430 }, { "grad_norm": 0.44365522265434265, "learning_rate": 8.233855982039646e-06, "loss": 0.0343, "step": 49440 }, { "grad_norm": 0.37922242283821106, "learning_rate": 8.218712143074708e-06, "loss": 0.0199, "step": 49450 }, { "grad_norm": 0.4551420509815216, "learning_rate": 8.203580996161858e-06, "loss": 0.0253, "step": 49460 }, { "grad_norm": 0.5538607239723206, "learning_rate": 8.188462545897512e-06, "loss": 0.0231, "step": 49470 }, { "grad_norm": 0.44518235325813293, "learning_rate": 8.173356796874304e-06, "loss": 0.023, "step": 49480 }, { "grad_norm": 0.32404741644859314, "learning_rate": 8.158263753680906e-06, "loss": 0.0321, "step": 49490 }, { "grad_norm": 0.4039650559425354, "learning_rate": 8.143183420902239e-06, "loss": 0.0191, "step": 49500 }, { "grad_norm": 0.5792829990386963, "learning_rate": 8.128115803119258e-06, "loss": 0.027, "step": 49510 }, { "grad_norm": 0.7349728345870972, "learning_rate": 8.11306090490916e-06, "loss": 0.0364, "step": 49520 }, { "grad_norm": 0.5829764008522034, "learning_rate": 8.098018730845169e-06, "loss": 0.0316, "step": 49530 }, { "grad_norm": 0.24448904395103455, "learning_rate": 8.082989285496745e-06, "loss": 0.0209, "step": 49540 }, { "grad_norm": 0.23854438960552216, "learning_rate": 8.067972573429416e-06, "loss": 0.0224, "step": 49550 }, { "grad_norm": 0.4263959527015686, "learning_rate": 8.052968599204874e-06, "loss": 0.0229, "step": 49560 }, { "grad_norm": 1.018816590309143, "learning_rate": 8.037977367380922e-06, "loss": 0.0312, "step": 49570 }, { "grad_norm": 0.44724389910697937, "learning_rate": 8.022998882511495e-06, "loss": 0.0293, "step": 49580 }, { "grad_norm": 0.29854434728622437, "learning_rate": 8.008033149146677e-06, "loss": 0.0203, "step": 49590 }, { "grad_norm": 0.3472592830657959, "learning_rate": 7.993080171832656e-06, "loss": 0.0249, "step": 49600 }, { "grad_norm": 0.12804441154003143, "learning_rate": 7.978139955111752e-06, "loss": 0.0171, "step": 49610 }, { "grad_norm": 0.3764244318008423, "learning_rate": 7.9632125035224e-06, "loss": 0.0255, "step": 49620 }, { "grad_norm": 0.3006095290184021, "learning_rate": 7.948297821599177e-06, "loss": 0.0212, "step": 49630 }, { "grad_norm": 0.38101834058761597, "learning_rate": 7.933395913872755e-06, "loss": 0.0205, "step": 49640 }, { "grad_norm": 0.5682861804962158, "learning_rate": 7.918506784869972e-06, "loss": 0.0373, "step": 49650 }, { "grad_norm": 0.2328638881444931, "learning_rate": 7.903630439113707e-06, "loss": 0.0184, "step": 49660 }, { "grad_norm": 0.3255901038646698, "learning_rate": 7.888766881123044e-06, "loss": 0.026, "step": 49670 }, { "grad_norm": 0.5155882835388184, "learning_rate": 7.873916115413099e-06, "loss": 0.0214, "step": 49680 }, { "grad_norm": 0.6081421971321106, "learning_rate": 7.85907814649518e-06, "loss": 0.0187, "step": 49690 }, { "grad_norm": 0.32424992322921753, "learning_rate": 7.844252978876649e-06, "loss": 0.0232, "step": 49700 }, { "grad_norm": 0.2891518771648407, "learning_rate": 7.829440617061001e-06, "loss": 0.0245, "step": 49710 }, { "grad_norm": 0.3567965030670166, "learning_rate": 7.814641065547851e-06, "loss": 0.0207, "step": 49720 }, { "grad_norm": 0.40160396695137024, "learning_rate": 7.79985432883289e-06, "loss": 0.0246, "step": 49730 }, { "grad_norm": 0.37439975142478943, "learning_rate": 7.78508041140797e-06, "loss": 0.0364, "step": 49740 }, { "grad_norm": 0.4113416075706482, "learning_rate": 7.770319317760993e-06, "loss": 0.0202, "step": 49750 }, { "grad_norm": 0.26047536730766296, "learning_rate": 7.755571052376004e-06, "loss": 0.0252, "step": 49760 }, { "grad_norm": 0.39984679222106934, "learning_rate": 7.740835619733128e-06, "loss": 0.0245, "step": 49770 }, { "grad_norm": 0.5231815576553345, "learning_rate": 7.726113024308601e-06, "loss": 0.0231, "step": 49780 }, { "grad_norm": 0.4578632414340973, "learning_rate": 7.711403270574746e-06, "loss": 0.0241, "step": 49790 }, { "grad_norm": 0.25508642196655273, "learning_rate": 7.696706363000039e-06, "loss": 0.0236, "step": 49800 }, { "grad_norm": 0.30534815788269043, "learning_rate": 7.682022306048959e-06, "loss": 0.0158, "step": 49810 }, { "grad_norm": 0.5110493302345276, "learning_rate": 7.667351104182186e-06, "loss": 0.0226, "step": 49820 }, { "grad_norm": 0.3649754822254181, "learning_rate": 7.652692761856395e-06, "loss": 0.0212, "step": 49830 }, { "grad_norm": 0.5908485054969788, "learning_rate": 7.63804728352444e-06, "loss": 0.0281, "step": 49840 }, { "grad_norm": 0.4139309227466583, "learning_rate": 7.623414673635215e-06, "loss": 0.0209, "step": 49850 }, { "grad_norm": 0.3380444049835205, "learning_rate": 7.608794936633723e-06, "loss": 0.0241, "step": 49860 }, { "grad_norm": 0.45511332154273987, "learning_rate": 7.594188076961056e-06, "loss": 0.0289, "step": 49870 }, { "grad_norm": 0.39887097477912903, "learning_rate": 7.579594099054382e-06, "loss": 0.0249, "step": 49880 }, { "grad_norm": 0.2044619768857956, "learning_rate": 7.565013007346983e-06, "loss": 0.0204, "step": 49890 }, { "grad_norm": 0.6577175855636597, "learning_rate": 7.5504448062682035e-06, "loss": 0.0346, "step": 49900 }, { "grad_norm": 0.2590559124946594, "learning_rate": 7.53588950024347e-06, "loss": 0.018, "step": 49910 }, { "grad_norm": 0.5889080166816711, "learning_rate": 7.5213470936943145e-06, "loss": 0.0242, "step": 49920 }, { "grad_norm": 0.29757246375083923, "learning_rate": 7.506817591038323e-06, "loss": 0.0194, "step": 49930 }, { "grad_norm": 0.19220633804798126, "learning_rate": 7.492300996689183e-06, "loss": 0.0217, "step": 49940 }, { "grad_norm": 0.6092566251754761, "learning_rate": 7.477797315056645e-06, "loss": 0.0303, "step": 49950 }, { "grad_norm": 0.3956882059574127, "learning_rate": 7.463306550546539e-06, "loss": 0.0199, "step": 49960 }, { "grad_norm": 0.28620871901512146, "learning_rate": 7.448828707560812e-06, "loss": 0.0227, "step": 49970 }, { "grad_norm": 0.7387881278991699, "learning_rate": 7.4343637904974e-06, "loss": 0.0375, "step": 49980 }, { "grad_norm": 0.4592417776584625, "learning_rate": 7.419911803750401e-06, "loss": 0.0247, "step": 49990 }, { "grad_norm": 0.6981443762779236, "learning_rate": 7.405472751709935e-06, "loss": 0.0286, "step": 50000 }, { "grad_norm": 0.5539777278900146, "learning_rate": 7.3910466387622e-06, "loss": 0.0277, "step": 50010 }, { "grad_norm": 0.41442933678627014, "learning_rate": 7.3766334692894735e-06, "loss": 0.0241, "step": 50020 }, { "grad_norm": 0.32749372720718384, "learning_rate": 7.3622332476700865e-06, "loss": 0.0215, "step": 50030 }, { "grad_norm": 0.24420170485973358, "learning_rate": 7.347845978278472e-06, "loss": 0.0315, "step": 50040 }, { "grad_norm": 0.4182957112789154, "learning_rate": 7.333471665485065e-06, "loss": 0.0174, "step": 50050 }, { "grad_norm": 0.8861310482025146, "learning_rate": 7.31911031365643e-06, "loss": 0.0405, "step": 50060 }, { "grad_norm": 0.3685654401779175, "learning_rate": 7.304761927155157e-06, "loss": 0.0209, "step": 50070 }, { "grad_norm": 0.41909295320510864, "learning_rate": 7.29042651033991e-06, "loss": 0.0187, "step": 50080 }, { "grad_norm": 0.4484723210334778, "learning_rate": 7.276104067565409e-06, "loss": 0.0308, "step": 50090 }, { "grad_norm": 0.3239649832248688, "learning_rate": 7.261794603182431e-06, "loss": 0.0182, "step": 50100 }, { "grad_norm": 0.5565941333770752, "learning_rate": 7.24749812153781e-06, "loss": 0.0278, "step": 50110 }, { "grad_norm": 0.5551601052284241, "learning_rate": 7.2332146269744605e-06, "loss": 0.0214, "step": 50120 }, { "grad_norm": 0.5066900253295898, "learning_rate": 7.218944123831295e-06, "loss": 0.0264, "step": 50130 }, { "grad_norm": 0.23750558495521545, "learning_rate": 7.204686616443351e-06, "loss": 0.0171, "step": 50140 }, { "grad_norm": 0.34605661034584045, "learning_rate": 7.190442109141665e-06, "loss": 0.0238, "step": 50150 }, { "grad_norm": 0.540186882019043, "learning_rate": 7.176210606253347e-06, "loss": 0.0348, "step": 50160 }, { "grad_norm": 0.48186081647872925, "learning_rate": 7.161992112101551e-06, "loss": 0.0205, "step": 50170 }, { "grad_norm": 0.7527052164077759, "learning_rate": 7.147786631005465e-06, "loss": 0.0287, "step": 50180 }, { "grad_norm": 0.32781124114990234, "learning_rate": 7.1335941672803775e-06, "loss": 0.0219, "step": 50190 }, { "grad_norm": 0.22489574551582336, "learning_rate": 7.1194147252375384e-06, "loss": 0.0257, "step": 50200 }, { "grad_norm": 0.3225897550582886, "learning_rate": 7.10524830918432e-06, "loss": 0.0255, "step": 50210 }, { "grad_norm": 0.38970986008644104, "learning_rate": 7.091094923424097e-06, "loss": 0.0205, "step": 50220 }, { "grad_norm": 0.5533736944198608, "learning_rate": 7.0769545722562894e-06, "loss": 0.0194, "step": 50230 }, { "grad_norm": 0.5409282445907593, "learning_rate": 7.0628272599763675e-06, "loss": 0.0234, "step": 50240 }, { "grad_norm": 0.23260720074176788, "learning_rate": 7.048712990875828e-06, "loss": 0.0452, "step": 50250 }, { "grad_norm": 0.1346890926361084, "learning_rate": 7.034611769242216e-06, "loss": 0.0224, "step": 50260 }, { "grad_norm": 0.45179951190948486, "learning_rate": 7.02052359935913e-06, "loss": 0.0251, "step": 50270 }, { "grad_norm": 0.4147605895996094, "learning_rate": 7.006448485506145e-06, "loss": 0.0247, "step": 50280 }, { "grad_norm": 0.2898308336734772, "learning_rate": 6.992386431958942e-06, "loss": 0.0168, "step": 50290 }, { "grad_norm": 0.4660116732120514, "learning_rate": 6.978337442989197e-06, "loss": 0.0194, "step": 50300 }, { "grad_norm": 0.36981096863746643, "learning_rate": 6.964301522864608e-06, "loss": 0.0224, "step": 50310 }, { "grad_norm": 0.2396828979253769, "learning_rate": 6.950278675848926e-06, "loss": 0.0176, "step": 50320 }, { "grad_norm": 0.3673923909664154, "learning_rate": 6.9362689062019145e-06, "loss": 0.0218, "step": 50330 }, { "grad_norm": 0.45937231183052063, "learning_rate": 6.922272218179393e-06, "loss": 0.0271, "step": 50340 }, { "grad_norm": 0.8647177219390869, "learning_rate": 6.908288616033148e-06, "loss": 0.0213, "step": 50350 }, { "grad_norm": 0.10950637608766556, "learning_rate": 6.894318104011077e-06, "loss": 0.0205, "step": 50360 }, { "grad_norm": 0.5086265802383423, "learning_rate": 6.880360686357007e-06, "loss": 0.0289, "step": 50370 }, { "grad_norm": 0.2169928103685379, "learning_rate": 6.8664163673108575e-06, "loss": 0.0156, "step": 50380 }, { "grad_norm": 0.3632253110408783, "learning_rate": 6.85248515110854e-06, "loss": 0.0242, "step": 50390 }, { "grad_norm": 0.3706723153591156, "learning_rate": 6.838567041981992e-06, "loss": 0.0204, "step": 50400 }, { "grad_norm": 0.18193016946315765, "learning_rate": 6.8246620441591634e-06, "loss": 0.0194, "step": 50410 }, { "grad_norm": 0.3748451769351959, "learning_rate": 6.8107701618640275e-06, "loss": 0.0294, "step": 50420 }, { "grad_norm": 0.2925474941730499, "learning_rate": 6.796891399316557e-06, "loss": 0.0196, "step": 50430 }, { "grad_norm": 0.40033650398254395, "learning_rate": 6.7830257607327804e-06, "loss": 0.0212, "step": 50440 }, { "grad_norm": 0.6791098117828369, "learning_rate": 6.7691732503247e-06, "loss": 0.03, "step": 50450 }, { "grad_norm": 0.47667473554611206, "learning_rate": 6.755333872300346e-06, "loss": 0.0293, "step": 50460 }, { "grad_norm": 0.4650813937187195, "learning_rate": 6.741507630863747e-06, "loss": 0.0203, "step": 50470 }, { "grad_norm": 0.36038661003112793, "learning_rate": 6.727694530214945e-06, "loss": 0.0277, "step": 50480 }, { "grad_norm": 0.39772099256515503, "learning_rate": 6.713894574550028e-06, "loss": 0.0212, "step": 50490 }, { "grad_norm": 0.4730894863605499, "learning_rate": 6.700107768061015e-06, "loss": 0.0153, "step": 50500 }, { "grad_norm": 0.20102764666080475, "learning_rate": 6.686334114936016e-06, "loss": 0.0202, "step": 50510 }, { "grad_norm": 0.39071181416511536, "learning_rate": 6.672573619359063e-06, "loss": 0.0193, "step": 50520 }, { "grad_norm": 0.6735906004905701, "learning_rate": 6.658826285510256e-06, "loss": 0.0199, "step": 50530 }, { "grad_norm": 0.9327898025512695, "learning_rate": 6.645092117565666e-06, "loss": 0.0311, "step": 50540 }, { "grad_norm": 0.287757933139801, "learning_rate": 6.631371119697371e-06, "loss": 0.0239, "step": 50550 }, { "grad_norm": 0.2399500608444214, "learning_rate": 6.6176632960734505e-06, "loss": 0.0195, "step": 50560 }, { "grad_norm": 0.42561110854148865, "learning_rate": 6.603968650857978e-06, "loss": 0.0175, "step": 50570 }, { "grad_norm": 0.2514631152153015, "learning_rate": 6.5902871882110085e-06, "loss": 0.0191, "step": 50580 }, { "grad_norm": 0.5071794390678406, "learning_rate": 6.576618912288635e-06, "loss": 0.0244, "step": 50590 }, { "grad_norm": 0.4287756681442261, "learning_rate": 6.562963827242913e-06, "loss": 0.0171, "step": 50600 }, { "grad_norm": 0.4140174090862274, "learning_rate": 6.549321937221886e-06, "loss": 0.0137, "step": 50610 }, { "grad_norm": 0.7646976113319397, "learning_rate": 6.5356932463696064e-06, "loss": 0.0209, "step": 50620 }, { "grad_norm": 0.4856005012989044, "learning_rate": 6.522077758826101e-06, "loss": 0.0209, "step": 50630 }, { "grad_norm": 0.34010884165763855, "learning_rate": 6.5084754787274275e-06, "loss": 0.0185, "step": 50640 }, { "grad_norm": 0.2603606879711151, "learning_rate": 6.494886410205553e-06, "loss": 0.039, "step": 50650 }, { "grad_norm": 0.5362648963928223, "learning_rate": 6.481310557388521e-06, "loss": 0.0251, "step": 50660 }, { "grad_norm": 0.36904487013816833, "learning_rate": 6.46774792440028e-06, "loss": 0.0207, "step": 50670 }, { "grad_norm": 0.14088165760040283, "learning_rate": 6.4541985153608206e-06, "loss": 0.017, "step": 50680 }, { "grad_norm": 0.18023477494716644, "learning_rate": 6.4406623343861e-06, "loss": 0.0238, "step": 50690 }, { "grad_norm": 0.2499706894159317, "learning_rate": 6.427139385588038e-06, "loss": 0.0274, "step": 50700 }, { "grad_norm": 0.7080855965614319, "learning_rate": 6.413629673074561e-06, "loss": 0.0228, "step": 50710 }, { "grad_norm": 0.3860850930213928, "learning_rate": 6.400133200949554e-06, "loss": 0.0208, "step": 50720 }, { "grad_norm": 0.489263653755188, "learning_rate": 6.386649973312897e-06, "loss": 0.0281, "step": 50730 }, { "grad_norm": 0.5942742824554443, "learning_rate": 6.37317999426042e-06, "loss": 0.0217, "step": 50740 }, { "grad_norm": 0.5585848689079285, "learning_rate": 6.359723267883977e-06, "loss": 0.0188, "step": 50750 }, { "grad_norm": 0.5327900648117065, "learning_rate": 6.346279798271343e-06, "loss": 0.0173, "step": 50760 }, { "grad_norm": 0.2899172008037567, "learning_rate": 6.332849589506301e-06, "loss": 0.0216, "step": 50770 }, { "grad_norm": 0.24692930281162262, "learning_rate": 6.319432645668588e-06, "loss": 0.0232, "step": 50780 }, { "grad_norm": 0.5089011192321777, "learning_rate": 6.306028970833922e-06, "loss": 0.0213, "step": 50790 }, { "grad_norm": 0.35610300302505493, "learning_rate": 6.2926385690739665e-06, "loss": 0.0178, "step": 50800 }, { "grad_norm": 0.3880899250507355, "learning_rate": 6.279261444456413e-06, "loss": 0.0237, "step": 50810 }, { "grad_norm": 0.724978506565094, "learning_rate": 6.265897601044829e-06, "loss": 0.0192, "step": 50820 }, { "grad_norm": 0.22245009243488312, "learning_rate": 6.2525470428988434e-06, "loss": 0.0177, "step": 50830 }, { "grad_norm": 0.5170431137084961, "learning_rate": 6.239209774073962e-06, "loss": 0.0228, "step": 50840 }, { "grad_norm": 0.42839789390563965, "learning_rate": 6.225885798621728e-06, "loss": 0.0225, "step": 50850 }, { "grad_norm": 0.4298437237739563, "learning_rate": 6.2125751205895925e-06, "loss": 0.0356, "step": 50860 }, { "grad_norm": 0.634165346622467, "learning_rate": 6.199277744020998e-06, "loss": 0.0264, "step": 50870 }, { "grad_norm": 0.3366909623146057, "learning_rate": 6.185993672955337e-06, "loss": 0.0153, "step": 50880 }, { "grad_norm": 0.21581986546516418, "learning_rate": 6.172722911427947e-06, "loss": 0.0371, "step": 50890 }, { "grad_norm": 0.33750417828559875, "learning_rate": 6.159465463470149e-06, "loss": 0.02, "step": 50900 }, { "grad_norm": 0.23596793413162231, "learning_rate": 6.146221333109204e-06, "loss": 0.0193, "step": 50910 }, { "grad_norm": 0.16078773140907288, "learning_rate": 6.132990524368326e-06, "loss": 0.0181, "step": 50920 }, { "grad_norm": 0.2867731750011444, "learning_rate": 6.119773041266685e-06, "loss": 0.0284, "step": 50930 }, { "grad_norm": 0.4954531788825989, "learning_rate": 6.106568887819402e-06, "loss": 0.0203, "step": 50940 }, { "grad_norm": 0.4160810708999634, "learning_rate": 6.093378068037548e-06, "loss": 0.0209, "step": 50950 }, { "grad_norm": 0.44846421480178833, "learning_rate": 6.080200585928164e-06, "loss": 0.024, "step": 50960 }, { "grad_norm": 0.4037817418575287, "learning_rate": 6.06703644549419e-06, "loss": 0.018, "step": 50970 }, { "grad_norm": 0.41177210211753845, "learning_rate": 6.053885650734576e-06, "loss": 0.016, "step": 50980 }, { "grad_norm": 0.473087340593338, "learning_rate": 6.040748205644153e-06, "loss": 0.017, "step": 50990 }, { "grad_norm": 0.3627772927284241, "learning_rate": 6.0276241142137646e-06, "loss": 0.0297, "step": 51000 }, { "grad_norm": 0.41958528757095337, "learning_rate": 6.014513380430142e-06, "loss": 0.0257, "step": 51010 }, { "grad_norm": 0.4494984745979309, "learning_rate": 6.001416008275984e-06, "loss": 0.0268, "step": 51020 }, { "grad_norm": 0.31569281220436096, "learning_rate": 5.988332001729929e-06, "loss": 0.0187, "step": 51030 }, { "grad_norm": 0.1972232311964035, "learning_rate": 5.975261364766543e-06, "loss": 0.0195, "step": 51040 }, { "grad_norm": 0.7836227416992188, "learning_rate": 5.962204101356356e-06, "loss": 0.0325, "step": 51050 }, { "grad_norm": 0.5753371119499207, "learning_rate": 5.94916021546581e-06, "loss": 0.0248, "step": 51060 }, { "grad_norm": 0.4240570068359375, "learning_rate": 5.936129711057298e-06, "loss": 0.0175, "step": 51070 }, { "grad_norm": 0.5657826066017151, "learning_rate": 5.923112592089142e-06, "loss": 0.0293, "step": 51080 }, { "grad_norm": 0.2780112028121948, "learning_rate": 5.9101088625155954e-06, "loss": 0.0204, "step": 51090 }, { "grad_norm": 0.47333136200904846, "learning_rate": 5.897118526286843e-06, "loss": 0.0215, "step": 51100 }, { "grad_norm": 0.3220270276069641, "learning_rate": 5.884141587349035e-06, "loss": 0.0253, "step": 51110 }, { "grad_norm": 0.4040994346141815, "learning_rate": 5.871178049644177e-06, "loss": 0.0308, "step": 51120 }, { "grad_norm": 0.4610227942466736, "learning_rate": 5.858227917110293e-06, "loss": 0.0213, "step": 51130 }, { "grad_norm": 0.5424336791038513, "learning_rate": 5.845291193681252e-06, "loss": 0.0222, "step": 51140 }, { "grad_norm": 0.2705960273742676, "learning_rate": 5.832367883286921e-06, "loss": 0.0183, "step": 51150 }, { "grad_norm": 0.2165890336036682, "learning_rate": 5.81945798985305e-06, "loss": 0.0234, "step": 51160 }, { "grad_norm": 0.49004387855529785, "learning_rate": 5.806561517301306e-06, "loss": 0.0205, "step": 51170 }, { "grad_norm": 0.16892655193805695, "learning_rate": 5.793678469549335e-06, "loss": 0.0201, "step": 51180 }, { "grad_norm": 0.36686983704566956, "learning_rate": 5.780808850510627e-06, "loss": 0.0243, "step": 51190 }, { "grad_norm": 0.5952874422073364, "learning_rate": 5.767952664094673e-06, "loss": 0.0197, "step": 51200 }, { "grad_norm": 0.5738344192504883, "learning_rate": 5.755109914206791e-06, "loss": 0.026, "step": 51210 }, { "grad_norm": 0.3448309898376465, "learning_rate": 5.7422806047483125e-06, "loss": 0.0245, "step": 51220 }, { "grad_norm": 0.4189898669719696, "learning_rate": 5.72946473961643e-06, "loss": 0.0283, "step": 51230 }, { "grad_norm": 0.42563629150390625, "learning_rate": 5.716662322704264e-06, "loss": 0.0183, "step": 51240 }, { "grad_norm": 0.4932747185230255, "learning_rate": 5.703873357900852e-06, "loss": 0.0267, "step": 51250 }, { "grad_norm": 0.6138160228729248, "learning_rate": 5.691097849091143e-06, "loss": 0.0324, "step": 51260 }, { "grad_norm": 0.28697097301483154, "learning_rate": 5.678335800155982e-06, "loss": 0.0179, "step": 51270 }, { "grad_norm": 0.5833638906478882, "learning_rate": 5.665587214972174e-06, "loss": 0.0176, "step": 51280 }, { "grad_norm": 0.3961082696914673, "learning_rate": 5.652852097412386e-06, "loss": 0.0179, "step": 51290 }, { "grad_norm": 0.4183819890022278, "learning_rate": 5.640130451345216e-06, "loss": 0.0294, "step": 51300 }, { "grad_norm": 0.31887656450271606, "learning_rate": 5.627422280635159e-06, "loss": 0.0192, "step": 51310 }, { "grad_norm": 0.48714131116867065, "learning_rate": 5.61472758914261e-06, "loss": 0.0199, "step": 51320 }, { "grad_norm": 0.4422696828842163, "learning_rate": 5.602046380723918e-06, "loss": 0.0297, "step": 51330 }, { "grad_norm": 0.18524107336997986, "learning_rate": 5.5893786592312535e-06, "loss": 0.0229, "step": 51340 }, { "grad_norm": 0.7560898661613464, "learning_rate": 5.576724428512775e-06, "loss": 0.028, "step": 51350 }, { "grad_norm": 0.3213367462158203, "learning_rate": 5.56408369241247e-06, "loss": 0.0192, "step": 51360 }, { "grad_norm": 0.29006150364875793, "learning_rate": 5.5514564547702875e-06, "loss": 0.0167, "step": 51370 }, { "grad_norm": 0.3502066433429718, "learning_rate": 5.538842719422038e-06, "loss": 0.0305, "step": 51380 }, { "grad_norm": 0.5105603933334351, "learning_rate": 5.52624249019944e-06, "loss": 0.022, "step": 51390 }, { "grad_norm": 0.3182680904865265, "learning_rate": 5.51365577093011e-06, "loss": 0.0195, "step": 51400 }, { "grad_norm": 0.2560334801673889, "learning_rate": 5.501082565437565e-06, "loss": 0.0264, "step": 51410 }, { "grad_norm": 0.4974575936794281, "learning_rate": 5.488522877541202e-06, "loss": 0.0232, "step": 51420 }, { "grad_norm": 0.3166728615760803, "learning_rate": 5.475976711056341e-06, "loss": 0.0214, "step": 51430 }, { "grad_norm": 0.430751770734787, "learning_rate": 5.463444069794166e-06, "loss": 0.016, "step": 51440 }, { "grad_norm": 0.39944663643836975, "learning_rate": 5.4509249575617594e-06, "loss": 0.0343, "step": 51450 }, { "grad_norm": 1.057280421257019, "learning_rate": 5.438419378162107e-06, "loss": 0.031, "step": 51460 }, { "grad_norm": 0.4185559153556824, "learning_rate": 5.425927335394054e-06, "loss": 0.0186, "step": 51470 }, { "grad_norm": 0.43266811966896057, "learning_rate": 5.413448833052387e-06, "loss": 0.0192, "step": 51480 }, { "grad_norm": 0.33218809962272644, "learning_rate": 5.400983874927701e-06, "loss": 0.0225, "step": 51490 }, { "grad_norm": 0.2853853404521942, "learning_rate": 5.388532464806567e-06, "loss": 0.0223, "step": 51500 }, { "grad_norm": 0.27413254976272583, "learning_rate": 5.3760946064713546e-06, "loss": 0.0206, "step": 51510 }, { "grad_norm": 0.42233413457870483, "learning_rate": 5.363670303700386e-06, "loss": 0.02, "step": 51520 }, { "grad_norm": 0.40523073077201843, "learning_rate": 5.351259560267824e-06, "loss": 0.0193, "step": 51530 }, { "grad_norm": 0.4625720977783203, "learning_rate": 5.338862379943721e-06, "loss": 0.0176, "step": 51540 }, { "grad_norm": 0.5088134407997131, "learning_rate": 5.326478766494025e-06, "loss": 0.0174, "step": 51550 }, { "grad_norm": 0.7018653154373169, "learning_rate": 5.3141087236805385e-06, "loss": 0.0199, "step": 51560 }, { "grad_norm": 0.5938827395439148, "learning_rate": 5.3017522552609615e-06, "loss": 0.017, "step": 51570 }, { "grad_norm": 0.34313610196113586, "learning_rate": 5.289409364988851e-06, "loss": 0.021, "step": 51580 }, { "grad_norm": 0.30164283514022827, "learning_rate": 5.277080056613671e-06, "loss": 0.017, "step": 51590 }, { "grad_norm": 0.28817716240882874, "learning_rate": 5.264764333880729e-06, "loss": 0.0166, "step": 51600 }, { "grad_norm": 0.25509190559387207, "learning_rate": 5.252462200531216e-06, "loss": 0.0205, "step": 51610 }, { "grad_norm": 0.3935081958770752, "learning_rate": 5.240173660302194e-06, "loss": 0.0203, "step": 51620 }, { "grad_norm": 0.572688102722168, "learning_rate": 5.2278987169266044e-06, "loss": 0.0185, "step": 51630 }, { "grad_norm": 0.29218560457229614, "learning_rate": 5.215637374133231e-06, "loss": 0.0145, "step": 51640 }, { "grad_norm": 0.49866706132888794, "learning_rate": 5.203389635646782e-06, "loss": 0.0267, "step": 51650 }, { "grad_norm": 0.7755498290061951, "learning_rate": 5.191155505187756e-06, "loss": 0.0235, "step": 51660 }, { "grad_norm": 0.4573748707771301, "learning_rate": 5.1789349864726e-06, "loss": 0.0308, "step": 51670 }, { "grad_norm": 0.45654159784317017, "learning_rate": 5.16672808321354e-06, "loss": 0.0141, "step": 51680 }, { "grad_norm": 0.46065422892570496, "learning_rate": 5.154534799118749e-06, "loss": 0.0223, "step": 51690 }, { "grad_norm": 0.2563033699989319, "learning_rate": 5.142355137892207e-06, "loss": 0.0234, "step": 51700 }, { "grad_norm": 0.6521221399307251, "learning_rate": 5.130189103233779e-06, "loss": 0.0236, "step": 51710 }, { "grad_norm": 0.34691140055656433, "learning_rate": 5.118036698839179e-06, "loss": 0.02, "step": 51720 }, { "grad_norm": 0.3980310559272766, "learning_rate": 5.105897928399983e-06, "loss": 0.0259, "step": 51730 }, { "grad_norm": 0.5089674592018127, "learning_rate": 5.0937727956036464e-06, "loss": 0.0306, "step": 51740 }, { "grad_norm": 0.2624090015888214, "learning_rate": 5.081661304133456e-06, "loss": 0.0286, "step": 51750 }, { "grad_norm": 0.5022501349449158, "learning_rate": 5.069563457668558e-06, "loss": 0.0202, "step": 51760 }, { "grad_norm": 1.0428862571716309, "learning_rate": 5.0574792598839624e-06, "loss": 0.0248, "step": 51770 }, { "grad_norm": 0.2449718564748764, "learning_rate": 5.0454087144505276e-06, "loss": 0.0147, "step": 51780 }, { "grad_norm": 0.18085695803165436, "learning_rate": 5.0333518250349655e-06, "loss": 0.0179, "step": 51790 }, { "grad_norm": 0.4218282103538513, "learning_rate": 5.021308595299856e-06, "loss": 0.0228, "step": 51800 }, { "grad_norm": 0.27069520950317383, "learning_rate": 5.009279028903585e-06, "loss": 0.0238, "step": 51810 }, { "grad_norm": 0.41807568073272705, "learning_rate": 4.997263129500452e-06, "loss": 0.0265, "step": 51820 }, { "grad_norm": 0.3656843602657318, "learning_rate": 4.985260900740535e-06, "loss": 0.0142, "step": 51830 }, { "grad_norm": 0.5528975129127502, "learning_rate": 4.973272346269814e-06, "loss": 0.0219, "step": 51840 }, { "grad_norm": 0.2598586678504944, "learning_rate": 4.961297469730097e-06, "loss": 0.0209, "step": 51850 }, { "grad_norm": 0.4575367569923401, "learning_rate": 4.949336274759031e-06, "loss": 0.0242, "step": 51860 }, { "grad_norm": 0.20589683949947357, "learning_rate": 4.9373887649901144e-06, "loss": 0.0171, "step": 51870 }, { "grad_norm": 0.4711463153362274, "learning_rate": 4.925454944052666e-06, "loss": 0.0226, "step": 51880 }, { "grad_norm": 0.6004453897476196, "learning_rate": 4.913534815571891e-06, "loss": 0.0251, "step": 51890 }, { "grad_norm": 0.2668318748474121, "learning_rate": 4.901628383168805e-06, "loss": 0.02, "step": 51900 }, { "grad_norm": 0.3970584571361542, "learning_rate": 4.8897356504602585e-06, "loss": 0.0223, "step": 51910 }, { "grad_norm": 0.20784974098205566, "learning_rate": 4.877856621058957e-06, "loss": 0.0173, "step": 51920 }, { "grad_norm": 0.3188377618789673, "learning_rate": 4.86599129857343e-06, "loss": 0.0264, "step": 51930 }, { "grad_norm": 0.3578183352947235, "learning_rate": 4.85413968660805e-06, "loss": 0.0289, "step": 51940 }, { "grad_norm": 0.32284778356552124, "learning_rate": 4.842301788763031e-06, "loss": 0.0185, "step": 51950 }, { "grad_norm": 0.2080681174993515, "learning_rate": 4.830477608634393e-06, "loss": 0.0193, "step": 51960 }, { "grad_norm": 0.4408480226993561, "learning_rate": 4.818667149814049e-06, "loss": 0.0291, "step": 51970 }, { "grad_norm": 0.3619551658630371, "learning_rate": 4.806870415889664e-06, "loss": 0.0193, "step": 51980 }, { "grad_norm": 0.42424461245536804, "learning_rate": 4.795087410444798e-06, "loss": 0.0249, "step": 51990 }, { "grad_norm": 0.5264539122581482, "learning_rate": 4.783318137058807e-06, "loss": 0.0311, "step": 52000 }, { "grad_norm": 0.39955905079841614, "learning_rate": 4.771562599306895e-06, "loss": 0.0181, "step": 52010 }, { "grad_norm": 0.45687341690063477, "learning_rate": 4.759820800760073e-06, "loss": 0.029, "step": 52020 }, { "grad_norm": 0.4086625576019287, "learning_rate": 4.7480927449851834e-06, "loss": 0.0236, "step": 52030 }, { "grad_norm": 0.3460633158683777, "learning_rate": 4.73637843554493e-06, "loss": 0.0227, "step": 52040 }, { "grad_norm": 0.17321646213531494, "learning_rate": 4.724677875997774e-06, "loss": 0.0156, "step": 52050 }, { "grad_norm": 0.32906147837638855, "learning_rate": 4.712991069898065e-06, "loss": 0.0218, "step": 52060 }, { "grad_norm": 0.60561603307724, "learning_rate": 4.7013180207959305e-06, "loss": 0.0245, "step": 52070 }, { "grad_norm": 0.4657500684261322, "learning_rate": 4.6896587322373395e-06, "loss": 0.0245, "step": 52080 }, { "grad_norm": 0.3976362347602844, "learning_rate": 4.678013207764081e-06, "loss": 0.0227, "step": 52090 }, { "grad_norm": 0.5628819465637207, "learning_rate": 4.666381450913748e-06, "loss": 0.0293, "step": 52100 }, { "grad_norm": 0.3872677981853485, "learning_rate": 4.654763465219752e-06, "loss": 0.0217, "step": 52110 }, { "grad_norm": 0.7309420108795166, "learning_rate": 4.643159254211371e-06, "loss": 0.0199, "step": 52120 }, { "grad_norm": 0.6716875433921814, "learning_rate": 4.631568821413606e-06, "loss": 0.0394, "step": 52130 }, { "grad_norm": 0.5775042176246643, "learning_rate": 4.619992170347359e-06, "loss": 0.0235, "step": 52140 }, { "grad_norm": 0.631436824798584, "learning_rate": 4.608429304529305e-06, "loss": 0.0281, "step": 52150 }, { "grad_norm": 0.2920684218406677, "learning_rate": 4.596880227471928e-06, "loss": 0.0493, "step": 52160 }, { "grad_norm": 0.5256933569908142, "learning_rate": 4.585344942683539e-06, "loss": 0.029, "step": 52170 }, { "grad_norm": 0.17943385243415833, "learning_rate": 4.573823453668241e-06, "loss": 0.0303, "step": 52180 }, { "grad_norm": 0.38370969891548157, "learning_rate": 4.562315763925995e-06, "loss": 0.0155, "step": 52190 }, { "grad_norm": 0.2364010512828827, "learning_rate": 4.5508218769524825e-06, "loss": 0.0289, "step": 52200 }, { "grad_norm": 0.3167262077331543, "learning_rate": 4.539341796239277e-06, "loss": 0.019, "step": 52210 }, { "grad_norm": 0.35588911175727844, "learning_rate": 4.527875525273717e-06, "loss": 0.0194, "step": 52220 }, { "grad_norm": 0.40373384952545166, "learning_rate": 4.51642306753895e-06, "loss": 0.0188, "step": 52230 }, { "grad_norm": 0.44571590423583984, "learning_rate": 4.5049844265139306e-06, "loss": 0.0312, "step": 52240 }, { "grad_norm": 0.18381884694099426, "learning_rate": 4.4935596056734144e-06, "loss": 0.0206, "step": 52250 }, { "grad_norm": 0.5159327983856201, "learning_rate": 4.482148608487957e-06, "loss": 0.0166, "step": 52260 }, { "grad_norm": 0.4071614146232605, "learning_rate": 4.4707514384239365e-06, "loss": 0.0276, "step": 52270 }, { "grad_norm": 0.19356590509414673, "learning_rate": 4.459368098943484e-06, "loss": 0.0234, "step": 52280 }, { "grad_norm": 0.3343292772769928, "learning_rate": 4.447998593504582e-06, "loss": 0.0344, "step": 52290 }, { "grad_norm": 0.4494807720184326, "learning_rate": 4.4366429255609744e-06, "loss": 0.0147, "step": 52300 }, { "grad_norm": 0.4833141565322876, "learning_rate": 4.425301098562212e-06, "loss": 0.0219, "step": 52310 }, { "grad_norm": 0.3057330846786499, "learning_rate": 4.413973115953651e-06, "loss": 0.0209, "step": 52320 }, { "grad_norm": 0.44224900007247925, "learning_rate": 4.402658981176416e-06, "loss": 0.0192, "step": 52330 }, { "grad_norm": 0.5099350810050964, "learning_rate": 4.391358697667475e-06, "loss": 0.0216, "step": 52340 }, { "grad_norm": 0.330837219953537, "learning_rate": 4.3800722688595195e-06, "loss": 0.0179, "step": 52350 }, { "grad_norm": 0.46960198879241943, "learning_rate": 4.368799698181097e-06, "loss": 0.0244, "step": 52360 }, { "grad_norm": 0.46781760454177856, "learning_rate": 4.357540989056486e-06, "loss": 0.0218, "step": 52370 }, { "grad_norm": 0.19945046305656433, "learning_rate": 4.346296144905815e-06, "loss": 0.0206, "step": 52380 }, { "grad_norm": 0.20173761248588562, "learning_rate": 4.335065169144958e-06, "loss": 0.0196, "step": 52390 }, { "grad_norm": 0.20897620916366577, "learning_rate": 4.323848065185593e-06, "loss": 0.0189, "step": 52400 }, { "grad_norm": 0.30237656831741333, "learning_rate": 4.31264483643517e-06, "loss": 0.0139, "step": 52410 }, { "grad_norm": 0.3875107169151306, "learning_rate": 4.301455486296946e-06, "loss": 0.0231, "step": 52420 }, { "grad_norm": 0.3502529263496399, "learning_rate": 4.290280018169935e-06, "loss": 0.0252, "step": 52430 }, { "grad_norm": 0.30320242047309875, "learning_rate": 4.27911843544897e-06, "loss": 0.0237, "step": 52440 }, { "grad_norm": 0.3150494694709778, "learning_rate": 4.2679707415246294e-06, "loss": 0.0249, "step": 52450 }, { "grad_norm": 0.3835901916027069, "learning_rate": 4.256836939783299e-06, "loss": 0.0149, "step": 52460 }, { "grad_norm": 0.7264373302459717, "learning_rate": 4.245717033607127e-06, "loss": 0.0273, "step": 52470 }, { "grad_norm": 0.4091324806213379, "learning_rate": 4.234611026374035e-06, "loss": 0.0226, "step": 52480 }, { "grad_norm": 0.13383159041404724, "learning_rate": 4.2235189214577694e-06, "loss": 0.0105, "step": 52490 }, { "grad_norm": 0.5291451811790466, "learning_rate": 4.212440722227779e-06, "loss": 0.0392, "step": 52500 }, { "grad_norm": 0.4550970196723938, "learning_rate": 4.201376432049364e-06, "loss": 0.0363, "step": 52510 }, { "grad_norm": 0.5577747225761414, "learning_rate": 4.1903260542835275e-06, "loss": 0.0328, "step": 52520 }, { "grad_norm": 0.5719116926193237, "learning_rate": 4.1792895922871114e-06, "loss": 0.0283, "step": 52530 }, { "grad_norm": 0.4751291573047638, "learning_rate": 4.168267049412694e-06, "loss": 0.0189, "step": 52540 }, { "grad_norm": 0.6140816807746887, "learning_rate": 4.157258429008626e-06, "loss": 0.0427, "step": 52550 }, { "grad_norm": 0.43245765566825867, "learning_rate": 4.146263734419043e-06, "loss": 0.0182, "step": 52560 }, { "grad_norm": 0.6272283792495728, "learning_rate": 4.135282968983839e-06, "loss": 0.0366, "step": 52570 }, { "grad_norm": 0.3212090730667114, "learning_rate": 4.124316136038675e-06, "loss": 0.036, "step": 52580 }, { "grad_norm": 0.3753817677497864, "learning_rate": 4.1133632389149965e-06, "loss": 0.0304, "step": 52590 }, { "grad_norm": 0.4271151125431061, "learning_rate": 4.102424280939998e-06, "loss": 0.0301, "step": 52600 }, { "grad_norm": 0.36515167355537415, "learning_rate": 4.091499265436649e-06, "loss": 0.0232, "step": 52610 }, { "grad_norm": 0.5803313851356506, "learning_rate": 4.080588195723684e-06, "loss": 0.034, "step": 52620 }, { "grad_norm": 0.47212064266204834, "learning_rate": 4.069691075115578e-06, "loss": 0.0236, "step": 52630 }, { "grad_norm": 0.38319075107574463, "learning_rate": 4.0588079069226235e-06, "loss": 0.0268, "step": 52640 }, { "grad_norm": 0.7297544479370117, "learning_rate": 4.0479386944508034e-06, "loss": 0.0377, "step": 52650 }, { "grad_norm": 0.669399619102478, "learning_rate": 4.037083441001932e-06, "loss": 0.0209, "step": 52660 }, { "grad_norm": 0.4393808841705322, "learning_rate": 4.026242149873516e-06, "loss": 0.0245, "step": 52670 }, { "grad_norm": 0.46761390566825867, "learning_rate": 4.015414824358871e-06, "loss": 0.0197, "step": 52680 }, { "grad_norm": 0.46646395325660706, "learning_rate": 4.004601467747054e-06, "loss": 0.0211, "step": 52690 }, { "grad_norm": 0.1798756867647171, "learning_rate": 3.993802083322873e-06, "loss": 0.0162, "step": 52700 }, { "grad_norm": 0.22522331774234772, "learning_rate": 3.9830166743668906e-06, "loss": 0.0198, "step": 52710 }, { "grad_norm": 0.2412734180688858, "learning_rate": 3.9722452441554425e-06, "loss": 0.0268, "step": 52720 }, { "grad_norm": 0.3111005425453186, "learning_rate": 3.961487795960584e-06, "loss": 0.0161, "step": 52730 }, { "grad_norm": 0.3579656779766083, "learning_rate": 3.95074433305016e-06, "loss": 0.0208, "step": 52740 }, { "grad_norm": 0.6609001159667969, "learning_rate": 3.940014858687752e-06, "loss": 0.0254, "step": 52750 }, { "grad_norm": 0.45471614599227905, "learning_rate": 3.929299376132689e-06, "loss": 0.0154, "step": 52760 }, { "grad_norm": 0.7600141167640686, "learning_rate": 3.918597888640047e-06, "loss": 0.0308, "step": 52770 }, { "grad_norm": 0.30810070037841797, "learning_rate": 3.907910399460657e-06, "loss": 0.0227, "step": 52780 }, { "grad_norm": 0.326582133769989, "learning_rate": 3.8972369118410956e-06, "loss": 0.0202, "step": 52790 }, { "grad_norm": 0.3514954447746277, "learning_rate": 3.88657742902368e-06, "loss": 0.0184, "step": 52800 }, { "grad_norm": 0.41428449749946594, "learning_rate": 3.875931954246504e-06, "loss": 0.0188, "step": 52810 }, { "grad_norm": 0.28582248091697693, "learning_rate": 3.865300490743351e-06, "loss": 0.0194, "step": 52820 }, { "grad_norm": 0.13146279752254486, "learning_rate": 3.854683041743806e-06, "loss": 0.0117, "step": 52830 }, { "grad_norm": 0.18516068160533905, "learning_rate": 3.844079610473139e-06, "loss": 0.0219, "step": 52840 }, { "grad_norm": 0.3194276988506317, "learning_rate": 3.833490200152423e-06, "loss": 0.022, "step": 52850 }, { "grad_norm": 0.45207422971725464, "learning_rate": 3.822914813998424e-06, "loss": 0.0234, "step": 52860 }, { "grad_norm": 0.24890932440757751, "learning_rate": 3.812353455223666e-06, "loss": 0.0219, "step": 52870 }, { "grad_norm": 0.39565736055374146, "learning_rate": 3.8018061270364225e-06, "loss": 0.0144, "step": 52880 }, { "grad_norm": 0.3402644693851471, "learning_rate": 3.7912728326406688e-06, "loss": 0.0327, "step": 52890 }, { "grad_norm": 0.47958067059516907, "learning_rate": 3.7807535752361732e-06, "loss": 0.0158, "step": 52900 }, { "grad_norm": 0.20881690084934235, "learning_rate": 3.7702483580183855e-06, "loss": 0.0193, "step": 52910 }, { "grad_norm": 0.3563046157360077, "learning_rate": 3.759757184178525e-06, "loss": 0.0297, "step": 52920 }, { "grad_norm": 0.38310500979423523, "learning_rate": 3.7492800569035312e-06, "loss": 0.0237, "step": 52930 }, { "grad_norm": 0.5296953916549683, "learning_rate": 3.7388169793760754e-06, "loss": 0.0231, "step": 52940 }, { "grad_norm": 0.19764484465122223, "learning_rate": 3.728367954774553e-06, "loss": 0.0132, "step": 52950 }, { "grad_norm": 0.32597389817237854, "learning_rate": 3.7179329862731317e-06, "loss": 0.0205, "step": 52960 }, { "grad_norm": 0.3653144836425781, "learning_rate": 3.707512077041647e-06, "loss": 0.0219, "step": 52970 }, { "grad_norm": 0.220437154173851, "learning_rate": 3.6971052302457288e-06, "loss": 0.0197, "step": 52980 }, { "grad_norm": 0.26637938618659973, "learning_rate": 3.6867124490466698e-06, "loss": 0.0229, "step": 52990 }, { "grad_norm": 0.4340769052505493, "learning_rate": 3.6763337366015393e-06, "loss": 0.0277, "step": 53000 }, { "grad_norm": 0.32271260023117065, "learning_rate": 3.665969096063121e-06, "loss": 0.0456, "step": 53010 }, { "grad_norm": 0.3510395586490631, "learning_rate": 3.6556185305799074e-06, "loss": 0.02, "step": 53020 }, { "grad_norm": 0.14790619909763336, "learning_rate": 3.645282043296133e-06, "loss": 0.0147, "step": 53030 }, { "grad_norm": 0.458304762840271, "learning_rate": 3.6349596373517427e-06, "loss": 0.0315, "step": 53040 }, { "grad_norm": 0.24506895244121552, "learning_rate": 3.6246513158824215e-06, "loss": 0.0204, "step": 53050 }, { "grad_norm": 0.30368685722351074, "learning_rate": 3.6143570820195593e-06, "loss": 0.0255, "step": 53060 }, { "grad_norm": 0.45478278398513794, "learning_rate": 3.6040769388902773e-06, "loss": 0.0281, "step": 53070 }, { "grad_norm": 0.47513505816459656, "learning_rate": 3.593810889617405e-06, "loss": 0.0251, "step": 53080 }, { "grad_norm": 0.5121260285377502, "learning_rate": 3.5835589373194978e-06, "loss": 0.023, "step": 53090 }, { "grad_norm": 0.4413342773914337, "learning_rate": 3.5733210851108257e-06, "loss": 0.0216, "step": 53100 }, { "grad_norm": 0.6846964359283447, "learning_rate": 3.5630973361014008e-06, "loss": 0.029, "step": 53110 }, { "grad_norm": 0.42846620082855225, "learning_rate": 3.552887693396889e-06, "loss": 0.0221, "step": 53120 }, { "grad_norm": 0.381909042596817, "learning_rate": 3.542692160098754e-06, "loss": 0.0231, "step": 53130 }, { "grad_norm": 0.3544570207595825, "learning_rate": 3.5325107393040846e-06, "loss": 0.0156, "step": 53140 }, { "grad_norm": 0.2659839391708374, "learning_rate": 3.522343434105757e-06, "loss": 0.0217, "step": 53150 }, { "grad_norm": 0.5041897296905518, "learning_rate": 3.512190247592323e-06, "loss": 0.0185, "step": 53160 }, { "grad_norm": 0.9916229248046875, "learning_rate": 3.502051182848054e-06, "loss": 0.026, "step": 53170 }, { "grad_norm": 0.5005380511283875, "learning_rate": 3.4919262429529308e-06, "loss": 0.0205, "step": 53180 }, { "grad_norm": 0.3385971784591675, "learning_rate": 3.4818154309826325e-06, "loss": 0.022, "step": 53190 }, { "grad_norm": 0.5233507752418518, "learning_rate": 3.4717187500085734e-06, "loss": 0.0212, "step": 53200 }, { "grad_norm": 0.329348087310791, "learning_rate": 3.46163620309784e-06, "loss": 0.0141, "step": 53210 }, { "grad_norm": 0.18370619416236877, "learning_rate": 3.4515677933132595e-06, "loss": 0.0211, "step": 53220 }, { "grad_norm": 0.28658396005630493, "learning_rate": 3.4415135237133466e-06, "loss": 0.0186, "step": 53230 }, { "grad_norm": 0.2024528533220291, "learning_rate": 3.4314733973523196e-06, "loss": 0.0315, "step": 53240 }, { "grad_norm": 0.2698303163051605, "learning_rate": 3.4214474172800993e-06, "loss": 0.0298, "step": 53250 }, { "grad_norm": 0.5537111163139343, "learning_rate": 3.411435586542322e-06, "loss": 0.0172, "step": 53260 }, { "grad_norm": 0.44947826862335205, "learning_rate": 3.4014379081802995e-06, "loss": 0.0206, "step": 53270 }, { "grad_norm": 0.23354561626911163, "learning_rate": 3.391454385231102e-06, "loss": 0.0177, "step": 53280 }, { "grad_norm": 0.30137792229652405, "learning_rate": 3.3814850207274095e-06, "loss": 0.0177, "step": 53290 }, { "grad_norm": 0.2136675864458084, "learning_rate": 3.371529817697694e-06, "loss": 0.0133, "step": 53300 }, { "grad_norm": 0.269797146320343, "learning_rate": 3.3615887791660585e-06, "loss": 0.0222, "step": 53310 }, { "grad_norm": 0.41518649458885193, "learning_rate": 3.3516619081523426e-06, "loss": 0.0265, "step": 53320 }, { "grad_norm": 0.18822221457958221, "learning_rate": 3.3417492076720567e-06, "loss": 0.0141, "step": 53330 }, { "grad_norm": 0.5026010870933533, "learning_rate": 3.3318506807364147e-06, "loss": 0.0162, "step": 53340 }, { "grad_norm": 0.3499375581741333, "learning_rate": 3.3219663303523553e-06, "loss": 0.023, "step": 53350 }, { "grad_norm": 0.386984258890152, "learning_rate": 3.3120961595224374e-06, "loss": 0.0171, "step": 53360 }, { "grad_norm": 0.2934514582157135, "learning_rate": 3.302240171245002e-06, "loss": 0.0217, "step": 53370 }, { "grad_norm": 0.6975519061088562, "learning_rate": 3.29239836851401e-06, "loss": 0.0276, "step": 53380 }, { "grad_norm": 0.2851276397705078, "learning_rate": 3.2825707543191588e-06, "loss": 0.0297, "step": 53390 }, { "grad_norm": 0.6421399712562561, "learning_rate": 3.272757331645804e-06, "loss": 0.034, "step": 53400 }, { "grad_norm": 0.35142794251441956, "learning_rate": 3.2629581034750166e-06, "loss": 0.02, "step": 53410 }, { "grad_norm": 0.3277333676815033, "learning_rate": 3.2531730727835218e-06, "loss": 0.0217, "step": 53420 }, { "grad_norm": 0.1865222454071045, "learning_rate": 3.2434022425437914e-06, "loss": 0.0247, "step": 53430 }, { "grad_norm": 0.3522029221057892, "learning_rate": 3.233645615723907e-06, "loss": 0.0203, "step": 53440 }, { "grad_norm": 0.3441661298274994, "learning_rate": 3.2239031952876918e-06, "loss": 0.014, "step": 53450 }, { "grad_norm": 0.46960899233818054, "learning_rate": 3.21417498419464e-06, "loss": 0.0179, "step": 53460 }, { "grad_norm": 0.4470946192741394, "learning_rate": 3.204460985399921e-06, "loss": 0.0247, "step": 53470 }, { "grad_norm": 0.26098793745040894, "learning_rate": 3.1947612018543903e-06, "loss": 0.021, "step": 53480 }, { "grad_norm": 0.5317250490188599, "learning_rate": 3.1850756365045753e-06, "loss": 0.0175, "step": 53490 }, { "grad_norm": 0.33158043026924133, "learning_rate": 3.175404292292722e-06, "loss": 0.0293, "step": 53500 }, { "grad_norm": 0.5034148693084717, "learning_rate": 3.1657471721566965e-06, "loss": 0.0193, "step": 53510 }, { "grad_norm": 0.2615130841732025, "learning_rate": 3.1561042790300977e-06, "loss": 0.0181, "step": 53520 }, { "grad_norm": 0.2803767919540405, "learning_rate": 3.1464756158421816e-06, "loss": 0.0148, "step": 53530 }, { "grad_norm": 0.30996185541152954, "learning_rate": 3.136861185517875e-06, "loss": 0.0183, "step": 53540 }, { "grad_norm": 0.43602681159973145, "learning_rate": 3.127260990977798e-06, "loss": 0.0246, "step": 53550 }, { "grad_norm": 0.7294643521308899, "learning_rate": 3.1176750351382235e-06, "loss": 0.0218, "step": 53560 }, { "grad_norm": 0.5299769639968872, "learning_rate": 3.1081033209111153e-06, "loss": 0.0331, "step": 53570 }, { "grad_norm": 0.21848346292972565, "learning_rate": 3.0985458512041155e-06, "loss": 0.0147, "step": 53580 }, { "grad_norm": 0.30107352137565613, "learning_rate": 3.089002628920512e-06, "loss": 0.0189, "step": 53590 }, { "grad_norm": 1.0678503513336182, "learning_rate": 3.079473656959303e-06, "loss": 0.0211, "step": 53600 }, { "grad_norm": 0.41251280903816223, "learning_rate": 3.0699589382151393e-06, "loss": 0.0189, "step": 53610 }, { "grad_norm": 0.27400678396224976, "learning_rate": 3.060458475578326e-06, "loss": 0.0221, "step": 53620 }, { "grad_norm": 0.3421782851219177, "learning_rate": 3.0509722719348656e-06, "loss": 0.0276, "step": 53630 }, { "grad_norm": 0.5686379075050354, "learning_rate": 3.041500330166408e-06, "loss": 0.0224, "step": 53640 }, { "grad_norm": 0.6176370978355408, "learning_rate": 3.032042653150291e-06, "loss": 0.0159, "step": 53650 }, { "grad_norm": 0.5613231062889099, "learning_rate": 3.0225992437594887e-06, "loss": 0.0244, "step": 53660 }, { "grad_norm": 0.31526240706443787, "learning_rate": 3.0131701048626783e-06, "loss": 0.0203, "step": 53670 }, { "grad_norm": 0.5557582974433899, "learning_rate": 3.003755239324163e-06, "loss": 0.0289, "step": 53680 }, { "grad_norm": 0.20823794603347778, "learning_rate": 2.9943546500039553e-06, "loss": 0.0215, "step": 53690 }, { "grad_norm": 0.6967326402664185, "learning_rate": 2.9849683397576877e-06, "loss": 0.0268, "step": 53700 }, { "grad_norm": 0.14288124442100525, "learning_rate": 2.975596311436679e-06, "loss": 0.0259, "step": 53710 }, { "grad_norm": 0.21073780953884125, "learning_rate": 2.966238567887902e-06, "loss": 0.0195, "step": 53720 }, { "grad_norm": 0.1729731261730194, "learning_rate": 2.9568951119539943e-06, "loss": 0.02, "step": 53730 }, { "grad_norm": 0.315546452999115, "learning_rate": 2.947565946473241e-06, "loss": 0.0199, "step": 53740 }, { "grad_norm": 0.36233246326446533, "learning_rate": 2.9382510742796188e-06, "loss": 0.0151, "step": 53750 }, { "grad_norm": 0.5280331373214722, "learning_rate": 2.9289504982027204e-06, "loss": 0.026, "step": 53760 }, { "grad_norm": 0.12891896069049835, "learning_rate": 2.9196642210678248e-06, "loss": 0.019, "step": 53770 }, { "grad_norm": 0.17488446831703186, "learning_rate": 2.910392245695853e-06, "loss": 0.0178, "step": 53780 }, { "grad_norm": 0.27717530727386475, "learning_rate": 2.9011345749033804e-06, "loss": 0.019, "step": 53790 }, { "grad_norm": 0.3201059401035309, "learning_rate": 2.8918912115026677e-06, "loss": 0.0207, "step": 53800 }, { "grad_norm": 0.3592357039451599, "learning_rate": 2.8826621583015743e-06, "loss": 0.0318, "step": 53810 }, { "grad_norm": 0.3052090108394623, "learning_rate": 2.8734474181036643e-06, "loss": 0.025, "step": 53820 }, { "grad_norm": 0.29907310009002686, "learning_rate": 2.864246993708114e-06, "loss": 0.0193, "step": 53830 }, { "grad_norm": 0.6612403988838196, "learning_rate": 2.8550608879097884e-06, "loss": 0.0205, "step": 53840 }, { "grad_norm": 0.25185710191726685, "learning_rate": 2.845889103499172e-06, "loss": 0.0213, "step": 53850 }, { "grad_norm": 0.43838587403297424, "learning_rate": 2.8367316432624138e-06, "loss": 0.0178, "step": 53860 }, { "grad_norm": 0.4601554870605469, "learning_rate": 2.8275885099813105e-06, "loss": 0.0316, "step": 53870 }, { "grad_norm": 0.8077422380447388, "learning_rate": 2.8184597064332963e-06, "loss": 0.033, "step": 53880 }, { "grad_norm": 0.41094493865966797, "learning_rate": 2.809345235391464e-06, "loss": 0.0263, "step": 53890 }, { "grad_norm": 0.39647698402404785, "learning_rate": 2.80024509962456e-06, "loss": 0.0183, "step": 53900 }, { "grad_norm": 0.7247254252433777, "learning_rate": 2.7911593018969563e-06, "loss": 0.0221, "step": 53910 }, { "grad_norm": 0.4522287845611572, "learning_rate": 2.7820878449686838e-06, "loss": 0.0226, "step": 53920 }, { "grad_norm": 0.33721721172332764, "learning_rate": 2.7730307315953995e-06, "loss": 0.0349, "step": 53930 }, { "grad_norm": 0.27232882380485535, "learning_rate": 2.763987964528425e-06, "loss": 0.023, "step": 53940 }, { "grad_norm": 0.6632117033004761, "learning_rate": 2.754959546514718e-06, "loss": 0.0188, "step": 53950 }, { "grad_norm": 0.3236786723136902, "learning_rate": 2.7459454802968576e-06, "loss": 0.0153, "step": 53960 }, { "grad_norm": 0.5896658301353455, "learning_rate": 2.7369457686131028e-06, "loss": 0.0234, "step": 53970 }, { "grad_norm": 0.50972980260849, "learning_rate": 2.7279604141973004e-06, "loss": 0.0286, "step": 53980 }, { "grad_norm": 0.5279756784439087, "learning_rate": 2.7189894197789946e-06, "loss": 0.0176, "step": 53990 }, { "grad_norm": 0.5191856622695923, "learning_rate": 2.7100327880833055e-06, "loss": 0.0202, "step": 54000 }, { "grad_norm": 0.28588852286338806, "learning_rate": 2.70109052183104e-06, "loss": 0.0164, "step": 54010 }, { "grad_norm": 0.4270903170108795, "learning_rate": 2.69216262373862e-06, "loss": 0.015, "step": 54020 }, { "grad_norm": 0.4012005031108856, "learning_rate": 2.6832490965181036e-06, "loss": 0.0186, "step": 54030 }, { "grad_norm": 0.28910619020462036, "learning_rate": 2.6743499428771857e-06, "loss": 0.0208, "step": 54040 }, { "grad_norm": 0.39279094338417053, "learning_rate": 2.6654651655191875e-06, "loss": 0.0145, "step": 54050 }, { "grad_norm": 0.366111159324646, "learning_rate": 2.6565947671430836e-06, "loss": 0.0155, "step": 54060 }, { "grad_norm": 0.2503107488155365, "learning_rate": 2.647738750443457e-06, "loss": 0.0175, "step": 54070 }, { "grad_norm": 0.17230461537837982, "learning_rate": 2.6388971181105393e-06, "loss": 0.0289, "step": 54080 }, { "grad_norm": 0.15990637242794037, "learning_rate": 2.630069872830171e-06, "loss": 0.0161, "step": 54090 }, { "grad_norm": 0.4628559947013855, "learning_rate": 2.6212570172838514e-06, "loss": 0.023, "step": 54100 }, { "grad_norm": 0.47900789976119995, "learning_rate": 2.6124585541486778e-06, "loss": 0.0243, "step": 54110 }, { "grad_norm": 0.26285621523857117, "learning_rate": 2.6036744860974127e-06, "loss": 0.0122, "step": 54120 }, { "grad_norm": 0.1929483711719513, "learning_rate": 2.594904815798399e-06, "loss": 0.0192, "step": 54130 }, { "grad_norm": 0.5336416363716125, "learning_rate": 2.586149545915656e-06, "loss": 0.0208, "step": 54140 }, { "grad_norm": 0.5124112367630005, "learning_rate": 2.577408679108778e-06, "loss": 0.0169, "step": 54150 }, { "grad_norm": 0.5862692594528198, "learning_rate": 2.5686822180330306e-06, "loss": 0.018, "step": 54160 }, { "grad_norm": 0.6687697768211365, "learning_rate": 2.5599701653392703e-06, "loss": 0.0209, "step": 54170 }, { "grad_norm": 0.174785777926445, "learning_rate": 2.551272523673992e-06, "loss": 0.0164, "step": 54180 }, { "grad_norm": 0.4655226171016693, "learning_rate": 2.542589295679315e-06, "loss": 0.0253, "step": 54190 }, { "grad_norm": 0.3421829640865326, "learning_rate": 2.5339204839929575e-06, "loss": 0.0143, "step": 54200 }, { "grad_norm": 0.5119094848632812, "learning_rate": 2.525266091248296e-06, "loss": 0.0184, "step": 54210 }, { "grad_norm": 0.4910924732685089, "learning_rate": 2.5166261200743e-06, "loss": 0.0218, "step": 54220 }, { "grad_norm": 0.4822901785373688, "learning_rate": 2.5080005730955646e-06, "loss": 0.023, "step": 54230 }, { "grad_norm": 0.3472847044467926, "learning_rate": 2.499389452932299e-06, "loss": 0.0252, "step": 54240 }, { "grad_norm": 0.38730016350746155, "learning_rate": 2.4907927622003336e-06, "loss": 0.0176, "step": 54250 }, { "grad_norm": 0.4017443060874939, "learning_rate": 2.4822105035111177e-06, "loss": 0.0233, "step": 54260 }, { "grad_norm": 0.6130653023719788, "learning_rate": 2.4736426794717273e-06, "loss": 0.0262, "step": 54270 }, { "grad_norm": 0.39490076899528503, "learning_rate": 2.4650892926848135e-06, "loss": 0.0179, "step": 54280 }, { "grad_norm": 0.27458322048187256, "learning_rate": 2.456550345748704e-06, "loss": 0.0201, "step": 54290 }, { "grad_norm": 0.29230284690856934, "learning_rate": 2.4480258412572733e-06, "loss": 0.0152, "step": 54300 }, { "grad_norm": 0.4604647159576416, "learning_rate": 2.4395157818000612e-06, "loss": 0.0181, "step": 54310 }, { "grad_norm": 0.2120380848646164, "learning_rate": 2.431020169962189e-06, "loss": 0.0183, "step": 54320 }, { "grad_norm": 0.6942935585975647, "learning_rate": 2.422539008324409e-06, "loss": 0.0227, "step": 54330 }, { "grad_norm": 0.3001829981803894, "learning_rate": 2.414072299463066e-06, "loss": 0.0174, "step": 54340 }, { "grad_norm": 0.30804017186164856, "learning_rate": 2.4056200459501186e-06, "loss": 0.0205, "step": 54350 }, { "grad_norm": 0.6527283191680908, "learning_rate": 2.397182250353147e-06, "loss": 0.0359, "step": 54360 }, { "grad_norm": 0.41311728954315186, "learning_rate": 2.388758915235334e-06, "loss": 0.0204, "step": 54370 }, { "grad_norm": 0.3906332850456238, "learning_rate": 2.380350043155455e-06, "loss": 0.0209, "step": 54380 }, { "grad_norm": 0.713746964931488, "learning_rate": 2.371955636667911e-06, "loss": 0.0326, "step": 54390 }, { "grad_norm": 0.3437851369380951, "learning_rate": 2.3635756983227008e-06, "loss": 0.0305, "step": 54400 }, { "grad_norm": 0.36710429191589355, "learning_rate": 2.3552102306654278e-06, "loss": 0.0249, "step": 54410 }, { "grad_norm": 0.6291863322257996, "learning_rate": 2.346859236237292e-06, "loss": 0.0181, "step": 54420 }, { "grad_norm": 0.3520095646381378, "learning_rate": 2.3385227175751145e-06, "loss": 0.0159, "step": 54430 }, { "grad_norm": 0.3114345371723175, "learning_rate": 2.330200677211314e-06, "loss": 0.0206, "step": 54440 }, { "grad_norm": 0.31837746500968933, "learning_rate": 2.3218931176738847e-06, "loss": 0.0186, "step": 54450 }, { "grad_norm": 0.33810344338417053, "learning_rate": 2.313600041486469e-06, "loss": 0.0166, "step": 54460 }, { "grad_norm": 0.1832834780216217, "learning_rate": 2.3053214511682743e-06, "loss": 0.0168, "step": 54470 }, { "grad_norm": 0.25232821702957153, "learning_rate": 2.2970573492341163e-06, "loss": 0.0177, "step": 54480 }, { "grad_norm": 0.5402427911758423, "learning_rate": 2.288807738194415e-06, "loss": 0.045, "step": 54490 }, { "grad_norm": 0.5561582446098328, "learning_rate": 2.2805726205551768e-06, "loss": 0.019, "step": 54500 }, { "grad_norm": 0.5736695528030396, "learning_rate": 2.272351998818023e-06, "loss": 0.0148, "step": 54510 }, { "grad_norm": 0.44704002141952515, "learning_rate": 2.2641458754801505e-06, "loss": 0.0196, "step": 54520 }, { "grad_norm": 0.23948100209236145, "learning_rate": 2.2559542530343756e-06, "loss": 0.0146, "step": 54530 }, { "grad_norm": 0.2058890461921692, "learning_rate": 2.247777133969087e-06, "loss": 0.0144, "step": 54540 }, { "grad_norm": 0.2515376806259155, "learning_rate": 2.2396145207682795e-06, "loss": 0.0205, "step": 54550 }, { "grad_norm": 0.4644915759563446, "learning_rate": 2.231466415911543e-06, "loss": 0.025, "step": 54560 }, { "grad_norm": 0.20127296447753906, "learning_rate": 2.2233328218740524e-06, "loss": 0.0143, "step": 54570 }, { "grad_norm": 0.45329761505126953, "learning_rate": 2.215213741126576e-06, "loss": 0.015, "step": 54580 }, { "grad_norm": 0.16747307777404785, "learning_rate": 2.2071091761354912e-06, "loss": 0.0141, "step": 54590 }, { "grad_norm": 0.5511597990989685, "learning_rate": 2.1990191293627337e-06, "loss": 0.0227, "step": 54600 }, { "grad_norm": 0.39905279874801636, "learning_rate": 2.1909436032658548e-06, "loss": 0.0237, "step": 54610 }, { "grad_norm": 0.6279119253158569, "learning_rate": 2.1828826002979806e-06, "loss": 0.0201, "step": 54620 }, { "grad_norm": 0.1527787446975708, "learning_rate": 2.1748361229078362e-06, "loss": 0.0233, "step": 54630 }, { "grad_norm": 0.3424808979034424, "learning_rate": 2.1668041735397327e-06, "loss": 0.0241, "step": 54640 }, { "grad_norm": 0.38285472989082336, "learning_rate": 2.1587867546335514e-06, "loss": 0.0171, "step": 54650 }, { "grad_norm": 0.432063490152359, "learning_rate": 2.1507838686247894e-06, "loss": 0.0145, "step": 54660 }, { "grad_norm": 0.26445096731185913, "learning_rate": 2.1427955179444957e-06, "loss": 0.0173, "step": 54670 }, { "grad_norm": 0.3516097664833069, "learning_rate": 2.13482170501933e-06, "loss": 0.0195, "step": 54680 }, { "grad_norm": 0.6691017746925354, "learning_rate": 2.1268624322715202e-06, "loss": 0.0245, "step": 54690 }, { "grad_norm": 0.44178494811058044, "learning_rate": 2.118917702118889e-06, "loss": 0.0202, "step": 54700 }, { "grad_norm": 0.2489091455936432, "learning_rate": 2.1109875169748327e-06, "loss": 0.023, "step": 54710 }, { "grad_norm": 0.5087804794311523, "learning_rate": 2.103071879248336e-06, "loss": 0.0178, "step": 54720 }, { "grad_norm": 0.24194970726966858, "learning_rate": 2.0951707913439478e-06, "loss": 0.0178, "step": 54730 }, { "grad_norm": 0.43630820512771606, "learning_rate": 2.0872842556618255e-06, "loss": 0.0187, "step": 54740 }, { "grad_norm": 0.3393300771713257, "learning_rate": 2.079412274597686e-06, "loss": 0.0199, "step": 54750 }, { "grad_norm": 0.4400307238101959, "learning_rate": 2.0715548505428284e-06, "loss": 0.0261, "step": 54760 }, { "grad_norm": 0.35158300399780273, "learning_rate": 2.0637119858841258e-06, "loss": 0.023, "step": 54770 }, { "grad_norm": 0.25513017177581787, "learning_rate": 2.055883683004034e-06, "loss": 0.0151, "step": 54780 }, { "grad_norm": 0.20657554268836975, "learning_rate": 2.0480699442806006e-06, "loss": 0.0189, "step": 54790 }, { "grad_norm": 0.3399648666381836, "learning_rate": 2.0402707720874105e-06, "loss": 0.0171, "step": 54800 }, { "grad_norm": 0.40167438983917236, "learning_rate": 2.032486168793668e-06, "loss": 0.025, "step": 54810 }, { "grad_norm": 0.19625596702098846, "learning_rate": 2.024716136764104e-06, "loss": 0.0221, "step": 54820 }, { "grad_norm": 0.29835209250450134, "learning_rate": 2.0169606783590757e-06, "loss": 0.0163, "step": 54830 }, { "grad_norm": 0.22766557335853577, "learning_rate": 2.0092197959344638e-06, "loss": 0.0204, "step": 54840 }, { "grad_norm": 0.5794190168380737, "learning_rate": 2.0014934918417606e-06, "loss": 0.0197, "step": 54850 }, { "grad_norm": 0.3112379312515259, "learning_rate": 1.99378176842801e-06, "loss": 0.0212, "step": 54860 }, { "grad_norm": 0.34767788648605347, "learning_rate": 1.9860846280358224e-06, "loss": 0.0212, "step": 54870 }, { "grad_norm": 0.6138976812362671, "learning_rate": 1.978402073003394e-06, "loss": 0.0264, "step": 54880 }, { "grad_norm": 0.31302788853645325, "learning_rate": 1.9707341056644736e-06, "loss": 0.0153, "step": 54890 }, { "grad_norm": 0.43608298897743225, "learning_rate": 1.963080728348399e-06, "loss": 0.017, "step": 54900 }, { "grad_norm": 0.24539081752300262, "learning_rate": 1.955441943380054e-06, "loss": 0.0192, "step": 54910 }, { "grad_norm": 0.30813315510749817, "learning_rate": 1.947817753079906e-06, "loss": 0.0227, "step": 54920 }, { "grad_norm": 0.6683168411254883, "learning_rate": 1.9402081597639785e-06, "loss": 0.0242, "step": 54930 }, { "grad_norm": 0.6451540589332581, "learning_rate": 1.9326131657438683e-06, "loss": 0.0361, "step": 54940 }, { "grad_norm": 0.42108458280563354, "learning_rate": 1.925032773326724e-06, "loss": 0.0227, "step": 54950 }, { "grad_norm": 0.634102463722229, "learning_rate": 1.9174669848152916e-06, "loss": 0.0192, "step": 54960 }, { "grad_norm": 0.2837274372577667, "learning_rate": 1.9099158025078336e-06, "loss": 0.0218, "step": 54970 }, { "grad_norm": 0.3705548048019409, "learning_rate": 1.90237922869822e-06, "loss": 0.0252, "step": 54980 }, { "grad_norm": 0.13129167258739471, "learning_rate": 1.8948572656758367e-06, "loss": 0.0198, "step": 54990 }, { "grad_norm": 0.38898220658302307, "learning_rate": 1.8873499157256834e-06, "loss": 0.0306, "step": 55000 }, { "grad_norm": 0.4022713601589203, "learning_rate": 1.879857181128286e-06, "loss": 0.0205, "step": 55010 }, { "grad_norm": 0.6217085719108582, "learning_rate": 1.8723790641597349e-06, "loss": 0.0199, "step": 55020 }, { "grad_norm": 0.29179081320762634, "learning_rate": 1.8649155670916906e-06, "loss": 0.0174, "step": 55030 }, { "grad_norm": 0.282353937625885, "learning_rate": 1.8574666921913565e-06, "loss": 0.0294, "step": 55040 }, { "grad_norm": 0.37511053681373596, "learning_rate": 1.8500324417215166e-06, "loss": 0.0223, "step": 55050 }, { "grad_norm": 0.31890690326690674, "learning_rate": 1.8426128179404977e-06, "loss": 0.0219, "step": 55060 }, { "grad_norm": 0.5336704850196838, "learning_rate": 1.8352078231021807e-06, "loss": 0.0285, "step": 55070 }, { "grad_norm": 0.2564707398414612, "learning_rate": 1.8278174594560049e-06, "loss": 0.0171, "step": 55080 }, { "grad_norm": 0.5440425276756287, "learning_rate": 1.82044172924698e-06, "loss": 0.0246, "step": 55090 }, { "grad_norm": 0.4377407133579254, "learning_rate": 1.813080634715636e-06, "loss": 0.0241, "step": 55100 }, { "grad_norm": 0.36681538820266724, "learning_rate": 1.8057341780981119e-06, "loss": 0.0152, "step": 55110 }, { "grad_norm": 0.2708076536655426, "learning_rate": 1.7984023616260338e-06, "loss": 0.0148, "step": 55120 }, { "grad_norm": 0.24684825539588928, "learning_rate": 1.7910851875266421e-06, "loss": 0.0223, "step": 55130 }, { "grad_norm": 0.467349648475647, "learning_rate": 1.7837826580226757e-06, "loss": 0.022, "step": 55140 }, { "grad_norm": 0.4286964535713196, "learning_rate": 1.7764947753324656e-06, "loss": 0.0173, "step": 55150 }, { "grad_norm": 0.7540074586868286, "learning_rate": 1.7692215416698799e-06, "loss": 0.015, "step": 55160 }, { "grad_norm": 0.589078962802887, "learning_rate": 1.7619629592443233e-06, "loss": 0.0211, "step": 55170 }, { "grad_norm": 1.0579742193222046, "learning_rate": 1.7547190302607709e-06, "loss": 0.0233, "step": 55180 }, { "grad_norm": 0.4706648290157318, "learning_rate": 1.747489756919729e-06, "loss": 0.0251, "step": 55190 }, { "grad_norm": 0.4996583163738251, "learning_rate": 1.7402751414172802e-06, "loss": 0.0233, "step": 55200 }, { "grad_norm": 0.5369991660118103, "learning_rate": 1.7330751859450044e-06, "loss": 0.0244, "step": 55210 }, { "grad_norm": 0.3352694809436798, "learning_rate": 1.7258898926900801e-06, "loss": 0.0232, "step": 55220 }, { "grad_norm": 0.7308063507080078, "learning_rate": 1.7187192638352002e-06, "loss": 0.0245, "step": 55230 }, { "grad_norm": 0.31416064500808716, "learning_rate": 1.7115633015586163e-06, "loss": 0.0198, "step": 55240 }, { "grad_norm": 0.4186398386955261, "learning_rate": 1.7044220080341178e-06, "loss": 0.014, "step": 55250 }, { "grad_norm": 0.6253836750984192, "learning_rate": 1.6972953854310414e-06, "loss": 0.0182, "step": 55260 }, { "grad_norm": 0.29197514057159424, "learning_rate": 1.690183435914261e-06, "loss": 0.0174, "step": 55270 }, { "grad_norm": 0.39536458253860474, "learning_rate": 1.6830861616442206e-06, "loss": 0.0192, "step": 55280 }, { "grad_norm": 0.46862098574638367, "learning_rate": 1.6760035647768568e-06, "loss": 0.0245, "step": 55290 }, { "grad_norm": 0.3066451847553253, "learning_rate": 1.6689356474636985e-06, "loss": 0.0339, "step": 55300 }, { "grad_norm": 0.2357746958732605, "learning_rate": 1.6618824118517784e-06, "loss": 0.0152, "step": 55310 }, { "grad_norm": 0.3270311653614044, "learning_rate": 1.654843860083688e-06, "loss": 0.0353, "step": 55320 }, { "grad_norm": 0.590121328830719, "learning_rate": 1.647819994297556e-06, "loss": 0.0332, "step": 55330 }, { "grad_norm": 0.44966739416122437, "learning_rate": 1.6408108166270363e-06, "loss": 0.0228, "step": 55340 }, { "grad_norm": 0.3695054054260254, "learning_rate": 1.6338163292013486e-06, "loss": 0.019, "step": 55350 }, { "grad_norm": 0.2947315573692322, "learning_rate": 1.6268365341452208e-06, "loss": 0.0228, "step": 55360 }, { "grad_norm": 0.29353734850883484, "learning_rate": 1.6198714335789345e-06, "loss": 0.0178, "step": 55370 }, { "grad_norm": 0.19318526983261108, "learning_rate": 1.612921029618303e-06, "loss": 0.0177, "step": 55380 }, { "grad_norm": 0.53536057472229, "learning_rate": 1.6059853243746815e-06, "loss": 0.0244, "step": 55390 }, { "grad_norm": 0.1716267466545105, "learning_rate": 1.5990643199549404e-06, "loss": 0.0232, "step": 55400 }, { "grad_norm": 0.3211418390274048, "learning_rate": 1.5921580184615147e-06, "loss": 0.0185, "step": 55410 }, { "grad_norm": 0.4132545292377472, "learning_rate": 1.5852664219923374e-06, "loss": 0.0183, "step": 55420 }, { "grad_norm": 0.14260517060756683, "learning_rate": 1.5783895326409172e-06, "loss": 0.0151, "step": 55430 }, { "grad_norm": 0.24795249104499817, "learning_rate": 1.5715273524962438e-06, "loss": 0.0212, "step": 55440 }, { "grad_norm": 0.26648616790771484, "learning_rate": 1.564679883642889e-06, "loss": 0.0227, "step": 55450 }, { "grad_norm": 0.2060139924287796, "learning_rate": 1.5578471281609276e-06, "loss": 0.0154, "step": 55460 }, { "grad_norm": 0.4150611460208893, "learning_rate": 1.551029088125966e-06, "loss": 0.0154, "step": 55470 }, { "grad_norm": 0.19583162665367126, "learning_rate": 1.544225765609142e-06, "loss": 0.0183, "step": 55480 }, { "grad_norm": 0.5858960747718811, "learning_rate": 1.53743716267713e-06, "loss": 0.0205, "step": 55490 }, { "grad_norm": 0.5867142677307129, "learning_rate": 1.5306632813921361e-06, "loss": 0.0149, "step": 55500 }, { "grad_norm": 0.2994583249092102, "learning_rate": 1.52390412381187e-06, "loss": 0.0169, "step": 55510 }, { "grad_norm": 0.30233457684516907, "learning_rate": 1.5171596919895948e-06, "loss": 0.0199, "step": 55520 }, { "grad_norm": 0.7383714914321899, "learning_rate": 1.5104299879740936e-06, "loss": 0.0285, "step": 55530 }, { "grad_norm": 0.5732960104942322, "learning_rate": 1.5037150138096701e-06, "loss": 0.0205, "step": 55540 }, { "grad_norm": 0.23648156225681305, "learning_rate": 1.4970147715361538e-06, "loss": 0.0205, "step": 55550 }, { "grad_norm": 0.7105567455291748, "learning_rate": 1.4903292631889054e-06, "loss": 0.0206, "step": 55560 }, { "grad_norm": 0.7372156381607056, "learning_rate": 1.483658490798795e-06, "loss": 0.0291, "step": 55570 }, { "grad_norm": 0.553195059299469, "learning_rate": 1.4770024563922457e-06, "loss": 0.0276, "step": 55580 }, { "grad_norm": 1.1578043699264526, "learning_rate": 1.4703611619911628e-06, "loss": 0.0231, "step": 55590 }, { "grad_norm": 0.3392793834209442, "learning_rate": 1.4637346096130155e-06, "loss": 0.0203, "step": 55600 }, { "grad_norm": 0.4474485218524933, "learning_rate": 1.4571228012707662e-06, "loss": 0.0215, "step": 55610 }, { "grad_norm": 0.4766024053096771, "learning_rate": 1.4505257389729132e-06, "loss": 0.0417, "step": 55620 }, { "grad_norm": 0.5154149532318115, "learning_rate": 1.4439434247234596e-06, "loss": 0.0176, "step": 55630 }, { "grad_norm": 0.19068722426891327, "learning_rate": 1.4373758605219445e-06, "loss": 0.0214, "step": 55640 }, { "grad_norm": 0.35657066106796265, "learning_rate": 1.4308230483634333e-06, "loss": 0.0169, "step": 55650 }, { "grad_norm": 0.3496154844760895, "learning_rate": 1.4242849902384724e-06, "loss": 0.0187, "step": 55660 }, { "grad_norm": 0.3710026741027832, "learning_rate": 1.4177616881331734e-06, "loss": 0.022, "step": 55670 }, { "grad_norm": 0.3676792085170746, "learning_rate": 1.4112531440291233e-06, "loss": 0.0239, "step": 55680 }, { "grad_norm": 0.45684537291526794, "learning_rate": 1.4047593599034624e-06, "loss": 0.0181, "step": 55690 }, { "grad_norm": 0.2071218341588974, "learning_rate": 1.3982803377288246e-06, "loss": 0.0232, "step": 55700 }, { "grad_norm": 0.25754112005233765, "learning_rate": 1.3918160794733681e-06, "loss": 0.0147, "step": 55710 }, { "grad_norm": 0.4081452190876007, "learning_rate": 1.3853665871007615e-06, "loss": 0.0214, "step": 55720 }, { "grad_norm": 0.33985552191734314, "learning_rate": 1.378931862570193e-06, "loss": 0.0178, "step": 55730 }, { "grad_norm": 0.30836963653564453, "learning_rate": 1.372511907836349e-06, "loss": 0.02, "step": 55740 }, { "grad_norm": 0.4177175760269165, "learning_rate": 1.3661067248494586e-06, "loss": 0.0222, "step": 55750 }, { "grad_norm": 0.7557799816131592, "learning_rate": 1.3597163155552429e-06, "loss": 0.0507, "step": 55760 }, { "grad_norm": 0.6972224116325378, "learning_rate": 1.3533406818949434e-06, "loss": 0.0299, "step": 55770 }, { "grad_norm": 0.2855721414089203, "learning_rate": 1.3469798258053002e-06, "loss": 0.018, "step": 55780 }, { "grad_norm": 0.2724454998970032, "learning_rate": 1.3406337492185672e-06, "loss": 0.0187, "step": 55790 }, { "grad_norm": 0.34800922870635986, "learning_rate": 1.3343024540625414e-06, "loss": 0.0234, "step": 55800 }, { "grad_norm": 0.28418251872062683, "learning_rate": 1.3279859422604735e-06, "loss": 0.0164, "step": 55810 }, { "grad_norm": 0.5306480526924133, "learning_rate": 1.3216842157311781e-06, "loss": 0.0163, "step": 55820 }, { "grad_norm": 0.2929772734642029, "learning_rate": 1.3153972763889355e-06, "loss": 0.0197, "step": 55830 }, { "grad_norm": 0.18333560228347778, "learning_rate": 1.3091251261435566e-06, "loss": 0.0196, "step": 55840 }, { "grad_norm": 0.38587486743927, "learning_rate": 1.3028677669003564e-06, "loss": 0.0183, "step": 55850 }, { "grad_norm": 0.5301830172538757, "learning_rate": 1.2966252005601587e-06, "loss": 0.0189, "step": 55860 }, { "grad_norm": 0.6128166317939758, "learning_rate": 1.2903974290192855e-06, "loss": 0.018, "step": 55870 }, { "grad_norm": 0.30468451976776123, "learning_rate": 1.284184454169568e-06, "loss": 0.0192, "step": 55880 }, { "grad_norm": 0.31847789883613586, "learning_rate": 1.2779862778983464e-06, "loss": 0.0197, "step": 55890 }, { "grad_norm": 0.18282407522201538, "learning_rate": 1.2718029020884647e-06, "loss": 0.0239, "step": 55900 }, { "grad_norm": 0.39725032448768616, "learning_rate": 1.2656343286182703e-06, "loss": 0.0157, "step": 55910 }, { "grad_norm": 0.20007231831550598, "learning_rate": 1.2594805593616088e-06, "loss": 0.0411, "step": 55920 }, { "grad_norm": 0.44569307565689087, "learning_rate": 1.2533415961878404e-06, "loss": 0.0146, "step": 55930 }, { "grad_norm": 0.6311435103416443, "learning_rate": 1.2472174409618009e-06, "loss": 0.0173, "step": 55940 }, { "grad_norm": 0.8931568264961243, "learning_rate": 1.2411080955438747e-06, "loss": 0.0197, "step": 55950 }, { "grad_norm": 0.28867796063423157, "learning_rate": 1.235013561789894e-06, "loss": 0.0178, "step": 55960 }, { "grad_norm": 0.5160608887672424, "learning_rate": 1.2289338415512385e-06, "loss": 0.0204, "step": 55970 }, { "grad_norm": 0.39513057470321655, "learning_rate": 1.222868936674748e-06, "loss": 0.0218, "step": 55980 }, { "grad_norm": 0.5463154315948486, "learning_rate": 1.2168188490027876e-06, "loss": 0.0309, "step": 55990 }, { "grad_norm": 0.24139568209648132, "learning_rate": 1.2107835803732204e-06, "loss": 0.0168, "step": 56000 }, { "grad_norm": 0.2885236442089081, "learning_rate": 1.2047631326193964e-06, "loss": 0.0191, "step": 56010 }, { "grad_norm": 0.2695394456386566, "learning_rate": 1.1987575075701696e-06, "loss": 0.0172, "step": 56020 }, { "grad_norm": 0.5225959420204163, "learning_rate": 1.1927667070498916e-06, "loss": 0.0273, "step": 56030 }, { "grad_norm": 0.37974244356155396, "learning_rate": 1.1867907328784067e-06, "loss": 0.0218, "step": 56040 }, { "grad_norm": 0.3579225242137909, "learning_rate": 1.1808295868710518e-06, "loss": 0.0222, "step": 56050 }, { "grad_norm": 0.4086303114891052, "learning_rate": 1.174883270838678e-06, "loss": 0.0151, "step": 56060 }, { "grad_norm": 0.3234381377696991, "learning_rate": 1.1689517865876187e-06, "loss": 0.0214, "step": 56070 }, { "grad_norm": 0.14092792570590973, "learning_rate": 1.1630351359196933e-06, "loss": 0.0272, "step": 56080 }, { "grad_norm": 0.4106067419052124, "learning_rate": 1.1571333206322255e-06, "loss": 0.0213, "step": 56090 }, { "grad_norm": 0.7108640670776367, "learning_rate": 1.1512463425180365e-06, "loss": 0.023, "step": 56100 }, { "grad_norm": 0.5598741173744202, "learning_rate": 1.145374203365429e-06, "loss": 0.02, "step": 56110 }, { "grad_norm": 0.3726085424423218, "learning_rate": 1.1395169049582155e-06, "loss": 0.0195, "step": 56120 }, { "grad_norm": 0.34337663650512695, "learning_rate": 1.1336744490756722e-06, "loss": 0.015, "step": 56130 }, { "grad_norm": 0.5183547735214233, "learning_rate": 1.1278468374925967e-06, "loss": 0.0256, "step": 56140 }, { "grad_norm": 0.21310612559318542, "learning_rate": 1.12203407197925e-06, "loss": 0.0134, "step": 56150 }, { "grad_norm": 0.27733975648880005, "learning_rate": 1.116236154301409e-06, "loss": 0.0194, "step": 56160 }, { "grad_norm": 0.5687543153762817, "learning_rate": 1.11045308622032e-06, "loss": 0.0236, "step": 56170 }, { "grad_norm": 0.4853838384151459, "learning_rate": 1.1046848694927337e-06, "loss": 0.0242, "step": 56180 }, { "grad_norm": 0.1422659009695053, "learning_rate": 1.098931505870876e-06, "loss": 0.0187, "step": 56190 }, { "grad_norm": 0.5341496467590332, "learning_rate": 1.0931929971024657e-06, "loss": 0.0157, "step": 56200 }, { "grad_norm": 0.37199243903160095, "learning_rate": 1.0874693449307193e-06, "loss": 0.0239, "step": 56210 }, { "grad_norm": 0.3717893064022064, "learning_rate": 1.081760551094324e-06, "loss": 0.0191, "step": 56220 }, { "grad_norm": 0.5454277396202087, "learning_rate": 1.0760666173274592e-06, "loss": 0.0243, "step": 56230 }, { "grad_norm": 0.52334064245224, "learning_rate": 1.0703875453597967e-06, "loss": 0.019, "step": 56240 }, { "grad_norm": 0.3877466917037964, "learning_rate": 1.0647233369164845e-06, "loss": 0.0182, "step": 56250 }, { "grad_norm": 0.2983112335205078, "learning_rate": 1.0590739937181625e-06, "loss": 0.015, "step": 56260 }, { "grad_norm": 0.48448386788368225, "learning_rate": 1.053439517480953e-06, "loss": 0.0211, "step": 56270 }, { "grad_norm": 0.4735480844974518, "learning_rate": 1.047819909916453e-06, "loss": 0.019, "step": 56280 }, { "grad_norm": 0.5027951002120972, "learning_rate": 1.0422151727317697e-06, "loss": 0.021, "step": 56290 }, { "grad_norm": 0.3736591935157776, "learning_rate": 1.0366253076294462e-06, "loss": 0.0335, "step": 56300 }, { "grad_norm": 0.48044171929359436, "learning_rate": 1.031050316307558e-06, "loss": 0.017, "step": 56310 }, { "grad_norm": 0.7697222828865051, "learning_rate": 1.0254902004596333e-06, "loss": 0.0285, "step": 56320 }, { "grad_norm": 0.5762849450111389, "learning_rate": 1.0199449617746882e-06, "loss": 0.0193, "step": 56330 }, { "grad_norm": 0.36674344539642334, "learning_rate": 1.0144146019372247e-06, "loss": 0.021, "step": 56340 }, { "grad_norm": 0.33899980783462524, "learning_rate": 1.0088991226272048e-06, "loss": 0.03, "step": 56350 }, { "grad_norm": 0.4002334177494049, "learning_rate": 1.003398525520105e-06, "loss": 0.0121, "step": 56360 }, { "grad_norm": 0.43506452441215515, "learning_rate": 9.979128122868552e-07, "loss": 0.0236, "step": 56370 }, { "grad_norm": 0.2901763916015625, "learning_rate": 9.924419845938614e-07, "loss": 0.031, "step": 56380 }, { "grad_norm": 0.3590594530105591, "learning_rate": 9.869860441030276e-07, "loss": 0.0217, "step": 56390 }, { "grad_norm": 0.5348643660545349, "learning_rate": 9.815449924717169e-07, "loss": 0.0247, "step": 56400 }, { "grad_norm": 0.43413564562797546, "learning_rate": 9.761188313527791e-07, "loss": 0.0214, "step": 56410 }, { "grad_norm": 0.27743640542030334, "learning_rate": 9.70707562394546e-07, "loss": 0.0156, "step": 56420 }, { "grad_norm": 0.5478072166442871, "learning_rate": 9.653111872408027e-07, "loss": 0.022, "step": 56430 }, { "grad_norm": 0.15016840398311615, "learning_rate": 9.599297075308434e-07, "loss": 0.0205, "step": 56440 }, { "grad_norm": 0.46270838379859924, "learning_rate": 9.545631248994048e-07, "loss": 0.0181, "step": 56450 }, { "grad_norm": 0.20059344172477722, "learning_rate": 9.492114409767217e-07, "loss": 0.0227, "step": 56460 }, { "grad_norm": 0.27468326687812805, "learning_rate": 9.438746573884938e-07, "loss": 0.0212, "step": 56470 }, { "grad_norm": 0.3678772747516632, "learning_rate": 9.385527757558909e-07, "loss": 0.016, "step": 56480 }, { "grad_norm": 0.36558961868286133, "learning_rate": 9.332457976955644e-07, "loss": 0.0244, "step": 56490 }, { "grad_norm": 0.3072582483291626, "learning_rate": 9.279537248196247e-07, "loss": 0.032, "step": 56500 }, { "grad_norm": 0.3395232856273651, "learning_rate": 9.2267655873568e-07, "loss": 0.0212, "step": 56510 }, { "grad_norm": 0.35498547554016113, "learning_rate": 9.174143010467762e-07, "loss": 0.0279, "step": 56520 }, { "grad_norm": 0.3972140848636627, "learning_rate": 9.12166953351462e-07, "loss": 0.0174, "step": 56530 }, { "grad_norm": 0.25762075185775757, "learning_rate": 9.069345172437404e-07, "loss": 0.0173, "step": 56540 }, { "grad_norm": 0.3170675039291382, "learning_rate": 9.017169943130843e-07, "loss": 0.0204, "step": 56550 }, { "grad_norm": 0.4584876298904419, "learning_rate": 8.965143861444425e-07, "loss": 0.0152, "step": 56560 }, { "grad_norm": 0.5429971814155579, "learning_rate": 8.91326694318223e-07, "loss": 0.0248, "step": 56570 }, { "grad_norm": 0.307708740234375, "learning_rate": 8.861539204103098e-07, "loss": 0.0144, "step": 56580 }, { "grad_norm": 0.24751587212085724, "learning_rate": 8.809960659920735e-07, "loss": 0.0246, "step": 56590 }, { "grad_norm": 0.28221964836120605, "learning_rate": 8.758531326303055e-07, "loss": 0.016, "step": 56600 }, { "grad_norm": 0.31268319487571716, "learning_rate": 8.707251218873169e-07, "loss": 0.0173, "step": 56610 }, { "grad_norm": 0.28248822689056396, "learning_rate": 8.656120353208507e-07, "loss": 0.0184, "step": 56620 }, { "grad_norm": 0.4500970244407654, "learning_rate": 8.605138744841312e-07, "loss": 0.0201, "step": 56630 }, { "grad_norm": 0.2677329182624817, "learning_rate": 8.554306409258417e-07, "loss": 0.0194, "step": 56640 }, { "grad_norm": 0.3578583300113678, "learning_rate": 8.503623361901358e-07, "loss": 0.0161, "step": 56650 }, { "grad_norm": 0.6503327488899231, "learning_rate": 8.453089618166377e-07, "loss": 0.0243, "step": 56660 }, { "grad_norm": 0.49715378880500793, "learning_rate": 8.402705193404137e-07, "loss": 0.0234, "step": 56670 }, { "grad_norm": 0.390621155500412, "learning_rate": 8.352470102920174e-07, "loss": 0.0168, "step": 56680 }, { "grad_norm": 0.49970605969429016, "learning_rate": 8.302384361974669e-07, "loss": 0.0219, "step": 56690 }, { "grad_norm": 0.4149988889694214, "learning_rate": 8.252447985782231e-07, "loss": 0.0129, "step": 56700 }, { "grad_norm": 0.20790210366249084, "learning_rate": 8.202660989512279e-07, "loss": 0.0189, "step": 56710 }, { "grad_norm": 0.2907707691192627, "learning_rate": 8.153023388288772e-07, "loss": 0.0152, "step": 56720 }, { "grad_norm": 0.4367403984069824, "learning_rate": 8.103535197190204e-07, "loss": 0.0218, "step": 56730 }, { "grad_norm": 0.5765858292579651, "learning_rate": 8.054196431249993e-07, "loss": 0.0216, "step": 56740 }, { "grad_norm": 0.456114262342453, "learning_rate": 8.005007105455709e-07, "loss": 0.0209, "step": 56750 }, { "grad_norm": 0.4597046673297882, "learning_rate": 7.95596723474995e-07, "loss": 0.0205, "step": 56760 }, { "grad_norm": 0.683719277381897, "learning_rate": 7.907076834029692e-07, "loss": 0.0204, "step": 56770 }, { "grad_norm": 0.3622368574142456, "learning_rate": 7.858335918146498e-07, "loss": 0.0334, "step": 56780 }, { "grad_norm": 0.12585711479187012, "learning_rate": 7.809744501906635e-07, "loss": 0.0129, "step": 56790 }, { "grad_norm": 0.34160345792770386, "learning_rate": 7.761302600070797e-07, "loss": 0.0188, "step": 56800 }, { "grad_norm": 0.3829641342163086, "learning_rate": 7.713010227354545e-07, "loss": 0.0242, "step": 56810 }, { "grad_norm": 0.5877901911735535, "learning_rate": 7.664867398427589e-07, "loss": 0.0267, "step": 56820 }, { "grad_norm": 0.5224180817604065, "learning_rate": 7.616874127914619e-07, "loss": 0.0273, "step": 56830 }, { "grad_norm": 0.40108588337898254, "learning_rate": 7.569030430394641e-07, "loss": 0.0183, "step": 56840 }, { "grad_norm": 0.5273886322975159, "learning_rate": 7.521336320401306e-07, "loss": 0.0202, "step": 56850 }, { "grad_norm": 0.45853474736213684, "learning_rate": 7.473791812422915e-07, "loss": 0.0199, "step": 56860 }, { "grad_norm": 0.3999346196651459, "learning_rate": 7.426396920902134e-07, "loss": 0.0188, "step": 56870 }, { "grad_norm": 0.31934186816215515, "learning_rate": 7.379151660236283e-07, "loss": 0.021, "step": 56880 }, { "grad_norm": 0.24230806529521942, "learning_rate": 7.332056044777324e-07, "loss": 0.0152, "step": 56890 }, { "grad_norm": 0.20672887563705444, "learning_rate": 7.285110088831537e-07, "loss": 0.0205, "step": 56900 }, { "grad_norm": 0.19980204105377197, "learning_rate": 7.238313806659902e-07, "loss": 0.0194, "step": 56910 }, { "grad_norm": 0.5904955267906189, "learning_rate": 7.191667212477993e-07, "loss": 0.0238, "step": 56920 }, { "grad_norm": 0.39962446689605713, "learning_rate": 7.145170320455697e-07, "loss": 0.0195, "step": 56930 }, { "grad_norm": 0.48691731691360474, "learning_rate": 7.098823144717604e-07, "loss": 0.0196, "step": 56940 }, { "grad_norm": 0.4059668481349945, "learning_rate": 7.052625699342674e-07, "loss": 0.0239, "step": 56950 }, { "grad_norm": 0.42681172490119934, "learning_rate": 7.006577998364628e-07, "loss": 0.0287, "step": 56960 }, { "grad_norm": 0.27430006861686707, "learning_rate": 6.96068005577133e-07, "loss": 0.0154, "step": 56970 }, { "grad_norm": 0.2542240023612976, "learning_rate": 6.914931885505627e-07, "loss": 0.0202, "step": 56980 }, { "grad_norm": 0.2544598877429962, "learning_rate": 6.869333501464347e-07, "loss": 0.0267, "step": 56990 }, { "grad_norm": 0.24623596668243408, "learning_rate": 6.823884917499246e-07, "loss": 0.0185, "step": 57000 }, { "grad_norm": 0.42630550265312195, "learning_rate": 6.778586147416278e-07, "loss": 0.0174, "step": 57010 }, { "grad_norm": 0.4770656228065491, "learning_rate": 6.733437204976156e-07, "loss": 0.0206, "step": 57020 }, { "grad_norm": 0.22100359201431274, "learning_rate": 6.688438103893857e-07, "loss": 0.0265, "step": 57030 }, { "grad_norm": 0.2803400754928589, "learning_rate": 6.64358885783889e-07, "loss": 0.0268, "step": 57040 }, { "grad_norm": 0.48158982396125793, "learning_rate": 6.598889480435299e-07, "loss": 0.0254, "step": 57050 }, { "grad_norm": 0.3576102554798126, "learning_rate": 6.554339985261615e-07, "loss": 0.0247, "step": 57060 }, { "grad_norm": 0.3260915279388428, "learning_rate": 6.509940385850733e-07, "loss": 0.0224, "step": 57070 }, { "grad_norm": 0.20287640392780304, "learning_rate": 6.465690695690141e-07, "loss": 0.0214, "step": 57080 }, { "grad_norm": 0.2564356327056885, "learning_rate": 6.421590928221699e-07, "loss": 0.0296, "step": 57090 }, { "grad_norm": 0.753498375415802, "learning_rate": 6.377641096841691e-07, "loss": 0.0243, "step": 57100 }, { "grad_norm": 0.37434279918670654, "learning_rate": 6.333841214901048e-07, "loss": 0.0226, "step": 57110 }, { "grad_norm": 0.33960750699043274, "learning_rate": 6.290191295704906e-07, "loss": 0.0198, "step": 57120 }, { "grad_norm": 0.234640434384346, "learning_rate": 6.246691352513046e-07, "loss": 0.0382, "step": 57130 }, { "grad_norm": 0.4016323983669281, "learning_rate": 6.203341398539452e-07, "loss": 0.0209, "step": 57140 }, { "grad_norm": 0.4031032919883728, "learning_rate": 6.160141446952872e-07, "loss": 0.0367, "step": 57150 }, { "grad_norm": 0.4020065665245056, "learning_rate": 6.11709151087625e-07, "loss": 0.0211, "step": 57160 }, { "grad_norm": 0.20647412538528442, "learning_rate": 6.074191603386958e-07, "loss": 0.0207, "step": 57170 }, { "grad_norm": 0.19174985587596893, "learning_rate": 6.031441737516907e-07, "loss": 0.0264, "step": 57180 }, { "grad_norm": 0.19396956264972687, "learning_rate": 5.988841926252431e-07, "loss": 0.0225, "step": 57190 }, { "grad_norm": 0.1764955371618271, "learning_rate": 5.946392182534066e-07, "loss": 0.0176, "step": 57200 }, { "grad_norm": 0.3204851448535919, "learning_rate": 5.90409251925711e-07, "loss": 0.0319, "step": 57210 }, { "grad_norm": 0.2388501763343811, "learning_rate": 5.861942949270949e-07, "loss": 0.0221, "step": 57220 }, { "grad_norm": 0.397731214761734, "learning_rate": 5.819943485379564e-07, "loss": 0.0206, "step": 57230 }, { "grad_norm": 0.41949301958084106, "learning_rate": 5.778094140341306e-07, "loss": 0.0233, "step": 57240 }, { "grad_norm": 0.5248616337776184, "learning_rate": 5.736394926868893e-07, "loss": 0.0204, "step": 57250 }, { "grad_norm": 0.4904094636440277, "learning_rate": 5.694845857629416e-07, "loss": 0.0146, "step": 57260 }, { "grad_norm": 0.2362263947725296, "learning_rate": 5.653446945244334e-07, "loss": 0.0165, "step": 57270 }, { "grad_norm": 1.018144965171814, "learning_rate": 5.612198202289698e-07, "loss": 0.0234, "step": 57280 }, { "grad_norm": 0.410300076007843, "learning_rate": 5.571099641295596e-07, "loss": 0.0213, "step": 57290 }, { "grad_norm": 0.6102570295333862, "learning_rate": 5.530151274746875e-07, "loss": 0.0189, "step": 57300 }, { "grad_norm": 0.4143592417240143, "learning_rate": 5.489353115082418e-07, "loss": 0.0218, "step": 57310 }, { "grad_norm": 0.5398693084716797, "learning_rate": 5.4487051746957e-07, "loss": 0.0281, "step": 57320 }, { "grad_norm": 0.4636707603931427, "learning_rate": 5.408207465934511e-07, "loss": 0.0133, "step": 57330 }, { "grad_norm": 0.30488163232803345, "learning_rate": 5.3678600011009e-07, "loss": 0.0186, "step": 57340 }, { "grad_norm": 0.5408939123153687, "learning_rate": 5.327662792451449e-07, "loss": 0.0244, "step": 57350 }, { "grad_norm": 0.5390161275863647, "learning_rate": 5.287615852196947e-07, "loss": 0.0234, "step": 57360 }, { "grad_norm": 0.36342889070510864, "learning_rate": 5.247719192502665e-07, "loss": 0.0181, "step": 57370 }, { "grad_norm": 0.1634533852338791, "learning_rate": 5.207972825488128e-07, "loss": 0.0133, "step": 57380 }, { "grad_norm": 0.31573715806007385, "learning_rate": 5.168376763227178e-07, "loss": 0.0254, "step": 57390 }, { "grad_norm": 0.30228888988494873, "learning_rate": 5.12893101774814e-07, "loss": 0.0126, "step": 57400 }, { "grad_norm": 0.5635353922843933, "learning_rate": 5.089635601033483e-07, "loss": 0.0191, "step": 57410 }, { "grad_norm": 0.4047757089138031, "learning_rate": 5.050490525020213e-07, "loss": 0.0252, "step": 57420 }, { "grad_norm": 0.2895248532295227, "learning_rate": 5.011495801599541e-07, "loss": 0.0171, "step": 57430 }, { "grad_norm": 0.19029873609542847, "learning_rate": 4.972651442616994e-07, "loss": 0.0256, "step": 57440 }, { "grad_norm": 0.468797504901886, "learning_rate": 4.933957459872574e-07, "loss": 0.0207, "step": 57450 }, { "grad_norm": 0.20081089437007904, "learning_rate": 4.895413865120324e-07, "loss": 0.0484, "step": 57460 }, { "grad_norm": 0.24995552003383636, "learning_rate": 4.857020670068935e-07, "loss": 0.0209, "step": 57470 }, { "grad_norm": 0.2668546140193939, "learning_rate": 4.818777886381132e-07, "loss": 0.0179, "step": 57480 }, { "grad_norm": 0.3173988461494446, "learning_rate": 4.780685525674122e-07, "loss": 0.0251, "step": 57490 }, { "grad_norm": 0.3505961298942566, "learning_rate": 4.7427435995193724e-07, "loss": 0.02, "step": 57500 }, { "grad_norm": 0.0898575559258461, "learning_rate": 4.7049521194425515e-07, "loss": 0.0282, "step": 57510 }, { "grad_norm": 0.29311883449554443, "learning_rate": 4.667311096923754e-07, "loss": 0.0158, "step": 57520 }, { "grad_norm": 0.38325417041778564, "learning_rate": 4.6298205433973895e-07, "loss": 0.0196, "step": 57530 }, { "grad_norm": 0.8134199976921082, "learning_rate": 4.5924804702520696e-07, "loss": 0.0216, "step": 57540 }, { "grad_norm": 0.2861372232437134, "learning_rate": 4.5552908888306655e-07, "loss": 0.0218, "step": 57550 }, { "grad_norm": 0.3683083951473236, "learning_rate": 4.5182518104304185e-07, "loss": 0.0226, "step": 57560 }, { "grad_norm": 0.3981439173221588, "learning_rate": 4.4813632463028277e-07, "loss": 0.0224, "step": 57570 }, { "grad_norm": 0.23878513276576996, "learning_rate": 4.444625207653763e-07, "loss": 0.0256, "step": 57580 }, { "grad_norm": 0.42365455627441406, "learning_rate": 4.4080377056430753e-07, "loss": 0.0224, "step": 57590 }, { "grad_norm": 0.2328474372625351, "learning_rate": 4.371600751385263e-07, "loss": 0.0183, "step": 57600 }, { "grad_norm": 0.3911654055118561, "learning_rate": 4.3353143559487495e-07, "loss": 0.0181, "step": 57610 }, { "grad_norm": 0.43414509296417236, "learning_rate": 4.2991785303565513e-07, "loss": 0.0172, "step": 57620 }, { "grad_norm": 0.41024675965309143, "learning_rate": 4.2631932855856647e-07, "loss": 0.0235, "step": 57630 }, { "grad_norm": 0.23843088746070862, "learning_rate": 4.2273586325674576e-07, "loss": 0.0173, "step": 57640 }, { "grad_norm": 0.19736701250076294, "learning_rate": 4.1916745821876103e-07, "loss": 0.0132, "step": 57650 }, { "grad_norm": 0.1459006369113922, "learning_rate": 4.156141145285897e-07, "loss": 0.0164, "step": 57660 }, { "grad_norm": 0.30398812890052795, "learning_rate": 4.1207583326566267e-07, "loss": 0.0178, "step": 57670 }, { "grad_norm": 0.39095720648765564, "learning_rate": 4.085526155047925e-07, "loss": 0.0188, "step": 57680 }, { "grad_norm": 0.5341652035713196, "learning_rate": 4.050444623162564e-07, "loss": 0.0138, "step": 57690 }, { "grad_norm": 0.5818784236907959, "learning_rate": 4.015513747657351e-07, "loss": 0.0312, "step": 57700 }, { "grad_norm": 0.339880108833313, "learning_rate": 3.9807335391433554e-07, "loss": 0.0235, "step": 57710 }, { "grad_norm": 0.39587002992630005, "learning_rate": 3.946104008185847e-07, "loss": 0.0148, "step": 57720 }, { "grad_norm": 0.533129096031189, "learning_rate": 3.9116251653044113e-07, "loss": 0.0178, "step": 57730 }, { "grad_norm": 0.2489190250635147, "learning_rate": 3.877297020972781e-07, "loss": 0.022, "step": 57740 }, { "grad_norm": 0.1466773897409439, "learning_rate": 3.8431195856190036e-07, "loss": 0.0254, "step": 57750 }, { "grad_norm": 0.3867305517196655, "learning_rate": 3.8090928696251635e-07, "loss": 0.0245, "step": 57760 }, { "grad_norm": 0.3250673711299896, "learning_rate": 3.77521688332777e-07, "loss": 0.0176, "step": 57770 }, { "grad_norm": 0.3259287178516388, "learning_rate": 3.741491637017425e-07, "loss": 0.0174, "step": 57780 }, { "grad_norm": 0.3794943392276764, "learning_rate": 3.707917140939043e-07, "loss": 0.0145, "step": 57790 }, { "grad_norm": 0.24849702417850494, "learning_rate": 3.6744934052915235e-07, "loss": 0.0146, "step": 57800 }, { "grad_norm": 0.22514809668064117, "learning_rate": 3.641220440228188e-07, "loss": 0.0162, "step": 57810 }, { "grad_norm": 0.525413453578949, "learning_rate": 3.608098255856562e-07, "loss": 0.0212, "step": 57820 }, { "grad_norm": 0.4512537717819214, "learning_rate": 3.575126862238154e-07, "loss": 0.018, "step": 57830 }, { "grad_norm": 0.14788393676280975, "learning_rate": 3.5423062693888955e-07, "loss": 0.0167, "step": 57840 }, { "grad_norm": 0.5764786005020142, "learning_rate": 3.509636487278756e-07, "loss": 0.0182, "step": 57850 }, { "grad_norm": 0.31115832924842834, "learning_rate": 3.4771175258320186e-07, "loss": 0.0173, "step": 57860 }, { "grad_norm": 0.3591616749763489, "learning_rate": 3.4447493949270047e-07, "loss": 0.0154, "step": 57870 }, { "grad_norm": 0.5261187553405762, "learning_rate": 3.4125321043964045e-07, "loss": 0.0177, "step": 57880 }, { "grad_norm": 0.23104193806648254, "learning_rate": 3.380465664026833e-07, "loss": 0.0235, "step": 57890 }, { "grad_norm": 0.12053153663873672, "learning_rate": 3.348550083559388e-07, "loss": 0.0221, "step": 57900 }, { "grad_norm": 0.352042555809021, "learning_rate": 3.316785372689091e-07, "loss": 0.0217, "step": 57910 }, { "grad_norm": 0.2910352647304535, "learning_rate": 3.285171541065224e-07, "loss": 0.0135, "step": 57920 }, { "grad_norm": 0.3771366775035858, "learning_rate": 3.253708598291272e-07, "loss": 0.0229, "step": 57930 }, { "grad_norm": 0.5587508082389832, "learning_rate": 3.2223965539248116e-07, "loss": 0.0178, "step": 57940 }, { "grad_norm": 0.3169196844100952, "learning_rate": 3.1912354174776227e-07, "loss": 0.0148, "step": 57950 }, { "grad_norm": 0.18776348233222961, "learning_rate": 3.1602251984155786e-07, "loss": 0.0125, "step": 57960 }, { "grad_norm": 0.6596295833587646, "learning_rate": 3.1293659061589207e-07, "loss": 0.0146, "step": 57970 }, { "grad_norm": 0.538142204284668, "learning_rate": 3.098657550081707e-07, "loss": 0.0218, "step": 57980 }, { "grad_norm": 0.41861897706985474, "learning_rate": 3.068100139512475e-07, "loss": 0.0199, "step": 57990 }, { "grad_norm": 0.4119289517402649, "learning_rate": 3.037693683733689e-07, "loss": 0.0166, "step": 58000 }, { "grad_norm": 0.32662808895111084, "learning_rate": 3.007438191982015e-07, "loss": 0.0196, "step": 58010 }, { "grad_norm": 0.34207335114479065, "learning_rate": 2.9773336734482684e-07, "loss": 0.0205, "step": 58020 }, { "grad_norm": 0.4744192957878113, "learning_rate": 2.9473801372774667e-07, "loss": 0.0204, "step": 58030 }, { "grad_norm": 0.2504667341709137, "learning_rate": 2.91757759256861e-07, "loss": 0.0175, "step": 58040 }, { "grad_norm": 0.158586785197258, "learning_rate": 2.887926048375067e-07, "loss": 0.0202, "step": 58050 }, { "grad_norm": 0.13077345490455627, "learning_rate": 2.858425513704022e-07, "loss": 0.0146, "step": 58060 }, { "grad_norm": 0.41363734006881714, "learning_rate": 2.8290759975170834e-07, "loss": 0.0239, "step": 58070 }, { "grad_norm": 0.3119538426399231, "learning_rate": 2.799877508729787e-07, "loss": 0.0256, "step": 58080 }, { "grad_norm": 0.8739739060401917, "learning_rate": 2.770830056211926e-07, "loss": 0.0232, "step": 58090 }, { "grad_norm": 0.2962716519832611, "learning_rate": 2.741933648787331e-07, "loss": 0.0217, "step": 58100 }, { "grad_norm": 0.315327912569046, "learning_rate": 2.7131882952339263e-07, "loss": 0.0183, "step": 58110 }, { "grad_norm": 0.5910035967826843, "learning_rate": 2.684594004283836e-07, "loss": 0.0284, "step": 58120 }, { "grad_norm": 0.2635203003883362, "learning_rate": 2.6561507846232234e-07, "loss": 0.019, "step": 58130 }, { "grad_norm": 1.1080836057662964, "learning_rate": 2.6278586448924005e-07, "loss": 0.0244, "step": 58140 }, { "grad_norm": 0.9057651162147522, "learning_rate": 2.5997175936857685e-07, "loss": 0.0228, "step": 58150 }, { "grad_norm": 0.22720377147197723, "learning_rate": 2.57172763955188e-07, "loss": 0.0178, "step": 58160 }, { "grad_norm": 0.4013720154762268, "learning_rate": 2.543888790993265e-07, "loss": 0.0204, "step": 58170 }, { "grad_norm": 0.2315722405910492, "learning_rate": 2.5162010564666607e-07, "loss": 0.0182, "step": 58180 }, { "grad_norm": 0.5386664271354675, "learning_rate": 2.488664444382893e-07, "loss": 0.0177, "step": 58190 }, { "grad_norm": 0.3181219696998596, "learning_rate": 2.461278963106828e-07, "loss": 0.0191, "step": 58200 }, { "grad_norm": 0.38124343752861023, "learning_rate": 2.434044620957421e-07, "loss": 0.0186, "step": 58210 }, { "grad_norm": 0.565185010433197, "learning_rate": 2.406961426207832e-07, "loss": 0.0164, "step": 58220 }, { "grad_norm": 0.41961678862571716, "learning_rate": 2.380029387085203e-07, "loss": 0.0161, "step": 58230 }, { "grad_norm": 0.8911678791046143, "learning_rate": 2.353248511770767e-07, "loss": 0.0265, "step": 58240 }, { "grad_norm": 0.3414516746997833, "learning_rate": 2.3266188083997942e-07, "loss": 0.0166, "step": 58250 }, { "grad_norm": 0.3390418589115143, "learning_rate": 2.3001402850617027e-07, "loss": 0.0187, "step": 58260 }, { "grad_norm": 0.363781213760376, "learning_rate": 2.2738129498000581e-07, "loss": 0.0274, "step": 58270 }, { "grad_norm": 0.311284601688385, "learning_rate": 2.2476368106122414e-07, "loss": 0.026, "step": 58280 }, { "grad_norm": 0.40746089816093445, "learning_rate": 2.2216118754500582e-07, "loss": 0.0216, "step": 58290 }, { "grad_norm": 0.18175311386585236, "learning_rate": 2.1957381522190735e-07, "loss": 0.0187, "step": 58300 }, { "grad_norm": 0.2877090275287628, "learning_rate": 2.1700156487790558e-07, "loss": 0.0296, "step": 58310 }, { "grad_norm": 0.36807572841644287, "learning_rate": 2.1444443729439213e-07, "loss": 0.0196, "step": 58320 }, { "grad_norm": 0.29772713780403137, "learning_rate": 2.1190243324814007e-07, "loss": 0.0222, "step": 58330 }, { "grad_norm": 0.3751228451728821, "learning_rate": 2.0937555351135395e-07, "loss": 0.0301, "step": 58340 }, { "grad_norm": 0.4183208644390106, "learning_rate": 2.0686379885162532e-07, "loss": 0.0279, "step": 58350 }, { "grad_norm": 0.3525357246398926, "learning_rate": 2.0436717003196604e-07, "loss": 0.0249, "step": 58360 }, { "grad_norm": 0.30951181054115295, "learning_rate": 2.0188566781078057e-07, "loss": 0.0113, "step": 58370 }, { "grad_norm": 0.6977769732475281, "learning_rate": 1.994192929418881e-07, "loss": 0.026, "step": 58380 }, { "grad_norm": 0.44806769490242004, "learning_rate": 1.9696804617451158e-07, "loss": 0.0184, "step": 58390 }, { "grad_norm": 0.7450365424156189, "learning_rate": 1.9453192825326093e-07, "loss": 0.0251, "step": 58400 }, { "grad_norm": 0.4115981459617615, "learning_rate": 1.9211093991817753e-07, "loss": 0.0183, "step": 58410 }, { "grad_norm": 0.5096988677978516, "learning_rate": 1.8970508190468973e-07, "loss": 0.0204, "step": 58420 }, { "grad_norm": 0.1888100951910019, "learning_rate": 1.8731435494362958e-07, "loss": 0.0237, "step": 58430 }, { "grad_norm": 0.17962846159934998, "learning_rate": 1.849387597612495e-07, "loss": 0.0225, "step": 58440 }, { "grad_norm": 0.2066667377948761, "learning_rate": 1.8257829707917228e-07, "loss": 0.017, "step": 58450 }, { "grad_norm": 0.22807978093624115, "learning_rate": 1.8023296761446317e-07, "loss": 0.0213, "step": 58460 }, { "grad_norm": 0.3986574113368988, "learning_rate": 1.7790277207956341e-07, "loss": 0.0379, "step": 58470 }, { "grad_norm": 0.35622134804725647, "learning_rate": 1.7558771118232343e-07, "loss": 0.0172, "step": 58480 }, { "grad_norm": 0.3441099524497986, "learning_rate": 1.7328778562599734e-07, "loss": 0.0125, "step": 58490 }, { "grad_norm": 0.6010122895240784, "learning_rate": 1.7100299610924298e-07, "loss": 0.0184, "step": 58500 }, { "grad_norm": 0.26140090823173523, "learning_rate": 1.6873334332612733e-07, "loss": 0.0182, "step": 58510 }, { "grad_norm": 0.23566380143165588, "learning_rate": 1.6647882796609894e-07, "loss": 0.0246, "step": 58520 }, { "grad_norm": 0.44743114709854126, "learning_rate": 1.6423945071402102e-07, "loss": 0.0196, "step": 58530 }, { "grad_norm": 0.3286674916744232, "learning_rate": 1.6201521225016614e-07, "loss": 0.0196, "step": 58540 }, { "grad_norm": 0.6130770444869995, "learning_rate": 1.598061132501938e-07, "loss": 0.021, "step": 58550 }, { "grad_norm": 0.4841345548629761, "learning_rate": 1.576121543851672e-07, "loss": 0.0207, "step": 58560 }, { "grad_norm": 0.34778323769569397, "learning_rate": 1.5543333632155876e-07, "loss": 0.0304, "step": 58570 }, { "grad_norm": 0.411042720079422, "learning_rate": 1.5326965972123352e-07, "loss": 0.0164, "step": 58580 }, { "grad_norm": 0.3881392478942871, "learning_rate": 1.5112112524146016e-07, "loss": 0.0179, "step": 58590 }, { "grad_norm": 0.1717434972524643, "learning_rate": 1.4898773353489992e-07, "loss": 0.0204, "step": 58600 }, { "grad_norm": 0.09827342629432678, "learning_rate": 1.4686948524962884e-07, "loss": 0.0191, "step": 58610 }, { "grad_norm": 0.843288004398346, "learning_rate": 1.4476638102911556e-07, "loss": 0.0279, "step": 58620 }, { "grad_norm": 0.336485356092453, "learning_rate": 1.4267842151222123e-07, "loss": 0.025, "step": 58630 }, { "grad_norm": 0.33221498131752014, "learning_rate": 1.4060560733321626e-07, "loss": 0.0195, "step": 58640 }, { "grad_norm": 0.16355273127555847, "learning_rate": 1.385479391217692e-07, "loss": 0.0229, "step": 58650 }, { "grad_norm": 0.388053297996521, "learning_rate": 1.365054175029412e-07, "loss": 0.0225, "step": 58660 }, { "grad_norm": 0.16020329296588898, "learning_rate": 1.3447804309719702e-07, "loss": 0.0174, "step": 58670 }, { "grad_norm": 0.18345439434051514, "learning_rate": 1.3246581652040512e-07, "loss": 0.0193, "step": 58680 }, { "grad_norm": 0.3284805715084076, "learning_rate": 1.3046873838381546e-07, "loss": 0.0217, "step": 58690 }, { "grad_norm": 0.5434720516204834, "learning_rate": 1.2848680929409828e-07, "loss": 0.0244, "step": 58700 }, { "grad_norm": 0.32558178901672363, "learning_rate": 1.2652002985331091e-07, "loss": 0.0167, "step": 58710 }, { "grad_norm": 0.4878475069999695, "learning_rate": 1.2456840065889764e-07, "loss": 0.0253, "step": 58720 }, { "grad_norm": 0.3480880856513977, "learning_rate": 1.226319223037231e-07, "loss": 0.0174, "step": 58730 }, { "grad_norm": 0.39818045496940613, "learning_rate": 1.2071059537603902e-07, "loss": 0.0182, "step": 58740 }, { "grad_norm": 0.13438737392425537, "learning_rate": 1.1880442045948403e-07, "loss": 0.0178, "step": 58750 }, { "grad_norm": 0.24130350351333618, "learning_rate": 1.1691339813311164e-07, "loss": 0.0211, "step": 58760 }, { "grad_norm": 0.4537890553474426, "learning_rate": 1.1503752897136233e-07, "loss": 0.0167, "step": 58770 }, { "grad_norm": 0.2780086100101471, "learning_rate": 1.1317681354407472e-07, "loss": 0.0149, "step": 58780 }, { "grad_norm": 0.7581173181533813, "learning_rate": 1.1133125241649111e-07, "loss": 0.0311, "step": 58790 }, { "grad_norm": 0.30223801732063293, "learning_rate": 1.0950084614922973e-07, "loss": 0.0144, "step": 58800 }, { "grad_norm": 0.18243347108364105, "learning_rate": 1.0768559529834021e-07, "loss": 0.0185, "step": 58810 }, { "grad_norm": 0.5160495042800903, "learning_rate": 1.0588550041522594e-07, "loss": 0.0241, "step": 58820 }, { "grad_norm": 0.24206838011741638, "learning_rate": 1.0410056204672169e-07, "loss": 0.0166, "step": 58830 }, { "grad_norm": 0.3571574091911316, "learning_rate": 1.0233078073504376e-07, "loss": 0.0193, "step": 58840 }, { "grad_norm": 0.34279775619506836, "learning_rate": 1.0057615701780654e-07, "loss": 0.014, "step": 58850 }, { "grad_norm": 0.550478994846344, "learning_rate": 9.883669142801144e-08, "loss": 0.0258, "step": 58860 }, { "grad_norm": 0.24811893701553345, "learning_rate": 9.711238449406356e-08, "loss": 0.0145, "step": 58870 }, { "grad_norm": 0.2727779150009155, "learning_rate": 9.540323673976614e-08, "loss": 0.0264, "step": 58880 }, { "grad_norm": 0.3432424068450928, "learning_rate": 9.370924868430942e-08, "loss": 0.0303, "step": 58890 }, { "grad_norm": 0.25157877802848816, "learning_rate": 9.203042084228175e-08, "loss": 0.0176, "step": 58900 }, { "grad_norm": 0.17951565980911255, "learning_rate": 9.036675372366965e-08, "loss": 0.0216, "step": 58910 }, { "grad_norm": 0.21431699395179749, "learning_rate": 8.871824783385218e-08, "loss": 0.0146, "step": 58920 }, { "grad_norm": 0.37574124336242676, "learning_rate": 8.7084903673601e-08, "loss": 0.0174, "step": 58930 }, { "grad_norm": 0.2910180687904358, "learning_rate": 8.546672173908032e-08, "loss": 0.0199, "step": 58940 }, { "grad_norm": 0.20702187716960907, "learning_rate": 8.386370252185249e-08, "loss": 0.0267, "step": 58950 }, { "grad_norm": 0.2912044823169708, "learning_rate": 8.227584650887243e-08, "loss": 0.0209, "step": 58960 }, { "grad_norm": 0.2261659950017929, "learning_rate": 8.070315418249319e-08, "loss": 0.0158, "step": 58970 }, { "grad_norm": 0.23853175342082977, "learning_rate": 7.914562602044929e-08, "loss": 0.0195, "step": 58980 }, { "grad_norm": 0.7367177605628967, "learning_rate": 7.76032624958789e-08, "loss": 0.0275, "step": 58990 }, { "grad_norm": 1.059195876121521, "learning_rate": 7.607606407731282e-08, "loss": 0.0219, "step": 59000 }, { "grad_norm": 0.3559046983718872, "learning_rate": 7.45640312286744e-08, "loss": 0.0185, "step": 59010 }, { "grad_norm": 0.5096719861030579, "learning_rate": 7.306716440927952e-08, "loss": 0.0178, "step": 59020 }, { "grad_norm": 0.2614828050136566, "learning_rate": 7.15854640738367e-08, "loss": 0.0175, "step": 59030 }, { "grad_norm": 0.20604673027992249, "learning_rate": 7.011893067244701e-08, "loss": 0.0221, "step": 59040 }, { "grad_norm": 0.2553955316543579, "learning_rate": 6.866756465060408e-08, "loss": 0.0186, "step": 59050 }, { "grad_norm": 0.26057958602905273, "learning_rate": 6.723136644918859e-08, "loss": 0.0187, "step": 59060 }, { "grad_norm": 1.153708577156067, "learning_rate": 6.581033650449042e-08, "loss": 0.0229, "step": 59070 }, { "grad_norm": 0.27039799094200134, "learning_rate": 6.440447524817539e-08, "loss": 0.0384, "step": 59080 }, { "grad_norm": 0.42596736550331116, "learning_rate": 6.301378310730743e-08, "loss": 0.0193, "step": 59090 }, { "grad_norm": 0.4107165038585663, "learning_rate": 6.163826050434307e-08, "loss": 0.0167, "step": 59100 }, { "grad_norm": 0.45320671796798706, "learning_rate": 6.027790785713139e-08, "loss": 0.0199, "step": 59110 }, { "grad_norm": 0.23150089383125305, "learning_rate": 5.89327255789085e-08, "loss": 0.0157, "step": 59120 }, { "grad_norm": 0.3935047388076782, "learning_rate": 5.7602714078303085e-08, "loss": 0.0209, "step": 59130 }, { "grad_norm": 0.3805312216281891, "learning_rate": 5.628787375934197e-08, "loss": 0.0187, "step": 59140 }, { "grad_norm": 0.3106011748313904, "learning_rate": 5.498820502143898e-08, "loss": 0.0254, "step": 59150 }, { "grad_norm": 0.1428183913230896, "learning_rate": 5.370370825939497e-08, "loss": 0.0231, "step": 59160 }, { "grad_norm": 0.14292550086975098, "learning_rate": 5.243438386340893e-08, "loss": 0.0197, "step": 59170 }, { "grad_norm": 0.35030654072761536, "learning_rate": 5.118023221907242e-08, "loss": 0.0268, "step": 59180 }, { "grad_norm": 0.14950452744960785, "learning_rate": 4.994125370735292e-08, "loss": 0.0136, "step": 59190 }, { "grad_norm": 0.3346584737300873, "learning_rate": 4.871744870462713e-08, "loss": 0.015, "step": 59200 }, { "grad_norm": 0.3192692697048187, "learning_rate": 4.7508817582658794e-08, "loss": 0.0193, "step": 59210 }, { "grad_norm": 0.4391400218009949, "learning_rate": 4.631536070858755e-08, "loss": 0.0142, "step": 59220 }, { "grad_norm": 0.13826514780521393, "learning_rate": 4.513707844495674e-08, "loss": 0.0218, "step": 59230 }, { "grad_norm": 1.0197919607162476, "learning_rate": 4.3973971149702255e-08, "loss": 0.0183, "step": 59240 }, { "grad_norm": 0.27333158254623413, "learning_rate": 4.2826039176147025e-08, "loss": 0.0209, "step": 59250 }, { "grad_norm": 0.6137517690658569, "learning_rate": 4.169328287299545e-08, "loss": 0.0157, "step": 59260 }, { "grad_norm": 0.49093934893608093, "learning_rate": 4.057570258435006e-08, "loss": 0.0175, "step": 59270 }, { "grad_norm": 0.4008193910121918, "learning_rate": 3.947329864970595e-08, "loss": 0.0206, "step": 59280 }, { "grad_norm": 0.5147374272346497, "learning_rate": 3.8386071403939686e-08, "loss": 0.026, "step": 59290 }, { "grad_norm": 0.26667970418930054, "learning_rate": 3.731402117733152e-08, "loss": 0.019, "step": 59300 }, { "grad_norm": 0.237412229180336, "learning_rate": 3.625714829552651e-08, "loss": 0.0197, "step": 59310 }, { "grad_norm": 0.3829590380191803, "learning_rate": 3.5215453079590065e-08, "loss": 0.0278, "step": 59320 }, { "grad_norm": 0.47510528564453125, "learning_rate": 3.4188935845952396e-08, "loss": 0.0228, "step": 59330 }, { "grad_norm": 0.31703558564186096, "learning_rate": 3.3177596906447396e-08, "loss": 0.0192, "step": 59340 }, { "grad_norm": 0.3379102945327759, "learning_rate": 3.218143656829043e-08, "loss": 0.0185, "step": 59350 }, { "grad_norm": 0.49279677867889404, "learning_rate": 3.120045513408387e-08, "loss": 0.0208, "step": 59360 }, { "grad_norm": 0.4125659763813019, "learning_rate": 3.02346529018338e-08, "loss": 0.0213, "step": 59370 }, { "grad_norm": 0.34475964307785034, "learning_rate": 2.9284030164922204e-08, "loss": 0.0257, "step": 59380 }, { "grad_norm": 0.26994645595550537, "learning_rate": 2.8348587212123634e-08, "loss": 0.0167, "step": 59390 }, { "grad_norm": 0.27324768900871277, "learning_rate": 2.7428324327594125e-08, "loss": 0.0264, "step": 59400 }, { "grad_norm": 0.39314061403274536, "learning_rate": 2.6523241790893383e-08, "loss": 0.0194, "step": 59410 }, { "grad_norm": 0.4642446041107178, "learning_rate": 2.563333987695704e-08, "loss": 0.0339, "step": 59420 }, { "grad_norm": 0.4607559144496918, "learning_rate": 2.4758618856118852e-08, "loss": 0.0235, "step": 59430 }, { "grad_norm": 0.60435950756073, "learning_rate": 2.3899078994088497e-08, "loss": 0.0177, "step": 59440 }, { "grad_norm": 0.5731623768806458, "learning_rate": 2.3054720551973775e-08, "loss": 0.0232, "step": 59450 }, { "grad_norm": 0.6686480641365051, "learning_rate": 2.222554378627506e-08, "loss": 0.0405, "step": 59460 }, { "grad_norm": 0.25849124789237976, "learning_rate": 2.1411548948868653e-08, "loss": 0.0189, "step": 59470 }, { "grad_norm": 0.2075738161802292, "learning_rate": 2.0612736287023426e-08, "loss": 0.0227, "step": 59480 }, { "grad_norm": 0.4192596971988678, "learning_rate": 1.9829106043400826e-08, "loss": 0.0154, "step": 59490 }, { "grad_norm": 0.23238155245780945, "learning_rate": 1.9060658456043768e-08, "loss": 0.0246, "step": 59500 }, { "grad_norm": 0.2940039336681366, "learning_rate": 1.830739375838775e-08, "loss": 0.0192, "step": 59510 }, { "grad_norm": 0.22712092101573944, "learning_rate": 1.7569312179260832e-08, "loss": 0.023, "step": 59520 }, { "grad_norm": 0.2378648817539215, "learning_rate": 1.684641394286146e-08, "loss": 0.0217, "step": 59530 }, { "grad_norm": 0.29120635986328125, "learning_rate": 1.6138699268797296e-08, "loss": 0.0167, "step": 59540 }, { "grad_norm": 0.4450613558292389, "learning_rate": 1.5446168372046376e-08, "loss": 0.0177, "step": 59550 }, { "grad_norm": 0.3482188880443573, "learning_rate": 1.4768821462984861e-08, "loss": 0.0129, "step": 59560 }, { "grad_norm": 0.3732518255710602, "learning_rate": 1.4106658747370383e-08, "loss": 0.0215, "step": 59570 }, { "grad_norm": 0.1928289830684662, "learning_rate": 1.3459680426353149e-08, "loss": 0.0158, "step": 59580 }, { "grad_norm": 0.20452900230884552, "learning_rate": 1.2827886696464841e-08, "loss": 0.0244, "step": 59590 }, { "grad_norm": 0.3010455369949341, "learning_rate": 1.2211277749635264e-08, "loss": 0.0187, "step": 59600 }, { "grad_norm": 0.32138141989707947, "learning_rate": 1.1609853773164592e-08, "loss": 0.0185, "step": 59610 }, { "grad_norm": 0.5132644176483154, "learning_rate": 1.1023614949751127e-08, "loss": 0.0182, "step": 59620 }, { "grad_norm": 0.33849385380744934, "learning_rate": 1.0452561457485744e-08, "loss": 0.015, "step": 59630 }, { "grad_norm": 0.2413378208875656, "learning_rate": 9.896693469829688e-09, "loss": 0.0224, "step": 59640 }, { "grad_norm": 0.4605166018009186, "learning_rate": 9.35601115564788e-09, "loss": 0.032, "step": 59650 }, { "grad_norm": 0.8724208474159241, "learning_rate": 8.830514679186719e-09, "loss": 0.0222, "step": 59660 }, { "grad_norm": 0.3539263904094696, "learning_rate": 8.320204200074066e-09, "loss": 0.0216, "step": 59670 }, { "grad_norm": 0.20766448974609375, "learning_rate": 7.825079873324814e-09, "loss": 0.018, "step": 59680 }, { "grad_norm": 0.13525986671447754, "learning_rate": 7.345141849351977e-09, "loss": 0.0248, "step": 59690 }, { "grad_norm": 0.2763459384441376, "learning_rate": 6.880390273944493e-09, "loss": 0.0189, "step": 59700 }, { "grad_norm": 0.7462846040725708, "learning_rate": 6.4308252882838704e-09, "loss": 0.0245, "step": 59710 }, { "grad_norm": 0.34023916721343994, "learning_rate": 5.9964470289386455e-09, "loss": 0.0197, "step": 59720 }, { "grad_norm": 0.4585813581943512, "learning_rate": 5.577255627853273e-09, "loss": 0.0212, "step": 59730 }, { "grad_norm": 0.5436128973960876, "learning_rate": 5.173251212370334e-09, "loss": 0.0138, "step": 59740 }, { "grad_norm": 0.5341458916664124, "learning_rate": 4.784433905219432e-09, "loss": 0.0301, "step": 59750 }, { "grad_norm": 0.16818736493587494, "learning_rate": 4.4108038245060934e-09, "loss": 0.0196, "step": 59760 }, { "grad_norm": 0.23266373574733734, "learning_rate": 4.05236108373952e-09, "loss": 0.016, "step": 59770 }, { "grad_norm": 0.22807660698890686, "learning_rate": 3.7091057917937324e-09, "loss": 0.0199, "step": 59780 }, { "grad_norm": 0.3673882484436035, "learning_rate": 3.381038052946428e-09, "loss": 0.0347, "step": 59790 }, { "grad_norm": 0.7532874345779419, "learning_rate": 3.0681579668623283e-09, "loss": 0.0269, "step": 59800 }, { "grad_norm": 0.41027018427848816, "learning_rate": 2.7704656285709727e-09, "loss": 0.0211, "step": 59810 }, { "grad_norm": 0.46733760833740234, "learning_rate": 2.4879611285166803e-09, "loss": 0.0211, "step": 59820 }, { "grad_norm": 0.6380115747451782, "learning_rate": 2.2206445525085883e-09, "loss": 0.0212, "step": 59830 }, { "grad_norm": 0.43487274646759033, "learning_rate": 1.9685159817595112e-09, "loss": 0.0158, "step": 59840 }, { "grad_norm": 0.6487061977386475, "learning_rate": 1.7315754928470817e-09, "loss": 0.0209, "step": 59850 }, { "grad_norm": 0.3049255311489105, "learning_rate": 1.5098231577581611e-09, "loss": 0.0305, "step": 59860 }, { "grad_norm": 0.42982253432273865, "learning_rate": 1.3032590438499804e-09, "loss": 0.0231, "step": 59870 }, { "grad_norm": 0.35013458132743835, "learning_rate": 1.1118832138723444e-09, "loss": 0.0247, "step": 59880 }, { "grad_norm": 0.2482244074344635, "learning_rate": 9.356957259620825e-10, "loss": 0.0205, "step": 59890 }, { "grad_norm": 0.2953185439109802, "learning_rate": 7.74696633637495e-10, "loss": 0.016, "step": 59900 }, { "grad_norm": 0.5978455543518066, "learning_rate": 6.288859858039064e-10, "loss": 0.0288, "step": 59910 }, { "grad_norm": 0.5214975476264954, "learning_rate": 4.982638267647665e-10, "loss": 0.0217, "step": 59920 }, { "grad_norm": 0.33494675159454346, "learning_rate": 3.8283019618834406e-10, "loss": 0.0196, "step": 59930 }, { "grad_norm": 0.3668327331542969, "learning_rate": 2.825851291410331e-10, "loss": 0.0168, "step": 59940 }, { "grad_norm": 0.347065269947052, "learning_rate": 1.975286560873535e-10, "loss": 0.0194, "step": 59950 }, { "grad_norm": 0.39973312616348267, "learning_rate": 1.2766080285109284e-10, "loss": 0.0236, "step": 59960 }, { "grad_norm": 0.25761163234710693, "learning_rate": 7.298159065971533e-11, "loss": 0.0162, "step": 59970 }, { "grad_norm": 0.21721351146697998, "learning_rate": 3.349103612770854e-11, "loss": 0.0194, "step": 59980 }, { "grad_norm": 0.2793077528476715, "learning_rate": 9.189151245481142e-12, "loss": 0.0171, "step": 59990 }, { "grad_norm": 0.29749155044555664, "learning_rate": 7.594340156735769e-14, "loss": 0.0184, "step": 60000 } ], "logging_steps": 10, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }